CC-统计词频

作者：

在

分别用C和C++实现词频统计，要求用fopen打开文本文件，最后将统计结果输出至文件。

1、首先用纯C实现，定义一个存放单词和出现频次的结点，用链表实现存储，代码如下：

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

// 定义一个存放单词和频次的节点，用链表实现存储
typedef struct WordNode {
    char word[100];          // 存放单词
    int  word_cnt;           // 单词出现次数
    struct WordNode* p_next; // 结构体指针
} WordNode;

// 函数声明
void ProcessWord(char* word);     // 处理当前单词并统计
void CountWord(char* word);       // 统计当前单词
WordNode* SearchWord(char* word); // 查找当前单词
void SortResult(void);            // 对统计结果按字典顺序排序
void PrintResult(void);           // 输出统计结果
void Release(void);               // 释放内存

// 全局变量
WordNode* g_p_head = NULL; // 存放单词链表的头指针

int main(int argc, char* argv[]) {
    
    if (argc != 2) {
        printf("usage: %s filepath", argv[0]);
        exit(1);
    }
    
    char temp_word[100]; // 用于临时存放当前单词
    FILE* fp_read = NULL;
    if ((fp_read = fopen(argv[1], "r")) == NULL) {
        printf("open %s error!\n", argv[1]);
        exit(1);
    }
    
    // 循环读取文本中的单词并进行处理
    while (fscanf(fp_read, "%s", temp_word) != EOF) {
        ProcessWord(temp_word);
    }
    
    fclose(fp_read); // 关闭文件
    SortResult();    // 按字典顺序排序所统计的单词
    PrintResult();   // 输出统计结果至文本
    Release();       // 释放内存
    
    return 0;
}

// 处理当前单词并统计
void ProcessWord(char* word) {
    
    char* special_char = ",;.:?\"\'\\><-+=|*^&%$#@!()"; // 可能存在的特殊字符，标点符号
    int spec_length = strlen(special_char);
    int word_length = strlen(word);
    
    int int_res = atoi(word);
    // 如果当前单词不是数字而是英文单词，才进行单词统计
    if (int_res == 0 && strcmp("0", word) != 0) { 
        int i, j;
        for (i = 0; i < word_length; i++) {    
            // 将单词中的大写字母均转化为小写字母
            if (word[i] >= 'A' && word[i] <= 'Z') {
                word[i] = word[i] + 32;
            }
            
            // 将单词中可能存在的特殊字符均以空格符代替
            for (j = 0; j < spec_length; j++) {
                if (word[i] == special_char[j]) {
                    word[i] = ' ';
                }
            }
        }
        
        // 用strtok分割经上述步骤处理的单词(可能存在多个单词)，并对每个单词进行统计
        char* temp_word = NULL;
        for (temp_word = strtok(word, " "); 
                 temp_word != NULL; 
                 temp_word = strtok(NULL, " ")) {
            CountWord(temp_word);
        }
    } else { // 说明传入的字符串是数字，直接返回不做处理
        return;
    }
}

// 统计当前单词
void CountWord(char* word) {
    
    WordNode* p_find = NULL;
    p_find = SearchWord(word);
    if (p_find == NULL) {
        return;
    } else {
        p_find->word_cnt++; // 对应单词次数加一
    }
}

// 查找当前单词(链表查找效率较低下)
WordNode* SearchWord(char* word) {
    // 链表为空时，统计第一个单词
    if (g_p_head == NULL) {
        g_p_head = new WordNode;
        strcpy(g_p_head->word, word);
        g_p_head->word_cnt = 0;
        g_p_head->p_next = NULL;
        
        return g_p_head;
    }
    
    // 检查现有链表，确定当前查找单词的位置
    WordNode* p_cur = g_p_head;
    WordNode* p_pre = NULL;
    while ((p_cur != NULL) && (strcmp(p_cur->word, word) != 0)) {
        p_pre = p_cur;
        p_cur = p_cur->p_next;
    }
    
    // 该单词不存在时，直接加入链表尾部
    if (p_cur == NULL) {
        p_cur = new WordNode;
        strcpy(p_cur->word, word);
        p_cur->word_cnt = 0;
        p_cur->p_next = NULL;
        
        p_pre->p_next = p_cur;
    }
    
    return p_cur;
}

// 对统计结果按字典顺序排序
void SortResult(void) {
    
    WordNode* p_pre = NULL;
    WordNode* p_cur = NULL;
    WordNode  p_tmp;
    
    for (p_pre = g_p_head; p_pre->p_next != NULL; p_pre = p_pre->p_next) {
        for (p_cur = p_pre->p_next; p_cur != NULL; p_cur = p_cur->p_next) {
            if (strcmp(p_pre->word, p_cur->word) > 0) {
                // 交换结点的数据域和指针域
                p_tmp  = *p_pre;
                *p_pre = *p_cur;
                *p_cur = p_tmp;
                
                p_tmp.p_next  = p_pre->p_next;
                p_pre->p_next = p_cur->p_next;
                p_cur->p_next = p_tmp.p_next;
            }
        }
    }
}

// 输出统计结果
void PrintResult(void) {
    
    FILE* fp_write = fopen("result_liang.txt", "w");
    if (g_p_head == NULL) {
        printf("No word in this file!\n");
        return;
    }
    
    WordNode* p_cur = g_p_head;
    while (p_cur != NULL) {
        fprintf(fp_write, "%-20s\t%d\n", p_cur->word, p_cur->word_cnt);
        p_cur = p_cur->p_next;
    }
    
    fclose(fp_write);
}

// 释放内存
void Release(void) {
    
    if (g_p_head == NULL) {
        return;
    }
    
    WordNode* p_cur = g_p_head;
    WordNode* p_p_tmp = NULL;
    while (p_cur != NULL) {
        p_p_tmp = p_cur->p_next;
        delete p_cur;
        p_cur = p_p_tmp;
    }
}

经编译运行结果可以发现两个问题：
一是整个过程用链表实现，在统计单词时需要查找整个链表，效率较低，且后期还需要对链表进行排序，总之，利用这种方式实现整体效率较为低下；二是在代码的ProcessWord函数中，以下过程是在处理文本中可能存在的数值的情况，利用atoi函数判断，atoi函数返回值为0则代表当前word不是数值而是英文单词，否则返回转换的数值。但有一种情况，即当word为“0”或是“00”或是“000”等时，返回的结果也为0，只利用strcmp(“0”, word)判断是不行的，若文本中存在类似“00”或者“000”等多0的情况，统计结果中还是会出现“0”和它的频次。

int int_res = atoi(word);
// 如果当前单词不是数字而是英文单词，才进行单词统计
if (int_res == 0 && strcmp("0", word) != 0) { 
    int i, j;
    for (i = 0; i < word_length; i++) {    
        // 将单词中的大写字母均转化为小写字母
        if (word[i] >= 'A' && word[i] <= 'Z') {
            word[i] = word[i] + 32;
        }
 
        // 将单词中可能存在的特殊字符均以空格符代替
        for (j = 0; j < spec_length; j++) {
            if (word[i] == special_char[j]) {
                word[i] = ' ';
            }
        }
    }
        
    // 用strtok分割经上述步骤处理的单词(可能存在多个单词)，并对每个单词进行统计
    char* temp_word = NULL;
    for (temp_word = strtok(word, " "); 
         temp_word != NULL; 
         temp_word = strtok(NULL, " ")) {
        CountWord(temp_word);
    }
} else { // 说明传入的字符串是数字，直接返回不做处理
    return;
}

2、下面用C++实现，绝大部分功能实现都使用库函数，涉及了一些C++11的内容，代码如下：

#include <iostream>
#include <map>
#include <vector>
#include <algorithm>
#include <string>
#include <locale>
#include <cstdio>
#include <cstring>

// 函数声明
bool GetFileContent(const std::string& file_path, std::string* content);
bool SplitWords(const std::string& content, std::vector<std::string>* words);
bool CountWordFrequency(const std::vector<std::string>& words, std::map<std::string, int>* word_2_count);
bool OutputResultToFile(const std::map<std::string, int>& word_2_count, const std::string& file_path);

int main(int argc, char* argv[]) {
  if (argc != 2) {
    printf("Usage: %s filepath", argv[0]);
    return false;
  }
    
  std::string file_content;
  // 读取文件内容并存入file_content中
  bool get_file_result = GetFileContent(argv[1], &file_content);
  if (!get_file_result) {
    printf("Get file %s content error!\n", argv[1]);
    return false;
  }
    
  // 划分file_content中的单词并存入vector容器words中
  std::vector<std::string> words;
  bool split_word_result = SplitWords(file_content, &words);
  if (!split_word_result) {
    printf("Split file %s content to words error!\n ", argv[1]);
    return false;
  }
    
  // 统计单词和词频并用map容器words_frequency_map存放
  std::map<std::string, int> words_frequency_map;
  bool count_word_frequency_result = CountWordFrequency(words, &words_frequency_map);
  if (!count_word_frequency_result) {
    printf("Count word frequency error!\n");
    return false;
  }
    
  // 将统计结果写入指定文本文件中
  std::string result_file = "result_liang.txt";
  bool output_to_file_result = OutputResultToFile(words_frequency_map, result_file);
  if (!output_to_file_result) {
    printf("Output result to file %s error!\n", result_file.c_str());
    return false;
  }

  return 0;
}

bool GetFileContent(const std::string& file_path, std::string* content) {
  // 要检查传入参数
  if (file_path.empty()) 
    return false;
  if (!content) 
    return false;
  
  FILE* fp = fopen(file_path.c_str(), "r");
  if (!fp) 
    return false;
  
  std::string inner_content;
  inner_content.reserve(4096); // 预留空间，提升效率
    
  const int buffer_size = 4096;
  char buffer[buffer_size];
  size_t read_size = 0;
  bool ok = true;

  while (1) {
        
    // 每读一次写一次
    read_size = fread(buffer, 1, sizeof(buffer), fp);
    if (read_size >= sizeof(buffer)) {
      inner_content.append(buffer, read_size);
      continue;
    }
    
    // 读到文件尾部，将剩余内容写入inner_content并跳出循环
    if (feof(fp)) {
      inner_content.append(buffer, read_size);
      break;
    }

    // on error
    ok = false;
    break;
  }
  
  if (ok)
    content->swap(inner_content); // 交换空间给传出参数
    // *content = std::move(inner_content);

  fclose(fp);
  return ok;
}

bool SplitWords(const std::string& content, std::vector<std::string>* words) {
  if (!words)
    return false;
  if (content.empty()) // ?
    return true;
  
  std::string word;
  std::vector<std::string> wordlist;
    
    // 循环读取内容并划分单词存入vector容器中
  int chr = 0;
  for (int i = 0; i < (int)content.size(); ++i) { //++i占用的空间比i++要小，注意运算符重载中i++有一个亚元
    chr = content[i];
    if (!std::isalpha(chr)) { // 不是英文字母
      if (!word.empty()) // 不为空，即word中存入了单词，将该单词放入wordlist容器中
        wordlist.emplace_back(std::move(word)); // 相比于push_back，emplace_back可以避免额外类的复制和移动操作
                                                                                                // 用move顺便清空word
      continue;
    }

    word.append(1, chr);
  }
    
  if (!word.empty()) // 将最后一个单词放入wordlist中
    wordlist.emplace_back(std::move(word));

  words->swap(wordlist); // 交换空间给传出参数
  return true;
}

bool CountWordFrequency(const std::vector<std::string>& words, std::map<std::string, int>* word_2_count) {
  if (!word_2_count)
    return false;
  if (words.empty()) // ?
    return true;
  
  std::map<std::string, int> inner_word_counts;
  std::string lower_word;
  auto it = inner_word_counts.begin();
  std::pair<std::map<std::string, int>::iterator, bool> insert_result;
    
  for (const auto& word : words) { // for auto 
        
    // 将单词转化为小写
    lower_word.resize(word.size());
    std::transform(word.cbegin(), word.cend(), lower_word.begin(), ::tolower);

    it = inner_word_counts.find(lower_word);
    if (it == inner_word_counts.end()) { // 若是新的单词
      insert_result = inner_word_counts.insert(std::make_pair(lower_word, 0));
      it = insert_result.first;
    }

    it->second++;
  }

  word_2_count->swap(inner_word_counts);
  return true;
}

bool OutputResult(const std::map<std::string, int>& word_2_count, FILE* fp) {
  if (!fp)
    return false;
  if (word_2_count.empty())
    return true;
  
  int size = 0;
  bool ok = true;
  for (const auto& pair : word_2_count) {
    size = fprintf(fp, "%-20s\t%d\n", pair.first.c_str(), pair.second);
    if (size < 0) {
      ok = false;
      break;
    }
  }

  return ok;
}

bool OutputResultToFile(const std::map<std::string, int>& word_2_count, const std::string& file_path) {
  if (file_path.empty())
    return false;
  if (word_2_count.empty())
    return true;
    
  FILE* fp = fopen(file_path.c_str(), "w");
  if (!fp)
    return false;
  
  bool ok = OutputResult(word_2_count, fp);
  if (!ok) {
    fclose(fp);
    return false;
  }

  fclose(fp);
  return ok;
}

未完待续…

评论

发表回复取消回复

这个站点使用 Akismet 来减少垃圾评论。了解你的评论数据如何被处理。

更多文章