CC-统计词频 | 乐趣区

分别用C和C++实现词频统计，要求用fopen打开文本文件，最后将统计结果输出至文件。

1、首先用纯C实现，定义一个存放单词和出现频次的结点，用链表实现存储，代码如下：

#include <stdio.h>#include <stdlib.h>#include <string.h>// 定义一个存放单词和频次的节点，用链表实现存储typedef struct WordNode {    char word[100];          // 存放单词    int  word_cnt;           // 单词出现次数    struct WordNode* p_next; // 结构体指针} WordNode;// 函数声明void ProcessWord(char* word);     // 处理当前单词并统计void CountWord(char* word);       // 统计当前单词WordNode* SearchWord(char* word); // 查找当前单词void SortResult(void);            // 对统计结果按字典顺序排序void PrintResult(void);           // 输出统计结果void Release(void);               // 释放内存// 全局变量WordNode* g_p_head = NULL; // 存放单词链表的头指针int main(int argc, char* argv[]) {        if (argc != 2) {        printf("usage: %s filepath", argv[0]);        exit(1);    }        char temp_word[100]; // 用于临时存放当前单词    FILE* fp_read = NULL;    if ((fp_read = fopen(argv[1], "r")) == NULL) {        printf("open %s error!\n", argv[1]);        exit(1);    }        // 循环读取文本中的单词并进行处理    while (fscanf(fp_read, "%s", temp_word) != EOF) {        ProcessWord(temp_word);    }        fclose(fp_read); // 关闭文件    SortResult();    // 按字典顺序排序所统计的单词    PrintResult();   // 输出统计结果至文本    Release();       // 释放内存        return 0;}// 处理当前单词并统计void ProcessWord(char* word) {        char* special_char = ",;.:?\"\'\\><-+=|*^&%$#@!()"; // 可能存在的特殊字符，标点符号    int spec_length = strlen(special_char);    int word_length = strlen(word);        int int_res = atoi(word);    // 如果当前单词不是数字而是英文单词，才进行单词统计    if (int_res == 0 && strcmp("0", word) != 0) {         int i, j;        for (i = 0; i < word_length; i++) {                // 将单词中的大写字母均转化为小写字母            if (word[i] >= 'A' && word[i] <= 'Z') {                word[i] = word[i] + 32;            }                        // 将单词中可能存在的特殊字符均以空格符代替            for (j = 0; j < spec_length; j++) {                if (word[i] == special_char[j]) {                    word[i] = ' ';                }            }        }                // 用strtok分割经上述步骤处理的单词(可能存在多个单词)，并对每个单词进行统计        char* temp_word = NULL;        for (temp_word = strtok(word, " ");                  temp_word != NULL;                  temp_word = strtok(NULL, " ")) {            CountWord(temp_word);        }    } else { // 说明传入的字符串是数字，直接返回不做处理        return;    }}// 统计当前单词void CountWord(char* word) {        WordNode* p_find = NULL;    p_find = SearchWord(word);    if (p_find == NULL) {        return;    } else {        p_find->word_cnt++; // 对应单词次数加一    }}// 查找当前单词(链表查找效率较低下)WordNode* SearchWord(char* word) {    // 链表为空时，统计第一个单词    if (g_p_head == NULL) {        g_p_head = new WordNode;        strcpy(g_p_head->word, word);        g_p_head->word_cnt = 0;        g_p_head->p_next = NULL;                return g_p_head;    }        // 检查现有链表，确定当前查找单词的位置    WordNode* p_cur = g_p_head;    WordNode* p_pre = NULL;    while ((p_cur != NULL) && (strcmp(p_cur->word, word) != 0)) {        p_pre = p_cur;        p_cur = p_cur->p_next;    }        // 该单词不存在时，直接加入链表尾部    if (p_cur == NULL) {        p_cur = new WordNode;        strcpy(p_cur->word, word);        p_cur->word_cnt = 0;        p_cur->p_next = NULL;                p_pre->p_next = p_cur;    }        return p_cur;}// 对统计结果按字典顺序排序void SortResult(void) {        WordNode* p_pre = NULL;    WordNode* p_cur = NULL;    WordNode  p_tmp;        for (p_pre = g_p_head; p_pre->p_next != NULL; p_pre = p_pre->p_next) {        for (p_cur = p_pre->p_next; p_cur != NULL; p_cur = p_cur->p_next) {            if (strcmp(p_pre->word, p_cur->word) > 0) {                // 交换结点的数据域和指针域                p_tmp  = *p_pre;                *p_pre = *p_cur;                *p_cur = p_tmp;                                p_tmp.p_next  = p_pre->p_next;                p_pre->p_next = p_cur->p_next;                p_cur->p_next = p_tmp.p_next;            }        }    }}// 输出统计结果void PrintResult(void) {        FILE* fp_write = fopen("result_liang.txt", "w");    if (g_p_head == NULL) {        printf("No word in this file!\n");        return;    }        WordNode* p_cur = g_p_head;    while (p_cur != NULL) {        fprintf(fp_write, "%-20s\t%d\n", p_cur->word, p_cur->word_cnt);        p_cur = p_cur->p_next;    }        fclose(fp_write);}// 释放内存void Release(void) {        if (g_p_head == NULL) {        return;    }        WordNode* p_cur = g_p_head;    WordNode* p_p_tmp = NULL;    while (p_cur != NULL) {        p_p_tmp = p_cur->p_next;        delete p_cur;        p_cur = p_p_tmp;    }}

经编译运行结果可以发现两个问题：
一是整个过程用链表实现，在统计单词时需要查找整个链表，效率较低，且后期还需要对链表进行排序，总之，利用这种方式实现整体效率较为低下；二是在代码的ProcessWord函数中，以下过程是在处理文本中可能存在的数值的情况，利用atoi函数判断，atoi函数返回值为0则代表当前word不是数值而是英文单词，否则返回转换的数值。但有一种情况，即当word为“0”或是“00”或是“000”等时，返回的结果也为0，只利用strcmp("0", word)判断是不行的，若文本中存在类似“00”或者“000”等多0的情况，统计结果中还是会出现“0”和它的频次。

int int_res = atoi(word);// 如果当前单词不是数字而是英文单词，才进行单词统计if (int_res == 0 && strcmp("0", word) != 0) {     int i, j;    for (i = 0; i < word_length; i++) {            // 将单词中的大写字母均转化为小写字母        if (word[i] >= 'A' && word[i] <= 'Z') {            word[i] = word[i] + 32;        }         // 将单词中可能存在的特殊字符均以空格符代替        for (j = 0; j < spec_length; j++) {            if (word[i] == special_char[j]) {                word[i] = ' ';            }        }    }            // 用strtok分割经上述步骤处理的单词(可能存在多个单词)，并对每个单词进行统计    char* temp_word = NULL;    for (temp_word = strtok(word, " ");          temp_word != NULL;          temp_word = strtok(NULL, " ")) {        CountWord(temp_word);    }} else { // 说明传入的字符串是数字，直接返回不做处理    return;}

2、下面用C++实现，绝大部分功能实现都使用库函数，涉及了一些C++11的内容，代码如下：

#include <iostream>#include <map>#include <vector>#include <algorithm>#include <string>#include <locale>#include <cstdio>#include <cstring>// 函数声明bool GetFileContent(const std::string& file_path, std::string* content);bool SplitWords(const std::string& content, std::vector<std::string>* words);bool CountWordFrequency(const std::vector<std::string>& words, std::map<std::string, int>* word_2_count);bool OutputResultToFile(const std::map<std::string, int>& word_2_count, const std::string& file_path);int main(int argc, char* argv[]) {  if (argc != 2) {    printf("Usage: %s filepath", argv[0]);    return false;  }      std::string file_content;  // 读取文件内容并存入file_content中  bool get_file_result = GetFileContent(argv[1], &file_content);  if (!get_file_result) {    printf("Get file %s content error!\n", argv[1]);    return false;  }      // 划分file_content中的单词并存入vector容器words中  std::vector<std::string> words;  bool split_word_result = SplitWords(file_content, &words);  if (!split_word_result) {    printf("Split file %s content to words error!\n ", argv[1]);    return false;  }      // 统计单词和词频并用map容器words_frequency_map存放  std::map<std::string, int> words_frequency_map;  bool count_word_frequency_result = CountWordFrequency(words, &words_frequency_map);  if (!count_word_frequency_result) {    printf("Count word frequency error!\n");    return false;  }      // 将统计结果写入指定文本文件中  std::string result_file = "result_liang.txt";  bool output_to_file_result = OutputResultToFile(words_frequency_map, result_file);  if (!output_to_file_result) {    printf("Output result to file %s error!\n", result_file.c_str());    return false;  }  return 0;}bool GetFileContent(const std::string& file_path, std::string* content) {  // 要检查传入参数  if (file_path.empty())     return false;  if (!content)     return false;    FILE* fp = fopen(file_path.c_str(), "r");  if (!fp)     return false;    std::string inner_content;  inner_content.reserve(4096); // 预留空间，提升效率      const int buffer_size = 4096;  char buffer[buffer_size];  size_t read_size = 0;  bool ok = true;  while (1) {            // 每读一次写一次    read_size = fread(buffer, 1, sizeof(buffer), fp);    if (read_size >= sizeof(buffer)) {      inner_content.append(buffer, read_size);      continue;    }        // 读到文件尾部，将剩余内容写入inner_content并跳出循环    if (feof(fp)) {      inner_content.append(buffer, read_size);      break;    }    // on error    ok = false;    break;  }    if (ok)    content->swap(inner_content); // 交换空间给传出参数    // *content = std::move(inner_content);  fclose(fp);  return ok;}bool SplitWords(const std::string& content, std::vector<std::string>* words) {  if (!words)    return false;  if (content.empty()) // ?    return true;    std::string word;  std::vector<std::string> wordlist;        // 循环读取内容并划分单词存入vector容器中  int chr = 0;  for (int i = 0; i < (int)content.size(); ++i) { //++i占用的空间比i++要小，注意运算符重载中i++有一个亚元    chr = content[i];    if (!std::isalpha(chr)) { // 不是英文字母      if (!word.empty()) // 不为空，即word中存入了单词，将该单词放入wordlist容器中        wordlist.emplace_back(std::move(word)); // 相比于push_back，emplace_back可以避免额外类的复制和移动操作                                                                                                // 用move顺便清空word      continue;    }    word.append(1, chr);  }      if (!word.empty()) // 将最后一个单词放入wordlist中    wordlist.emplace_back(std::move(word));  words->swap(wordlist); // 交换空间给传出参数  return true;}bool CountWordFrequency(const std::vector<std::string>& words, std::map<std::string, int>* word_2_count) {  if (!word_2_count)    return false;  if (words.empty()) // ?    return true;    std::map<std::string, int> inner_word_counts;  std::string lower_word;  auto it = inner_word_counts.begin();  std::pair<std::map<std::string, int>::iterator, bool> insert_result;      for (const auto& word : words) { // for auto             // 将单词转化为小写    lower_word.resize(word.size());    std::transform(word.cbegin(), word.cend(), lower_word.begin(), ::tolower);    it = inner_word_counts.find(lower_word);    if (it == inner_word_counts.end()) { // 若是新的单词      insert_result = inner_word_counts.insert(std::make_pair(lower_word, 0));      it = insert_result.first;    }    it->second++;  }  word_2_count->swap(inner_word_counts);  return true;}bool OutputResult(const std::map<std::string, int>& word_2_count, FILE* fp) {  if (!fp)    return false;  if (word_2_count.empty())    return true;    int size = 0;  bool ok = true;  for (const auto& pair : word_2_count) {    size = fprintf(fp, "%-20s\t%d\n", pair.first.c_str(), pair.second);    if (size < 0) {      ok = false;      break;    }  }  return ok;}bool OutputResultToFile(const std::map<std::string, int>& word_2_count, const std::string& file_path) {  if (file_path.empty())    return false;  if (word_2_count.empty())    return true;      FILE* fp = fopen(file_path.c_str(), "w");  if (!fp)    return false;    bool ok = OutputResult(word_2_count, fp);  if (!ok) {    fclose(fp);    return false;  }  fclose(fp);  return ok;}

未完待续...