关于python:语料的流式加载

os.listdir('data/')# 输出文件夹名字，输入文件夹下的文件夹名或者文档名list

## 流式加载(Corpus Streaming)class MySentences(object):    def __init__(self, dirname):        self.dirname = dirname    def __iter__(self):        for fname in os.listdir(self.dirname):            print('正在解决文件{}'.format(fname))            for line in smart_open(os.path.join(self.dirname, fname), 'r',encoding='utf-8'):        line = line.lower() #对每一行文本中的英文词汇小写化                 line = wordnet_lemmatizer.lemmatize(porter_stemmer.stem(line))                # line = line.replace('social listening','social_listening')  #'social_listening'是文本中一个重要的词汇，为了避免因分词问题导致的语义失落，笔者将其替换成带下划线的单个词汇                     # jieba.add_word('social_listening')  #对特定长词进行管制，避免被分错词，影响后续的剖析成果              # jieba.add_word('社会化凝听')  #对social_listening进行管制，避免被分错词，影响后续的剖析成果                yield [i.strip() for i in jieba.lcut(line) if i not in stoplist and  len(i) > 1]  #在载入文本的同时，对其中的语句进行分词解决，且去掉停用词和长度小于1的语句

os.listdir('data/')# os.listdir('filefolder')输出文件夹的门路，返回该文件夹下的文件夹名或者文档名listos.path.join('data/', 'android智能手机编程_国家凋谢大学_王立.txt')# os.path.join(self.dirname,fname)拼接文件夹的门路和文件名# os.path.join("filefolder_road","filename")for line in smart_open(os.path.join('data/', 'android智能手机编程_国家凋谢大学_王立.txt'), encoding='utf8'):    print(line)