# 读取每个txt文件的全路径 def readfullnames(): fullname_list = [] for dir in os.listdir('data/train/THUCNews'):
for filename in os.listdir('data/train/THUCNews/' + dir): fullname = 'data/train/THUCNews/' + dir + '/' + filename fullname_list.append(fullname) # print len(fullname_list) # print fullname.decode('gbk').encode('utf-8') return fullname_list
# 读取每个类别的字典的路径 def readfillnames(): fillname_list = [] for dir in os.listdir('data/train/分词'): fillname = 'data/train/分词/' + dir + '/' + dir + '.txt' # if fullname != 'E:\python\untitled2\文本分类语料库\解词\字典.txt\字典.txt.txt': # if fullname.find('txt.txt') == -1: if 'txt.txt' not in fillname: fillname_list.append(fillname) # print len(fullname_list) # print fullname.decode('gbk').encode('utf-8') return fillname_list
# 读取每个类别的词频的路径 def readcipin_fullnames(): fillname_list = [] for dir in os.listdir('data/train/分词/'): fillname = 'data/train/分词/' + dir + '/词频' + dir + '.txt' # if fullname != 'E:\python\untitled2\文本分类语料库\解词\字典.txt\字典.txt.txt': # if fullname.find('txt.txt') == -1: if 'txt.txt' not in fillname: fillname_list.append(fillname) # print len(fullname_list) # print fullname.decode('gbk').encode('utf-8') return fillname_list
# 读取停用词表 def read_stopwords(): stopwords_list = [] ifs = open('data/stopwords.txt', 'r') for line in ifs.readlines(): line = line.strip() stopwords_list.append(line) return stopwords_list
# 将每个类别的字典写入该文件下 def write_words(words_dic, dfs): for k in words_dic.items(): st = ''.join(['%s : %s' % k]) dfs.write(st) dfs.write('\n')
filename = fullname.split('/')[-1] print(fullname + '===================================================') ifs = open(fullname, 'r') ofs = open('data/train/分词/' + dirname + '/' + filename, 'w') words_temp = [] for line in ifs.readlines(): line = line.strip() try: words = jieba.cut(line) except: continue
for w in words: if w.strip() == '': continue if w in all_stopwords_list: continue if w not in words_temp: words_temp.append(w) if w not in all_words.keys(): all_words[w] = 1 else: all_words[w] += 1 print(w) ofs.write(w + ' ') ofs.write('\n')
for t in words_temp: if t not in words_dic.keys(): words_dic[t] = 1 else: words_dic[t] += 1
# 统计总字典 def sumdic(fillname_list): dic = {} fillname_list = readfillnames() for file in fillname_list: dfs = open(file, 'r') for line in dfs.readlines(): key = line.split(':')[0].strip() value = int(line.split(':')[-1].strip()) if key not in dic.keys(): dic[key] = value else: dic[key] += value # print key.decode('gbk').encode('utf-8')+': %d'%dic[key] print("程序运行中,请稍后。。。")
# 将次数少于九次的词删除 for t in dic.keys(): if dic[t] < 2: del dic[t]
# 统计总词频 def sumcipindic(): cipin_dic = {} cipin_fullnamelist = readcipin_fullnames() for file in cipin_fullnamelist: dfs = open(file, 'r') for line in dfs.readlines(): key = line.split(':')[0].strip() value = int(line.split(':')[-1].strip()) if key not in cipin_dic.keys(): cipin_dic[key] = value else: cipin_dic[key] += value # print key.decode('gbk').encode('utf-8')+': %d'%dic[key] print("请稍后。。。")