文本分类 | How 2 Play Life

1.对每个类别进行分词，统计每个类别的字典及总的字典


"""
@Time ： 2020/11/29 14:38
@Auth ： penghui
"""
# 对每个类别进行分词，统计每个类别的字典以及总的字典
import jieba
import os
import xlwt

# 读取每个txt文件的全路径
def readfullnames():
    fullname_list = []
    for dir in os.listdir('data/train/THUCNews'):

        for filename in os.listdir('data/train/THUCNews/' + dir):
            fullname = 'data/train/THUCNews/' + dir + '/' + filename
            fullname_list.append(fullname)
            # print len(fullname_list)
            # print fullname.decode('gbk').encode('utf-8')
    return fullname_list

# 读取每个类别的字典的路径
def readfillnames():
    fillname_list = []
    for dir in os.listdir('data/train/分词'):
        fillname = 'data/train/分词/' + dir + '/' + dir + '.txt'
        # if  fullname != 'E:\python\untitled2\文本分类语料库\解词\字典.txt\字典.txt.txt':
        # if fullname.find('txt.txt') == -1:
        if 'txt.txt' not in fillname:
            fillname_list.append(fillname)
            # print len(fullname_list)
            # print fullname.decode('gbk').encode('utf-8')
    return fillname_list

# 读取每个类别的词频的路径
def readcipin_fullnames():
    fillname_list = []
    for dir in os.listdir('data/train/分词/'):
        fillname = 'data/train/分词/' + dir + '/词频' + dir + '.txt'
        # if  fullname != 'E:\python\untitled2\文本分类语料库\解词\字典.txt\字典.txt.txt':
        # if fullname.find('txt.txt') == -1:
        if 'txt.txt' not in fillname:
            fillname_list.append(fillname)
            # print len(fullname_list)
            # print fullname.decode('gbk').encode('utf-8')
    return fillname_list


# 读取停用词表
def read_stopwords():
    stopwords_list = []
    ifs = open('data/stopwords.txt', 'r')
    for line in ifs.readlines():
        line = line.strip()
        stopwords_list.append(line)
    return stopwords_list


# 将每个类别的字典写入该文件下
def write_words(words_dic, dfs):
    for k in words_dic.items():
        st = ''.join(['%s : %s' % k])
        dfs.write(st)
        dfs.write('\n')


# 将txt进行分词 统计每个类别的词在多少篇文章中出现  以及 类词频
def segfile(fullname_list):
    all_stopwords_list = read_stopwords()
    words_dic = {}
    all_words = {}  # 词频
    name_temp = fillname_list[0].split('/')[3]
    for fullname in fullname_list:
        dfs = open('data/train/分词/' + name_temp + '/' + name_temp + '.txt', 'w')
        ddfs = open('data/train/分词/' + name_temp + '/词频' + name_temp + '.txt', 'w')
        dirname = fullname.split('data/train/THUCNews/')[1].split('/')[0]

        if name_temp != dirname:
            write_words(words_dic, dfs)
            write_words(all_words, ddfs)
            words_dic.clear()
            all_words.clear()
            name_temp = dirname

        filename = fullname.split('/')[-1]
        print(fullname + '===================================================')
        ifs = open(fullname, 'r')
        ofs = open('data/train/分词/' + dirname + '/' + filename, 'w')
        words_temp = []
        for line in ifs.readlines():
            line = line.strip()
            try:
                words = jieba.cut(line)
            except:
                continue

            for w in words:
                if w.strip() == '':
                    continue
                if w in all_stopwords_list:
                    continue
                if w not in words_temp:
                    words_temp.append(w)
                if w not in all_words.keys():
                    all_words[w] = 1
                else:
                    all_words[w] += 1
                print(w)
                ofs.write(w + ' ')
            ofs.write('\n')

        for t in words_temp:
            if t not in words_dic.keys():
                words_dic[t] = 1
            else:
                words_dic[t] += 1

    ifs.close()
    ofs.close()
    dfs.close()
    ddfs.close()

    dfs = open('data/train/分词/' + name_temp + '/' + name_temp + '.txt', 'w')
    write_words(words_dic, dfs)
    dfs.close()
    ddfs = open('data/train/分词/' + name_temp + '/词频' + name_temp + '.txt', 'w')
    write_words(all_words, ddfs)
    ddfs.close()


# 统计总字典
def sumdic(fillname_list):
    dic = {}
    fillname_list = readfillnames()
    for file in fillname_list:
        dfs = open(file, 'r')
        for line in dfs.readlines():
            key = line.split(':')[0].strip()
            value = int(line.split(':')[-1].strip())
            if key not in dic.keys():
                dic[key] = value
            else:
                dic[key] += value
                # print key.decode('gbk').encode('utf-8')+': %d'%dic[key]
            print("程序运行中，请稍后。。。")

    # 将次数少于九次的词删除
    for t in dic.keys():
        if dic[t] < 2:
            del dic[t]

    afs = open('data/train/字典.txt', 'w')
    write_words(dic, afs)
    afs.close()


# 统计总词频
def sumcipindic():
    cipin_dic = {}
    cipin_fullnamelist = readcipin_fullnames()
    for file in cipin_fullnamelist:
        dfs = open(file, 'r')
        for line in dfs.readlines():
            key = line.split(':')[0].strip()
            value = int(line.split(':')[-1].strip())
            if key not in cipin_dic.keys():
                cipin_dic[key] = value
            else:
                cipin_dic[key] += value
                # print key.decode('gbk').encode('utf-8')+': %d'%dic[key]
            print("请稍后。。。")

    afs = open('data/train/词频字典.txt', 'w')
    write_words(cipin_dic, afs)
    afs.close()

for dir in os.listdir('data/train/THUCNews'):
    if not os.path.exists('data/train/分词/' + dir):
        os.mkdir('data/train/分词/' + dir)

fullname_list = readfullnames()
fillname_list = readfillnames()
segfile(fullname_list)
sumdic(fillname_list)
sumcipindic()