文本分类

1.对每个类别进行分词,统计每个类别的字典及总的字典

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186

"""
@Time : 2020/11/29 14:38
@Auth : penghui
"""
# 对每个类别进行分词,统计每个类别的字典以及总的字典
import jieba
import os
import xlwt

# 读取每个txt文件的全路径
def readfullnames():
fullname_list = []
for dir in os.listdir('data/train/THUCNews'):

for filename in os.listdir('data/train/THUCNews/' + dir):
fullname = 'data/train/THUCNews/' + dir + '/' + filename
fullname_list.append(fullname)
# print len(fullname_list)
# print fullname.decode('gbk').encode('utf-8')
return fullname_list

# 读取每个类别的字典的路径
def readfillnames():
fillname_list = []
for dir in os.listdir('data/train/分词'):
fillname = 'data/train/分词/' + dir + '/' + dir + '.txt'
# if fullname != 'E:\python\untitled2\文本分类语料库\解词\字典.txt\字典.txt.txt':
# if fullname.find('txt.txt') == -1:
if 'txt.txt' not in fillname:
fillname_list.append(fillname)
# print len(fullname_list)
# print fullname.decode('gbk').encode('utf-8')
return fillname_list

# 读取每个类别的词频的路径
def readcipin_fullnames():
fillname_list = []
for dir in os.listdir('data/train/分词/'):
fillname = 'data/train/分词/' + dir + '/词频' + dir + '.txt'
# if fullname != 'E:\python\untitled2\文本分类语料库\解词\字典.txt\字典.txt.txt':
# if fullname.find('txt.txt') == -1:
if 'txt.txt' not in fillname:
fillname_list.append(fillname)
# print len(fullname_list)
# print fullname.decode('gbk').encode('utf-8')
return fillname_list


# 读取停用词表
def read_stopwords():
stopwords_list = []
ifs = open('data/stopwords.txt', 'r')
for line in ifs.readlines():
line = line.strip()
stopwords_list.append(line)
return stopwords_list


# 将每个类别的字典写入该文件下
def write_words(words_dic, dfs):
for k in words_dic.items():
st = ''.join(['%s : %s' % k])
dfs.write(st)
dfs.write('\n')


# 将txt进行分词 统计每个类别的词在多少篇文章中出现 以及 类词频
def segfile(fullname_list):
all_stopwords_list = read_stopwords()
words_dic = {}
all_words = {} # 词频
name_temp = fillname_list[0].split('/')[3]
for fullname in fullname_list:
dfs = open('data/train/分词/' + name_temp + '/' + name_temp + '.txt', 'w')
ddfs = open('data/train/分词/' + name_temp + '/词频' + name_temp + '.txt', 'w')
dirname = fullname.split('data/train/THUCNews/')[1].split('/')[0]

if name_temp != dirname:
write_words(words_dic, dfs)
write_words(all_words, ddfs)
words_dic.clear()
all_words.clear()
name_temp = dirname

filename = fullname.split('/')[-1]
print(fullname + '===================================================')
ifs = open(fullname, 'r')
ofs = open('data/train/分词/' + dirname + '/' + filename, 'w')
words_temp = []
for line in ifs.readlines():
line = line.strip()
try:
words = jieba.cut(line)
except:
continue

for w in words:
if w.strip() == '':
continue
if w in all_stopwords_list:
continue
if w not in words_temp:
words_temp.append(w)
if w not in all_words.keys():
all_words[w] = 1
else:
all_words[w] += 1
print(w)
ofs.write(w + ' ')
ofs.write('\n')

for t in words_temp:
if t not in words_dic.keys():
words_dic[t] = 1
else:
words_dic[t] += 1

ifs.close()
ofs.close()
dfs.close()
ddfs.close()

dfs = open('data/train/分词/' + name_temp + '/' + name_temp + '.txt', 'w')
write_words(words_dic, dfs)
dfs.close()
ddfs = open('data/train/分词/' + name_temp + '/词频' + name_temp + '.txt', 'w')
write_words(all_words, ddfs)
ddfs.close()


# 统计总字典
def sumdic(fillname_list):
dic = {}
fillname_list = readfillnames()
for file in fillname_list:
dfs = open(file, 'r')
for line in dfs.readlines():
key = line.split(':')[0].strip()
value = int(line.split(':')[-1].strip())
if key not in dic.keys():
dic[key] = value
else:
dic[key] += value
# print key.decode('gbk').encode('utf-8')+': %d'%dic[key]
print("程序运行中,请稍后。。。")

# 将次数少于九次的词删除
for t in dic.keys():
if dic[t] < 2:
del dic[t]

afs = open('data/train/字典.txt', 'w')
write_words(dic, afs)
afs.close()


# 统计总词频
def sumcipindic():
cipin_dic = {}
cipin_fullnamelist = readcipin_fullnames()
for file in cipin_fullnamelist:
dfs = open(file, 'r')
for line in dfs.readlines():
key = line.split(':')[0].strip()
value = int(line.split(':')[-1].strip())
if key not in cipin_dic.keys():
cipin_dic[key] = value
else:
cipin_dic[key] += value
# print key.decode('gbk').encode('utf-8')+': %d'%dic[key]
print("请稍后。。。")

afs = open('data/train/词频字典.txt', 'w')
write_words(cipin_dic, afs)
afs.close()

for dir in os.listdir('data/train/THUCNews'):
if not os.path.exists('data/train/分词/' + dir):
os.mkdir('data/train/分词/' + dir)

fullname_list = readfullnames()
fillname_list = readfillnames()
segfile(fullname_list)
sumdic(fillname_list)
sumcipindic()