This repository was archived by the owner on Feb 16, 2026. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathparse.py
More file actions
92 lines (87 loc) · 4.44 KB
/
parse.py
File metadata and controls
92 lines (87 loc) · 4.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# encoding=utf-8
import jieba
import pandas as pd
from utils import contains
from tqdm import tqdm
from re import findall
"""
parse对消息进行分词。会分出词语和表情两列。
生成keywords.csv文件作为输出。
"""
def parse(msg_file='msg.csv', emoji_file='emoji.txt',
stopword_file='stopwords_hit_modified.txt',
transform_file='transformDict.txt',
user_dict_file='userDict.txt', process_rows='all'):
print('开始分词')
records = pd.read_csv(
f'input_data/{msg_file}', usecols=['IsSender', 'StrContent', 'StrTime']).dropna(how='any')
# 如果有一条消息是乱码或者纯数字,直接删除之
not_want_msg = ['<.+', # 如<msg,<?xml
'^\d{1,}$' # 纯数字,如验证码
]
records_not_want = records['StrContent'].apply(lambda x: contains(x, not_want_msg))
records = records[~records_not_want]
records.index = range(records.shape[0])
emoji_eng2cn = pd.read_table(f'input_data/{emoji_file}').set_index('eng').to_dict()['cn']
# 停词不是哈工大原版,删除了[],使得微信表情能匹配到;添加了一些新词
with open(f'input_data/{stopword_file}', 'r', encoding='utf-8') as f1:
stop_words = set(f1.read().splitlines())
transformDict = pd.read_table(f'input_data/{transform_file}'
).set_index('original').to_dict()['transformed']
jieba.load_userdict(f'input_data/{user_dict_file}')
if process_rows == 'all':
process_rows = records.shape[0]
result = []
emoji_res = []
records['keywords'] = [float('nan') for _ in range(records.shape[0])]
records['emoji'] = [float('nan') for _ in range(records.shape[0])]
# emoji_set = set()
try:
for i in tqdm(range(process_rows)):
for word in jieba.cut(records.loc[i, 'StrContent'], use_paddle=True): # 使用paddle模式
# 不是停词,不是空白,是数字字母下划线汉字或者[],不是纯数字(包括带小数点的),不是单个英文字母
if word not in stop_words and len(word.strip()) and \
findall('[\[\]一-龟a-zA-Z0-9]+', word) and \
not findall('^\d{1,}$|^\d{1,}\.\d{1,}$', word) and \
not findall('^[a-zA-Z]$', word):
if word in transformDict:
word = transformDict[word]
result.append(word)
# 我发现jieba总是会把微信表情[Cry]分成[,Cry,],因此人为把它们合起来
for _ in range(result.count(']')):
ind2 = result.index(']')
if len(result) < 3:
break
if result[ind2 - 2] != '[':
continue
emoji_text = result[ind2 - 1]
if emoji_text in emoji_eng2cn.keys():
# 如果是英文并且在字典中,就转换成中文
cur_emoji = '[' + emoji_eng2cn[emoji_text] + ']'
else:
if findall('^[0-9a-zA-Z]+$', emoji_text):
# 如果emoji_text全部是字母数字并且不在emoji字典中,就删除后跳过
del result[ind2 - 2:ind2 + 1]
continue
cur_emoji = '[' + emoji_text + ']' # 纯汉字
del result[ind2 - 2:ind2 + 1]
emoji_res.append(cur_emoji)
# emoji_set.add(cur_emoji)
records.loc[i, 'keywords'] = ', '.join(result)
records.loc[i, 'emoji'] = ', '.join(emoji_res)
result = []
emoji_res = []
except Exception as e:
print(f'数据文件某行有问题,异常为{e}。请检查生成的bug.csv,可以提交给开发者。')
df_bug = records.loc[[i], :]
df_bug.to_csv('bug.csv', index=None, encoding='utf_8_sig')
raise e
records.replace('', float('nan'), inplace=True) # 方便后面dropna
# 分词后,由于某些消息全是停词,使得分词为空,需要删去这部分
records.dropna(how='all', subset=['keywords', 'emoji'], inplace=True)
records.to_csv('temp_files/keywords.csv', index=None, encoding='utf_8_sig')
# with open('emoji_set.txt', 'w') as f3:
# f3.write('\n'.join(emoji_set))
print('=' * 20)
if __name__ == '__main__':
parse()