失眠网 > 微博爬虫及舆情分析-3.文本清理与制作词云

微博爬虫及舆情分析-3.文本清理与制作词云

时间：2019-01-28 19:00:46

1、文本清理

import pandas as pdimport pymysqlfrom sqlalchemy import create_engineimport reimport jiebaimport jieba.analyse#1.从数据库导入微博数据并查看mblog_frame = pd.read_csv('mblog.csv',index_col=None)mblog_frame.head(2)

# 2.清除text中的非微博正文字符并抽取关键词# 自定义函数def clean_text(raw):"""清除text中的非微博正文字符返回值类型为元组"""if raw['raw_text']:text=re.sub('<[^<]*>','',raw['raw_text']) # 清除多余的html语句text=re.sub('[#\n]*','',text) # 清除换行符与#符号text=re.sub('(http://.*)$','',text) # 清除文末的网址return text else:return Nonedef get_chinese_text(raw):"""清除text中的非中文字符只能提取中文字符，微博中的数字以及英文均会丢失"""if raw['text']:res_text=''.join(re.findall(r"[\u4e00-\u9fff]{2,}",raw['text']))return (raw['mid'],res_text)else:return Nonedef get_keywords(raw):"""使用jieba从中文text抽取关键词默认抽取20个关键词longtext 提取40个关键词"""if raw['chinese_text']:if raw['isLongText'] == 1:# 当text为长文本时，提取50个关键词keywords = jieba.analyse.extract_tags(raw['chinese_text'],topK=50)else:# 当text为非长文本时，默认提取20个关键词keywords = jieba.analyse.extract_tags(raw['chinese_text'])return (raw['mid'],keywords)else:return Nonedef clean_created_date(raw):created_date = raw['created_at']if created_date.endswith('前'):created_date = '09-15'elif created_date.startswith('昨天'):created_date = '09-14'return created_date#获取清理后的created_datemblog_frame['created_date'] = mblog_frame.apply(clean_created_date,axis=1)# 获取清理后的textmblog_frame['chinese_text'] = mblog_frame.apply(clean_text,axis=1)# 以传入字典items()的形式生成DataFrame，指定列名res_mblog = pd.DataFrame(mblog_frame,columns=['mid','chinese_text','like_count','comments_count','reposts_count','created_date','user_id'])# 写入csv文件便于查看数据清洗结果res_mblog.to_csv('clean_mblog.csv', encoding='utf_8_sig',index=False)# 获取关键字并转换为分散存储的DataFramemid_with_keyword = list(mblog_frame.apply(get_keywords,axis=1))# 这里要把keywords列表存储到数据库，因此需要将keywords列表分开，并与mid对应keywords_list = [(raw[0],w) for raw in mid_with_keyword for w in raw[1]]mid_with_keyword = pd.DataFrame(keywords_list,columns=['mid','keyword'])# 写入csv文件便于查看结果mid_with_keyword.to_csv('keyword.csv', encoding='utf_8_sig',index=False)

2、制作词云

# 从数据库读取微博数据keyword_frame = pd.read_csv('keyword.csv',index_col=False)# 取出全部的关键词，并生成一个列表all_keyword = list(keyword_frame.keyword)# 使用collections模块中的Counter统计每个关键词出现的次数，Counter返回一个字典，keyword：countfrom collections import Counterword_freq_frame = pd.DataFrame(Counter(all_keyword).items())word_freq_frame.columns=['word','count']top100_freq_word = word_freq_frame.sort_values('count',ascending=0).head(100)top100_freq_word_dict=dict(list(top100_freq_word.apply(lambda w:(w['word'],w['count']),axis=1)))from wordcloud import WordCloud,STOPWORDSimport matplotlib.pyplot as pltplt.rcParams['font.sans-serif']=['SimHei']#用来显示中文标签plt.rcParams['axes.unicode_minus']=False #用来显示负号%matplotlib inlineplt.rcParams['figure.dpi'] = 100 #分辨率wc = WordCloud(background_color="white",max_words=2000,font_path='simhei.ttf')wc.generate_from_frequencies(top100_freq_word_dict)plt.imshow(wc)plt.axis('off')plt.show()

如果觉得《微博爬虫及舆情分析-3.文本清理与制作词云》对你有帮助，请点赞、收藏，并留下你的观点哦！

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。