from lxml import etree from wordcloud import WordCloud import matplotlib.pyplot as plt
path = "/home/jimo/workspace/temp/python/love/"
defanalyze(line): '''解析html''' html = etree.HTML(line.encode(encoding='utf-8')) # 解析微博内容 contents = html.xpath('//span[@class="ctt"]') content = " ".join([c.xpath("string(.)") for c in contents]) # for c in contents: # text = c.xpath("string(.)") # print(text)
# 解析发的时间和来自哪里 times = html.xpath('//span[@class="ct"]') time = [] fromWhere = [] for t in times: text = t.xpath("string(.)") splitIndex = text.index("来自") time.append(text[:splitIndex - 1]) fromWhere.append(text[splitIndex:])