数据分析|Python提取微博用户信息和画像

Abstract:网络上数据大多是非结构性数据,需要对其进行预处理以便进行下一步的文本分析。本文用python提取微博用户的数据,将用户的名称、ID、性别、标签、地点等提取出来。

提取微博用户信息代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
 print "开始获取粉丝 ID"
fansurl = "https://api.weibo.com/2/{0}.json?access_token={1}&{2}={3}&
{4}={5}&{6}={7}".format(fansmethod ,access_token,'screen_name','大都会','count',2000,'cursor',1)
fansurlfile = urllib2.urlopen(fansurl)
fansuid = fansurlfile.read()
fansidlist = eval(fansuid)['ids']
print "获取粉丝 ID 成功"

false = False
null = None
true = True

print "开始写入粉丝用户信息"
outputfile = open("wbuserprofile.xml","w")
for id in fansidlist:
outputfile.write("<user>\n\t<id>"+str(id)+"</id>\n")
print "写入 uid{}".format(id)
try:
profile = urllib2.urlopen("https://api.weibo.com/2/{0}.json?access_token={1}&{2}={3}&{4}={5}&{6}={7}".format(userprofmethod,access_token,"uid",id,'','','','')).read()
outputfile.writelines("\t<screen_name>"+eval(profile)["screen_name"]+"</screen_name>\n")
print "写入 uid{} 微博名称完成".format(id)
outputfile.writelines("\t<location>"+eval(profile)["location"]+"</location>\n")
print "写入 uid{} 微博地点完成".format(id)
outputfile.writelines("\t<gender>"+eval(profile)["gender"]+"</gander>\n")
print "写入 uid{} 微博性别完成".format(id)
tags = urllib2.urlopen("https://api.weibo.com/2/{0}.json?access_token={1}&{2}={3}&{4}={5}&{6}={7}".format(tagsmethod,access_token,"uid",id,'','','','')).read()
print "开始写入 uid{} 标签".format(id)
outputfile.writelines("\t<tags>\n")
for tag in eval(tags):
for key in tag.keys():
if key.isdigit():
outputfile.writelines("\t"+tag[key]+"\n")
outputfile.writelines("\t</tags>\n")
print "完成写入 uid{} 标签".format(id)
except:
print "连接出错无法写入 , 跳过!"
outputfile.write("</user>")
print "sleep 60 mins"
time.sleep(3600)
continue
outputfile.write("</user>")
outputfile.write("\n")

outputfile.close()
print "完成粉丝用户信息"
print "文件写入结束"

提取微博用户肖像

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
 <user> 
<id>2863185903</id>
<screen_name> 花开有季 xn</screen_name>
<location> 北京 延庆县 </location>
<gender>f</gander>
<tags>
星座命理
娱乐
</tags>
</user>
<user>
<id>1246347253</id>
<screen_name> 杰里 - 商 </screen_name>
<location> 北京 房山区 </location>
<gender>m</gander>
<tags>
搞笑幽默
</tags>
</user>
<user>
<id>3265394820</id>
<screen_name> 小荷相公丶 </screen_name>
<location> 重庆 </location>
<gender>f</gander>
<tags>
</tags>
</user>
<user>
<id>2036066523</id>
<screen_name> 青春的 Dalin</screen_name>
<location> 江西 南昌 </location>
<gender>f</gander>
<tags>
</tags>
</user>
<user>
<id>2013144111</id>
<screen_name> 势必拿下会计证 _ 微微猫 </screen_name>
<location> 北京 宣武区 </location>
<gender>f</gander>
<tags>
旅游
WE
90 后
</tags>
</user>
Thanks!