OWS tweets - socrateslab/zh GitHub Wiki
import sys
def flushPrint(s):
sys.stdout.write('\r')
sys.stdout.write('%s' % s)
sys.stdout.flush()
import re
import twitter_text #pip install twitter-text-py
import csv
import sys
def flushPrint(s):
sys.stdout.write('\r')
sys.stdout.write('%s' % s)
sys.stdout.flush()
def extract_rt_user(tweet):
rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", re.IGNORECASE)
try:
rt_user_name = rt_patterns.findall(tweet)[0][1].strip(' @')
return rt_user_name
except IndexError, e:
pass
def cleanTweet(tweet):
replace symbols, return word list
#print tweet
tweet = tweet.decode('utf-8').strip()
rt_name = extract_rt_user(tweet)
ex = twitter_text.Extractor(tweet)
at_names = ex.extract_mentioned_screen_names()
urls = ex.extract_urls()
hashtags = ex.extract_hashtags()
for ia in at_names:
tweet = tweet.replace(ia, )
for j in urls:
tweet = tweet.replace(j, )
tweet = tweet.replace('RT @', ).replace('@', ).replace('"', ).replace('#', )
seps = ['(', ')', '!', ':', '.', '?', ',', '=', u'\xa0', '/', '\\', '\n', '-', '|', ';', u'&', '*', "'", '+']
for s in seps:
tweet = tweet.replace(s, ' ')
tweet = tweet.split(' ')
tweet = [t.lower() for t in tweet if t != ]
return tweet
from collections import defaultdict
import csv
import sys
def flushPrint(s):
sys.stdout.write('\r')
sys.stdout.write('%s' % s)
sys.stdout.flush()
data_dict = defaultdict(lambda: defaultdict(int))
error_num = 0
line_num = 0
total_num = 0
bigfile = open('/Users/chengjun/百度云同步盘/Writing/OWS/ows-clean.txt', 'rb')
chunkSize = 100000000
chunk = bigfile.readlines(chunkSize)
total_num += len(chunk)
while chunk:
lines = csv.reader((line.replace('\x00',) for line in chunk), delimiter=',', quotechar='"')
for i in lines:
line_num +=1
if line_num % 10000 ==0:
flushPrint(line_num)
try:
date = i[3]
tweet = i[1]
if len(date) == 10:
tweet = cleanTweet(tweet)
for tt in tweet:
data_dict[tt][date] += 1
else:
error_num+=1
except:
pass
chunk = bigfile.readlines(chunkSize)
print line_num, total_num, error_num
import json
with open('/Users/chengjun/百度云同步盘/Writing/OWS/term_vectors.json', 'w') as f:
json.dump(data_dict, f)
import re
import twitter_text #pip install twitter-text-py
import csv
import sys
def flushPrint(s):
sys.stdout.write('\r')
sys.stdout.write('%s' % s)
sys.stdout.flush()
def extract_rt_user(tweet):
rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", re.IGNORECASE)
try:
rt_user_name = rt_patterns.findall(tweet)[0][1].strip(' @')
return rt_user_name
except IndexError, e:
pass
from collections import defaultdict
import csv
bigfile = open('/Users/chengjun/百度云同步盘/Writing/OWS/ows-clean.txt', 'rb')
chunkSize = 100000000
chunk = bigfile.readlines(chunkSize)
- format: data[date][attribute] = int
at_dict = defaultdict(lambda: defaultdict(int))
rt_dict = defaultdict(lambda: defaultdict(int))
url_dict = defaultdict(lambda: defaultdict(int))
tag_dict = defaultdict(lambda: defaultdict(int))
user_dict = defaultdict(lambda: defaultdict(int))
error_num =0
line_num = 0
while chunk:
lines = csv.reader((line.replace('\x00',) for line in chunk), delimiter=',', quotechar='"')
for i in lines:
line_num +=1
if line_num % 10000 ==0:
flushPrint(line_num)
try:
date = i[3]
user = i[8] # from_user_id
tweet = i[1]
ex = twitter_text.Extractor(tweet)
at_names = ex.extract_mentioned_screen_names()
urls = ex.extract_urls()
hashtags = ex.extract_hashtags()
rt_user = extract_rt_user(tweet)
if len(date) == 10:
if at_names:
for at_name in at_names:
at_dict[date][at_name]+=1
if rt_user:
rt_dict[date][rt_user]+=1
if urls:
for url in urls:
url_dict[date][url]+=1
if hashtags:
for tag in hashtags:
tag_dict[date][tag]+=1
user_dict[date][user]+=1
else:
error_num+=1
except Exception, e:
print e
pass
chunk = bigfile.readlines(chunkSize)
print line_num, error_num
import json
with open('/Users/chengjun/百度云同步盘/Writing/OWS/at_dict.json', 'w') as f:
json.dump(at_dict, f)
with open('/Users/chengjun/百度云同步盘/Writing/OWS/rt_dict.json', 'w') as f:
json.dump(rt_dict, f)
with open('/Users/chengjun/百度云同步盘/Writing/OWS/url_dict.json', 'w') as f:
json.dump(url_dict, f)
with open('/Users/chengjun/百度云同步盘/Writing/OWS/tag_dict.json', 'w') as f:
json.dump(tag_dict, f)
with open('/Users/chengjun/百度云同步盘/Writing/OWS/user_dict.json', 'w') as f:
json.dump(user_dict, f)
从tweets中定位date字符,然后读取出所有的date和users,并且计算出读取的行数、总的行数和出错的行数
from collections import defaultdict
import csv
data_dict = defaultdict(list)
error_num = 0
line_num = 0
total_num = 0
bigfile = open('D:/Data/ows/ows-raw.txt', 'rb')
chunkSize = 100000000
chunk = bigfile.readlines(chunkSize)
while chunk:
total_num += len(chunk)
lines = csv.reader((line.replace('\x00',) for line in chunk), delimiter=',', quotechar='"')
for i in lines:
line_num +=1
try:
date = i[3]
if len(date) == 10:
data_dict[date].append(i[8])
else:
error_num+=1
except:
pass
chunk = bigfile.readlines(chunkSize)
print line_num, total_num, error_num
以“date-tweets-users”格式输出
import pandas as pd data = [[i, len(data_dict[i]), len(set(data_dict[i]))] for i in data_dict] dat = pd.DataFrame(data, columns = ['date', 'tweets', 'users']) dat.date = pd.to_datetime(dat.date) dat = dat.sort(['date', 'tweets', 'users']) print dat
date tweets users 108 2011-10-06 49638 18487 107 2011-10-07 65238 23460 110 2011-10-08 65949 23243 .. ... ... ... 13 2012-02-16 12837 4428 14 2012-02-17 12468 4299 21 2012-02-18 4859 2012 [136 rows x 3 columns]
- 返回结果
绘制每天的tweets和users数量变化折线图
对users和tweets数量进行拟合,结果显示符合幂律分布
从tweets文件中抓取出url部分,并计算出读取的行数,文件总的行数,以及出错的行数。
from collections import defaultdict
import csv
import re
import sys
def flushPrint(s):
sys.stdout.write('\r')
sys.stdout.write('%s' % s)
sys.stdout.flush()
from collections import defaultdict
import csv
data_dict = defaultdict(list) # error
error_num = 0
line_num = 0
total_num = 0
bigfile = open('D:/Data/ows-raw.txt', 'rb')
chunkSize = 10000000
chunk = bigfile.readlines(chunkSize)
while chunk:
total_num += len(chunk)
lines = csv.reader((line.replace('\x00',) for line in chunk), delimiter=',', quotechar='"')
for i in lines:
line_num+=1
if line_num%1000000==0:
flushPrint(line_num)
try:
url_patterns=re.compile(r"http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+",re.IGNORECASE)
urls=url_patterns.findall(i[1]) # error
date=i[3]
for ui in urls: # error
if len(date)==10:
data_dict[date].append(ui)
except Exception, e:
print e
error_num+=1
pass
chunk = bigfile.readlines(chunkSize) # error
print line_num, total_num,error_num
构建一个数据框,格式为date-urls-users
import pandas as pd data=[[d,len(data_dict[d]),len(set(data_dict[d]))] for d in data_dict]dat = pd.DataFrame(data, columns = ['date','urls',"users"]) dat.date = pd.to_datetime(dat.date) dat = dat.sort(['date', 'urls',"users"]) print dat
- data = [[i, len(data_dict[i]), len(set(data_dict[i]))] for i in data_dict]
绘制users和urls以日期为x轴的累积频数折线图,结果显示二者的波动正相关
import matplotlib.pyplot as plt import matplotlib % matplotlib inline import numpy as np %matplotlib inline import matplotlib.cm as cm import matplotlib.pyplot as plt fig = plt.figure(figsize=(15, 4),facecolor='white') plt.plot(dat.date, dat.urls, 'r-o', label = "urls") plt.plot(dat.date, dat.users, 'g-o', label = "users") plt.legend(loc=2,fontsize=8)plt.show()
- plt.yscale('log')
对每天的urls和users数量做拟合,看是否呈幂律分布,结果显示二者符合幂律分布
import statsmodels.api as sm
x = np.log(dat.users)
y = np.log(dat.urls)
xx = sm.add_constant(x, prepend=True)
res = sm.OLS(y,xx).fit()
constant,beta = res.params
r2 = res.rsquared
fig = plt.figure(figsize=(8, 4),facecolor='white')
plt.plot(dat.users, dat.urls, 'rs', label= 'Data')
plt.plot(np.exp(x), np.exp(constant + x*beta),"-", label = 'Fit')
plt.yscale('log');plt.xscale('log')
plt.xlabel(r'$Users$')
plt.ylabel(r'$Urls$')
plt.text(max(dat.users)/4,max(dat.urls)/20,
r'$\beta$ = ' + str(round(beta,2)) +'\t' + r'$R^2$ = ' + str(round(r2, 2)))
plt.legend(loc=2,fontsize=10, numpoints=1)
plt.axis('tight')
plt.show()
BSTS Code of Occupy Central
setwd("/Users/chengjun/bigdata/")
oc = read.csv("./occupycentral_wordfreq.csv", sep = ",",
header = F, stringsAsFactors = F, encoding= "utf-8")
oc15 = read.csv("./occupycentral_wordfreq_tfidf.csv", sep = ",",
header = F, stringsAsFactors = F )
query = read.csv("./occupycentralgoogletrends.csv", sep = ",",
header = T, stringsAsFactors = F, encoding= "utf-8")
query = query[1:27,]
names(query)
data = data.frame( t(oc[,2:28]) )
data$queryf = log(query$fanti+1)
data$queryj = log(query$jianti+1)
data$querye = log(query$occupy.central+1)
queryb = c(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 67.1245791245791, 2.097643097643072,
54.538720538720554, 2.097643097643072, 0.0, 0.0, 0.0,
10.488215488215474, 0.0, 31.464646464646478, 0.0,
10.488215488215417, 4.195286195286144, 31.464646464646421,
199.27609427609428, 125.85858585858585, 12.585858585858489,
2.097643097643072, 0.0, 0.0, 0.0)
data$queryb = log(queryb +1)
data$y = c(1. , 8.33333333, 31.74193548, 34.72413793,
41.70833333, 50.06666667, 52.35483871, 33.80645161,
43.46666667, 36.53333333, 29.36666667, 30.03225806,
34.29032258, 31.21428571, 33.93548387, 34.96666667,
60.16129032, 22.62068966, 59.06451613, 27.51612903,
16.8 , 55.93548387, 32.53333333, 77.29032258,
70.35483871, 38.78571429, 39.89285714)
name =oc[,1]
cat(name)
require(zoo)
date = c('2013-01-01', '2013-02-01','2013-03-01','2013-04-01',
'2013-05-01','2013-06-01', '2013-07-01','2013-08-01',
'2013-09-01','2013-10-01','2013-11-01','2013-12-01',
'2014-01-01','2014-02-01', '2014-03-01','2014-04-01',
'2014-05-01','2014-06-01','2014-07-01','2014-08-01',
'2014-09-01','2014-10-01','2014-11-01','2014-12-01',
'2015-01-01','2015-02-01','2015-03-01')
date = as.Date(strptime(date, "%Y-%m-%d"))
dt = zoo(data, date)
- par(mfrow=c(3, 2))
plot(dt[,121:125], main = )
matplot(scale(dt[,121:125]), type = "l", lwd = 2, main = ,ylab = "value")
legend(10, 3,c('Google Fanti','Google Jianti','Google English', "Baidu Jianti",'News'),
col=1:5,lty=1:5,cex=1,ncol=1, lwd = 2)
cor(dt[,121:125])
- clustering
mydata = data.frame( oc15[,2:28] )
name = oc15[,1]
mydata <- scale(t(mydata)) # standardize variables
class(mydata)
- Determine number of clusters
wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))
for (i in 2:30) wss[i] <- sum(kmeans(mydata,
centers=i)$withinss)
plot(1:30, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")
- Ward Hierarchical Clustering
d <- dist(mydata, method = "euclidean") # distance matrix
fit <- hclust(d, method="ward.D")
plot(fit) # display dendogram
groups <- cutree(fit, k=5) # cut tree into 5 clusters
- draw dendogram with red borders around the 5 clusters
rect.hclust(fit, k=5, border="red")
printGroup = function(n){
for (i in which(groups ==n)){
cat(name[i], ' , ')
}
}
printGroup(1)
for (i in 1:15){
cat(i, name[i], sep = "-->")
}
- 反对派 , 和平 , 警方 , 团体 , 行政长官 , 意见 , 学生 , 大学 , 运动 , 国家 ,
- 内地 , 违法 , 选举 , 委会 , 提名 , 政策 , 人民 , 批评 , 事件 , 中国 , 本港 ,
- 梁振英 , 游行 , 戴耀廷 , 法治 , 组织 , 利益 , 自由 , 我们 , 委员会 , 争取 ,
- 法律 , 记者 , 示威 , 个人 , 活动 , 发起人 , 特区政府 , 基本法 , 国际 , 经济 , 传媒 , 民意 ,
printGroup(2)
- 立法会 , 方案 , 港人 , 公民 , 会议 , 特首 , 普选 , 占领 , 行动 , 政治 , 民主 , 行政 ,
- 代表 , 政改 , 反对 , 中环 , 党 , 政府 , 主席 , 社会 , 中央 , 市民 , 议员 ,
printGroup(3)
- 中央政府 , 抗命 , 官员 , 激进 , 安全 , 港独 , 制度 , 年轻人 , 民主派 , 程序 ,
- 政党 , 台湾 , 美国 , 生活 , 商讨 , 公众 , 建制 , 教育 , 抗争 , 政制 , 共识 ,
- 繁荣 , 青年 , 投资 , 集会 , 诉求 , 建议 , 规定 , 地方 , 学者 , 理性 , 市场 ,
- 民主党 , 台独 , 暴力 , 言论 , 泛民 , 认同 , 全国人大常委会 , 北京 , 尊重 ,
- 质疑 , 英国 , 公司 , 一国两制 ,
- 历史 , 投票 , 示威者 , 候选人 , 爱港 , 对抗 , 合作 , 爱国 , 调查 , 公投 ,
printGroup(4)
- 香港 , 占 ,
- bsts of news
library(bsts)
- dt$poll=(lowess(dt$poll, f = .03))$y
plot(date, dt$y, 'l', col='red')
ss0 <- AddLocalLevel(list(), dt$y)
ss1 <- AddSeasonal(ss0, y = dt$y, nseasons = 9, season.duration = 3) #27
trend.model <- bsts(dt$y, ss0, niter = 1000, bma.method = c("ODA"),seed = 1)
trend.seasonal.model <- bsts(dt$y, ss1, niter = 1000, bma.method = c("ODA"), seed = 1)
model <- bsts(y ~ ., data = dt, niter = 1000, state.specification=ss0,
expected.model.size = 3, bma.method = c("ODA"), seed = 1)
model1<- bsts(y ~dt[,120], data = dt, niter = 1000, state.specification=ss0,
expected.model.size = 3, bma.method = c("ODA"), seed = 1)
model2<- bsts(y ~dt[,120]+dt[,115], data = dt, niter = 1000, state.specification=ss0,
expected.model.size = 3, bma.method = c("ODA"), seed = 1)
model3<- bsts(y ~dt[,120]+dt[,115]+dt[,113], data = dt, niter = 1000, state.specification=ss0,
expected.model.size = 3, bma.method = c("ODA"), seed = 1)
model4<- bsts(y ~dt[,120]+dt[,115]+dt[,113]+dt[,110], data = dt, niter = 1000, state.specification=ss0,
expected.model.size = 3, bma.method = c("ODA"), seed = 1)
model5<- bsts(y ~dt[,120]+dt[,115]+dt[,113]+dt[,110]+dt[,109], data = dt, niter = 1000, state.specification=ss0,
expected.model.size = 3, bma.method = c("ODA"), seed = 1)
model6<- bsts(y ~dt[,120]+dt[,115]+dt[,113]+dt[,110]+dt[,109]+dt[,107], data = dt, niter = 1000, state.specification=ss0,
expected.model.size = 3, bma.method = c("ODA"), seed = 1)
- > print_names(var_names)
120, 民意, 115, 调查, 113, 合作, 110, 市民, 109, 基本法, 107, 特区政府, 103, 活动,
102, 示威者, 100, 示威, 99, 社会, 98, 记者, 94, 一国两制, 92, 法律, 91, 争取,
89, 英国, 87, 党, 86, 尊重, 82, 全国人大常委会, 80, 反对, 76, 组织, 74, 暴力,
71, 戴耀廷, 68, 行政, 64, 梁振英, 60, 占, 57, 事件, 52, 政策, 49, 诉求, 48, 集会,
45, 青年, 41, 政制, 39, 占领, 37, 普选, 35, 内地, 33, 建制, 29, 运动, 25, 特首,
24, 学生, 20, 行政长官, 18, 团体, 14, 年轻人, 13, 制度, 11, 警方, 10, 安全,
8, 港人, 6, 激进, 5, 官员, 2, 立法会, 1, 反对派, 65, 理性,
CompareBstsModels(list(trend = trend.model,
#"trend and seasonal" = trend.seasonal.model,
"model1"=model1,
"model2"=model2,
"model3"=model3,
"model4"=model4,
"model5"=model5,
"model6"=model6,
"all"=model), xlab = "")
scope = c(min(dt$y), max(dt$y))
r.square = function(model) as.character(round(summary(model)$relative.gof, 3))
par(mfrow=c(4, 2))
par(mar=c(rep(3, 4)))
plot(trend.model, ylim = scope, main = paste('Trend (relative.gof = ', r.square(trend.model), ')') , xlab = "", ylab = "Poll")
- plot(trend.seasonal.model, ylim = scope, main = paste('add Seasonal (relative.gof = ', r.square(trend.seasonal.model), ')'), xlab = "", ylab = "Poll")
plot(model1, ylim = scope, main = paste('model1 (relative.gof = ', r.square(model1), ')'), xlab = "", ylab = "Poll")
plot(model2, ylim = scope, main = paste('model2 (relative.gof = ', r.square(model2), ')'), xlab = "", ylab = "Poll")
plot(model3, ylim = scope, main = paste('model3 (relative.gof = ', r.square(model3), ')'), xlab = "", ylab = "Poll")
plot(model4, ylim = scope, main = paste('model4 (relative.gof = ', r.square(model4), ')'), xlab = "", ylab = "Poll")
plot(model5, ylim = scope, main = paste('model5 (relative.gof = ', r.square(model5), ')'), xlab = "", ylab = "Poll")
plot(model6, ylim = scope, main = paste('model6 (relative.gof = ', r.square(model6), ')'), xlab = "", ylab = "Poll")
plot(model, ylim = scope, main = paste('all (relative.gof = ', r.square(model), ')'), xlab = "", ylab = "Poll")
- pred <- predict(model, horizon = 12, burn = 100)
- plot(pred)
model <- bsts(y ~dt[,274]+dt[,272], data = dt, niter = 1000, state.specification=ss1,
expected.model.size = 3, bma.method = c("ODA"), seed = 1)
summary(model)
par(mfrow=c(1,1))
plot(model)
- abline(h=0, col = 'red')
plot(model, "components")
plot(model, "coefficients")
plot(model, "size")
plot(model, "predictors")
plot(model1, 'state')
plot(trend.model, 'state')
print_names = function(var_names){
for (i in var_names){
i =strsplit(i, "X")[[1]][2]
i = as.numeric(i)
#cat(i, "",sep = ",")
cat(i, name[i], " ",sep = "-->")
}
}
var_names = rownames(summary(model)$coefficients)[1:50]
print_names(var_names)
120,115,113,110,109,107,103,102,100,99,98,94,92,91,89,87,86,
82,80,76,74,71,68,64,60,57,52,49,
48,45,41,39,37,35,33,29,25,24,20,18,14,13,11,10,8,6,5,2,1,65,
mydata = dt[,var_names]
120-->民意--> 115-->调查--> 113-->合作--> 110-->市民--> 109-->基本法--> 107-->特区政府--> 103-->活动--> 102-->示威者--> 100-->示威--> 99-->社会--> 98-->记者--> 94-->一国两制--> 92-->法律--> 91-->争取--> 89-->英国--> 87-->党--> 86-->尊重--> 82-->全国人大常委会--> 80-->反对--> 76-->组织--> 74-->暴力--> 71-->戴耀廷--> 68-->行政--> 64-->梁振英--> 60-->占--> 57-->事件--> 52-->政策--> 49-->诉求--> 48-->集会--> 45-->青年--> 41-->政制--> 39-->占领--> 37-->普选--> 35-->内地--> 33-->建制--> 29-->运动--> 25-->特首--> 24-->学生--> 20-->行政长官--> 18-->团体--> 14-->年轻人--> 13-->制度--> 11-->警方--> 10-->安全--> 8-->港人--> 6-->激进--> 5-->官员--> 2-->立法会--> 1-->反对派--> 65-->理性-->
左图是所有用户中的一部分,右图是活跃用户(推特数量大于50条)的一部分。基本证实 <math> S(t) \sim t^\mu</math>。参见Human Mobility。
github 地址:https://github.com/qinqiang2015/tweets/blob/master/users_urls_at_name_rt_name.ipynb
王成军、党明辉、顾慧君, 参与者、议题与行动 香港_占领中环_运动中的新闻报道, 2015,计算传播学实验中心手稿http://computational-communication.com/wiki/images/b/b7/20160113%E3%80%8A%E5%8F%82%E4%B8%8E%E8%80%85%E3%80%81%E8%AE%AE%E9%A2%98%E4%B8%8E%E8%A1%8C%E5%8A%A8_%E9%A6%99%E6%B8%AF_%E5%8D%A0%E9%A2%86%E4%B8%AD%E7%8E%AF_%E8%BF%90%E5%8A%A8%E4%B8%AD%E7%9A%84%E6%96%B0%E9%97%BB%E6%8A%A5%E9%81%93%E3%80%8B.pdf 论文链接