OWS tweets - socrateslab/zh GitHub Wiki
import sys def flushPrint(s): sys.stdout.write('\r') sys.stdout.write('%s' % s) sys.stdout.flush()
import re import twitter_text #pip install twitter-text-py import csv import sys def flushPrint(s): sys.stdout.write('\r') sys.stdout.write('%s' % s) sys.stdout.flush() def extract_rt_user(tweet): rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", re.IGNORECASE) try: rt_user_name = rt_patterns.findall(tweet)[0][1].strip(' @') return rt_user_name except IndexError, e: pass def cleanTweet(tweet): replace symbols, return word list #print tweet tweet = tweet.decode('utf-8').strip() rt_name = extract_rt_user(tweet) ex = twitter_text.Extractor(tweet) at_names = ex.extract_mentioned_screen_names() urls = ex.extract_urls() hashtags = ex.extract_hashtags() for ia in at_names: tweet = tweet.replace(ia, ) for j in urls: tweet = tweet.replace(j, ) tweet = tweet.replace('RT @', ).replace('@', ).replace('"', ).replace('#', ) seps = ['(', ')', '!', ':', '.', '?', ',', '=', u'\xa0', '/', '\\', '\n', '-', '|', ';', u'&', '*', "'", '+'] for s in seps: tweet = tweet.replace(s, ' ') tweet = tweet.split(' ') tweet = [t.lower() for t in tweet if t != ] return tweet from collections import defaultdict import csv import sys def flushPrint(s): sys.stdout.write('\r') sys.stdout.write('%s' % s) sys.stdout.flush() data_dict = defaultdict(lambda: defaultdict(int)) error_num = 0 line_num = 0 total_num = 0 bigfile = open('/Users/chengjun/百度云同步盘/Writing/OWS/ows-clean.txt', 'rb') chunkSize = 100000000 chunk = bigfile.readlines(chunkSize) total_num += len(chunk) while chunk: lines = csv.reader((line.replace('\x00',) for line in chunk), delimiter=',', quotechar='"') for i in lines: line_num +=1 if line_num % 10000 ==0: flushPrint(line_num) try: date = i[3] tweet = i[1] if len(date) == 10: tweet = cleanTweet(tweet) for tt in tweet: data_dict[tt][date] += 1 else: error_num+=1 except: pass chunk = bigfile.readlines(chunkSize) print line_num, total_num, error_num import json with open('/Users/chengjun/百度云同步盘/Writing/OWS/term_vectors.json', 'w') as f: json.dump(data_dict, f)
import re import twitter_text #pip install twitter-text-py import csv import sys def flushPrint(s): sys.stdout.write('\r') sys.stdout.write('%s' % s) sys.stdout.flush() def extract_rt_user(tweet): rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", re.IGNORECASE) try: rt_user_name = rt_patterns.findall(tweet)[0][1].strip(' @') return rt_user_name except IndexError, e: pass from collections import defaultdict import csv bigfile = open('/Users/chengjun/百度云同步盘/Writing/OWS/ows-clean.txt', 'rb') chunkSize = 100000000 chunk = bigfile.readlines(chunkSize)at_dict = defaultdict(lambda: defaultdict(int)) rt_dict = defaultdict(lambda: defaultdict(int)) url_dict = defaultdict(lambda: defaultdict(int)) tag_dict = defaultdict(lambda: defaultdict(int)) user_dict = defaultdict(lambda: defaultdict(int)) error_num =0 line_num = 0 while chunk: lines = csv.reader((line.replace('\x00',) for line in chunk), delimiter=',', quotechar='"') for i in lines: line_num +=1 if line_num % 10000 ==0: flushPrint(line_num) try: date = i[3] user = i[8] # from_user_id tweet = i[1] ex = twitter_text.Extractor(tweet) at_names = ex.extract_mentioned_screen_names() urls = ex.extract_urls() hashtags = ex.extract_hashtags() rt_user = extract_rt_user(tweet) if len(date) == 10: if at_names: for at_name in at_names: at_dict[date][at_name]+=1 if rt_user: rt_dict[date][rt_user]+=1 if urls: for url in urls: url_dict[date][url]+=1 if hashtags: for tag in hashtags: tag_dict[date][tag]+=1 user_dict[date][user]+=1 else: error_num+=1 except Exception, e: print e pass chunk = bigfile.readlines(chunkSize) print line_num, error_num import json with open('/Users/chengjun/百度云同步盘/Writing/OWS/at_dict.json', 'w') as f: json.dump(at_dict, f) with open('/Users/chengjun/百度云同步盘/Writing/OWS/rt_dict.json', 'w') as f: json.dump(rt_dict, f) with open('/Users/chengjun/百度云同步盘/Writing/OWS/url_dict.json', 'w') as f: json.dump(url_dict, f) with open('/Users/chengjun/百度云同步盘/Writing/OWS/tag_dict.json', 'w') as f: json.dump(tag_dict, f) with open('/Users/chengjun/百度云同步盘/Writing/OWS/user_dict.json', 'w') as f: json.dump(user_dict, f)
- format: data[date][attribute] = int
从tweets中定位date字符,然后读取出所有的date和users,并且计算出读取的行数、总的行数和出错的行数
from collections import defaultdict import csv data_dict = defaultdict(list) error_num = 0 line_num = 0 total_num = 0 bigfile = open('D:/Data/ows/ows-raw.txt', 'rb') chunkSize = 100000000 chunk = bigfile.readlines(chunkSize) while chunk: total_num += len(chunk) lines = csv.reader((line.replace('\x00',) for line in chunk), delimiter=',', quotechar='"') for i in lines: line_num +=1 try: date = i[3] if len(date) == 10: data_dict[date].append(i[8]) else: error_num+=1 except: pass chunk = bigfile.readlines(chunkSize) print line_num, total_num, error_num
以“date-tweets-users”格式输出
import pandas as pd data = [[i, len(data_dict[i]), len(set(data_dict[i]))] for i in data_dict] dat = pd.DataFrame(data, columns = ['date', 'tweets', 'users']) dat.date = pd.to_datetime(dat.date) dat = dat.sort(['date', 'tweets', 'users']) print dat
date tweets users 108 2011-10-06 49638 18487 107 2011-10-07 65238 23460 110 2011-10-08 65949 23243 .. ... ... ... 13 2012-02-16 12837 4428 14 2012-02-17 12468 4299 21 2012-02-18 4859 2012 [136 rows x 3 columns]
- 返回结果
绘制每天的tweets和users数量变化折线图
对users和tweets数量进行拟合,结果显示符合幂律分布
从tweets文件中抓取出url部分,并计算出读取的行数,文件总的行数,以及出错的行数。
from collections import defaultdict import csv import re import sys def flushPrint(s): sys.stdout.write('\r') sys.stdout.write('%s' % s) sys.stdout.flush() from collections import defaultdict import csv data_dict = defaultdict(list) # error error_num = 0 line_num = 0 total_num = 0 bigfile = open('D:/Data/ows-raw.txt', 'rb') chunkSize = 10000000 chunk = bigfile.readlines(chunkSize) while chunk: total_num += len(chunk) lines = csv.reader((line.replace('\x00',) for line in chunk), delimiter=',', quotechar='"') for i in lines: line_num+=1 if line_num%1000000==0: flushPrint(line_num) try: url_patterns=re.compile(r"http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+",re.IGNORECASE) urls=url_patterns.findall(i[1]) # error date=i[3] for ui in urls: # error if len(date)==10: data_dict[date].append(ui) except Exception, e: print e error_num+=1 pass chunk = bigfile.readlines(chunkSize) # error print line_num, total_num,error_num
构建一个数据框,格式为date-urls-users
import pandas as pd data=[[d,len(data_dict[d]),len(set(data_dict[d]))] for d in data_dict]dat = pd.DataFrame(data, columns = ['date','urls',"users"]) dat.date = pd.to_datetime(dat.date) dat = dat.sort(['date', 'urls',"users"]) print dat
- data = [[i, len(data_dict[i]), len(set(data_dict[i]))] for i in data_dict]
绘制users和urls以日期为x轴的累积频数折线图,结果显示二者的波动正相关
import matplotlib.pyplot as plt import matplotlib % matplotlib inline import numpy as np %matplotlib inline import matplotlib.cm as cm import matplotlib.pyplot as plt fig = plt.figure(figsize=(15, 4),facecolor='white') plt.plot(dat.date, dat.urls, 'r-o', label = "urls") plt.plot(dat.date, dat.users, 'g-o', label = "users") plt.legend(loc=2,fontsize=8)plt.show()
- plt.yscale('log')
对每天的urls和users数量做拟合,看是否呈幂律分布,结果显示二者符合幂律分布
import statsmodels.api as sm x = np.log(dat.users) y = np.log(dat.urls) xx = sm.add_constant(x, prepend=True) res = sm.OLS(y,xx).fit() constant,beta = res.params r2 = res.rsquared fig = plt.figure(figsize=(8, 4),facecolor='white') plt.plot(dat.users, dat.urls, 'rs', label= 'Data') plt.plot(np.exp(x), np.exp(constant + x*beta),"-", label = 'Fit') plt.yscale('log');plt.xscale('log') plt.xlabel(r'$Users$') plt.ylabel(r'$Urls$') plt.text(max(dat.users)/4,max(dat.urls)/20, r'$\beta$ = ' + str(round(beta,2)) +'\t' + r'$R^2$ = ' + str(round(r2, 2))) plt.legend(loc=2,fontsize=10, numpoints=1) plt.axis('tight') plt.show()
BSTS Code of Occupy Central
setwd("/Users/chengjun/bigdata/") oc = read.csv("./occupycentral_wordfreq.csv", sep = ",", header = F, stringsAsFactors = F, encoding= "utf-8") oc15 = read.csv("./occupycentral_wordfreq_tfidf.csv", sep = ",", header = F, stringsAsFactors = F ) query = read.csv("./occupycentralgoogletrends.csv", sep = ",", header = T, stringsAsFactors = F, encoding= "utf-8") query = query[1:27,] names(query) data = data.frame( t(oc[,2:28]) ) data$queryf = log(query$fanti+1) data$queryj = log(query$jianti+1) data$querye = log(query$occupy.central+1) queryb = c(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 67.1245791245791, 2.097643097643072, 54.538720538720554, 2.097643097643072, 0.0, 0.0, 0.0, 10.488215488215474, 0.0, 31.464646464646478, 0.0, 10.488215488215417, 4.195286195286144, 31.464646464646421, 199.27609427609428, 125.85858585858585, 12.585858585858489, 2.097643097643072, 0.0, 0.0, 0.0) data$queryb = log(queryb +1) data$y = c(1. , 8.33333333, 31.74193548, 34.72413793, 41.70833333, 50.06666667, 52.35483871, 33.80645161, 43.46666667, 36.53333333, 29.36666667, 30.03225806, 34.29032258, 31.21428571, 33.93548387, 34.96666667, 60.16129032, 22.62068966, 59.06451613, 27.51612903, 16.8 , 55.93548387, 32.53333333, 77.29032258, 70.35483871, 38.78571429, 39.89285714) name =oc[,1] cat(name) require(zoo) date = c('2013-01-01', '2013-02-01','2013-03-01','2013-04-01', '2013-05-01','2013-06-01', '2013-07-01','2013-08-01', '2013-09-01','2013-10-01','2013-11-01','2013-12-01', '2014-01-01','2014-02-01', '2014-03-01','2014-04-01', '2014-05-01','2014-06-01','2014-07-01','2014-08-01', '2014-09-01','2014-10-01','2014-11-01','2014-12-01', '2015-01-01','2015-02-01','2015-03-01') date = as.Date(strptime(date, "%Y-%m-%d")) dt = zoo(data, date)plot(dt[,121:125], main = ) matplot(scale(dt[,121:125]), type = "l", lwd = 2, main = ,ylab = "value") legend(10, 3,c('Google Fanti','Google Jianti','Google English', "Baidu Jianti",'News'), col=1:5,lty=1:5,cex=1,ncol=1, lwd = 2) cor(dt[,121:125])
- par(mfrow=c(3, 2))
mydata = data.frame( oc15[,2:28] ) name = oc15[,1] mydata <- scale(t(mydata)) # standardize variables class(mydata)
- clustering
wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var)) for (i in 2:30) wss[i] <- sum(kmeans(mydata, centers=i)$withinss) plot(1:30, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares")
- Determine number of clusters
d <- dist(mydata, method = "euclidean") # distance matrix fit <- hclust(d, method="ward.D") plot(fit) # display dendogram groups <- cutree(fit, k=5) # cut tree into 5 clusters
- Ward Hierarchical Clustering
rect.hclust(fit, k=5, border="red") printGroup = function(n){ for (i in which(groups ==n)){ cat(name[i], ' , ') } } printGroup(1) for (i in 1:15){ cat(i, name[i], sep = "-->") }
- draw dendogram with red borders around the 5 clusters
printGroup(2)
- 反对派 , 和平 , 警方 , 团体 , 行政长官 , 意见 , 学生 , 大学 , 运动 , 国家 ,
- 内地 , 违法 , 选举 , 委会 , 提名 , 政策 , 人民 , 批评 , 事件 , 中国 , 本港 ,
- 梁振英 , 游行 , 戴耀廷 , 法治 , 组织 , 利益 , 自由 , 我们 , 委员会 , 争取 ,
- 法律 , 记者 , 示威 , 个人 , 活动 , 发起人 , 特区政府 , 基本法 , 国际 , 经济 , 传媒 , 民意 ,
printGroup(3)
- 立法会 , 方案 , 港人 , 公民 , 会议 , 特首 , 普选 , 占领 , 行动 , 政治 , 民主 , 行政 ,
- 代表 , 政改 , 反对 , 中环 , 党 , 政府 , 主席 , 社会 , 中央 , 市民 , 议员 ,
printGroup(4)
- 中央政府 , 抗命 , 官员 , 激进 , 安全 , 港独 , 制度 , 年轻人 , 民主派 , 程序 ,
- 政党 , 台湾 , 美国 , 生活 , 商讨 , 公众 , 建制 , 教育 , 抗争 , 政制 , 共识 ,
- 繁荣 , 青年 , 投资 , 集会 , 诉求 , 建议 , 规定 , 地方 , 学者 , 理性 , 市场 ,
- 民主党 , 台独 , 暴力 , 言论 , 泛民 , 认同 , 全国人大常委会 , 北京 , 尊重 ,
- 质疑 , 英国 , 公司 , 一国两制 ,
- 历史 , 投票 , 示威者 , 候选人 , 爱港 , 对抗 , 合作 , 爱国 , 调查 , 公投 ,
- 香港 , 占 ,
library(bsts)
- bsts of news
plot(date, dt$y, 'l', col='red') ss0 <- AddLocalLevel(list(), dt$y) ss1 <- AddSeasonal(ss0, y = dt$y, nseasons = 9, season.duration = 3) #27 trend.model <- bsts(dt$y, ss0, niter = 1000, bma.method = c("ODA"),seed = 1) trend.seasonal.model <- bsts(dt$y, ss1, niter = 1000, bma.method = c("ODA"), seed = 1) model <- bsts(y ~ ., data = dt, niter = 1000, state.specification=ss0, expected.model.size = 3, bma.method = c("ODA"), seed = 1) model1<- bsts(y ~dt[,120], data = dt, niter = 1000, state.specification=ss0, expected.model.size = 3, bma.method = c("ODA"), seed = 1) model2<- bsts(y ~dt[,120]+dt[,115], data = dt, niter = 1000, state.specification=ss0, expected.model.size = 3, bma.method = c("ODA"), seed = 1) model3<- bsts(y ~dt[,120]+dt[,115]+dt[,113], data = dt, niter = 1000, state.specification=ss0, expected.model.size = 3, bma.method = c("ODA"), seed = 1) model4<- bsts(y ~dt[,120]+dt[,115]+dt[,113]+dt[,110], data = dt, niter = 1000, state.specification=ss0, expected.model.size = 3, bma.method = c("ODA"), seed = 1) model5<- bsts(y ~dt[,120]+dt[,115]+dt[,113]+dt[,110]+dt[,109], data = dt, niter = 1000, state.specification=ss0, expected.model.size = 3, bma.method = c("ODA"), seed = 1) model6<- bsts(y ~dt[,120]+dt[,115]+dt[,113]+dt[,110]+dt[,109]+dt[,107], data = dt, niter = 1000, state.specification=ss0, expected.model.size = 3, bma.method = c("ODA"), seed = 1)
- dt$poll=(lowess(dt$poll, f = .03))$y
120, 民意, 115, 调查, 113, 合作, 110, 市民, 109, 基本法, 107, 特区政府, 103, 活动, 102, 示威者, 100, 示威, 99, 社会, 98, 记者, 94, 一国两制, 92, 法律, 91, 争取, 89, 英国, 87, 党, 86, 尊重, 82, 全国人大常委会, 80, 反对, 76, 组织, 74, 暴力, 71, 戴耀廷, 68, 行政, 64, 梁振英, 60, 占, 57, 事件, 52, 政策, 49, 诉求, 48, 集会, 45, 青年, 41, 政制, 39, 占领, 37, 普选, 35, 内地, 33, 建制, 29, 运动, 25, 特首, 24, 学生, 20, 行政长官, 18, 团体, 14, 年轻人, 13, 制度, 11, 警方, 10, 安全, 8, 港人, 6, 激进, 5, 官员, 2, 立法会, 1, 反对派, 65, 理性, CompareBstsModels(list(trend = trend.model, #"trend and seasonal" = trend.seasonal.model, "model1"=model1, "model2"=model2, "model3"=model3, "model4"=model4, "model5"=model5, "model6"=model6, "all"=model), xlab = "") scope = c(min(dt$y), max(dt$y)) r.square = function(model) as.character(round(summary(model)$relative.gof, 3)) par(mfrow=c(4, 2)) par(mar=c(rep(3, 4))) plot(trend.model, ylim = scope, main = paste('Trend (relative.gof = ', r.square(trend.model), ')') , xlab = "", ylab = "Poll")
- > print_names(var_names)
plot(model1, ylim = scope, main = paste('model1 (relative.gof = ', r.square(model1), ')'), xlab = "", ylab = "Poll") plot(model2, ylim = scope, main = paste('model2 (relative.gof = ', r.square(model2), ')'), xlab = "", ylab = "Poll") plot(model3, ylim = scope, main = paste('model3 (relative.gof = ', r.square(model3), ')'), xlab = "", ylab = "Poll") plot(model4, ylim = scope, main = paste('model4 (relative.gof = ', r.square(model4), ')'), xlab = "", ylab = "Poll") plot(model5, ylim = scope, main = paste('model5 (relative.gof = ', r.square(model5), ')'), xlab = "", ylab = "Poll") plot(model6, ylim = scope, main = paste('model6 (relative.gof = ', r.square(model6), ')'), xlab = "", ylab = "Poll") plot(model, ylim = scope, main = paste('all (relative.gof = ', r.square(model), ')'), xlab = "", ylab = "Poll")
- plot(trend.seasonal.model, ylim = scope, main = paste('add Seasonal (relative.gof = ', r.square(trend.seasonal.model), ')'), xlab = "", ylab = "Poll")
model <- bsts(y ~dt[,274]+dt[,272], data = dt, niter = 1000, state.specification=ss1, expected.model.size = 3, bma.method = c("ODA"), seed = 1) summary(model) par(mfrow=c(1,1)) plot(model)
- pred <- predict(model, horizon = 12, burn = 100)
- plot(pred)
plot(model, "components") plot(model, "coefficients") plot(model, "size") plot(model, "predictors") plot(model1, 'state') plot(trend.model, 'state') print_names = function(var_names){ for (i in var_names){ i =strsplit(i, "X")[[1]][2] i = as.numeric(i) #cat(i, "",sep = ",") cat(i, name[i], " ",sep = "-->") } } var_names = rownames(summary(model)$coefficients)[1:50] print_names(var_names) 120,115,113,110,109,107,103,102,100,99,98,94,92,91,89,87,86, 82,80,76,74,71,68,64,60,57,52,49, 48,45,41,39,37,35,33,29,25,24,20,18,14,13,11,10,8,6,5,2,1,65, mydata = dt[,var_names] 120-->民意--> 115-->调查--> 113-->合作--> 110-->市民--> 109-->基本法--> 107-->特区政府--> 103-->活动--> 102-->示威者--> 100-->示威--> 99-->社会--> 98-->记者--> 94-->一国两制--> 92-->法律--> 91-->争取--> 89-->英国--> 87-->党--> 86-->尊重--> 82-->全国人大常委会--> 80-->反对--> 76-->组织--> 74-->暴力--> 71-->戴耀廷--> 68-->行政--> 64-->梁振英--> 60-->占--> 57-->事件--> 52-->政策--> 49-->诉求--> 48-->集会--> 45-->青年--> 41-->政制--> 39-->占领--> 37-->普选--> 35-->内地--> 33-->建制--> 29-->运动--> 25-->特首--> 24-->学生--> 20-->行政长官--> 18-->团体--> 14-->年轻人--> 13-->制度--> 11-->警方--> 10-->安全--> 8-->港人--> 6-->激进--> 5-->官员--> 2-->立法会--> 1-->反对派--> 65-->理性-->
- abline(h=0, col = 'red')
左图是所有用户中的一部分,右图是活跃用户(推特数量大于50条)的一部分。基本证实 <math> S(t) \sim t^\mu</math>。参见Human Mobility。
github 地址:https://github.com/qinqiang2015/tweets/blob/master/users_urls_at_name_rt_name.ipynb
王成军、党明辉、顾慧君, 参与者、议题与行动 香港_占领中环_运动中的新闻报道, 2015,计算传播学实验中心手稿http://computational-communication.com/wiki/images/b/b7/20160113%E3%80%8A%E5%8F%82%E4%B8%8E%E8%80%85%E3%80%81%E8%AE%AE%E9%A2%98%E4%B8%8E%E8%A1%8C%E5%8A%A8_%E9%A6%99%E6%B8%AF_%E5%8D%A0%E9%A2%86%E4%B8%AD%E7%8E%AF_%E8%BF%90%E5%8A%A8%E4%B8%AD%E7%9A%84%E6%96%B0%E9%97%BB%E6%8A%A5%E9%81%93%E3%80%8B.pdf 论文链接