Table of Contents Python script 数据清理提取term每天出现次数提取at, rt, url, hashtag, user每天出现的次数计算每天的tweets和users数量抓取出tweets中url BSTS From zipf to allowmetric Distinct hashtags over time 参考文献

Python script

import sys
def flushPrint(s):
    sys.stdout.write('\r')
    sys.stdout.write('%s' % s)
    sys.stdout.flush()

数据清理

提取term每天出现次数

import re
import twitter_text #pip install twitter-text-py
import csv
import sys
 
def flushPrint(s):
    sys.stdout.write('\r')
    sys.stdout.write('%s' % s)
    sys.stdout.flush()
     
def extract_rt_user(tweet):
    rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", re.IGNORECASE)
    try:
        rt_user_name = rt_patterns.findall(tweet)[0][1].strip(' @')
        return rt_user_name
    except IndexError, e:
        pass

def cleanTweet(tweet):
    replace symbols, return word list
    #print tweet
    tweet = tweet.decode('utf-8').strip() 
    rt_name = extract_rt_user(tweet)
    ex = twitter_text.Extractor(tweet)
    at_names = ex.extract_mentioned_screen_names()
    urls = ex.extract_urls()
    hashtags = ex.extract_hashtags()
    for ia in at_names:
        tweet = tweet.replace(ia, )
    for j in urls:
        tweet = tweet.replace(j, )
    tweet = tweet.replace('RT @', ).replace('@', ).replace('"', ).replace('#', )
    seps = ['(', ')', '!', ':', '.', '?', ',', '=', u'\xa0', '/', '\\', '\n', '-', '|', ';', u'&amp', '*', "'", '+']
    for s in seps:
        tweet = tweet.replace(s, ' ')
    tweet = tweet.split(' ')
    tweet = [t.lower() for t in tweet if t != ]
    return tweet

from collections import defaultdict
import csv

import sys
def flushPrint(s):
    sys.stdout.write('\r')
    sys.stdout.write('%s' % s)
    sys.stdout.flush()

data_dict = defaultdict(lambda: defaultdict(int))
error_num = 0
line_num = 0
total_num = 0

bigfile = open('/Users/chengjun/百度云同步盘/Writing/OWS/ows-clean.txt', 'rb')
chunkSize = 100000000
chunk = bigfile.readlines(chunkSize)
total_num += len(chunk)
while chunk:
    lines = csv.reader((line.replace('\x00',) for line in chunk), delimiter=',', quotechar='"')
    for i in lines:
        line_num +=1
        if line_num % 10000 ==0:
            flushPrint(line_num)
        try:
            date = i[3]
            tweet = i[1]
            if len(date) == 10:
                tweet = cleanTweet(tweet)
                for tt in tweet:
                    data_dict[tt][date] += 1
            else:
                error_num+=1
        except:
            pass
    chunk = bigfile.readlines(chunkSize)
print line_num, total_num, error_num

import json
with open('/Users/chengjun/百度云同步盘/Writing/OWS/term_vectors.json', 'w') as f:
    json.dump(data_dict, f)

提取at, rt, url, hashtag, user每天出现的次数

import re
import twitter_text #pip install twitter-text-py
import csv
import sys

def flushPrint(s):
    sys.stdout.write('\r')
    sys.stdout.write('%s' % s)
    sys.stdout.flush()
    
def extract_rt_user(tweet):
    rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", re.IGNORECASE)
    try:
        rt_user_name = rt_patterns.findall(tweet)[0][1].strip(' @')
        return rt_user_name
    except IndexError, e:
        pass

from collections import defaultdict
import csv

bigfile = open('/Users/chengjun/百度云同步盘/Writing/OWS/ows-clean.txt', 'rb')
chunkSize = 100000000
chunk = bigfile.readlines(chunkSize)

format: data[date][attribute] = int
at_dict = defaultdict(lambda: defaultdict(int))
rt_dict = defaultdict(lambda: defaultdict(int))
url_dict = defaultdict(lambda: defaultdict(int))
tag_dict = defaultdict(lambda: defaultdict(int))
user_dict = defaultdict(lambda: defaultdict(int))
error_num =0
line_num = 0
while chunk:
    lines = csv.reader((line.replace('\x00',) for line in chunk), delimiter=',', quotechar='"')
    for i in lines:
        line_num +=1
        if line_num % 10000 ==0:
            flushPrint(line_num)
        try:
            date = i[3]
            user = i[8] # from_user_id
            tweet = i[1]
            ex = twitter_text.Extractor(tweet)
            at_names = ex.extract_mentioned_screen_names()
            urls = ex.extract_urls()
            hashtags = ex.extract_hashtags()
            rt_user = extract_rt_user(tweet)
            if len(date) == 10:
                if at_names:
                    for at_name in at_names:
                        at_dict[date][at_name]+=1
                if rt_user:
                    rt_dict[date][rt_user]+=1
                if urls:
                    for url in urls:
                        url_dict[date][url]+=1
                if hashtags:
                    for tag in hashtags:
                        tag_dict[date][tag]+=1
                user_dict[date][user]+=1
            else:
                error_num+=1
        except Exception, e:
            print e
            pass
    chunk = bigfile.readlines(chunkSize)
print line_num,  error_num

import json
with open('/Users/chengjun/百度云同步盘/Writing/OWS/at_dict.json', 'w') as f:
    json.dump(at_dict, f)
with open('/Users/chengjun/百度云同步盘/Writing/OWS/rt_dict.json', 'w') as f:
    json.dump(rt_dict, f)
with open('/Users/chengjun/百度云同步盘/Writing/OWS/url_dict.json', 'w') as f:
    json.dump(url_dict, f)
with open('/Users/chengjun/百度云同步盘/Writing/OWS/tag_dict.json', 'w') as f:
    json.dump(tag_dict, f)
with open('/Users/chengjun/百度云同步盘/Writing/OWS/user_dict.json', 'w') as f:
    json.dump(user_dict, f)

800px

计算每天的tweets和users数量

从tweets中定位date字符，然后读取出所有的date和users，并且计算出读取的行数、总的行数和出错的行数

from collections import defaultdict
import csv
 
data_dict = defaultdict(list)
 
error_num = 0
line_num = 0
total_num = 0
 
bigfile = open('D:/Data/ows/ows-raw.txt', 'rb')
chunkSize = 100000000
chunk = bigfile.readlines(chunkSize)
while chunk:
    total_num += len(chunk)
    lines = csv.reader((line.replace('\x00',) for line in chunk), delimiter=',', quotechar='"')
    for i in lines:
        line_num +=1
        try:
            date = i[3]
            if len(date) == 10:
                data_dict[date].append(i[8])
            else:
                error_num+=1
        except:
            pass
    chunk = bigfile.readlines(chunkSize)
print line_num, total_num, error_num

以“date-tweets-users”格式输出

import pandas as pd
 
data = [[i, len(data_dict[i]), len(set(data_dict[i]))] for i in data_dict]
dat = pd.DataFrame(data, columns = ['date', 'tweets', 'users'])
dat.date = pd.to_datetime(dat.date)
dat = dat.sort(['date', 'tweets', 'users'])
print dat

返回结果
          date  tweets  users
108 2011-10-06   49638  18487
107 2011-10-07   65238  23460
110 2011-10-08   65949  23243
..         ...     ...    ...
13  2012-02-16   12837   4428
14  2012-02-17   12468   4299
21  2012-02-18    4859   2012

[136 rows x 3 columns]

绘制每天的tweets和users数量变化折线图

400px

对users和tweets数量进行拟合，结果显示符合幂律分布

400px

抓取出tweets中url

从tweets文件中抓取出url部分，并计算出读取的行数，文件总的行数，以及出错的行数。

from collections import defaultdict
import csv
import re
import sys
def flushPrint(s):
    sys.stdout.write('\r')
    sys.stdout.write('%s' % s)
    sys.stdout.flush()   
from collections import defaultdict
import csv

data_dict = defaultdict(list) # error
error_num = 0
line_num = 0
total_num = 0
 
bigfile = open('D:/Data/ows-raw.txt', 'rb')
chunkSize = 10000000
chunk = bigfile.readlines(chunkSize)
while chunk:
    total_num += len(chunk)
    lines = csv.reader((line.replace('\x00',) for line in chunk), delimiter=',', quotechar='"')
    for i in lines:
        line_num+=1
        if line_num%1000000==0:
            flushPrint(line_num)
        try:
            url_patterns=re.compile(r"http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+",re.IGNORECASE)
            urls=url_patterns.findall(i[1]) # error
            date=i[3]
            for ui in urls: # error
                if len(date)==10:
                    data_dict[date].append(ui)
        except Exception, e:
            print e
            error_num+=1
            pass
    chunk = bigfile.readlines(chunkSize) # error
print line_num, total_num,error_num

构建一个数据框，格式为date-urls-users

import pandas as pd

data=[[d,len(data_dict[d]),len(set(data_dict[d]))] for d in data_dict]
data = [[i, len(data_dict[i]), len(set(data_dict[i]))] for i in data_dict]
dat = pd.DataFrame(data, columns = ['date','urls',"users"])
dat.date = pd.to_datetime(dat.date)
dat = dat.sort(['date', 'urls',"users"])
print dat

绘制users和urls以日期为x轴的累积频数折线图，结果显示二者的波动正相关

import matplotlib.pyplot as plt
import matplotlib
% matplotlib inline
import numpy as np
%matplotlib inline
import matplotlib.cm as cm
import matplotlib.pyplot as plt
 
fig = plt.figure(figsize=(15, 4),facecolor='white')
plt.plot(dat.date, dat.urls, 'r-o', label = "urls")
plt.plot(dat.date, dat.users, 'g-o', label = "users")
plt.legend(loc=2,fontsize=8)
plt.yscale('log')
plt.show()

400px

对每天的urls和users数量做拟合，看是否呈幂律分布，结果显示二者符合幂律分布

import statsmodels.api as sm
 
x = np.log(dat.users)
y = np.log(dat.urls)
xx = sm.add_constant(x, prepend=True)
res = sm.OLS(y,xx).fit()
constant,beta = res.params
r2 = res.rsquared
fig = plt.figure(figsize=(8, 4),facecolor='white')
plt.plot(dat.users, dat.urls, 'rs', label= 'Data')
plt.plot(np.exp(x), np.exp(constant + x*beta),"-", label = 'Fit')
plt.yscale('log');plt.xscale('log')
plt.xlabel(r'$Users$')
plt.ylabel(r'$Urls$')
plt.text(max(dat.users)/4,max(dat.urls)/20,
         r'$\beta$ = ' + str(round(beta,2)) +'\t' + r'$R^2$ = ' + str(round(r2, 2)))
plt.legend(loc=2,fontsize=10, numpoints=1)
plt.axis('tight')
plt.show()

400px

BSTS

BSTS Code of Occupy Central

setwd("/Users/chengjun/bigdata/")
oc = read.csv("./occupycentral_wordfreq.csv", sep = ",",
              header = F, stringsAsFactors = F,  encoding= "utf-8")

oc15 = read.csv("./occupycentral_wordfreq_tfidf.csv", sep = ",",
              header = F, stringsAsFactors = F )

query = read.csv("./occupycentralgoogletrends.csv", sep = ",",
              header = T, stringsAsFactors = F,  encoding= "utf-8")

query = query[1:27,]
names(query)
data = data.frame( t(oc[,2:28]) )
data$queryf = log(query$fanti+1)
data$queryj = log(query$jianti+1)
data$querye = log(query$occupy.central+1)
queryb = c(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 67.1245791245791, 2.097643097643072,
                54.538720538720554, 2.097643097643072, 0.0, 0.0, 0.0,
                10.488215488215474, 0.0, 31.464646464646478, 0.0,
                10.488215488215417, 4.195286195286144, 31.464646464646421,
                199.27609427609428, 125.85858585858585, 12.585858585858489,
                2.097643097643072, 0.0, 0.0, 0.0)
data$queryb = log(queryb +1)

data$y = c(1.        ,   8.33333333,  31.74193548,  34.72413793,
           41.70833333,  50.06666667,  52.35483871,  33.80645161,
           43.46666667,  36.53333333,  29.36666667,  30.03225806,
           34.29032258,  31.21428571,  33.93548387,  34.96666667,
           60.16129032,  22.62068966,  59.06451613,  27.51612903,
           16.8       ,  55.93548387,  32.53333333,  77.29032258,
           70.35483871,  38.78571429,  39.89285714)

name =oc[,1]
cat(name)

require(zoo)

date = c('2013-01-01', '2013-02-01','2013-03-01','2013-04-01',
        '2013-05-01','2013-06-01', '2013-07-01','2013-08-01',
        '2013-09-01','2013-10-01','2013-11-01','2013-12-01',
        '2014-01-01','2014-02-01', '2014-03-01','2014-04-01',
        '2014-05-01','2014-06-01','2014-07-01','2014-08-01',
        '2014-09-01','2014-10-01','2014-11-01','2014-12-01',
        '2015-01-01','2015-02-01','2015-03-01')

date = as.Date(strptime(date, "%Y-%m-%d"))


dt = zoo(data, date)

par(mfrow=c(3, 2))
plot(dt[,121:125], main = )

matplot(scale(dt[,121:125]), type = "l", lwd = 2, main = ,ylab = "value")
legend(10, 3,c('Google Fanti','Google Jianti','Google English', "Baidu Jianti",'News'),
       col=1:5,lty=1:5,cex=1,ncol=1, lwd = 2)

cor(dt[,121:125])



clustering



mydata = data.frame( oc15[,2:28] )
name = oc15[,1]

mydata <- scale(t(mydata)) # standardize variables
class(mydata)
Determine number of clusters
wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))
for (i in 2:30) wss[i] <- sum(kmeans(mydata, 
                                     centers=i)$withinss)
plot(1:30, wss, type="b", xlab="Number of Clusters",
     ylab="Within groups sum of squares")

Ward Hierarchical Clustering
d <- dist(mydata, method = "euclidean") # distance matrix
fit <- hclust(d, method="ward.D") 

plot(fit) # display dendogram
groups <- cutree(fit, k=5) # cut tree into 5 clusters
draw dendogram with red borders around the 5 clusters 
rect.hclust(fit, k=5, border="red")

printGroup = function(n){
  for (i in which(groups ==n)){
    cat(name[i], ' , ')
  }
}
printGroup(1)

for (i in 1:15){
  cat(i, name[i], sep = "-->")
}

反对派  , 和平  , 警方  , 团体  , 行政长官  , 意见  , 学生  , 大学  , 运动  , 国家  , 
内地  , 违法  , 选举  , 委会  , 提名  , 政策  , 人民  , 批评  , 事件  , 中国  , 本港  ,
梁振英  , 游行  , 戴耀廷  , 法治  , 组织  , 利益  , 自由  , 我们  , 委员会  , 争取  , 
法律  , 记者  , 示威  , 个人  , 活动  , 发起人  , 特区政府  , 基本法  , 国际  , 经济  , 传媒  , 民意  , 

printGroup(2)

立法会  , 方案  , 港人  , 公民  , 会议  , 特首  , 普选  , 占领  , 行动  , 政治  , 民主  , 行政  ,
代表  , 政改  , 反对  , 中环  , 党  , 政府  , 主席  , 社会  , 中央  , 市民  , 议员  , 

printGroup(3)

中央政府  , 抗命  , 官员  , 激进  , 安全  , 港独  , 制度  , 年轻人  , 民主派  , 程序  , 
政党  , 台湾  , 美国  , 生活  , 商讨  , 公众  , 建制  , 教育  , 抗争  , 政制  , 共识  , 
繁荣  , 青年  , 投资  , 集会  , 诉求  , 建议  , 规定  , 地方  , 学者  , 理性  , 市场  , 
民主党  , 台独  , 暴力  , 言论  , 泛民  , 认同  , 全国人大常委会  , 北京  , 尊重  , 
质疑  , 英国  , 公司  , 一国两制  ,
历史  , 投票  , 示威者  , 候选人  , 爱港  , 对抗  , 合作  , 爱国  , 调查  , 公投  ,

printGroup(4)
香港  , 占  , 




bsts of news




library(bsts)
dt$poll=(lowess(dt$poll, f = .03))$y
plot(date, dt$y, 'l', col='red')

ss0 <- AddLocalLevel(list(), dt$y)
ss1 <- AddSeasonal(ss0, y = dt$y, nseasons = 9, season.duration = 3) #27

trend.model <- bsts(dt$y, ss0, niter = 1000,  bma.method = c("ODA"),seed = 1)
trend.seasonal.model <- bsts(dt$y, ss1, niter = 1000, bma.method = c("ODA"), seed = 1)
model <- bsts(y ~ ., data = dt, niter = 1000, state.specification=ss0,
              expected.model.size = 3, bma.method = c("ODA"), seed = 1)

model1<- bsts(y ~dt[,120], data = dt, niter = 1000, state.specification=ss0,
              expected.model.size = 3, bma.method = c("ODA"), seed = 1)

model2<- bsts(y ~dt[,120]+dt[,115], data = dt, niter = 1000, state.specification=ss0,
              expected.model.size = 3, bma.method = c("ODA"), seed = 1)

model3<- bsts(y ~dt[,120]+dt[,115]+dt[,113], data = dt, niter = 1000, state.specification=ss0,
              expected.model.size = 3, bma.method = c("ODA"), seed = 1)

model4<- bsts(y ~dt[,120]+dt[,115]+dt[,113]+dt[,110], data = dt, niter = 1000, state.specification=ss0,
              expected.model.size = 3, bma.method = c("ODA"), seed = 1)

model5<- bsts(y ~dt[,120]+dt[,115]+dt[,113]+dt[,110]+dt[,109], data = dt, niter = 1000, state.specification=ss0,
              expected.model.size = 3, bma.method = c("ODA"), seed = 1)

model6<- bsts(y ~dt[,120]+dt[,115]+dt[,113]+dt[,110]+dt[,109]+dt[,107], data = dt, niter = 1000, state.specification=ss0,
              expected.model.size = 3, bma.method = c("ODA"), seed = 1)
> print_names(var_names)
120, 民意,  115, 调查,  113, 合作,  110, 市民,  109, 基本法,  107, 特区政府,  103, 活动,
102, 示威者,  100, 示威,  99, 社会,  98, 记者,  94, 一国两制,  92, 法律,  91, 争取,  
89, 英国,  87, 党,  86, 尊重,  82, 全国人大常委会,  80, 反对,  76, 组织,  74, 暴力,  
71, 戴耀廷,  68, 行政,  64, 梁振英,  60, 占,  57, 事件,  52, 政策,  49, 诉求,  48, 集会,  
45, 青年,  41, 政制,  39, 占领,  37, 普选,  35, 内地,  33, 建制,  29, 运动,  25, 特首,  
24, 学生,  20, 行政长官,  18, 团体,  14, 年轻人,  13, 制度,  11, 警方,  10, 安全, 
8, 港人,  6, 激进,  5, 官员,  2, 立法会,  1, 反对派,  65, 理性,  


CompareBstsModels(list(trend = trend.model,
                       #"trend and seasonal" = trend.seasonal.model,
                       "model1"=model1,
                       "model2"=model2,
                       "model3"=model3,
                       "model4"=model4,
                       "model5"=model5,
                       "model6"=model6,
                       "all"=model), xlab = "")

scope = c(min(dt$y), max(dt$y))
r.square = function(model) as.character(round(summary(model)$relative.gof, 3))

par(mfrow=c(4, 2))
par(mar=c(rep(3, 4)))
plot(trend.model, ylim = scope, main = paste('Trend (relative.gof = ', r.square(trend.model), ')') , xlab = "", ylab = "Poll")
plot(trend.seasonal.model, ylim = scope, main = paste('add Seasonal (relative.gof = ', r.square(trend.seasonal.model), ')'), xlab = "", ylab = "Poll")
plot(model1, ylim = scope, main = paste('model1 (relative.gof = ', r.square(model1), ')'), xlab = "", ylab = "Poll")
plot(model2, ylim = scope, main = paste('model2 (relative.gof = ', r.square(model2), ')'), xlab = "", ylab = "Poll")
plot(model3, ylim = scope, main = paste('model3 (relative.gof = ', r.square(model3), ')'), xlab = "", ylab = "Poll")
plot(model4, ylim = scope, main = paste('model4 (relative.gof = ', r.square(model4), ')'), xlab = "", ylab = "Poll")
plot(model5, ylim = scope, main = paste('model5 (relative.gof = ', r.square(model5), ')'), xlab = "", ylab = "Poll")
plot(model6, ylim = scope, main = paste('model6 (relative.gof = ', r.square(model6), ')'), xlab = "", ylab = "Poll")
plot(model, ylim = scope, main = paste('all (relative.gof = ', r.square(model), ')'), xlab = "", ylab = "Poll")


pred <- predict(model, horizon = 12, burn = 100)
plot(pred)

model <- bsts(y ~dt[,274]+dt[,272], data = dt, niter = 1000, state.specification=ss1,
                      expected.model.size = 3, bma.method = c("ODA"), seed = 1)

summary(model)

par(mfrow=c(1,1))
plot(model)
abline(h=0, col = 'red')
plot(model, "components")
plot(model, "coefficients")
plot(model, "size")
plot(model, "predictors")
plot(model1, 'state')

plot(trend.model, 'state')


print_names = function(var_names){
  for (i in var_names){
    i =strsplit(i, "X")[[1]][2]
    i = as.numeric(i)
    #cat(i, "",sep = ",")
    cat(i, name[i], " ",sep = "-->")
  }
}

var_names = rownames(summary(model)$coefficients)[1:50]
print_names(var_names)

120,115,113,110,109,107,103,102,100,99,98,94,92,91,89,87,86,
82,80,76,74,71,68,64,60,57,52,49,
48,45,41,39,37,35,33,29,25,24,20,18,14,13,11,10,8,6,5,2,1,65,

mydata = dt[,var_names]

120-->民意--> 115-->调查--> 113-->合作--> 110-->市民--> 109-->基本法--> 107-->特区政府--> 103-->活动--> 102-->示威者--> 100-->示威--> 99-->社会--> 98-->记者--> 94-->一国两制--> 92-->法律--> 91-->争取--> 89-->英国--> 87-->党--> 86-->尊重--> 82-->全国人大常委会--> 80-->反对--> 76-->组织--> 74-->暴力--> 71-->戴耀廷--> 68-->行政--> 64-->梁振英--> 60-->占--> 57-->事件--> 52-->政策--> 49-->诉求--> 48-->集会--> 45-->青年--> 41-->政制--> 39-->占领--> 37-->普选--> 35-->内地--> 33-->建制--> 29-->运动--> 25-->特首--> 24-->学生--> 20-->行政长官--> 18-->团体--> 14-->年轻人--> 13-->制度--> 11-->警方--> 10-->安全--> 8-->港人--> 6-->激进--> 5-->官员--> 2-->立法会--> 1-->反对派--> 65-->理性-->

From zipf to allowmetric

200px 200px 200px

Distinct hashtags over time

400px 400px

左图是所有用户中的一部分，右图是活跃用户（推特数量大于50条）的一部分。基本证实 <math> S(t) \sim t^\mu</math>。参见Human Mobility。

800px

参考文献

github 地址：https://github.com/qinqiang2015/tweets/blob/master/users_urls_at_name_rt_name.ipynb

王成军、党明辉、顾慧君，参与者、议题与行动香港_占领中环_运动中的新闻报道, 2015，计算传播学实验中心手稿http://computational-communication.com/wiki/images/b/b7/20160113%E3%80%8A%E5%8F%82%E4%B8%8E%E8%80%85%E3%80%81%E8%AE%AE%E9%A2%98%E4%B8%8E%E8%A1%8C%E5%8A%A8_%E9%A6%99%E6%B8%AF_%E5%8D%A0%E9%A2%86%E4%B8%AD%E7%8E%AF_%E8%BF%90%E5%8A%A8%E4%B8%AD%E7%9A%84%E6%96%B0%E9%97%BB%E6%8A%A5%E9%81%93%E3%80%8B.pdf 论文链接

OWS tweets - socrateslab/zh GitHub Wiki

Table of Contents

Python script

数据清理

提取term每天出现次数

提取at, rt, url, hashtag, user每天出现的次数

计算每天的tweets和users数量

抓取出tweets中url

BSTS

From zipf to allowmetric

Distinct hashtags over time

参考文献

⚠️ GitHub.com Fallback ⚠️

OWS tweets - socrateslab/zh GitHub Wiki

Table of Contents

Python script

数据清理

提取term每天出现次数

提取at, rt, url, hashtag, user每天出现的次数

计算每天的tweets和users数量

抓取出tweets中url

BSTS

From zipf to allowmetric

Distinct hashtags over time

参考文献

⚠️ **GitHub.com Fallback** ⚠️

⚠️ GitHub.com Fallback ⚠️