python - socrateslab/zh GitHub Wiki
https://github.com/solarlee/Awesome-Python-Toolbox
- Numerical Python http://nbviewer.jupyter.org/github/jrjohansson/numerical-python-book-code/blob/master/ch13-code-listing.ipynb
- An Introduction to Statistics with Python http://nbviewer.jupyter.org/github/data-journalism/statsintro_python/tree/master/ipynb/
有时为了可以重复,我们需要这样设置:
import random l = [1,2,3,4,5] r = random.Random(500) # seed number is arbitraryrandom.Random(500).choice(l), random.Random(500).choice(l)
- r.choice(l), r.choice(l)
def log_binning(x, y, bin_count=50): max_x = np.log10(max(x)) max_y = np.log10(max(y)) max_base = max([max_x,max_y]) xx = [i for i in x if i>0] min_x = np.log10(np.min(xx)) bins = np.logspace(min_x,max_base,num=bin_count) # Based on: http://stackoverflow.com/questions/6163334/binning-data-in-python-with-scipy-numpy bin_means_y = (np.histogram(x,bins,weights=y)[0] / np.histogram(x,bins)[0]) bin_means_x = (np.histogram(x,bins,weights=x)[0] / np.histogram(x,bins)[0]) return bin_means_x,bin_means_y
一般而言使用hist命令即可,当我们想要先对数据取log,然后使用hist命令的时候,需要如下调整。
# The most direct way is to just compute the log10 of the limits, compute linearly spaced bins, and then convert back by raising to the power of 10, as below:import pylab as pl import numpy as np data = np.random.normal(size=10000) MIN, MAX = .01, 10.0 pl.figure() pl.hist(data, bins = 10 ** np.linspace(np.log10(MIN), np.log10(MAX), 50)) pl.gca().set_xscale("log") pl.show()
- http://stackoverflow.com/questions/6855710/how-to-have-logarithmic-bins-in-a-python-histogram
import pandas as pd data = {'xi':range(1, 12), 'y':[3, 5, 9, 12, 18, 22, 28, 35, 49, 60,65]} df = pd.DataFrame(data)
Pandas中的GroupBy操作效率非常高,可以使用.get_group()方法获得分组后的每组数据的内容,例如从7000万条通话记录中寻找其中14万个用户的基站序列,可以先对用户进行GroupBy,即可获取他们的基站序列。
In [16]: df3 = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]}) In [17]: df3.groupby(['X']).get_group('A') Out[17]: X Y 0 A 1 2 A 3 In [18]: df3.groupby(['X']).get_group('B') Out[18]: X Y 1 B 4 3 B 2dfgroupbyobj = call_data_selected.groupby('calling_nbr') node_list = dfgroupbyobj.get_group(i)['calling_cell'].tolist()
- 获取每个用户的基站序列
sort a dict
webt = sorted(web.iteritems(), key=lambda (k,v): (-v,k)) sorted(data,key=lambda x:-x) sorted(d.items(), key=lambda x: x[1]) sorted(data, reverse = True )
how to return index of a sorted list?
>>> s = [2, 3, 1, 4, 5] >>> sorted(range(len(s)), key=lambda k: s[k]) [2, 0, 1, 3, 4] >>>
http://file.allitebooks.com/20160505/Matplotlib%20for%20Python%20Developers.pdf
Matplotlib for Python Developers
强烈推荐此书
参见:https://networkx.github.io/documentation/latest/reference/
import matplotlib.pyplot as plt %matplotlib inline plt.figure(figsize=(10,10)) pos=nx.spring_layout(G) nx.draw_networkx_nodes(G,pos,node_size=20, node_color='b') nx.draw_networkx_labels(G,pos,fontsize=5) nx.draw_networkx_edges(G,pos,edge_color='k') edge_labels = nx.get_edge_attributes(G,'times') nx.draw_networkx_edge_labels(G, pos, labels = edge_labels, label_pos=0.5) plt.show()
如图
from jieba import cut from sklearn import feature_extraction from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer txt = ["我来到北京清华大学", "他来到了网易杭研大厦", "小明硕士毕业与中国科学院", "我爱北京天安门"] corpus =[r' '.join(cut(i, cut_all=False)) for i in txt ] vectorizer=CountVectorizer()#该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频 transformer=TfidfTransformer()#该类会统计每个词语的tf-idf权值 tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))#第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵 word=vectorizer.get_feature_names()#获取词袋模型中的所有词语 weight=tfidf.toarray() for i in word: print i weight
https://github.com/whtsky/WeRoBot
import powerlaw def plotPowerlaw(data,ax,col,xlab): fit = powerlaw.Fit(data,xmin=1) fit.plot_pdf(color = col, linewidth = 2) fit = powerlaw.Fit(data) fit.power_law.plot_pdf(color = col, linestyle = 'dotted', ax = ax) a,x = (fit.power_law.alpha,fit.power_law.xmin) minx,maxx=ax.get_xlim() miny,maxy=ax.get_ylim() ax.text(minx*5,maxy/10,r"$\alpha = %d \:\:, x_{min} = %d$" % (a,x), fontsize = 20) ax.set_xlabel(xlab, fontsize = 20) ax.set_ylabel('$P(k)$', fontsize = 20) def plotDegreeDistribution(G): # G is a networkx object degs = defaultdict(int) for i in G.degree().values(): degs[i]+=1 items = sorted ( degs.items () ) x, y = np.array(items).T x, y = np.array(items).T y_sum = np.sum(y) y = y/float(y_sum) plt.plot(x, y, 'b-o') plt.xscale('log') plt.yscale('log') plt.legend(['Degree']) plt.xlabel('$k$', fontsize = 20) plt.ylabel('$P(k)$', fontsize = 20)
import statsmodels.api as sm import numpy as np x = np.log(range(1,len(degree_list)+1)) y = np.log([d+1 for d in degree_list]) xx = sm.add_constant(x, prepend=True) res = sm.OLS(y,xx).fit() constant,beta = res.params r2 = res.rsquared fig = plt.figure(figsize=(8, 4),facecolor='white') plt.plot(range(len(degree_list)), degree_list, 'rs', label= 'Data') plt.plot(np.exp(x), np.exp(constant + x*beta),"-", label = 'Fit') plt.yscale('log');plt.xscale('log') plt.xlabel(r'$Users$', fontsize = 20) plt.ylabel(r'$Replies$', fontsize = 20) plt.text(max(range(len(degree_list)))/300,max(degree_list)/20, r'$\beta$ = ' + str(round(beta,2)) +'\n' + r'$R^2$ = ' + str(round(r2, 2))) plt.legend(loc=2,fontsize=10, numpoints=1) plt.axis('tight') plt.show() print beta, r2
# 将实时结果记录到文件中 for k, i in enumerate(user_list): node_entropy = entropy(node_list) entropy_list.append(node_entropy) if k % 100 ==0: log_file = open('log_file.log', 'w+') log_file.write(str(k)) log_file.write('\t') log_file.write(str(float(k)/nums)) log_file.write('\t') log_file.write(str(node_entropy)) log_file.write('\n') log_file.flush()def flushPrint(s): sys.stdout.write('\r') sys.stdout.write('%s' % s) sys.stdout.flush()
- 将实时结果和进度输出都屏幕上