Python.pandas - k821209/pipelines GitHub Wiki
Pandas
- http://www.gregreda.com/2013/10/26/working-with-pandas-dataframes/
- http://pandas.pydata.org/pandas-docs/dev/cookbook.html
- https://gist.github.com/why-not/4582705
- https://github.com/pandas-dev/pandas/blob/master/doc/cheatsheet/Pandas_Cheat_Sheet.pdf
pandas plot
# pandas x ticks label rotation
ax = eedf[3].value_counts().head(20).plot('bar')
ax.set_xticklabels(ax.get_xticklabels(),rotation=45,ha='right')
plt.show()
Pandas corr to tree
import pandas as pd
import matplotlib.pyplot as plt
from scipy.cluster import hierarchy as hc
import numpy as np
# pandas .corr() 결과임.
df_diff_acc_same_treat.sort_index(inplace=True)
df_diff_acc_same_treat = df_diff_acc_same_treat[df_diff_acc_same_treat.index]
# 트리그리기
corr = 1 - df_diff_acc_same_treat
corr_condensed = hc.distance.squareform(corr) # convert to condensed
z = hc.linkage(corr_condensed, method='average')
dendrogram = hc.dendrogram(z, labels=corr.columns)
plt.show()
Pandas df to pdf
def to_html_pretty(df, filename='/tmp/out.html', title=''):
'''
Write an entire dataframe to an HTML file
with nice formatting.
Thanks to @stackoverflowuser2010 for the
pretty printer see https://stackoverflow.com/a/47723330/362951
'''
ht = ''
if title != '':
ht += '<h2> %s </h2>\n' % title
ht += df.to_html(classes='wide', escape=False)
with open(filename, 'w') as f:
f.write(HTML_TEMPLATE1 + ht + HTML_TEMPLATE2)
HTML_TEMPLATE1 = '''
<html>
<head>
<style>
h2 {
text-align: center;
font-family: Helvetica, Arial, sans-serif;
}
table {
margin-left: auto;
margin-right: auto;
}
table, th, td {
border: 1px solid black;
border-collapse: collapse;
}
th, td {
padding: 5px;
text-align: center;
font-family: Helvetica, Arial, sans-serif;
font-size: 90%;
}
table tbody tr:hover {
background-color: #dddddd;
}
.wide {
width: 90%;
}
</style>
</head>
<body>
'''
HTML_TEMPLATE2 = '''
</body>
</html>
'''
def df2pdf(df,out_pdf,title):
intermediate_html = '/tmp/intermediate.html'
to_html_pretty(df,intermediate_html,title)
import weasyprint
weasyprint.HTML(intermediate_html).write_pdf(out_pdf)
SPSS to pandas
import pandas as pd
import numpy as np
import savReaderWriter as spss
# metadata parsing
## column metadata 로딩
with spss.SavHeaderReader('./HN0811_ALLDXA(65세남성).sav',ioUtf8=True) as header:
metadata = header.all()
report = str(header)
dicC2M = dict([[x, y.decode('euc-kr')] for x, y in metadata.varLabels.items()])
# data parsing
with spss.SavReader('./HN0811_ALLDXA(65세남성).sav') as reader:
records = reader.all()
header = reader.header
df = pd.DataFrame(records,columns=header)
df.head()
# na많은 column 제외하기
nacount = np.array([len(df[x].isnull().nonzero()[0]) for x in df.columns])
df_lownas_dropna = df[df.columns[(nacount < 100 )]].dropna()
df_lownas_dropna.head()
merge df
reduce(lambda a,b : pd.merge(a,b,on="target_id",how='outer'),df_list_v55)
determine type
isinstance(df_sub_ix.loc['STRG.12455.1|m.45759'],pd.Series)
True
read_csv
df_gff = pd.read_csv(file_gff,sep='\t',header=None,comment='#')
# comment line을 무시!
Index
df.set_index(['fileName','phrase'],inplace=True)
df.sortlevel(inplace=True)
index column value ret
df_sam_index.index.values
df_sam_index.index.get_level_values('READID') # 특히 멀티 인덱싱일때 필요
multi column selection
http://stackoverflow.com/questions/16353729/pandas-how-to-use-apply-function-to-multiple-columns
def get_genename(df):
x = df['platename']
y = df['Pos.Nr']
genename = df_mayra_plate_index_platename.loc[(x,y)]['ChlreGene_ID55']
return genename
df_mayra_pcrlist['genename'] = df_mayra_pcrlist.apply(get_genename,axis=1)
df_mayra_plate_index_platename.loc[('P1kCre1+2',1)]
ChlreGene_ID55 Cre11.g468300
samegenemodel 1
Name: (P1kCre1+2, 1), dtype: object
visualization
http://pandas.pydata.org/pandas-docs/stable/visualization.html
import pandas as pd
df = pd.read_csv('S_total.CG.q.dict.exonmethylv.txt',sep='\t') # header=None 을 넣으면 헤더를 숫자로 알아서 넣어줌.
print(df)
result
genename TotalM TotalU Methylation Level
Vradi02g00170.1 0 831 0.000000 2.747080
Vradi0223s00280.1 0 289 0.000000 0.000000
Vradi06g13740.1 0 679 0.000000 0.000000
Vradi05g10140.1 0 580 0.000000 5.414350
merge
>>> A >>> B
lkey value rkey value
0 foo 1 0 foo 5
1 bar 2 1 bar 6
2 baz 3 2 qux 7
3 foo 4 3 bar 8
>>> merge(A, B, left_on='lkey', right_on='rkey', how='outer')
lkey value_x rkey value_y
0 foo 1 foo 5
1 foo 4 foo 5
2 bar 2 bar 6
3 bar 2 bar 8
4 baz 3 NaN NaN
5 NaN NaN qux 7
merge 이후 nan 있는 행/열 지우기
In [45]: df
Out[45]:
one two three
a NaN 0.036220 0.184735
c NaN -0.271020 1.288393
e NaN 0.000000 0.000000
f NaN 0.000000 0.000000
h NaN -0.059268 0.249698
In [46]: df.dropna(axis=0)
Out[46]:
Empty DataFrame
Columns: [one, two, three]
Index: []
In [47]: df.dropna(axis=1)
Out[47]:
two three
a 0.036220 0.184735
c -0.271020 1.288393
e 0.000000 0.000000
f 0.000000 0.000000
h -0.059268 0.249698
In [48]: df['one'].dropna()
Out[48]: Series([], Name: one, dtype: float64)
add column
trait_category라는 column을 새로 만들고 이 것은 QTL_name이라는 기존의 Column의 값을 dicName2Cat이라는 딕셔너리에 키값으로 넣은 뒤 나오는 value값으로 채운다.
df_soybean_QTL["trait_category"] = df_soybean_QTL["QTL_name"].apply(dicName2Cat.get)
word frequency
from collections import Counter
counter = Counter(df_PGDB_sub["Scientific name"])
counter
Counter({'Solanum lycopersicum': 21607, 'Arachis hypogaea': 14392, 'Manihot esculenta': 7946, 'Trifolium pratense': 7493, 'Glycine max': 7020, 'Fragaria x ananassa': 5818, 'Capsicum annuum': 5753, 'Eucalyptus camaldulensis': 5684, 'Raphanus sativus': 5068, 'Brassica napus': 2156, 'Trifolium repens': 1993, 'Brassica rapa': 1893, 'Brassica oleracea': 1556, 'Medicago truncatula': 1435, 'Lotus japonicus': 1155, 'Nicotiana tabacum': 912, 'Malus x domestica': 902, 'Jatropha curcas': 855, 'Camellia sinensis': 850, 'Solanum tuberosum': 550, 'Dianthus caryophyllus': 496, 'Vitis vinifera': 424, 'Lactuca sativa': 420, 'Prunus persica': 401, 'Allium cepa': 271, 'Ricinus communis': 223, 'Brachypodium distachyon': 214, 'Theobroma cacao': 199, 'Cajanus cajan': 142, 'Fragaria vesca': 105, 'Ipomoea nil': 75, 'Populus trichocarpa': 67, 'Carica papaya': 53, 'Phoenix dactylifera': 42, 'Citrus unshiu': 41, 'Eucalyptus globulus': 38, 'Allium fistulosum': 33, 'Chrysanthemum morifolium': 26, 'Spinacia oleracea': 23, 'Cucumis sativus': 2})
word count
c = df_PGDB_sub["Scientific name"].value_counts()
c
#c.plot(kind='bar') # plotting
Solanum lycopersicum 21607 Arachis hypogaea 14392 Manihot esculenta 7946 Trifolium pratense 7493 Glycine max 7020 Fragaria x ananassa 5818 Capsicum annuum 5753 Eucalyptus camaldulensis 5684 Raphanus sativus 5068 Brassica napus 2156 Trifolium repens 1993 Brassica rapa 1893 Brassica oleracea 1556 Medicago truncatula 1435 Lotus japonicus 1155 Nicotiana tabacum 912 Malus x domestica 902 Jatropha curcas 855 Camellia sinensis 850 Solanum tuberosum 550 ... dtype: int64
**dataframe 만들기 **
#array to df
pd.DataFrame(data[1:,1:], index=data[1:,0], columns=data[0,1:])
각 row를 iteration
- df.iterrows() : generator를 만들기 때문에 for를 사용가능
원하는 col을 index화
- df.set_index("원하는col이름")
index select
- df.loc["indexname"]
select multiple column
column reordering
- df = df.reindex_axis(sorted(df.columns), axis=1)
- df = df.reindex_axis([l,i,s,t], axis=1)
index를 generate
- df.index
for i in df.index:
print df.ix[i].values # values : 반환되는 series를 numpy array로 바꿔줌
df 행열 전환
- df.T
data frame save
df.to_pickle(file_name) # where to save it, usually as a .pkl
df = pd.read_pickle(file_name)
df.to_csv('test.gff',index=None,header=False,sep='\t')
df[['genename',1,2,3,4,5,6,7,'info']].to_csv(file_gtf+'.converted.gff',index=None,header=False,sep='\t',quoting=csv.QUOTE_NONE)
df sort
result = df.sort(['A', 'B'], ascending=[1, 0])
triangular matrix에서 값만 가져오기
cor_dist = df_cor.values[np.tril_indices_from(df_cor,-1)]
column 가져오기
my_dataframe.columns.values
apply
multiple index
file_mayra_index = 'mayra.plate.index.txt'
df_mayra_index = pd.read_csv(file_mayra_index,sep='\t',index_col=['platename','Pos.Nr'])
df_mayra_index.loc['P1kCre1+2','1']