Python.pandas - k821209/pipelines GitHub Wiki

Pandas

pandas plot

# pandas x ticks label rotation
ax = eedf[3].value_counts().head(20).plot('bar')
ax.set_xticklabels(ax.get_xticklabels(),rotation=45,ha='right')
plt.show()

Pandas corr to tree

import pandas as pd
import matplotlib.pyplot as plt
from scipy.cluster import hierarchy as hc
import numpy as np
# pandas .corr() 결과임. 
df_diff_acc_same_treat.sort_index(inplace=True)
df_diff_acc_same_treat = df_diff_acc_same_treat[df_diff_acc_same_treat.index]

# 트리그리기 
corr = 1 - df_diff_acc_same_treat 
corr_condensed = hc.distance.squareform(corr) # convert to condensed
z = hc.linkage(corr_condensed, method='average')
dendrogram = hc.dendrogram(z, labels=corr.columns)
plt.show()

Pandas df to pdf

def to_html_pretty(df, filename='/tmp/out.html', title=''):
    '''
    Write an entire dataframe to an HTML file
    with nice formatting.
    Thanks to @stackoverflowuser2010 for the
    pretty printer see https://stackoverflow.com/a/47723330/362951
    '''
    ht = ''
    if title != '':
        ht += '<h2> %s </h2>\n' % title
    ht += df.to_html(classes='wide', escape=False)

    with open(filename, 'w') as f:
         f.write(HTML_TEMPLATE1 + ht + HTML_TEMPLATE2)

HTML_TEMPLATE1 = '''
<html>
<head>
<style>
  h2 {
    text-align: center;
    font-family: Helvetica, Arial, sans-serif;
  }
  table { 
    margin-left: auto;
    margin-right: auto;
  }
  table, th, td {
    border: 1px solid black;
    border-collapse: collapse;
  }
  th, td {
    padding: 5px;
    text-align: center;
    font-family: Helvetica, Arial, sans-serif;
    font-size: 90%;
  }
  table tbody tr:hover {
    background-color: #dddddd;
  }
  .wide {
    width: 90%; 
  }
</style>
</head>
<body>
'''

HTML_TEMPLATE2 = '''
</body>
</html>
'''

def df2pdf(df,out_pdf,title):
    intermediate_html = '/tmp/intermediate.html'
    to_html_pretty(df,intermediate_html,title)
    import weasyprint
    weasyprint.HTML(intermediate_html).write_pdf(out_pdf)

SPSS to pandas

import pandas as pd
import numpy as np
import savReaderWriter as spss
# metadata parsing
## column metadata 로딩 
with spss.SavHeaderReader('./HN0811_ALLDXA(65세남성).sav',ioUtf8=True) as header:
    metadata = header.all()
    report = str(header)
dicC2M = dict([[x, y.decode('euc-kr')] for x, y in metadata.varLabels.items()])

# data parsing
with spss.SavReader('./HN0811_ALLDXA(65세남성).sav') as reader:
    records = reader.all()
    header = reader.header
df = pd.DataFrame(records,columns=header)
df.head()

# na많은 column 제외하기 
nacount = np.array([len(df[x].isnull().nonzero()[0]) for x in df.columns])
df_lownas_dropna = df[df.columns[(nacount < 100 )]].dropna()
df_lownas_dropna.head()

merge df

reduce(lambda a,b : pd.merge(a,b,on="target_id",how='outer'),df_list_v55)

determine type

isinstance(df_sub_ix.loc['STRG.12455.1|m.45759'],pd.Series)
True

read_csv

df_gff = pd.read_csv(file_gff,sep='\t',header=None,comment='#')
# comment line을 무시!

Index

df.set_index(['fileName','phrase'],inplace=True)
df.sortlevel(inplace=True)

index column value ret

df_sam_index.index.values
df_sam_index.index.get_level_values('READID') # 특히 멀티 인덱싱일때 필요

multi column selection

http://stackoverflow.com/questions/16353729/pandas-how-to-use-apply-function-to-multiple-columns

def get_genename(df):
    x = df['platename']
    y = df['Pos.Nr']
    genename = df_mayra_plate_index_platename.loc[(x,y)]['ChlreGene_ID55']
    return genename
df_mayra_pcrlist['genename'] = df_mayra_pcrlist.apply(get_genename,axis=1)
df_mayra_plate_index_platename.loc[('P1kCre1+2',1)]
ChlreGene_ID55    Cre11.g468300
samegenemodel                 1
Name: (P1kCre1+2, 1), dtype: object

visualization

http://pandas.pydata.org/pandas-docs/stable/visualization.html

import pandas as pd
df = pd.read_csv('S_total.CG.q.dict.exonmethylv.txt',sep='\t') # header=None 을 넣으면 헤더를 숫자로 알아서 넣어줌.
print(df)

result

                   genename  TotalM    TotalU  Methylation Level
Vradi02g00170.1           0     831  0.000000           2.747080
Vradi0223s00280.1         0     289  0.000000           0.000000
Vradi06g13740.1           0     679  0.000000           0.000000
Vradi05g10140.1           0     580  0.000000           5.414350

merge

>>> A              >>> B
    lkey value         rkey value
0   foo  1         0   foo  5
1   bar  2         1   bar  6
2   baz  3         2   qux  7
3   foo  4         3   bar  8
>>> merge(A, B, left_on='lkey', right_on='rkey', how='outer')
   lkey  value_x  rkey  value_y
0  foo   1        foo   5
1  foo   4        foo   5
2  bar   2        bar   6
3  bar   2        bar   8
4  baz   3        NaN   NaN
5  NaN   NaN      qux   7

merge 이후 nan 있는 행/열 지우기

In [45]: df
Out[45]: 
   one       two     three
a  NaN  0.036220  0.184735
c  NaN -0.271020  1.288393
e  NaN  0.000000  0.000000
f  NaN  0.000000  0.000000
h  NaN -0.059268  0.249698

In [46]: df.dropna(axis=0)
Out[46]: 
Empty DataFrame
Columns: [one, two, three]
Index: []

In [47]: df.dropna(axis=1)
Out[47]: 
        two     three
a  0.036220  0.184735
c -0.271020  1.288393
e  0.000000  0.000000
f  0.000000  0.000000
h -0.059268  0.249698

In [48]: df['one'].dropna()
Out[48]: Series([], Name: one, dtype: float64)

add column

trait_category라는 column을 새로 만들고 이 것은 QTL_name이라는 기존의 Column의 값을 dicName2Cat이라는 딕셔너리에 키값으로 넣은 뒤 나오는 value값으로 채운다.

df_soybean_QTL["trait_category"] = df_soybean_QTL["QTL_name"].apply(dicName2Cat.get)

word frequency

from collections import Counter
counter = Counter(df_PGDB_sub["Scientific name"])
counter

Counter({'Solanum lycopersicum': 21607, 'Arachis hypogaea': 14392, 'Manihot esculenta': 7946, 'Trifolium pratense': 7493, 'Glycine max': 7020, 'Fragaria x ananassa': 5818, 'Capsicum annuum': 5753, 'Eucalyptus camaldulensis': 5684, 'Raphanus sativus': 5068, 'Brassica napus': 2156, 'Trifolium repens': 1993, 'Brassica rapa': 1893, 'Brassica oleracea': 1556, 'Medicago truncatula': 1435, 'Lotus japonicus': 1155, 'Nicotiana tabacum': 912, 'Malus x domestica': 902, 'Jatropha curcas': 855, 'Camellia sinensis': 850, 'Solanum tuberosum': 550, 'Dianthus caryophyllus': 496, 'Vitis vinifera': 424, 'Lactuca sativa': 420, 'Prunus persica': 401, 'Allium cepa': 271, 'Ricinus communis': 223, 'Brachypodium distachyon': 214, 'Theobroma cacao': 199, 'Cajanus cajan': 142, 'Fragaria vesca': 105, 'Ipomoea nil': 75, 'Populus trichocarpa': 67, 'Carica papaya': 53, 'Phoenix dactylifera': 42, 'Citrus unshiu': 41, 'Eucalyptus globulus': 38, 'Allium fistulosum': 33, 'Chrysanthemum morifolium': 26, 'Spinacia oleracea': 23, 'Cucumis sativus': 2})

word count

c = df_PGDB_sub["Scientific name"].value_counts()
c
#c.plot(kind='bar') # plotting

Solanum lycopersicum 21607 Arachis hypogaea 14392 Manihot esculenta 7946 Trifolium pratense 7493 Glycine max 7020 Fragaria x ananassa 5818 Capsicum annuum 5753 Eucalyptus camaldulensis 5684 Raphanus sativus 5068 Brassica napus 2156 Trifolium repens 1993 Brassica rapa 1893 Brassica oleracea 1556 Medicago truncatula 1435 Lotus japonicus 1155 Nicotiana tabacum 912 Malus x domestica 902 Jatropha curcas 855 Camellia sinensis 850 Solanum tuberosum 550 ... dtype: int64

**dataframe 만들기 **

#array to df
pd.DataFrame(data[1:,1:], index=data[1:,0], columns=data[0,1:])

각 row를 iteration

  • df.iterrows() : generator를 만들기 때문에 for를 사용가능

원하는 col을 index화

  • df.set_index("원하는col이름")

index select

  • df.loc["indexname"]

select multiple column

column reordering

  • df = df.reindex_axis(sorted(df.columns), axis=1)
  • df = df.reindex_axis([l,i,s,t], axis=1)

index를 generate

  • df.index
for i in df.index:
    print df.ix[i].values  # values : 반환되는 series를 numpy array로 바꿔줌

df 행열 전환

  • df.T

data frame save

df.to_pickle(file_name)  # where to save it, usually as a .pkl
df = pd.read_pickle(file_name)
df.to_csv('test.gff',index=None,header=False,sep='\t')
df[['genename',1,2,3,4,5,6,7,'info']].to_csv(file_gtf+'.converted.gff',index=None,header=False,sep='\t',quoting=csv.QUOTE_NONE)

df sort

result = df.sort(['A', 'B'], ascending=[1, 0])

triangular matrix에서 값만 가져오기

cor_dist = df_cor.values[np.tril_indices_from(df_cor,-1)]

column 가져오기

my_dataframe.columns.values

apply

http://stackoverflow.com/questions/13331698/how-to-apply-a-function-to-two-columns-of-pandas-dataframe

multiple index

file_mayra_index = 'mayra.plate.index.txt'
df_mayra_index   = pd.read_csv(file_mayra_index,sep='\t',index_col=['platename','Pos.Nr'])

df_mayra_index.loc['P1kCre1+2','1']
⚠️ **GitHub.com Fallback** ⚠️