pandas Notes - SeanWu1977/Machine-Learning GitHub Wiki

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Basis concept
DataFrame  1 : n Series
Series 可以是一排 column or row


# get columns :
===> Series
df ["column1"] 
===> Dataframe
df[ ["column1", "column2" ]

# get rows :
===> Series
df.iloc[n]
===> Dataframe
df.head(n)
df.tail(n)
df.iloc[n:m]     #n:start index(include), m: end index(exclude)
df.iloc[n:m:s]   #n:start index(include), m: end index(exclude), s: steps
df.iloc[n:m, x:y] 

df.loc[]是依row的label來抓,如果沒有特別去設定,label就是0,1,2,...

# change cell value
df.loc[0,"R"] = 3
df.at[1,"L"] = 2

# step to change one column value
step 1. 自定義轉換函數
    def foo(x):
        return do_something_for_x
step 2. 取出一排column
    dfc = df["column"]
step 3. apply(fun) #會將每一個值個別丟入fun內取得結果
    dfc.apply(foo)
step 4. 設定新column給step 3的產出
    df["new_column"] = dfc.apply(foo)

# filter 用法
df[跟資料比數一樣多的Ture/False 的 List/Series] # 注意是list/series
df[ [True, False, True] ] # df有三筆狀況下

# eval(字串): 會把字串用python程式執行,即把字串的"/'移除,並用python執行該行
eval("x = 4 + 3") ==> x = 4 + 3
eval("['A','B','C']") ==> ['A','B','C']

# **kwds
DataFrame.apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, result_type=None, args=(), **kwds)

def test(s, **kwargs):
    l = eval(s)
    tags = kwargs["tag"]
    if type(tags) == str:
        tags = [tags]
    for t in tags:
        if t in l:
            return True
    return False
    
df["tags"].apply(test, tag=['children','demo'])


# slicen
[n:m] = 從index n 取到 index m-1
[n:-m] = 從index n 取到 index (len(x)-1)-1
[n:m:s] = [start:end:step]  從index n 取到 index m-1 每次加s
[::s] = [start:end:step]  n,m 空白表示0:m+1 ==> 取全部
[::-s] = [start:end:step]  n,m 空白表示m:-1 每次少s

# ignore warning message : A value is trying to be set on a copy of a slice from a DataFrame
pd.options.mode.chained_assignment = None

# col1的資料
city = ["TW", "BJ", "TK", "SF", "NY", "KH"]
# col2的資料
amount = [30, 12, 6, 31, 44, 15]
# combine with column name
dict = {"city": city ,
        "amount": amount 
       }
# 將dictionary 轉 dataframe
df = pd.DataFrame(dict)

# 讀取csv並轉成dataframe
df = pd.read_csv("file_path",header=(None|colume|...) , sep=",")
from io import StringIO
csv_data = '''
A,B,C,D
1,2,3,4
,4,,5
3,6,33,77
123,66,33,'''
# python 2.7 , csv_data = unicode(csv_data)
df=pd.read_csv(StringIO(csv_data))
#回傳dataframe, 結果為每個欄位是否為null(true/false)
df.isnull

# groupby 和 以及 不同值
c = df.groupby('KUNNR').agg({'price': np.sum, 'VBELN': pd.Series.nunique})


# 讀取xls,結果會是sheet
xls = pd.ExcelFile("file_path")
# 讀取sheet1並轉成dataframe
sheet1 = xls.parse("Sheet1")

# 檢查欄位是否存在於dataframe
if 'col_name' in df.columns:

# dataframe 轉 array (pandas to numpy,沒有rol_index & col_index, 變成 numpy 的 array)
df.values
# 讀row name,沒有的話就是顯示0,1,2,3,...
df.index
# 讀column name,沒有的話就是顯示0,1,2,3,...
df.columns
# 設定column name
df.columns = ['ColA','ColB','ColC']
# 讀前n筆(預設是 n = 5 )
df.head(n)
# 讀最後n筆(預設是 n = 5 )
df.tail(n)
# 讀取col0 大於 100 的記錄
df[df.col0 > 100]
df[(df.col0 > 100) & (df.col0 < 200)]
To select rows whose column value equals a scalar, some_value, use ==:
df.loc[df['column_name'] == some_value]

To select rows whose column value is in an iterable, some_values, use isin:
df.loc[df['column_name'].isin(some_values)]

Combine multiple conditions with &:
df.loc[(df['column_name'] == some_value) & df['other_column'].isin(some_values)]



# 去重覆
df_user.drop_duplicates(inplace=True)
df_user.drop_duplicates(subset = [column list] ,inplace=True)

# .loc .iloc .ix
# n:m ==> n ~ m-1
# [x,y,z,...] ==> x,y,z,...
df.loc[row_index,col_name]  
df.iloc[row_index,col_index]
df.ix[row_index,col_index|col_name]
# 判斷df的欄位KKBER == 100 ,並只抓出第一/三欄
df[df.KKBER == 100].iloc[:,[0,2]]

# 重置索引 (iloc,loc.ix取出後,索引值會跟原本一樣,不會自動由 0 排列)
df.reset_index(drop=True)

# loop dataframe
for idx,v in df.iterrows():
  v[0] # 取第一欄位值  or v['col_name']

# 刪除row by index
df = df.drop(idx)

# 字串取代
df = df.replace("abc","ABC")

#字串前輟補0
df['Col'].apply(lambda x: x.zfill(15))
df.Col.apply(lambda x: x.zfill(15))

#字串前輟去0
df.col.str.lstrip('0')
df['col'].str.lstrip('0')

# df Join
# how : {'left', 'right', 'outer', 'inner'}, default: 'left'
# df2 left join df1
df1.join(df2.set_index('KEY'), on = 'KEY', how = 'left')

# df2 left join df1 , 可用於key/column名稱不同
df = pd.merge(df1, df2 , how = 'left', left_on=['PO'], right_on=['PO'])
df1.merge(df2, left_on = 'KUNNR', right_index=True ,how='left')

# dataframe 轉一維 
df.values.flatten()

# reverse column
df[df.columns[::-1]]

# select column
newdf = df[df.columns[2:4]]
df1 = df['b', 'c'](/SeanWu1977/Machine-Learning/wiki/'b',-'c')

# column rename
df.rename(columns={'OBJECTID':'PR'}, inplace=True)
df.columns =['OBJECTID','PR']

# sort df
df.sort_values(['a', 'b'], ascending=[True, False])

# read excel
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile 
df = pd.read_excel('File.xlsx', sheetname='Sheet1')

df['col_name'] # list
for i in df.index:
    print(df['col_name'][i])


# dataframe to excel(with template)
from openpyxl.utils.dataframe import dataframe_to_rows
rows = dataframe_to_rows(df)  # convert a dataframe into a worksheet
wb= openpyxl.load_workbook('H:/template.xlsx')
ws = wb.get_sheet_by_name('spam')
ws.title = 'df data'

for r_idx, row in enumerate(rows, 1): # loop each row ,enumerate(list, start-index)
    for c_idx, value in enumerate(row, 1): # loop each column in one row  ,enumerate(list, start-index)
         ws.cell(row=r_idx, column=c_idx, value=value)

wb.save('H:/df_out.xlsx')


# datafram to list
df['column_name'].tolist()  #one column
df.values.tolist() #list of lists