Perform Feature Engineering - setiamanlhc/python-snippet-code GitHub Wiki
Drop all records with Null values and fill null value records with default value.
df_clean = df.dropna()
df_clean2 = df.fillna(value='0')Get average value of the column and fill it into Null Cell
avg_score = df['marks'].mean()
df_clean3 = df.fillna(value=avg_score)
df_clean3.head()SOURCE_ABC = '.\\demo\\class_ABC.csv'
SOURCE_DE = '.\\demo\\class_DE.csv'
SOURCE_TEACHERS = '.\\demo\\teachers.csv'
df_abc = pd.read_csv(SOURCE_ABC)
df_de = pd.read_csv(SOURCE_DE)
df_teachers = pd.read_csv(SOURCE_TEACHERS)
df_abcde = pd.concat([df_abc,df_de])
df_combined = pd.merge(df_abcde, df_teachers, how='left', on = 'class')Using function
def remove_teacher(row):
name = row['teacher']
name_token = name.split(' ')
return name_token[1]
df_combined['teacher_new'] = df_combined.apply(remove_teacher, axis=1)Using split function
df[<col name 1>] , df[<col name 2>] = df[<col name to be splitted up>].str.split(<split token>,1).strUsing apply method to update existing column
def convert_inch_to_cm(row):
inch_to_cm = 2.54
row['Father'] = row['Father'] * inch_to_cm
row['Mother'] = row['Mother'] * inch_to_cm
row['Height'] = row['Height'] * inch_to_cm
return row
df = df.apply(convert_inch_to_cm, axis=1)Using pandas vectorized functions
df[['Father', 'Mother', 'Height']] = df[['Father', 'Mother', 'Height']] * 2.54
#vectorized function to create new columns.
df[['Father_m', 'Mother_m', 'Height_m']] = df[['Father', 'Mother', 'Height']] / 100df_combined = df_combined.drop(['teacher'], axis=1)
# drop the 'Alpha2_code' column that we don't need
mapping_country_3codes.drop(['Alpha2_code'], axis=1, inplace=True)
# Drop duplicate
df = df.drop_duplicates(subset = <list of column names>, keep = <'last' | 'first')# Create the mapping in a dictionary
mapping_dict_gender = {
'M': 'M',
'F': 'F',
'female': 'F',
'male': 'M'
}
df['Gender'] = df['Gender'].map(mapping_dict_gender)#Using Dictionary mapping
d = { 'old1': 'new column1', 'old2': 'new column 2' }
df.rename(columns = d, inplace = False)
# direct renaming based on position
new_colnames = ['hotel', 'market_segment', 'repeat_guest_no', 'repeat_guest_yes']
df_pivot.columns = new_colnames