Lesson x: Extra Features - kode2go/python-pandas GitHub Wiki
Remove Outliers
from scipy import stats
import numpy as np
mylist = [100,110,125,95,115,5,10,100,51,105,10]
print(mylist)
z = stats.zscore(mylist)
print(z)
z = list(np.abs(stats.zscore(mylist)))
thresh = 2
# print(np.where(z > 2))
for index, i in enumerate(mylist):
if z[index] > 3:
print(index)
mylist.remove(3)
print(index)
print(mylist)
mylist2 = [100,110,125,95,115,5,10,100,51,105,10]
# https://www.kite.com/python/answers/how-to-remove-outliers-from-a-pandas-dataframe-in-python
z_scores = np.abs(stats.zscore(mylist2))
filt_ent = z_scores < 1.5
print(filt_ent)
Ref: https://pandas.pydata.org/
DataFrame to HTML Table
# -*- coding: utf-8 -*-
"""
Created on Thu Oct 14 01:25:55 2021
"""
import pandas as pd
data = {'DATE': ['2021-10-13','2021-10-14','2021-10-15','2021-10-16','2021-10-17','2021-10-18'],
'MIN_TEMP': [68,65,60,51,30,70],
'MAX_TEMP': [71,73,63,55,32,31]}
#Create dataframe
df_temps = pd.DataFrame(data)
#Convert date to datetime column
print(df_temps)
df_temps['DATE'] = pd.to_datetime(df_temps['DATE'])
print(df_temps)
df_temps.set_index('DATE',inplace=True,drop=True)
print(df_temps)
def min_temp(row):
lightblue = 'background-color: lightblue;'
red = 'background-color: red;'
darkblue = 'background-color: blue;'
default = ''
# must return one string per cell in this row
if row['MIN_TEMP'] <= 68 and row['MIN_TEMP'] >= 32:
return [lightblue]
elif row['MIN_TEMP'] > 68:
return [red]
elif row['MIN_TEMP'] < 32:
return [darkblue]
else:
return [default]
def max_temp(row):
lightblue = 'background-color: lightblue;'
red = 'background-color: red;'
darkblue = 'background-color: blue;'
default = ''
# must return one string per cell in this row
if row['MAX_TEMP'] <= 68 and row['MAX_TEMP'] >= 32:
return [lightblue]
elif row['MAX_TEMP'] > 68:
return [red]
elif row['MAX_TEMP'] < 32:
return [darkblue]
else:
return [default]
style1 = df_temps.style.apply(min_temp, subset=['MIN_TEMP'], axis=1).apply(max_temp, subset=['MAX_TEMP'], axis=1).set_table_styles(
[{"selector": "", "props": [("border", "1px solid grey")]},
{"selector": "tbody td", "props": [("border", "1px solid grey")]},
{"selector": "th", "props": [("border", "1px solid grey")]}
]
)
df_html = style1.render()
text_file = open("index.html", "w")
text_file.write(df_html)
text_file.close()
Reorder Columns:
# -*- coding: utf-8 -*-
"""
Created on Fri Aug 20 16:30:54 2021
@author: BBarsch
"""
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
dataFrame = pd.read_csv("dataext.csv",skiprows=2)
# # Index(['customerid', 'age', 'salary', 'balance', 'marital', 'jobedu',
# # 'targeted', 'default', 'housing', 'loan', 'contact', 'day', 'month',
# # 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'response'],
# # dtype='object')
dataFrame.drop('customerid', axis=1, inplace=True)
dataFrame['job'] = dataFrame["jobedu"].apply(lambda x: x.split(",")[0])
dataFrame['education'] = dataFrame["jobedu"].apply(lambda x: x.split(",")[1])
dataFrame.drop('jobedu', axis=1, inplace=True)
firstColumn = dataFrame.pop('salary')
dataFrame.insert(0,'salary',firstColumn)
jobColumn = dataFrame.pop('job')
dataFrame.insert(1,'job',jobColumn)
# dataFrame.job.value_counts(normalize=True).plot.barh()
# plt.show()
# print(dataFrame.salary.describe())
# https://towardsdatascience.com/ways-to-detect-and-remove-the-outliers-404d16608dba
# https://www.kite.com/python/answers/how-to-remove-outliers-from-a-pandas-dataframe-in-python
print(dataFrame.groupby('response')['salary'].mean())
z = list(np.abs(stats.zscore(dataFrame['salary'])))
Q1 = dataFrame['salary'].quantile(0.25)
Q3 = dataFrame['salary'].quantile(0.75)
Q11 = dataFrame.quantile(0.25)
Q33 = dataFrame.quantile(0.75)
IQR = Q33 - Q11
print(IQR)
# dataFrame = dataFrame[(z<1).all(axis=1)]