Lesson x: Extra Features - kode2go/python-pandas GitHub Wiki

Remove Outliers

from scipy import stats
import numpy as np

mylist = [100,110,125,95,115,5,10,100,51,105,10]
print(mylist)
z = stats.zscore(mylist)
print(z)
z = list(np.abs(stats.zscore(mylist)))
thresh = 2
# print(np.where(z > 2))
for index, i in enumerate(mylist):
    if z[index] > 3:
        print(index)
        mylist.remove(3)
    print(index)

print(mylist)
mylist2 = [100,110,125,95,115,5,10,100,51,105,10]


# https://www.kite.com/python/answers/how-to-remove-outliers-from-a-pandas-dataframe-in-python

z_scores = np.abs(stats.zscore(mylist2))
filt_ent = z_scores < 1.5
      
print(filt_ent)

Ref: https://pandas.pydata.org/

DataFrame to HTML Table

# -*- coding: utf-8 -*-
"""
Created on Thu Oct 14 01:25:55 2021

"""

import pandas as pd

data = {'DATE':  ['2021-10-13','2021-10-14','2021-10-15','2021-10-16','2021-10-17','2021-10-18'],
        'MIN_TEMP': [68,65,60,51,30,70],
        'MAX_TEMP': [71,73,63,55,32,31]}

#Create dataframe
df_temps = pd.DataFrame(data)

#Convert date to datetime column
print(df_temps)

df_temps['DATE'] = pd.to_datetime(df_temps['DATE'])
print(df_temps)
df_temps.set_index('DATE',inplace=True,drop=True)
print(df_temps)
def min_temp(row):    

    lightblue = 'background-color: lightblue;'
    red = 'background-color: red;'
    darkblue = 'background-color: blue;'
    default = ''
    # must return one string per cell in this row
    if row['MIN_TEMP'] <= 68 and row['MIN_TEMP'] >= 32:
        return [lightblue]
    elif row['MIN_TEMP'] > 68:
        return [red]
    elif row['MIN_TEMP'] < 32:
        return [darkblue]
    else:
        return [default]
    
def max_temp(row):    

    lightblue = 'background-color: lightblue;'
    red = 'background-color: red;'
    darkblue = 'background-color: blue;'
    default = ''
    # must return one string per cell in this row
    if row['MAX_TEMP'] <= 68 and row['MAX_TEMP'] >= 32:
        return [lightblue]
    elif row['MAX_TEMP'] > 68:
        return [red]
    elif row['MAX_TEMP'] < 32:
        return [darkblue]
    else:
        return [default]

style1 = df_temps.style.apply(min_temp, subset=['MIN_TEMP'], axis=1).apply(max_temp, subset=['MAX_TEMP'], axis=1).set_table_styles(
    [{"selector": "", "props": [("border", "1px solid grey")]},
      {"selector": "tbody td", "props": [("border", "1px solid grey")]},
     {"selector": "th", "props": [("border", "1px solid grey")]}
    ]
)


df_html = style1.render()

text_file = open("index.html", "w")
text_file.write(df_html)
text_file.close()

Reorder Columns:

# -*- coding: utf-8 -*-
"""
Created on Fri Aug 20 16:30:54 2021

@author: BBarsch
"""

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from scipy import stats


dataFrame = pd.read_csv("dataext.csv",skiprows=2)

# # Index(['customerid', 'age', 'salary', 'balance', 'marital', 'jobedu',
# #        'targeted', 'default', 'housing', 'loan', 'contact', 'day', 'month',
# #        'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'response'],
# #       dtype='object')

dataFrame.drop('customerid', axis=1, inplace=True)
dataFrame['job'] = dataFrame["jobedu"].apply(lambda x: x.split(",")[0])
dataFrame['education'] = dataFrame["jobedu"].apply(lambda x: x.split(",")[1])
dataFrame.drop('jobedu', axis=1, inplace=True)

firstColumn = dataFrame.pop('salary')
dataFrame.insert(0,'salary',firstColumn)

jobColumn = dataFrame.pop('job')
dataFrame.insert(1,'job',jobColumn)

# dataFrame.job.value_counts(normalize=True).plot.barh()
# plt.show()

# print(dataFrame.salary.describe())
# https://towardsdatascience.com/ways-to-detect-and-remove-the-outliers-404d16608dba
# https://www.kite.com/python/answers/how-to-remove-outliers-from-a-pandas-dataframe-in-python
print(dataFrame.groupby('response')['salary'].mean())

z = list(np.abs(stats.zscore(dataFrame['salary'])))

Q1 = dataFrame['salary'].quantile(0.25)
Q3 = dataFrame['salary'].quantile(0.75)
Q11 = dataFrame.quantile(0.25)
Q33 = dataFrame.quantile(0.75)
IQR = Q33 - Q11
print(IQR)

# dataFrame = dataFrame[(z<1).all(axis=1)]