Python data wrangling - IanBrettell/template_repo GitHub Wiki

Libraries

import pandas as pd

Install a conda package in the current Jupyter kernel

import sys
!conda install --yes --prefix {sys.prefix} numpy

Install a pip package in the current Jupyter kernel

import sys
!{sys.executable} -m pip install numpy

System

List full paths

import os

def list_full_paths(directory):
    return [os.path.join(directory, file) for file in os.listdir(directory)]

Arrays

Append to array in loop

out = []
for sample in samples.index:
    out.append(samples.loc[sample, "int_floor"])

Read file into list

with open(filename) as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
content = [x.strip() for x in content]

Read in file with index

pd.read_csv(file, comment="#", skip_blank_lines=True, index_col=0)

List comprehensions

[os.path.splitext(file)[0] for file in os.listdir("../introgression/release-102/unzipped")]

Data frames

Read multiple CSVs and bind into single DF

# Read files
dfs = list()
for f in target_files:
  sample = os.path.basename(f).strip('.csv')
  df = pd.read_csv(f,
                   header = None,
                   names = ['CHROM', 'POS', 'ALT', 'SVLEN', 'SVTYPE', 'CHR2', 'END','GT','LN','ST'],
                   dtype = {'CHROM' : str,
                            'POS' : np.int,
                            'ALT' : str, 
                            'SVLEN' : np.int, 
                            'SVTYPE' : str,
                            'CHR2' : str, 
                            'END' : np.int, 
                            'GT' : str, 
                            'LN' : np.int, 
                            'ST' : str})
  df['SAMPLE'] = sample
  dfs.append(df)

# bind together
full_df = pd.concat(dfs)

Select columns

# by name
pop_new = pop_file.loc[:, ['Sample', 'Population']]

# "contains"
df_filt = df.loc[:, df.columns.str.contains('KW|iCab|Ho5') == False]

Summary stats

host_df.describe(include='all')