Libraries
import pandas as pd
Install a conda package in the current Jupyter kernel
import sys
!conda install --yes --prefix {sys.prefix} numpy
Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install numpy
System
List full paths
import os
def list_full_paths(directory):
return [os.path.join(directory, file) for file in os.listdir(directory)]
Arrays
Append to array in loop
out = []
for sample in samples.index:
out.append(samples.loc[sample, "int_floor"])
Read file into list
with open(filename) as f:
content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
content = [x.strip() for x in content]
Read in file with index
pd.read_csv(file, comment="#", skip_blank_lines=True, index_col=0)
List comprehensions
[os.path.splitext(file)[0] for file in os.listdir("../introgression/release-102/unzipped")]
Data frames
Read multiple CSVs and bind into single DF
# Read files
dfs = list()
for f in target_files:
sample = os.path.basename(f).strip('.csv')
df = pd.read_csv(f,
header = None,
names = ['CHROM', 'POS', 'ALT', 'SVLEN', 'SVTYPE', 'CHR2', 'END','GT','LN','ST'],
dtype = {'CHROM' : str,
'POS' : np.int,
'ALT' : str,
'SVLEN' : np.int,
'SVTYPE' : str,
'CHR2' : str,
'END' : np.int,
'GT' : str,
'LN' : np.int,
'ST' : str})
df['SAMPLE'] = sample
dfs.append(df)
# bind together
full_df = pd.concat(dfs)
Select columns
# by name
pop_new = pop_file.loc[:, ['Sample', 'Population']]
# "contains"
df_filt = df.loc[:, df.columns.str.contains('KW|iCab|Ho5') == False]
Summary stats
host_df.describe(include='all')