python.jupyter.quick - k821209/pipelines GitHub Wiki

μ›ν•˜λŠ” μ…€λ§Œ μ €μž₯

%save "RNAseq.bam.visualization.gffindexing.py" _ih[190] # _oh λŠ” 아웃풋 νžˆμŠ€ν† λ¦¬μΈλ“―. 

gff parsing

df_gff   = pd.read_csv(file_gff,sep='\t',skiprows=2,header=None)

df_gff['genename'] = df_gff[8].apply(lambda x : '.'.join(x.split(';')[0].replace('ID=','').split('.')[0:2]))
df_gff['transcriptname'] = df_gff[8].apply(lambda x : '.'.join(x.split(';')[0].replace('ID=','').split('.')[0:4]))

# grep longest transcript names
mask        = (df_gff[2] == 'mRNA')
df_gff_mRNA = df_gff[mask]
df_gff_mRNA['longest'] = df_gff_mRNA[8].apply(lambda x : x.split(';')[3].replace('longest=',''))
df_gff_mRNA_index =  df_gff_mRNA.set_index('transcriptname')

def get_longest(x):
    try:
        return df_gff_mRNA_index.loc[x]['longest']
    except KeyError:
        return None
df_gff['longest'] = df_gff['transcriptname'].apply(get_longest)

df_gff_index = df_gff.set_index(['genename','longest'])