python.jupyter.quick - k821209/pipelines GitHub Wiki
μνλ μ λ§ μ μ₯
%save "RNAseq.bam.visualization.gffindexing.py" _ih[190] # _oh λ μμν νμ€ν 리μΈλ―.
gff parsing
df_gff = pd.read_csv(file_gff,sep='\t',skiprows=2,header=None)
df_gff['genename'] = df_gff[8].apply(lambda x : '.'.join(x.split(';')[0].replace('ID=','').split('.')[0:2]))
df_gff['transcriptname'] = df_gff[8].apply(lambda x : '.'.join(x.split(';')[0].replace('ID=','').split('.')[0:4]))
# grep longest transcript names
mask = (df_gff[2] == 'mRNA')
df_gff_mRNA = df_gff[mask]
df_gff_mRNA['longest'] = df_gff_mRNA[8].apply(lambda x : x.split(';')[3].replace('longest=',''))
df_gff_mRNA_index = df_gff_mRNA.set_index('transcriptname')
def get_longest(x):
try:
return df_gff_mRNA_index.loc[x]['longest']
except KeyError:
return None
df_gff['longest'] = df_gff['transcriptname'].apply(get_longest)
df_gff_index = df_gff.set_index(['genename','longest'])