Simple pipelines - k821209/pipelines GitHub Wiki

mapping and snp calling

# mapping.sh
SAMPLE=${3}
READ1=${1}
READ2=${2}
REF=/data/k821209/KKH/ref/GCF_000188075.1_Si_gnG_genomic.fna
touch ${SAMPLE}.bwa.start
bwa mem -t 20 $REF $READ1 $READ2 -o ${SAMPLE}.sam 
touch ${SAMPLE}.bwa.end
# mapping2.sh
SAMPLE=${1}
REF=/data/k821209/KKH/ref/GCF_000188075.1_Si_gnG_genomic.fna
samtools fixmate -O bam ${SAMPLE}.sam ${SAMPLE}.bam &&
rm ${SAMPLE}.sam 
sambamba sort -t 5 -o ${SAMPLE}.sort.bam ${SAMPLE}.bam --tmpdir ./tmp &&
touch ${SAMPLE}.sort.end
bcftools mpileup -g 10 -Oz -o ${SAMPLE}.gvcf.gz -f ${REF} ${SAMPLE}.sort.bam &&
bcftools index ${SAMPLE}.gvcf.gz
# merge.sh
bcftools merge -Oz --gvcf ../ref/GCF_000188075.1_Si_gnG_genomic.fna --merge all -o ant_merged.vcf.v1.gz *.gvcf.gz 
# call
bcftools call -mv ant_merged.vcf.v1.gz -o ant_merged.vcf.v1.gz.mv.vcf
# plink
# to grep with minimum allele count (mac)
/data/k821209/tomato/program/plink2 --mac 3 --min-alleles 2 --out ${1}.plink2 --export vcf --new-id-max-allele-len 20 truncate --set-all-var-ids @:#:\$1,\$2 --vcf ${1} --freq cols=chrom,pos,ref,alt,eq,nobs --allow-extra-chr

dollop tree https://github.com/guyleonard/orthomcl_tools

# dollo gainloss : ๊ฐ ํด๋ ˆ์ด๋“œ๋ฅผ ๊ธฐ์ค€์œผ๋กœ ๋ชจ๋‘ ์กด์žฌํ•˜๋Š” ๊ฒƒ์œผ๋กœ ๋‚˜์˜ค๋Š” ๊ฒƒ๋“ค์€ root์œผ๋กœ ๋“ค์–ด๊ฐ€๋Š”๋“ฏ
#                  ์ด๋ผ ํด๋ ˆ์ด๋“œ์—์„œ 0 ๋žœ๋“œํ”Œ๋žœํŠธ ํด๋ ˆ์ด๋“œ์—์„œ 1 ์ด๋ฉด, ๋žœ๋“œ ํ”Œ๋žœํŠธ gain. (์ˆœ์ฐจ์ ์œผ๋กœ root์œผ๋กœ ๊ฐ„์ฃผํ•˜๊ณ  ์ƒˆ๋กœ ๊ณ„์‚ฐ) 
#+ dollo
from tqdm import tqdm

from collections import Counter
df_dollo = pd.DataFrame()
for c in tqdm(df_clusters.columns): 
    d = Counter(set([x.split('|')[0] for x in df_clusters.loc['genes'][c].split(',')]))
    for s in ['ZMA','ZMYS','OSA','SPO','WAU']:
        try:
            df_dollo.at[c,s] = d[s]
        except KeyError:
            df_dollo.at[c,s] = 0

with open('dollo.phy','w') as f: 
    print(' '.join(map(str,df_dollo.T.shape)),file=f)
    for c in df_dollo.columns:
        print(c,''.join(map(str,df_dollo[c].astype(int).values)),sep=' '*(10-len(c)),file=f)
#-
phylip dollop  # ์ˆซ์ž๋กœ ์„ ํƒํ•˜๋Š”๊ฑฐ ์ „๋ถ€ yes๋กœ outfile๋ฝ‘์ž 
perl ../extract_dollop_output_sequences_v2-fast.pl -i outfile -s 27661 -o output -cr -l -g list.txt
# 27661 : orthomcl ํด๋Ÿฌ์Šคํ„ฐ๊ฐฏ์ˆ˜
# list.txt : orthomcl ํด๋Ÿฌ์Šคํ„ฐ์•„์ด๋”” ๋ฆฌ์ŠคํŠธ (์—”ํ„ฐ) 
# outfile : phylip dollop ๊ฒฐ๊ณผ๋ฌผ 
# output_old ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
cd output_old
old2new.ipynb ์‹คํ–‰ํ•ด์„œ outfile.phy_newstyle_report.txt ์ƒ์„ฑ
R < plot_tree_gains_loss_in_R.r --no-save # ํŠธ๋ฆฌ์ƒ์„ฑ ๋‹ค์Œ segment์—์„œ ์ˆ˜์ • ์—ฌ๋ถ€ ํ™•์ธ 
perl add_internal_node_labels.pl > outtree_nodelabel
# outtree ๋ฅผ ์ธํ’‹์œผ๋กœ ๊ฐ€์ง .. ํ•ด๋‹น ์ฝ”๋“œ๋ฅผ ์‚ดํŽด๋ณด๋ฉด ๋‹ค์Œ segment์—์„œ ํ™•์ธ... 

# plot_tree_gains_loss_in_R.r
library(ggplot2)
library(ggtree)

user_report <- read.csv("outfile.phy_newstyle_report.txt", sep = "\t", header=TRUE, as.is=1, row.names=NULL) # infile ์ด๋ฆ„ 
user_tree <- read.tree("outtree_nodelabel") # treeํŒŒ์ผ ์ด๋ฆ„ add_internal_node_labels.pl ์˜ ๊ฒฐ๊ณผ๋ฌผ 
user_p <- ggplot(user_tree, aes(x, y), ladderize=TRUE) + geom_tree() + theme_tree() + geom_tiplab(size=3, align=TRUE, color="purple", x=13) + xlab("") + ylab("") + geom_text(aes(label=Gain, x=branch,subset=!isTip), size=3, color="springgreen4", vjust=-0.6) + geom_text(aes(label=Gain,subset=isTip), size=3, color="springgreen4", hjust=0, x=13.5) + geom_text(aes(label=Loss, x=branch,subset=!isTip), size=3, color="firebrick3", vjust=1.3) + geom_text(aes(label=Loss,subset=isTip), size=3, color="firebrick3", hjust=0, x=14) + geom_text(aes(label=node,subset=!isTip), size=2, hjust=-1.5,color="grey") + scale_x_continuous(expand = c(.1, .2))
user_p <- user_p %<+% user_report
print(user_p)
#add_internal_node_labels.pl
#!/usr/bin/perl
use strict;
use warnings;

use Bio::Phylo::IO qw(parse); # bioperl ๊น”๊ธฐ ๊ท€์ฐฎ์œผ๋ฏ€๋กœ bioperl ๋„์ปค๋ฅผ ํ™œ์šฉํ•˜์ž 

#my $newick  = '(((trad,necr),didi),(nagr,(chre,phra)));';

my $filename = 'outtree'; # ์ธํ’‹์œผ๋กœ outtree๊ฐ€ ๋“ค์–ด๊ฐ
open my $newick_file, '<', $filename or die "Could not open file '$filename' $!";

my $newick_tree = '';
while (my $line = <$newick_file>) {
  chomp $line;
  $newick_tree = $line;
}

my $tree    = parse( '-format' => 'newick', '-string' => $newick_tree )->first;
my $number_of_leaves = 23; # ์ธํ’‹ species๊ฐฏ์ˆ˜ ๊ผญ ๋ฐ”๊ฟ”์ฃผ๋ผ
my $counter = $number_of_leaves + 1;

$tree->visit(
    sub {
        my $node = shift;
        $node->set_name( $counter++ ) if $node->is_internal;
    }
);

print $tree->to_newick( '-nodelabels' => 1 );

orthomcl-docker

> docker run -v /data/k821209/:/data/ -d -e MYSQL_ROOT_PASSWORD=7890uiop --name mysql_orthomcl mysql
> docker exec -it mysql_orthomcl bash 

https://www.jianshu.com/p/6f989a619c57

dbVendor=mysql 
# dbConnectString=dbi:mysql:orthomcl:mysql_local_infile=1:localhost:3306
dbConnectString=dbi:mysql:orthomcl:mysql_local_infile=1:mysql_socket=/var/run/mysqld/mysqld.sock
dbLogin=pgl
dbPassword=7890uiop
similarSequencesTable=SimilarSequences
orthologTable=Ortholog
inParalogTable=InParalog
coOrthologTable=CoOrtholog
interTaxonMatchView=InterTaxonMatch
percentMatchCutoff=50
evalueExponentCutoff=-5
oracleIndexTblSpc=NONE
> CREATE USER 'pgl'@'localhost' IDENTIFIED BY '7890uiop'; 
> grant all privileges on orthomcl.* to 'pgl'@'localhost'; 
> ALTER USER 'pgl'@'localhost' IDENTIFIED WITH mysql_native_password BY '7890uiop';
> create database orthomcl;
> SET GLOBAL local_infile = 1;
> SET GLOBAL  tmp_table_size =1024*1024*1024;
> SET GLOBAL innodb_buffer_pool_size=3*1024*1024*1024;

iqtree

# protein tree
$ ./iqtree-1.6.12-Linux/bin/iqtree -s KOG0014_selectedSpecies.fas.fas -st AA -m TEST -bb 1000 -alrt 1000 -nt AUTO 

SequenceServer

$ docker run --rm -itp 4567:4567 -v /path-to-database-dir:/db wurmlab/sequenceserver:1.0.11

dcGO ์‹œ๊ฐํ™”

import pandas as pd
import matplotlib.pyplot as plt
def get_plot(url,label):
    filename = label+'.'+url.split('/')[-1]
    ! wget {url}
    ! mv {url.split('/')[-1]} {filename}
    
    df_sb_enrich = pd.read_csv(filename,sep='\t',comment='#')
    c = df_sb_enrich.reset_index().columns[1:]
    df_sb_enrich = df_sb_enrich.reset_index()
    df_sb_enrich.columns = list(c) + ['']
    
    df_sb_enrich.sort_values(by='Z-score',ascending=False).head(30).plot(kind='barh',x='GO term',y='Z-score',\
                                                                         color='red',legend=False,xlim=(0,10),\
                                                                         figsize=(4,6))
    plt.ylabel('')
    plt.xlabel('Z-score')
    plt.savefig('{filename}.png'.format(filename=filename),dpi=300,bbox_inches='tight')
    return df_sb_enrich

์—ฝ๋ก์ฒด ์„œ์—ด NCBI์— ์—…๋กœ๋“œ ๋ฐฉ๋ฒ•

  • genebank file์„ sqnํŒŒ์ผ๋กœ ๋งŒ๋“ ๋‹ค. https://chlorobox.mpimp-golm.mpg.de/GenBank2Sequin.html
  • submission ์ •๋ณด๋Š” genebank2sequin ์— ์ ํžŒ๋Œ€๋กœ ์ˆ˜ํ–‰
  • file์— ๋Œ€ํ•œ ์ •๋ณด๋“ค์€ submission์ค‘ ์ƒ์„ฑํ•˜๋ผ๊ณ  ํ•˜๋Š” excelํŒŒ์ผ๋Œ€๋กœ ์ž‘์„ฑ
  • submission ํ•˜๋Š” ๊ณณ์€ https://submit.ncbi.nlm.nih.gov/subs/genome/
  • new submission์„ ๋ˆŒ๋Ÿฌ์„œ ์‹œํ‚ค๋Š”๋Œ€๋กœํ•˜๋ฉด ๋. #๋งค์šฐ ๊ท€์ฐฎ์Œ

GBS demult

flexbar -r PP-A1_whole_1.fastq.gz -p PP-A1_whole_2.fastq.gz -b PP-A1.barcode.txt.parsed.fa

Illumina adapter ์ œ๊ฑฐ

# https://github.com/seqan/flexbar
# preset truseq
parallel -j 1 "flexbar -n 10 -r {1}_1.fastq.gz -p {1}_2.fastq.gz --adapter-preset TruSeq -ap ON -z GZ -t {1}_trim" :::: samples

RF RNAseq analysis

## x,y ๋ฅผ ์ •์˜ํ•œ ๋’ค
## y๋Š” 0๊ณผ 1๋กœ ์ •์˜ ํ–ˆ๋‹ค๊ณ  ๊ฐ€์ •
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from multiprocessing import Pool, Manager

def skrun(fis,i,x,y):
    rf = RandomForestClassifier(n_estimators=2000,n_jobs=1)
    rf.fit(x,y)
    fi =rf.feature_importances_
    fis.append(fi)
l   = Manager()
fis = l.list()
p   = Pool(10)    

for i in range(0, 1000):
    p.apply_async(skrun, args=(fis,i,X,y))
p.close()
p.join()  

fi_sum = np.sum(np.stack(fis),axis=0)
## fi_sum ์ด ์ •์˜ ๋œ๋’ค
## fi_sum, ์‚ฌ์šฉ๋œ df_mat_ix_sub, y ๊ฐ’์„ ๊ธฐ์ค€์œผ๋กœ ์ค‘์š”ํ•ด ๋ณด์ด๋Š” ์œ ์ „์ž์˜ rank,regulation(up/down)๋ฅผ ๋Œ๋ ค์คŒ. 
def get_rankdf(fi_sum,df_mat_ix_sub,y): # df_mat_ixi_sub : columns์ด ์ •ํ•ด์ง€๊ณ ๋‚œ df_mat_ix
    df_gene2fi       = pd.DataFrame(index=df_mat_ix_sub.index)
    df_gene2fi['fi'] = fi_sum
    c1 = df_mat_ix_sub.columns[y==0]
    c2 = df_mat_ix_sub.columns[y==1]
    m = df_gene2fi['fi']>0
    df_gene2fi_cut = df_gene2fi[m]

    from scipy.stats import rankdata

    rank_temp = rankdata(df_gene2fi_cut['fi'].astype(float))
    rank      = max(rank_temp)-(rank_temp-1)
    df_gene2fi_cut['RNAseq_rank'] = rank

    m                   = df_gene2fi_cut.RNAseq_rank < 500 
    df_gene2fi_cut_rank = df_gene2fi_cut[m]

    df_annot_ix = df_annot.set_index('Transcript stable ID version')

    df_gene2fi_cut_rank['gene name'] = [df_annot_ix.loc[x]['Gene name'] for x in df_gene2fi_cut_rank.index]
    df_gene2fi_cut_rank['gene desc'] = [df_annot_ix.loc[x]['Gene description'] for x in df_gene2fi_cut_rank.index]

    def reg(x):
        try:
            return np.mean(df_mat_ix_sub[c2].loc[x].fillna(0).values)/np.mean(df_mat_ix_sub[c1].loc[x].fillna(0).values)
        except KeyError:
            return 0
    df_gene2fi_cut_rank['regulation'] = [reg(x) for x in df_gene2fi_cut_rank.index]
    return df_gene2fi_cut_rank

vcf2matrix

def matrix(x,c_gt):
    import numpy as np
    allele = np.array([x['REF']] + x['ALT'].split(','))
    gt_list = x[c_gt]
    def change(i):
        try:
            ix = list(map(int,i.split(':')[0].split('/')))
            return ''.join(allele[ix])
        except ValueError:
            return 'NN'
    return [change(i) for i in gt_list]

# multi accessions vcf
# zcat sample.vcf.gz | grep -v '##' > sample.vcf.grep 
# gzip sample.vcf.grep

df_ncbi     = pd.read_csv('./SRP059747.vcf.gz.grep.gz',compression='gzip',sep='\t')
m_snp       = df_ncbi['INFO'].apply(lambda x : x.split(';')[0] != 'INDEL')
df_ncbi_snp = df_ncbi[m_snp]

c_gt    = df_ncbi_snp.columns[9:]
gt_list = df_ncbi_snp.apply(matrix,axis=1)
gt_list = np.stack(gt_list)

AB1 to Fastq

parallel -j 10 "seqret -osformat="fastq-sanger" -filter {1} > {1}.fastq" ::: *.ab1

Heatmap clustering python

from scipy.cluster.hierarchy import dendrogram, ward
X = X_pca
fig = plt.figure(figsize=(5,10))
axdendro = fig.add_axes([0.09,0.1,0.2,0.8])
linkage_array = ward(X)
axdendro.axis('off')
Z = dendrogram(linkage_array,orientation='left')
axmatrix = fig.add_axes([0.3,0.1,0.6,0.8])
Zix = Z['leaves']
axmatrix.matshow(X[Zix],aspect='auto', origin='lower')
axmatrix.axis('off')
axans = fig.add_axes([0.92,0.1,0.1,0.8])
im = axans.matshow(y.T[Zix],aspect='auto', origin='lower')
axans.axis('off')
axcolor = fig.add_axes([1.02,0.1,0.02,0.8])
plt.colorbar(im, cax=axcolor)

GO enrichment

  • ์ธํ’‹์—๋Š” ์—†๋Š” ์œ ์ „์ž๊ฐ€ ์นด์šดํŒ… ๋˜๋Š” ๋ฌธ์ œ๊ฐ€ ์žˆ๋‹ค. ์–ด๋””์„œ ๊ปด๋“œ๋Š”์ง€ ์•Œ์ˆ˜๊ฐ€ ์—†์Œ;
import goenrich
O = goenrich.obo.ontology('../../go_db/go-basic.obo')
gene2go = goenrich.read.gene2go('../../go_db/gene2go',tax_id=3702)
values = {k: set(v) for k,v in gene2go.groupby('GO_ID')['GeneID']}
background_attribute = 'gene2go'
goenrich.enrich.propagate(O, values, background_attribute)
annot = pd.read_csv('../../go_db/Arabidopsis_thaliana.gene_info',sep='\t')
ATID = annot['LocusTag']
GENEID = annot['GeneID']
dicAT2GD = dict(zip(ATID,GENEID))

def get_GO_from_genelist(gl,f='test'): # gene list
    df = goenrich.enrich.analyze(O, gl, background_attribute, gvfile='temp.dot')
    subprocess.check_call(['dot', '-Tpng', 'temp.dot', '-o', '%s.png'%f])
    m = (df['namespace'] == 'biological_process') & (df['rejected'] == 1 )
    #m = (df['namespace'] == 'biological_process') & (df['p'] < 0.01 )
    #m = (df['p'] < 0.01 )
    return df[m]

gl_GD = [dicAT2GD[x] for x in gl] # gl is list of arabidopsis gene name 

df_out = get_GO_from_genelist(gl_GD,f='test')
import goenrich
O = goenrich.obo.ontology('../../go_db/go-basic.obo')
annot = goenrich.read.sgd('../../go_db/gene_association.tair.gz')
annot_ix = annot.set_index('db_object_name')
# db_object_name ์œผ๋กœ ํ•ด์•ผ, ์ธํ’‹๊ฐ’์„ ์ผ๋ฐ˜์ ์ธ ์œ ์ „์ž ์•„์ด๋””๋กœ ํ•  ์ˆ˜ ์žˆ๋‹ค. db_object_symbol๋กœ ํ•˜๋ฉด ์ •์‹ ์—†๋Š” ์œ ์ „์ž์ด๋ฆ„์ด๋ž‘, ์œ ์ „์ž์•„์ด๋””๋ž‘ ์„ž์—ฌ์„œ ๊ฒฐ๊ณผ์— ๋ฌธ์ œ๊ฐ€ ์ƒ๊น€. 
values = {k: set(v) for k,v in annot.groupby('go_id')['db_object_name']}
# reference group was restricted to nodes in AI1 
background_attribute = 'annot_sub'
goenrich.enrich.propagate(O, values, background_attribute)
annot_ix = annot.set_index('db_object_name')

def get_GO_from_genelist(gl,f='test'): # gene list
    df = goenrich.enrich.analyze(O, gl, background_attribute, gvfile='temp.dot')
    subprocess.check_call(['dot', '-Tpng', 'temp.dot', '-o', '%s.png'%f])
    #m = (df['namespace'] == 'biological_process') & (df['rejected'] == 1 )
    #m = (df['namespace'] == 'biological_process') & (df['p'] < 0.01 )
    m = (df['p'] < 0.01 )
    return df[m]

annotation cloud

from wordcloud import WordCloud
text = ' '.join(x.split('=')[0][:-2] for x in df_annot['annotation'].values)

# Generate a word cloud image
wordcloud = WordCloud(width=1600,height=1000,stopwords=["protein","homolog","kDa","related","Probable"]).generate(text)

# Display the generated image:
# the matplotlib way:
import matplotlib.pyplot as plt
plt.figure(figsize=(20,20))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

๊ทธ๋ฆผ์œผ๋กœ ๋™์˜์ƒ ๋งŒ๋“ค๊ธฐ

mencoder mf://*.png -mf w=800:h=600 -ovc x264 -of avi

kmer gen

def convert(n, base):
    T = "0123456789ABCDEF"
    q, r = divmod(n, base)
    if q == 0:
        return T[r]
    else:
        return convert(q, base) + T[r]
def kmer_gen(kmer_length):
    bases = {'0':'A','1':'T','2':'G','3':'C'}
    destination = int('3'*kmer_length,4)
    seq_list = []
    for i in range(destination+1):
        nseq = convert(i,4).zfill(kmer_length)
        seq = ''.join([bases[x] for x in nseq])
        seq_list.append(seq)
    return seq_list

http://gatkforums.broadinstitute.org/gatk/discussion/3891/calling-variants-in-rnaseq

blast xml parse


outxml = 'temp.%s.xml'%selected_sbix
print('cmd: blastn -query %s -subject %s -outfmt 5 -out %s'%(seqA_name,seqB_name,outxml))
subprocess.call('blastn -query %s -subject %s -outfmt 5 -out %s'%(seqA_name,seqB_name,outxml),shell=True)

tree = ET.parse(outxml)
root = tree.getroot()
dic = {'hitfrom':[],
       'hitto':[]
      }
with open('%s.twoblast.parsed.txt'%selected_sbix,'w') as f:
    print ('# synteny block id : %s'%selected_sbix,file=f)
    aligned_pos_ref = {0}
    aligned_pos_query = {0}
    for hsps in root.iter('Hsp'):
        hsp_num      = hsps.find('Hsp_num').text
        hsp_hit_from = hsps.find('Hsp_hit-from').text
        hsp_hit_to   = hsps.find('Hsp_hit-to').text
        hsp_query_from = hsps.find('Hsp_query-from').text
        hsp_query_to   = hsps.find('Hsp_query-to').text
        hsp_align_len  = hsps.find('Hsp_align-len').text

MUMMER

/program/MUMmer3.23/nucmer --prefix=ken2ute Ahal.assembly.scf.fasta.addhd.fa  Ahal_masked.rm.fa.addhd.fa
/program/MUMmer3.23/delta-filter -g -i 98 ken2ute.delta > ken2ute.delta.filter
/program/MUMmer3.23/show-coords -rcl ken2ute.delta.filter > ken2ute.flt.coords
# scaffold a ์™€ scaffold b ๊ฐ„์— alignment
/program/MUMmer3.23/mummerplot ute2ute.delta.filter -r ute_000430F-pilon -q ute_scaffold-58  --png
# scaffold a ์™€ ๋‚˜๋จธ์ง€ ๋ชจ๋‘ ๊ฐ„์˜ alignment 
/program/MUMmer3.23/mummerplot ute2ute.delta.filter -r ute_000430F-pilon   --png
# scaffold a ์™€ ๋‚˜๋จธ์ง€ ๋ชจ๋‘ ๊ฐ„์˜ alignment ์˜ coverage plot
/program/MUMmer3.23/mummerplot ute2ute.delta.filter -r ute_000430F-pilon   --png --coverage
# scaffold a ์™€ ๋‚˜๋จธ์ง€ ๋ชจ๋‘ ๊ฐ„์˜ alignment, coverage ๊ฐ’์„ gradient๋กœ ์ค€๋‹ค. ๊ฒฐ๊ณผ๋Š” dot plot like 
/program/MUMmer3.23/mummerplot ute2ute.delta.filter -r ute_000430F-pilon   --png  --color
# scaffold a ์™€ ๋‚˜๋จธ์ง€ ๋ชจ๋‘ ๊ฐ„์˜ alignment, SNP๋ฅผ ๋นจ๊ฐ›๊ฒŒ ํ‘œ์‹œ. ๊ฒฐ๊ณผ๋Š” dot plot like 
/program/MUMmer3.23/mummerplot ute2ute.delta.filter -r ute_000430F-pilon   --png  --SNP

augustus training

GFF=../my_csv.csv.addgene.gff3.sort.gff3
GL=../good.list.td # gene list
REF=../../ref/Ahal.assembly.scf.fasta
parallel -j 4 "cat ${GFF} | grep {1} > {1}.test.gff" :::: ${GL}
cat Gene*.test.gff > test.gff
rm Gene*.test.gff
/programs/augustus-3.2.2/scripts/gff2gbSmallDNA.pl test.gff ${REF} 1000 test.gb
/programs/augustus-3.2.2/scripts/randomSplit.pl test.gb 100
/programs/augustus-3.2.2/scripts/new_species.pl --species=aha_v1
/programs/augustus-3.2.2/bin/etraining --species=aha_v1 test.gb.train
/programs/augustus-3.2.2/scripts/optimize_augustus.pl --species=aha_v1 test.gb.train  # takes ~1d
augustus --species=aha_v1 test.gb.test | tee firsttest.out

http://busco.ezlab.org/

./BUSCO_v1.22.py -in Ahal.assembly.scf.fasta -c 5 -o Ahaleri -l embryophyta_odb9 -m genome --long
scipio.pl --blat_output=prot.vs.genome.psl Ahal.assembly.scf.fasta selected.fa > scipio.yaml
cat scipio.yaml | yaml2gff.1.4.pl > scipio.scipiogff
../scipiogff2gff.pl --in=scipio.scipiogff --out=scipio.gff
cat scipio.yaml | yaml2log.1.4.pl > scipio.log

TE ๊ฑธ๋ฅด๋Š”๋ฒ•

cat new_gene.fa.bp.na1.uniprot-all.fasta.out7.annot | grep -v 'Retrovirus-related' | grep -v 'Transposon' | grep -v 'retrotransposable' | grep -v 'Ribonuclease' |grep -v 'LINE' | grep -v 'retrotransposon' | grep -v 'Copia' | grep -v 'Zinc finger BED domain' | grep -v 'Replication protein' > noTE.list

gtf to gff3

# from https://github.com/jorvis/biocode
 python3 /program/biocode/gff/convert_augustus_to_gff3.py -i Ahal.assembly.genepred.gff -o Ahal.assembly.genepred.gff3

FTP

docker pull stilliard/pure-ftpd:hardened
docker run -d --name ftpd_server -p 0.0.0.0:2121:21 -p 30000-30009:30000-30009 -e "PUBLICHOST=localhost" -v /root/data1/ftp:/data  stilliard/pure-ftpd:hardened
docker exec -it ftpd_server /bin/bash
pure-pw useradd guest -u ftpuser -d /data
pure-pw mkdb

vcf merge

parallel -j 5 bgzip {1} ::: *.raw.vcf  # tabix ๋Š” bgzip์œผ๋กœ ํ•ด์•ผํ•จ! 
parallel -j 5 tabix -p vcf {1} ::: *.vcf.gz
vcf-merge *.vcf.gz > test.c.vcf

Htseq-count

htseq-count -q -f bam ${1}.tophat_out_maxintron10000/accepted_hits.bam Creinhardtii_281_v5.5.gene_exons.gff3.exon.gtf > ${1}.tophat_out_maxintron10000.count

Hisat2 & stringtie & transdecoder

/program/hisat2-2.0.3-beta/hisat2 --rna-strandness FR #RF for dUTP protocol   \
                                  --max-intronlen 30000 \
                                  -p 2 -x ../../../ref/Creinhardtii_281_v5.0 \ 
                                  -1 SRR1521680_1.fastq.gz -2 SRR1521680_2.fastq.gz\
                       | sambamba view -f bam -o SRR1521680.bam -S /dev/stdin
sambamba sort -t 10 SRR1521680.bam
/program/stringtie-1.2.2.Linux_x86_64/stringtie -p 5 -o SRR1521680.sorted.bam.stringtie.gff  SRR1521680.sorted.bam
/program/TransDecoder-2.1.0/util/cufflinks_gtf_to_alignment_gff3.pl intron3000.merge.sorted.bam.stringtie.gff.strandcor.gff > intron3000.merge.sorted.bam.stringtie.gff.strandcor.gff.gff3
/program/TransDecoder-2.1.0/util/cufflinks_gtf_genome_to_cdna_fasta.pl SRR1521680.sorted.bam.stringtie.gff ../../../ref/Creinhardtii_281_v5.0.fa > transcripts.fasta
/program/TransDecoder-2.1.0/TransDecoder.LongOrfs -t transcripts.fasta
/program/TransDecoder-2.1.0/TransDecoder.Predict -t transcripts.fasta
/program/TransDecoder-2.1.0/util/cdna_alignment_orf_to_genome_orf.pl transcripts.fasta.transdecoder.gff3 intron3000.merge.sorted.bam.stringtie.gff.strandcor.gff.gff3 transcripts.fasta > transcripts.fasta.transdecoder.genome.gff3

BWA

for input in SRR2751913
do
/program/bwa-0.7.12/bwa aln -e 50 -l 30 -k 1 -t 10 -M 6 -O 15 -E 8 ../ref/Cannuum/Pepper.v.1.55.total.chr ${input}_1.fastq.gz > ${input}_1.sai
/program/bwa-0.7.12/bwa aln -e 50 -l 30 -k 1 -t 10 -M 6 -O 15 -E 8 ../ref/Cannuum/Pepper.v.1.55.total.chr ${input}_2.fastq.gz > ${input}_2.sai
done

REF=../ref/Cannuum/Pepper.v.1.55.total.chr
SAMPLE=PERENNIAL
for input in SRR2751913
do
/program/bwa-0.7.12/bwa sampe -r @RG"\t"ID:${input}"\t"SM:${SAMPLE}"\t"LB:${SAMPLE}"\t"PL:Illumina ${REF} ${input}_1.sai ${input}_2.sai ${input}_1.fastq.gz ${input}_2.fastq.gz > ${input}.sam
done

$ sambamba view -f bam -o SRR2751913.bam -S -t 10 SRR2751913.sam
$ sambamba sort  -t 10 -o SRR2751913.sort.bam SRR2751913.bam; sambamba mpileup -t 10 -o SRR2751913.bcf SRR2751913.sort.bam  --samtools -gf ../ref/Cannuum/Pepper.v.1.55.total.chr.fa
$ sambamba mpileup -t 10 -o SRR2751914.cv.vcf SRR2751914.sorted.bam  --samtools -gu ../References/Cannum/Pepper.v.1.55.total.chr --bcftools call -cv

tophat

tophat --output-dir ${1}.tophat_out  --max-intron-length 1000 --num-threads 10 --no-discordant --no-mixed ../ref/Creinhardtii_281_v5.0 ${1}_1.fastq ${1}_2.fastq
parallel -j 2 tophat --num-threads 5 --max-intron-length 50000 --output-dir {1}_tophatout Alyrata_107_v1 {1} ::: *.fastq

blat

http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/blat/ # download http://iubio.bio.indiana.edu/gmod/tandy/blat2gff.pl # psl2gff

blat Creinhardtii_281_v5.0.fa /dev/null /dev/null -tileSize=11 -makeOoc=Cre281.11.ooc
blat Creinhardtii_281_v5.0.fa SRR2132408_trinity_out.Trinity.fasta -ooc=Cre281.11.ooc -minIdentity=98 output.psl

transmission

https://help.ubuntu.com/community/TransmissionHowTo

$ sudo service transmission-daemon stop
$ sudo usermod -a -G debian-transmission k821209
$ sudo chgrp debian-transmission /home/k821209/download
$ sudo chmod 770 /home/k821209/download
$ sudo service transmission-daemon start
# torrent file : /var/lib/transmission-daemon/info/torrents/

Jbrowser

http://gmod.org/wiki/JBrowse_Configuration_Guide

$ bin/prepare-refseqs.pl --fasta name.fa
$ bin/flatfile-to-json.pl --gff Creinhardtii_281_v5.5.gene.gff3 --trackType CanvasFeatures --trackLabel original_genemodel
$ ../bin/flatfile-to-json.pl --trackLabel repeatrunner \
  --type protein_match:repeatrunner \ # repeatrunner  protein_match feature๋ฅผ ๊ฐ€์ ธ์˜ด
  --key RepeatRunner \   
  --className generic_parent \
  --subfeatureClasses '{"match_part" : "feature"}' \ # ์—ฌ๊ธฐ์„œ feature์— ํ•ด๋‹นํ•˜๋Š” ๊ฒƒ์„ ๋ฐ”๊พธ๋ฉด ๋””์ž์ธ์ด ๋ฐ”๋€๋‹ค 
                                                     # ๋ญ˜๋กœ ๋ฐ”๊พธ๋ฉด ๋˜๋Š”์ง€๋Š” JBrowse-1.11.6/track_styles.css
  --gff Creinhardtii_281_v5.0.all.gff.gff3
$ ../../bin/flatfile-to-json.pl --trackLabel est2genome --type expressed_sequence_match:est2genome --key est2genome --className generic_parent --subfeatureClasses '{"match_part" : "feature2"}' --gff Creinhardtii_281_v5.0.all.maker_try3.gff.fastaremoved.gff.est2genome.gff
$ ../../bin/generate-names.pl -v
# ์˜ฌ๋ฆฌ๊ณ  ๋‚œ ๋’ค์— ๊ณ ์น˜๋ ค๋ฉด data/trackList.json ๋ฅผ ๊ณ ์น˜๋ฉด๋จ
# removing track
$  ../bin/remove-track.pl  --trackLabel maker_gff

bam file

data/tracks.conf ์— ๋ถ™์—ฌ๋„ฃ์Œ ๋œ๋‹ค.

# make a section for the new track called tracks.(unique name)
[ tracks.my-bam-track ]

# settings for what data is shown in the track
storeClass     = JBrowse/Store/SeqFeature/BAM
urlTemplate    = ../../raw/volvox/volvox-sorted.bam
baiUrlTemplate = ../../raw/volvox/volvox-sorted.bam.bai

# settings for how the track looks
category = NGS   #< category for this track
type = JBrowse/View/Track/Alignments2
key  = BAM alignments from sample XYZ

Maker

http://gmod.org/wiki/MAKER_Tutorial

apt-get install libcr-dev mpich2 mpich2-doc
maker -CTL
mpiexec -n 4 /program/maker/bin/maker </dev/null

E-PCR

$ famap -tN -b ${1}.genome.famap ./${1} # ${1} : fasta file
$ fahash -b ${1}.genome.hash -w 12 -f3 ./${1}.genome.famap
# hashed
$ re-PCR -S Creinhardtii_281_v5.5.transcript_primaryTranscriptOnly.fa.genome.hash -n1 -g1 primers.sts -o primers.sts.trans.out
SSR.sts

Mungbean_SSR_ID_1 CAAAAACATGAGTTGCACACAA TCATAACGCAGAACAGCGAA
Mungbean_SSR_ID_2 ATGTGTGTGAGCACCTCGAC TTTGGCCATGCAAGATGTAA
Mungbean_SSR_ID_4 GCGGTTCACCTAGCCATAAA GGACCCTTCTGTGCGTGTAT
Mungbean_SSR_ID_5 GTTTGTGCTGCGGATTCTTT TTGGCAATTTGGACTAAGGC
Mungbean_SSR_ID_7 TTGACCCAAAACTTACCAATTT GCTAAGGACTGGGGGTCTTC

Trinity

$ ../../download/trinityrnaseq-2.0.6/Trinity --seqType fq --left SRR2132444_1.fastq --right SRR2132444_2.fastq --SS_lib_type FR --CPU 4 --max_memory 50G
$ /program/trinityrnaseq-2.0.6/trinity-plugins/TransDecoder-2.0.1/TransDecoder.LongOrfs -t SRR2132441_trinity_out.Trinity.fasta

MCSCANX

$ cat ath.gff.cor.gff aha.gff.cor.gff > At2Ah.gff
$ /data2/k821209/programs/MCScanX/MCScanX At2Ah
$ /usr/bin/perl /data2/k821209/programs/MCScanX/downstream_analyses/add_ka_and_ks_to_collinearity.pl -i At2Ah.collinearity -d At2Ah.cds -o At2Ah.collinearity.kaks
#synteny chromosome plot
#python3 /data2/k821209/good_python_code/synteny_plot_parse.py Pv2Va.collinearity.kaks Pv2Va Pv2Va.gff
$ python3 synteny_plot_parse.py Va2Gm.collinearity.kaks Va2Gm Va2Gm.gff
$ cat Pv2Va.collinearity.kaks.Rin Va2Gm.collinearity.kaks.Rin Va2Vr.collinearity.kaks.Rin > merge.Rin
library(ggplot2)
library(scales)
data<-read.table('merge.Rin',header=F,sep='\t')
sub<-subset(data,!grepl("s",data$V3)) # remove scaffolds
sub<-subset(sub,!grepl("SS",sub$V3)) # remove super scaffolds
g <- ggplot(data, aes(x=V6,y=V7,color=factor(V3))) + geom_point(alpha=0.5,size=1) + facet_grid(V5 ~ V1) + ylim(0,3) + 
scale_x_continuous(labels = comma) + theme(axis.text.x = element_text(angle = 60, hjust = 1))+ ylab('Ks value') + xlab('Genomic position')

png('all.png', height=600, width=1240, res = 100, units = 'px')
print(g)
dev.off()

Orthomcl

orthomcl ๋Š” ์•Œ์•„์„œ ์ด๋ฆ„์„ ๋ฐ”๊พธ๋ฉด๋œ๋‹ค.

#Make config file : orthomcl.config.template
dbVendor=mysql
dbConnectString=dbi:mysql:orthomcl:mysql_local_infile=1:localhost:3306
dbLogin=orthomcl
dbPassword=8804555
similarSequencesTable=SimilarSequences
orthologTable=Ortholog
inParalogTable=InParalog
coOrthologTable=CoOrtholog
interTaxonMatchView=InterTaxonMatch
percentMatchCutoff=50
evalueExponentCutoff=-5
oracleIndexTblSpc=NONE
$ mysql -u root -p
mysql> CREATE DATABASE orthomcl;
mysql> GRANT SELECT,INSERT,UPDATE,DELETE,CREATE VIEW,CREATE, INDEX, DROP on orthomcl.* TO orthomcl@localhost;
mysql> set password for orthomcl@localhost = password('8804555');
$ orthomclInstallSchema orthomcl.config.template
$ orthomclAdjustFasta VRA final.assembly.pep.fasta.primary.SH.fa 1
$ mkdir ./compliantFasta/
$ mv VRA.fasta ./compliantFasta/ # same for the ATH.fasta etc.
$ orthomclFilterFasta ./compliantFasta/ 10 20
$ blastall -m 8 -a 10 -p blastp -F 'm S' -v 100000 -b 100000 -z 259701 -e 1e-5 -d goodProteins.fasta -i goodProteins.fasta -o goodProteins.fasta.self.out # 27๊ฐœ ํ•  ๋•Œ ๋ณด๋ฆ„ ์ด์ƒ๊ฑธ๋ฆผ
#-z database size : grep -v '>' INPUT.fa | tr -d '\n' | wc
$ gpu:/data/k821209/programs/gpu-blast-1.1_ncbi-blast-2.2.26/ncbi-blast-2.2.26+-src/c++/GCC460-ReleaseMT64/bin/blastp -query ./goodProteins.fasta -dbsize 101358393 -evalue 1e-5 -out self.out -outfmt 6 -seg yes -db ./goodProteins.fasta.sorted -gpu t
$ orthomclBlastParser goodProteins.fasta.self.out compliantFasta/ >> similarSequences.txt # 27๊ฐœ accessionํ•  ๋•Œ ์ด ์Šคํ…๋งŒ 6์‹œ๊ฐ„ ์ •๋„ ๊ฑธ๋ฆผ
$ orthomclLoadBlast orthomcl.config.template similarSequences.txt # ์šฉ๋Ÿ‰ ๋งŽ์ด ๋จน์Œ ์ถฉ๋ถ„ํžˆ ํ™•๋ณด
$ orthomclPairs orthomcl.config.template orthomcl_pairs.log cleanup=no
$ orthomclDumpPairsFiles orthomcl.config.template
$ mcl mclInput --abc -I 1.5 -o mclOutput
$ orthomclMclToGroups CLST 1000 < mclOutput > groups.txt
$ python3 /data2/k821209/good_python_code/orthomcl/groups_count.py

Venndia gram via R

source("http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/overLapper.R")
sets <- read.delim("groups.txt.Rin")
setlistImp <- lapply(colnames(sets), function(x) as.character(sets[sets[,x]!="", x]))
names(setlistImp) <- colnames(sets)
setlist5 <- setlistImp[1:5]; OLlist5 <- overLapper(setlist=setlist5, sep="_", type="vennsets")
counts <- sapply(OLlist5$Venn_List, length)
vennPlot(counts=counts, ccol=c(rep(1,30),2), lcex=1.5, ccex=c(rep(1.5,5), rep(0.6,25),1.5))

ipython notebook server

$ ipython notebook --profile=nbserver

Venndia gram

Filename : ts.Rin
Flower       Leaf       Pod       Root
Vang01g00010    Vang01g00010    Vang01g00010    Vang01g00010
Vang01g00020    Vang01g00030    Vang01g00030    Vang01g00020
Vang01g00030    Vang01g00050    Vang01g00050    Vang01g00030
Vang01g00050    Vang01g00110    Vang01g00110    Vang01g00050
Vang01g00110    Vang01g00120    Vang01g00120    Vang01g00110
Vang01g00120    Vang01g00160    Vang01g00160    Vang01g00120
Vang01g00140    Vang01g00170    Vang01g00170    Vang01g00140
Vang01g00160    Vang01g00250    Vang01g00250    Vang01g00160
Vang01g00170    Vang01g00270    Vang01g00270    Vang01g00170
Vang01g00180    Vang01g00310    Vang01g00310    Vang01g00250
Vang01g00210    Vang01g00320    Vang01g00320    Vang01g00270
.....
source("http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/overLapper.R")
sets <- read.delim("ts.Rin")
setlist <- lapply(colnames(sets), function(x) as.character(sets[sets[,x]!="", x]))
names(setlist) <- colnames(sets)
OLlist <- overLapper(setlist=setlist, sep="", type="vennsets"); OLlist; names(OLlist)
setlist5 <- setlist[1:5]; OLlist5 <- overLapper(setlist=setlist5, sep="", type="vennsets")
counts <- sapply(OLlist5$Venn_List, length)
vennPlot(counts=counts, ccol=c(rep(1,30),2), lcex=1.5, ccex=c(rep(1.5,5), rep(0.6,25),1.5))

Interpro

$/data2/k821209/programs/interproscan-5-RC4/interproscan.sh -appl Pfam -T /data2/k821209/Mungbean/maker4/TMP/ -i target.pep.fa

Repeatmasking

#BuildDatabase -name echinosophora -engine ncbi echinosophora_p_pilon.fasta
#RepeatModeler -pa 20 -engine ncbi -database echinosophora 2>&1 | tee repeatmodeler.log
RepeatMasker -pa 20 -e ncbi -lib  echinosophora-families.fa -dir ./echinosophora_repeat_v1/ -gff echinosophora_p_pilon.fasta

LTR search using gt

$ /data2/k821209/programs/genometools-1.5.1/bin/gt suffixerator -suf -lcp -dna -indexname ${1} -db ${1}
$ /data2/k821209/programs/genometools-1.5.1/bin/gt ltrharvest -gff3 ${1}.gff3 -index ${1}
$ /data2/k821209/programs/genometools-1.5.1/bin/gt gff3 -sort {1}.gff3 > {1}.sort.gff3
$ /data2/k821209/programs/genometools-1.5.1/bin/gt -j 2 ltrdigest -pptlen 10 30 -pbsoffset 0 3 -trnas  Athal-tRNAs.fa -hmms ./hmm/*.hmm -outfileprefix {1} {1}.sort.gff3 Athaliana_167.fa | tee {1}.ltrdigest_GyDB.gff3

Trinity RNAseq assemble

#single end
$ /path/trinityrnaseq_r2013-02-25/Trinity.pl --seqType fq --single ERR185151.fastq --JM 2G --CPU 4 --no_cleanup --monitoring
#paired end
$ /data2/k821209/programs/trinityrnaseq_r2013-02-25/Trinity.pl --seqType fq --left ERR173896_1.fastq --right ERR173896_2.fastq --SS_lib_type FR --CPU 4 --no_cleanup --monitoring --JM 2G

Jellyfish:Kmercount

$ zcat SunhwaN_?.fastq.gz | jellyfish count -C -m 22 -s 50G -t 10 -c 6 -o output /dev/fd/0
$ jellyfish histo -o hist.out output_0 

BSseq

$ ll *
 -rw-rw-r-- 1 k821209 k821209        148  1?? 21 11:21 run.bismark.sh
 -rw-r--r-- 1 k821209 k821209 8396394060  1?? 21 11:10 S_1_4_5_1.fastq.gz
 -rw-r--r-- 1 k821209 k821209 8014309256  1?? 21 11:13 S_1_4_5_2.fastq.gz
 gw:
 total 453976
 drwxrwxr-x 3 k821209 k821209      4096  1?? 21 11:21 ./
 drwxrwxr-x 3 k821209 k821209      4096  1?? 21 11:21 ../
 drwxrwxr-x 4 k821209 k821209      4096  1?? 21 11:21 Bisulfite_Genome/
 -rwxr-xr-x 1 k821209 k821209 464854829  1?? 21 11:14 Vradi_ver6.fa*
  1. file:run.bismark.sh
 path2bowtie=/where/is/
 path2samtools=/where/is/
 bismark_genome_preparation --bowtie2 --path_to_bowtie ${path2bowtie} ${PWD}/gw/
 bismark --gzip --bowtie2 --fastq --bam -p 4 -N 1 --samtools_path ${path2samtools} --path_to_bowtie ${path2bowtie} ${PWD}/gw/ -1 read_1.fastq.gz -2 read_2.fastq.gz
 bismark_methylation_extractor --multicore 3 --genome_folder ${PWD}/gw/ --scaffolds --cytosine_report --CX --paired-end --no_overlap --comprehensive --samtools_path ${path2samtools} --gzip --bedGraph --counts --buffer_size 10G resulting.bam
  1. bsseq (R)
## try http:// if https:// URLs are not supported
source("https://bioconductor.org/biocLite.R")
biocLite("bsseq")

library(bsseq)
mungbean_methylation<-read.bismark(file=c("./S_1_4_5_1.fastq.gz_bismark_bt2_pe.bismark.cov","./K_1_2_4_1.fastq.gz_bismark_bt2_pe.bismark.cov"),sampleNames =c("sunhwa","kyunggi"), fileType="cov",verbose=TRUE,rmZeroCov=FALSE)
mungbean_methylation_smed =  BSmooth(mungbean_methylation, mc.cores = 1, verbose = TRUE)

Genotype by sequencing (GBS)

For GBS, we can use whole reference genome for read mapping procedure or sub genomic region harboring the restriction enzyme site. I usually use whole genome; however in the case of interspecies crossing population that would have many segregation distortion, it was better to use the sub genomic region.

samtools mpileup -DSugf Enzyme_site_block_ApeKI.txt.fa *.bam | bcftools view -bvcg - > all.bvcg.bcf

Q value

False discovery rate (FDR) calculation using R

> source("http://bioconductor.org/biocLite.R")
> biocLite("qvalue")

> library(qvalue)
> p<-scan("S_all.pv.txt.pv") # only number containing file ex > 1 \n 0.01 \n 0.2 \n ...
> qobj <- qvalue(p)
> qobj1 <- qvalue(p, lambda=0.5, robust=TRUE)
> qobj2 <- qvalue(p, fdr.level=0.05, pi0.method="bootstrap")
> qwrite(qobj, filename="myresults.txt")

Weblogo python

# composition ์— ์˜ˆ์ƒ๋˜๋Š” ๋น„์œจ์„ ๋„ฃ์–ด์ค˜์•ผ ํ•จ!
weblogo --format pdf --size large --resolution 300 --datatype fasta --composition "{'A':20, 'C':30, 'G':30, 'T':20}" --fin S_total.CG.context.fa > S_total.CG.context.fa.pdf

RNA structure

~/programs/RNAstructure/exe/MaxExpect comp_test3.fa comp_test3.fa.me.ct --sequence
# ๊ฒฐ๊ณผ outfile.ct ๋Š” jViz.Rna ๋กœ Visualization