Simple pipelines - k821209/pipelines GitHub Wiki
mapping and snp calling
# mapping.sh
SAMPLE=${3}
READ1=${1}
READ2=${2}
REF=/data/k821209/KKH/ref/GCF_000188075.1_Si_gnG_genomic.fna
touch ${SAMPLE}.bwa.start
bwa mem -t 20 $REF $READ1 $READ2 -o ${SAMPLE}.sam
touch ${SAMPLE}.bwa.end
# mapping2.sh
SAMPLE=${1}
REF=/data/k821209/KKH/ref/GCF_000188075.1_Si_gnG_genomic.fna
samtools fixmate -O bam ${SAMPLE}.sam ${SAMPLE}.bam &&
rm ${SAMPLE}.sam
sambamba sort -t 5 -o ${SAMPLE}.sort.bam ${SAMPLE}.bam --tmpdir ./tmp &&
touch ${SAMPLE}.sort.end
bcftools mpileup -g 10 -Oz -o ${SAMPLE}.gvcf.gz -f ${REF} ${SAMPLE}.sort.bam &&
bcftools index ${SAMPLE}.gvcf.gz
# merge.sh
bcftools merge -Oz --gvcf ../ref/GCF_000188075.1_Si_gnG_genomic.fna --merge all -o ant_merged.vcf.v1.gz *.gvcf.gz
# call
bcftools call -mv ant_merged.vcf.v1.gz -o ant_merged.vcf.v1.gz.mv.vcf
# plink
# to grep with minimum allele count (mac)
/data/k821209/tomato/program/plink2 --mac 3 --min-alleles 2 --out ${1}.plink2 --export vcf --new-id-max-allele-len 20 truncate --set-all-var-ids @:#:\$1,\$2 --vcf ${1} --freq cols=chrom,pos,ref,alt,eq,nobs --allow-extra-chr
dollop tree https://github.com/guyleonard/orthomcl_tools
# dollo gainloss : ๊ฐ ํด๋ ์ด๋๋ฅผ ๊ธฐ์ค์ผ๋ก ๋ชจ๋ ์กด์ฌํ๋ ๊ฒ์ผ๋ก ๋์ค๋ ๊ฒ๋ค์ root์ผ๋ก ๋ค์ด๊ฐ๋๋ฏ
# ์ด๋ผ ํด๋ ์ด๋์์ 0 ๋๋ํ๋ํธ ํด๋ ์ด๋์์ 1 ์ด๋ฉด, ๋๋ ํ๋ํธ gain. (์์ฐจ์ ์ผ๋ก root์ผ๋ก ๊ฐ์ฃผํ๊ณ ์๋ก ๊ณ์ฐ)
#+ dollo
from tqdm import tqdm
from collections import Counter
df_dollo = pd.DataFrame()
for c in tqdm(df_clusters.columns):
d = Counter(set([x.split('|')[0] for x in df_clusters.loc['genes'][c].split(',')]))
for s in ['ZMA','ZMYS','OSA','SPO','WAU']:
try:
df_dollo.at[c,s] = d[s]
except KeyError:
df_dollo.at[c,s] = 0
with open('dollo.phy','w') as f:
print(' '.join(map(str,df_dollo.T.shape)),file=f)
for c in df_dollo.columns:
print(c,''.join(map(str,df_dollo[c].astype(int).values)),sep=' '*(10-len(c)),file=f)
#-
phylip dollop # ์ซ์๋ก ์ ํํ๋๊ฑฐ ์ ๋ถ yes๋ก outfile๋ฝ์
perl ../extract_dollop_output_sequences_v2-fast.pl -i outfile -s 27661 -o output -cr -l -g list.txt
# 27661 : orthomcl ํด๋ฌ์คํฐ๊ฐฏ์
# list.txt : orthomcl ํด๋ฌ์คํฐ์์ด๋ ๋ฆฌ์คํธ (์ํฐ)
# outfile : phylip dollop ๊ฒฐ๊ณผ๋ฌผ
# output_old ๋๋ ํ ๋ฆฌ ์์ฑ
cd output_old
old2new.ipynb ์คํํด์ outfile.phy_newstyle_report.txt ์์ฑ
R < plot_tree_gains_loss_in_R.r --no-save # ํธ๋ฆฌ์์ฑ ๋ค์ segment์์ ์์ ์ฌ๋ถ ํ์ธ
perl add_internal_node_labels.pl > outtree_nodelabel
# outtree ๋ฅผ ์ธํ์ผ๋ก ๊ฐ์ง .. ํด๋น ์ฝ๋๋ฅผ ์ดํด๋ณด๋ฉด ๋ค์ segment์์ ํ์ธ...
# plot_tree_gains_loss_in_R.r
library(ggplot2)
library(ggtree)
user_report <- read.csv("outfile.phy_newstyle_report.txt", sep = "\t", header=TRUE, as.is=1, row.names=NULL) # infile ์ด๋ฆ
user_tree <- read.tree("outtree_nodelabel") # treeํ์ผ ์ด๋ฆ add_internal_node_labels.pl ์ ๊ฒฐ๊ณผ๋ฌผ
user_p <- ggplot(user_tree, aes(x, y), ladderize=TRUE) + geom_tree() + theme_tree() + geom_tiplab(size=3, align=TRUE, color="purple", x=13) + xlab("") + ylab("") + geom_text(aes(label=Gain, x=branch,subset=!isTip), size=3, color="springgreen4", vjust=-0.6) + geom_text(aes(label=Gain,subset=isTip), size=3, color="springgreen4", hjust=0, x=13.5) + geom_text(aes(label=Loss, x=branch,subset=!isTip), size=3, color="firebrick3", vjust=1.3) + geom_text(aes(label=Loss,subset=isTip), size=3, color="firebrick3", hjust=0, x=14) + geom_text(aes(label=node,subset=!isTip), size=2, hjust=-1.5,color="grey") + scale_x_continuous(expand = c(.1, .2))
user_p <- user_p %<+% user_report
print(user_p)
#add_internal_node_labels.pl
#!/usr/bin/perl
use strict;
use warnings;
use Bio::Phylo::IO qw(parse); # bioperl ๊น๊ธฐ ๊ท์ฐฎ์ผ๋ฏ๋ก bioperl ๋์ปค๋ฅผ ํ์ฉํ์
#my $newick = '(((trad,necr),didi),(nagr,(chre,phra)));';
my $filename = 'outtree'; # ์ธํ์ผ๋ก outtree๊ฐ ๋ค์ด๊ฐ
open my $newick_file, '<', $filename or die "Could not open file '$filename' $!";
my $newick_tree = '';
while (my $line = <$newick_file>) {
chomp $line;
$newick_tree = $line;
}
my $tree = parse( '-format' => 'newick', '-string' => $newick_tree )->first;
my $number_of_leaves = 23; # ์ธํ species๊ฐฏ์ ๊ผญ ๋ฐ๊ฟ์ฃผ๋ผ
my $counter = $number_of_leaves + 1;
$tree->visit(
sub {
my $node = shift;
$node->set_name( $counter++ ) if $node->is_internal;
}
);
print $tree->to_newick( '-nodelabels' => 1 );
orthomcl-docker
> docker run -v /data/k821209/:/data/ -d -e MYSQL_ROOT_PASSWORD=7890uiop --name mysql_orthomcl mysql
> docker exec -it mysql_orthomcl bash
https://www.jianshu.com/p/6f989a619c57
dbVendor=mysql
# dbConnectString=dbi:mysql:orthomcl:mysql_local_infile=1:localhost:3306
dbConnectString=dbi:mysql:orthomcl:mysql_local_infile=1:mysql_socket=/var/run/mysqld/mysqld.sock
dbLogin=pgl
dbPassword=7890uiop
similarSequencesTable=SimilarSequences
orthologTable=Ortholog
inParalogTable=InParalog
coOrthologTable=CoOrtholog
interTaxonMatchView=InterTaxonMatch
percentMatchCutoff=50
evalueExponentCutoff=-5
oracleIndexTblSpc=NONE
> CREATE USER 'pgl'@'localhost' IDENTIFIED BY '7890uiop';
> grant all privileges on orthomcl.* to 'pgl'@'localhost';
> ALTER USER 'pgl'@'localhost' IDENTIFIED WITH mysql_native_password BY '7890uiop';
> create database orthomcl;
> SET GLOBAL local_infile = 1;
> SET GLOBAL tmp_table_size =1024*1024*1024;
> SET GLOBAL innodb_buffer_pool_size=3*1024*1024*1024;
iqtree
# protein tree
$ ./iqtree-1.6.12-Linux/bin/iqtree -s KOG0014_selectedSpecies.fas.fas -st AA -m TEST -bb 1000 -alrt 1000 -nt AUTO
SequenceServer
$ docker run --rm -itp 4567:4567 -v /path-to-database-dir:/db wurmlab/sequenceserver:1.0.11
dcGO ์๊ฐํ
import pandas as pd
import matplotlib.pyplot as plt
def get_plot(url,label):
filename = label+'.'+url.split('/')[-1]
! wget {url}
! mv {url.split('/')[-1]} {filename}
df_sb_enrich = pd.read_csv(filename,sep='\t',comment='#')
c = df_sb_enrich.reset_index().columns[1:]
df_sb_enrich = df_sb_enrich.reset_index()
df_sb_enrich.columns = list(c) + ['']
df_sb_enrich.sort_values(by='Z-score',ascending=False).head(30).plot(kind='barh',x='GO term',y='Z-score',\
color='red',legend=False,xlim=(0,10),\
figsize=(4,6))
plt.ylabel('')
plt.xlabel('Z-score')
plt.savefig('{filename}.png'.format(filename=filename),dpi=300,bbox_inches='tight')
return df_sb_enrich
์ฝ๋ก์ฒด ์์ด NCBI์ ์ ๋ก๋ ๋ฐฉ๋ฒ
- genebank file์ sqnํ์ผ๋ก ๋ง๋ ๋ค. https://chlorobox.mpimp-golm.mpg.de/GenBank2Sequin.html
- submission ์ ๋ณด๋ genebank2sequin ์ ์ ํ๋๋ก ์ํ
- file์ ๋ํ ์ ๋ณด๋ค์ submission์ค ์์ฑํ๋ผ๊ณ ํ๋ excelํ์ผ๋๋ก ์์ฑ
- submission ํ๋ ๊ณณ์ https://submit.ncbi.nlm.nih.gov/subs/genome/
- new submission์ ๋๋ฌ์ ์ํค๋๋๋กํ๋ฉด ๋. #๋งค์ฐ ๊ท์ฐฎ์
GBS demult
flexbar -r PP-A1_whole_1.fastq.gz -p PP-A1_whole_2.fastq.gz -b PP-A1.barcode.txt.parsed.fa
Illumina adapter ์ ๊ฑฐ
# https://github.com/seqan/flexbar
# preset truseq
parallel -j 1 "flexbar -n 10 -r {1}_1.fastq.gz -p {1}_2.fastq.gz --adapter-preset TruSeq -ap ON -z GZ -t {1}_trim" :::: samples
RF RNAseq analysis
## x,y ๋ฅผ ์ ์ํ ๋ค
## y๋ 0๊ณผ 1๋ก ์ ์ ํ๋ค๊ณ ๊ฐ์
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from multiprocessing import Pool, Manager
def skrun(fis,i,x,y):
rf = RandomForestClassifier(n_estimators=2000,n_jobs=1)
rf.fit(x,y)
fi =rf.feature_importances_
fis.append(fi)
l = Manager()
fis = l.list()
p = Pool(10)
for i in range(0, 1000):
p.apply_async(skrun, args=(fis,i,X,y))
p.close()
p.join()
fi_sum = np.sum(np.stack(fis),axis=0)
## fi_sum ์ด ์ ์ ๋๋ค
## fi_sum, ์ฌ์ฉ๋ df_mat_ix_sub, y ๊ฐ์ ๊ธฐ์ค์ผ๋ก ์ค์ํด ๋ณด์ด๋ ์ ์ ์์ rank,regulation(up/down)๋ฅผ ๋๋ ค์ค.
def get_rankdf(fi_sum,df_mat_ix_sub,y): # df_mat_ixi_sub : columns์ด ์ ํด์ง๊ณ ๋ df_mat_ix
df_gene2fi = pd.DataFrame(index=df_mat_ix_sub.index)
df_gene2fi['fi'] = fi_sum
c1 = df_mat_ix_sub.columns[y==0]
c2 = df_mat_ix_sub.columns[y==1]
m = df_gene2fi['fi']>0
df_gene2fi_cut = df_gene2fi[m]
from scipy.stats import rankdata
rank_temp = rankdata(df_gene2fi_cut['fi'].astype(float))
rank = max(rank_temp)-(rank_temp-1)
df_gene2fi_cut['RNAseq_rank'] = rank
m = df_gene2fi_cut.RNAseq_rank < 500
df_gene2fi_cut_rank = df_gene2fi_cut[m]
df_annot_ix = df_annot.set_index('Transcript stable ID version')
df_gene2fi_cut_rank['gene name'] = [df_annot_ix.loc[x]['Gene name'] for x in df_gene2fi_cut_rank.index]
df_gene2fi_cut_rank['gene desc'] = [df_annot_ix.loc[x]['Gene description'] for x in df_gene2fi_cut_rank.index]
def reg(x):
try:
return np.mean(df_mat_ix_sub[c2].loc[x].fillna(0).values)/np.mean(df_mat_ix_sub[c1].loc[x].fillna(0).values)
except KeyError:
return 0
df_gene2fi_cut_rank['regulation'] = [reg(x) for x in df_gene2fi_cut_rank.index]
return df_gene2fi_cut_rank
vcf2matrix
def matrix(x,c_gt):
import numpy as np
allele = np.array([x['REF']] + x['ALT'].split(','))
gt_list = x[c_gt]
def change(i):
try:
ix = list(map(int,i.split(':')[0].split('/')))
return ''.join(allele[ix])
except ValueError:
return 'NN'
return [change(i) for i in gt_list]
# multi accessions vcf
# zcat sample.vcf.gz | grep -v '##' > sample.vcf.grep
# gzip sample.vcf.grep
df_ncbi = pd.read_csv('./SRP059747.vcf.gz.grep.gz',compression='gzip',sep='\t')
m_snp = df_ncbi['INFO'].apply(lambda x : x.split(';')[0] != 'INDEL')
df_ncbi_snp = df_ncbi[m_snp]
c_gt = df_ncbi_snp.columns[9:]
gt_list = df_ncbi_snp.apply(matrix,axis=1)
gt_list = np.stack(gt_list)
AB1 to Fastq
parallel -j 10 "seqret -osformat="fastq-sanger" -filter {1} > {1}.fastq" ::: *.ab1
Heatmap clustering python
from scipy.cluster.hierarchy import dendrogram, ward
X = X_pca
fig = plt.figure(figsize=(5,10))
axdendro = fig.add_axes([0.09,0.1,0.2,0.8])
linkage_array = ward(X)
axdendro.axis('off')
Z = dendrogram(linkage_array,orientation='left')
axmatrix = fig.add_axes([0.3,0.1,0.6,0.8])
Zix = Z['leaves']
axmatrix.matshow(X[Zix],aspect='auto', origin='lower')
axmatrix.axis('off')
axans = fig.add_axes([0.92,0.1,0.1,0.8])
im = axans.matshow(y.T[Zix],aspect='auto', origin='lower')
axans.axis('off')
axcolor = fig.add_axes([1.02,0.1,0.02,0.8])
plt.colorbar(im, cax=axcolor)
GO enrichment
- ์ธํ์๋ ์๋ ์ ์ ์๊ฐ ์นด์ดํ ๋๋ ๋ฌธ์ ๊ฐ ์๋ค. ์ด๋์ ๊ปด๋๋์ง ์์๊ฐ ์์;
import goenrich
O = goenrich.obo.ontology('../../go_db/go-basic.obo')
gene2go = goenrich.read.gene2go('../../go_db/gene2go',tax_id=3702)
values = {k: set(v) for k,v in gene2go.groupby('GO_ID')['GeneID']}
background_attribute = 'gene2go'
goenrich.enrich.propagate(O, values, background_attribute)
annot = pd.read_csv('../../go_db/Arabidopsis_thaliana.gene_info',sep='\t')
ATID = annot['LocusTag']
GENEID = annot['GeneID']
dicAT2GD = dict(zip(ATID,GENEID))
def get_GO_from_genelist(gl,f='test'): # gene list
df = goenrich.enrich.analyze(O, gl, background_attribute, gvfile='temp.dot')
subprocess.check_call(['dot', '-Tpng', 'temp.dot', '-o', '%s.png'%f])
m = (df['namespace'] == 'biological_process') & (df['rejected'] == 1 )
#m = (df['namespace'] == 'biological_process') & (df['p'] < 0.01 )
#m = (df['p'] < 0.01 )
return df[m]
gl_GD = [dicAT2GD[x] for x in gl] # gl is list of arabidopsis gene name
df_out = get_GO_from_genelist(gl_GD,f='test')
import goenrich
O = goenrich.obo.ontology('../../go_db/go-basic.obo')
annot = goenrich.read.sgd('../../go_db/gene_association.tair.gz')
annot_ix = annot.set_index('db_object_name')
# db_object_name ์ผ๋ก ํด์ผ, ์ธํ๊ฐ์ ์ผ๋ฐ์ ์ธ ์ ์ ์ ์์ด๋๋ก ํ ์ ์๋ค. db_object_symbol๋ก ํ๋ฉด ์ ์ ์๋ ์ ์ ์์ด๋ฆ์ด๋, ์ ์ ์์์ด๋๋ ์์ฌ์ ๊ฒฐ๊ณผ์ ๋ฌธ์ ๊ฐ ์๊น.
values = {k: set(v) for k,v in annot.groupby('go_id')['db_object_name']}
# reference group was restricted to nodes in AI1
background_attribute = 'annot_sub'
goenrich.enrich.propagate(O, values, background_attribute)
annot_ix = annot.set_index('db_object_name')
def get_GO_from_genelist(gl,f='test'): # gene list
df = goenrich.enrich.analyze(O, gl, background_attribute, gvfile='temp.dot')
subprocess.check_call(['dot', '-Tpng', 'temp.dot', '-o', '%s.png'%f])
#m = (df['namespace'] == 'biological_process') & (df['rejected'] == 1 )
#m = (df['namespace'] == 'biological_process') & (df['p'] < 0.01 )
m = (df['p'] < 0.01 )
return df[m]
annotation cloud
from wordcloud import WordCloud
text = ' '.join(x.split('=')[0][:-2] for x in df_annot['annotation'].values)
# Generate a word cloud image
wordcloud = WordCloud(width=1600,height=1000,stopwords=["protein","homolog","kDa","related","Probable"]).generate(text)
# Display the generated image:
# the matplotlib way:
import matplotlib.pyplot as plt
plt.figure(figsize=(20,20))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
๊ทธ๋ฆผ์ผ๋ก ๋์์ ๋ง๋ค๊ธฐ
mencoder mf://*.png -mf w=800:h=600 -ovc x264 -of avi
kmer gen
def convert(n, base):
T = "0123456789ABCDEF"
q, r = divmod(n, base)
if q == 0:
return T[r]
else:
return convert(q, base) + T[r]
def kmer_gen(kmer_length):
bases = {'0':'A','1':'T','2':'G','3':'C'}
destination = int('3'*kmer_length,4)
seq_list = []
for i in range(destination+1):
nseq = convert(i,4).zfill(kmer_length)
seq = ''.join([bases[x] for x in nseq])
seq_list.append(seq)
return seq_list
http://gatkforums.broadinstitute.org/gatk/discussion/3891/calling-variants-in-rnaseq
blast xml parse
outxml = 'temp.%s.xml'%selected_sbix
print('cmd: blastn -query %s -subject %s -outfmt 5 -out %s'%(seqA_name,seqB_name,outxml))
subprocess.call('blastn -query %s -subject %s -outfmt 5 -out %s'%(seqA_name,seqB_name,outxml),shell=True)
tree = ET.parse(outxml)
root = tree.getroot()
dic = {'hitfrom':[],
'hitto':[]
}
with open('%s.twoblast.parsed.txt'%selected_sbix,'w') as f:
print ('# synteny block id : %s'%selected_sbix,file=f)
aligned_pos_ref = {0}
aligned_pos_query = {0}
for hsps in root.iter('Hsp'):
hsp_num = hsps.find('Hsp_num').text
hsp_hit_from = hsps.find('Hsp_hit-from').text
hsp_hit_to = hsps.find('Hsp_hit-to').text
hsp_query_from = hsps.find('Hsp_query-from').text
hsp_query_to = hsps.find('Hsp_query-to').text
hsp_align_len = hsps.find('Hsp_align-len').text
MUMMER
/program/MUMmer3.23/nucmer --prefix=ken2ute Ahal.assembly.scf.fasta.addhd.fa Ahal_masked.rm.fa.addhd.fa
/program/MUMmer3.23/delta-filter -g -i 98 ken2ute.delta > ken2ute.delta.filter
/program/MUMmer3.23/show-coords -rcl ken2ute.delta.filter > ken2ute.flt.coords
# scaffold a ์ scaffold b ๊ฐ์ alignment
/program/MUMmer3.23/mummerplot ute2ute.delta.filter -r ute_000430F-pilon -q ute_scaffold-58 --png
# scaffold a ์ ๋๋จธ์ง ๋ชจ๋ ๊ฐ์ alignment
/program/MUMmer3.23/mummerplot ute2ute.delta.filter -r ute_000430F-pilon --png
# scaffold a ์ ๋๋จธ์ง ๋ชจ๋ ๊ฐ์ alignment ์ coverage plot
/program/MUMmer3.23/mummerplot ute2ute.delta.filter -r ute_000430F-pilon --png --coverage
# scaffold a ์ ๋๋จธ์ง ๋ชจ๋ ๊ฐ์ alignment, coverage ๊ฐ์ gradient๋ก ์ค๋ค. ๊ฒฐ๊ณผ๋ dot plot like
/program/MUMmer3.23/mummerplot ute2ute.delta.filter -r ute_000430F-pilon --png --color
# scaffold a ์ ๋๋จธ์ง ๋ชจ๋ ๊ฐ์ alignment, SNP๋ฅผ ๋นจ๊ฐ๊ฒ ํ์. ๊ฒฐ๊ณผ๋ dot plot like
/program/MUMmer3.23/mummerplot ute2ute.delta.filter -r ute_000430F-pilon --png --SNP
augustus training
GFF=../my_csv.csv.addgene.gff3.sort.gff3
GL=../good.list.td # gene list
REF=../../ref/Ahal.assembly.scf.fasta
parallel -j 4 "cat ${GFF} | grep {1} > {1}.test.gff" :::: ${GL}
cat Gene*.test.gff > test.gff
rm Gene*.test.gff
/programs/augustus-3.2.2/scripts/gff2gbSmallDNA.pl test.gff ${REF} 1000 test.gb
/programs/augustus-3.2.2/scripts/randomSplit.pl test.gb 100
/programs/augustus-3.2.2/scripts/new_species.pl --species=aha_v1
/programs/augustus-3.2.2/bin/etraining --species=aha_v1 test.gb.train
/programs/augustus-3.2.2/scripts/optimize_augustus.pl --species=aha_v1 test.gb.train # takes ~1d
augustus --species=aha_v1 test.gb.test | tee firsttest.out
./BUSCO_v1.22.py -in Ahal.assembly.scf.fasta -c 5 -o Ahaleri -l embryophyta_odb9 -m genome --long
scipio.pl --blat_output=prot.vs.genome.psl Ahal.assembly.scf.fasta selected.fa > scipio.yaml
cat scipio.yaml | yaml2gff.1.4.pl > scipio.scipiogff
../scipiogff2gff.pl --in=scipio.scipiogff --out=scipio.gff
cat scipio.yaml | yaml2log.1.4.pl > scipio.log
TE ๊ฑธ๋ฅด๋๋ฒ
cat new_gene.fa.bp.na1.uniprot-all.fasta.out7.annot | grep -v 'Retrovirus-related' | grep -v 'Transposon' | grep -v 'retrotransposable' | grep -v 'Ribonuclease' |grep -v 'LINE' | grep -v 'retrotransposon' | grep -v 'Copia' | grep -v 'Zinc finger BED domain' | grep -v 'Replication protein' > noTE.list
gtf to gff3
# from https://github.com/jorvis/biocode
python3 /program/biocode/gff/convert_augustus_to_gff3.py -i Ahal.assembly.genepred.gff -o Ahal.assembly.genepred.gff3
FTP
docker pull stilliard/pure-ftpd:hardened
docker run -d --name ftpd_server -p 0.0.0.0:2121:21 -p 30000-30009:30000-30009 -e "PUBLICHOST=localhost" -v /root/data1/ftp:/data stilliard/pure-ftpd:hardened
docker exec -it ftpd_server /bin/bash
pure-pw useradd guest -u ftpuser -d /data
pure-pw mkdb
vcf merge
parallel -j 5 bgzip {1} ::: *.raw.vcf # tabix ๋ bgzip์ผ๋ก ํด์ผํจ!
parallel -j 5 tabix -p vcf {1} ::: *.vcf.gz
vcf-merge *.vcf.gz > test.c.vcf
Htseq-count
htseq-count -q -f bam ${1}.tophat_out_maxintron10000/accepted_hits.bam Creinhardtii_281_v5.5.gene_exons.gff3.exon.gtf > ${1}.tophat_out_maxintron10000.count
Hisat2 & stringtie & transdecoder
/program/hisat2-2.0.3-beta/hisat2 --rna-strandness FR #RF for dUTP protocol \
--max-intronlen 30000 \
-p 2 -x ../../../ref/Creinhardtii_281_v5.0 \
-1 SRR1521680_1.fastq.gz -2 SRR1521680_2.fastq.gz\
| sambamba view -f bam -o SRR1521680.bam -S /dev/stdin
sambamba sort -t 10 SRR1521680.bam
/program/stringtie-1.2.2.Linux_x86_64/stringtie -p 5 -o SRR1521680.sorted.bam.stringtie.gff SRR1521680.sorted.bam
/program/TransDecoder-2.1.0/util/cufflinks_gtf_to_alignment_gff3.pl intron3000.merge.sorted.bam.stringtie.gff.strandcor.gff > intron3000.merge.sorted.bam.stringtie.gff.strandcor.gff.gff3
/program/TransDecoder-2.1.0/util/cufflinks_gtf_genome_to_cdna_fasta.pl SRR1521680.sorted.bam.stringtie.gff ../../../ref/Creinhardtii_281_v5.0.fa > transcripts.fasta
/program/TransDecoder-2.1.0/TransDecoder.LongOrfs -t transcripts.fasta
/program/TransDecoder-2.1.0/TransDecoder.Predict -t transcripts.fasta
/program/TransDecoder-2.1.0/util/cdna_alignment_orf_to_genome_orf.pl transcripts.fasta.transdecoder.gff3 intron3000.merge.sorted.bam.stringtie.gff.strandcor.gff.gff3 transcripts.fasta > transcripts.fasta.transdecoder.genome.gff3
BWA
for input in SRR2751913
do
/program/bwa-0.7.12/bwa aln -e 50 -l 30 -k 1 -t 10 -M 6 -O 15 -E 8 ../ref/Cannuum/Pepper.v.1.55.total.chr ${input}_1.fastq.gz > ${input}_1.sai
/program/bwa-0.7.12/bwa aln -e 50 -l 30 -k 1 -t 10 -M 6 -O 15 -E 8 ../ref/Cannuum/Pepper.v.1.55.total.chr ${input}_2.fastq.gz > ${input}_2.sai
done
REF=../ref/Cannuum/Pepper.v.1.55.total.chr
SAMPLE=PERENNIAL
for input in SRR2751913
do
/program/bwa-0.7.12/bwa sampe -r @RG"\t"ID:${input}"\t"SM:${SAMPLE}"\t"LB:${SAMPLE}"\t"PL:Illumina ${REF} ${input}_1.sai ${input}_2.sai ${input}_1.fastq.gz ${input}_2.fastq.gz > ${input}.sam
done
$ sambamba view -f bam -o SRR2751913.bam -S -t 10 SRR2751913.sam
$ sambamba sort -t 10 -o SRR2751913.sort.bam SRR2751913.bam; sambamba mpileup -t 10 -o SRR2751913.bcf SRR2751913.sort.bam --samtools -gf ../ref/Cannuum/Pepper.v.1.55.total.chr.fa
$ sambamba mpileup -t 10 -o SRR2751914.cv.vcf SRR2751914.sorted.bam --samtools -gu ../References/Cannum/Pepper.v.1.55.total.chr --bcftools call -cv
tophat
tophat --output-dir ${1}.tophat_out --max-intron-length 1000 --num-threads 10 --no-discordant --no-mixed ../ref/Creinhardtii_281_v5.0 ${1}_1.fastq ${1}_2.fastq
parallel -j 2 tophat --num-threads 5 --max-intron-length 50000 --output-dir {1}_tophatout Alyrata_107_v1 {1} ::: *.fastq
blat
http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/blat/ # download http://iubio.bio.indiana.edu/gmod/tandy/blat2gff.pl # psl2gff
blat Creinhardtii_281_v5.0.fa /dev/null /dev/null -tileSize=11 -makeOoc=Cre281.11.ooc
blat Creinhardtii_281_v5.0.fa SRR2132408_trinity_out.Trinity.fasta -ooc=Cre281.11.ooc -minIdentity=98 output.psl
transmission
https://help.ubuntu.com/community/TransmissionHowTo
$ sudo service transmission-daemon stop
$ sudo usermod -a -G debian-transmission k821209
$ sudo chgrp debian-transmission /home/k821209/download
$ sudo chmod 770 /home/k821209/download
$ sudo service transmission-daemon start
# torrent file : /var/lib/transmission-daemon/info/torrents/
Jbrowser
http://gmod.org/wiki/JBrowse_Configuration_Guide
$ bin/prepare-refseqs.pl --fasta name.fa
$ bin/flatfile-to-json.pl --gff Creinhardtii_281_v5.5.gene.gff3 --trackType CanvasFeatures --trackLabel original_genemodel
$ ../bin/flatfile-to-json.pl --trackLabel repeatrunner \
--type protein_match:repeatrunner \ # repeatrunner protein_match feature๋ฅผ ๊ฐ์ ธ์ด
--key RepeatRunner \
--className generic_parent \
--subfeatureClasses '{"match_part" : "feature"}' \ # ์ฌ๊ธฐ์ feature์ ํด๋นํ๋ ๊ฒ์ ๋ฐ๊พธ๋ฉด ๋์์ธ์ด ๋ฐ๋๋ค
# ๋ญ๋ก ๋ฐ๊พธ๋ฉด ๋๋์ง๋ JBrowse-1.11.6/track_styles.css
--gff Creinhardtii_281_v5.0.all.gff.gff3
$ ../../bin/flatfile-to-json.pl --trackLabel est2genome --type expressed_sequence_match:est2genome --key est2genome --className generic_parent --subfeatureClasses '{"match_part" : "feature2"}' --gff Creinhardtii_281_v5.0.all.maker_try3.gff.fastaremoved.gff.est2genome.gff
$ ../../bin/generate-names.pl -v
# ์ฌ๋ฆฌ๊ณ ๋ ๋ค์ ๊ณ ์น๋ ค๋ฉด data/trackList.json ๋ฅผ ๊ณ ์น๋ฉด๋จ
# removing track
$ ../bin/remove-track.pl --trackLabel maker_gff
bam file
data/tracks.conf ์ ๋ถ์ฌ๋ฃ์ ๋๋ค.
# make a section for the new track called tracks.(unique name)
[ tracks.my-bam-track ]
# settings for what data is shown in the track
storeClass = JBrowse/Store/SeqFeature/BAM
urlTemplate = ../../raw/volvox/volvox-sorted.bam
baiUrlTemplate = ../../raw/volvox/volvox-sorted.bam.bai
# settings for how the track looks
category = NGS #< category for this track
type = JBrowse/View/Track/Alignments2
key = BAM alignments from sample XYZ
Maker
http://gmod.org/wiki/MAKER_Tutorial
apt-get install libcr-dev mpich2 mpich2-doc
maker -CTL
mpiexec -n 4 /program/maker/bin/maker </dev/null
E-PCR
$ famap -tN -b ${1}.genome.famap ./${1} # ${1} : fasta file
$ fahash -b ${1}.genome.hash -w 12 -f3 ./${1}.genome.famap
# hashed
$ re-PCR -S Creinhardtii_281_v5.5.transcript_primaryTranscriptOnly.fa.genome.hash -n1 -g1 primers.sts -o primers.sts.trans.out
SSR.sts
Mungbean_SSR_ID_1 CAAAAACATGAGTTGCACACAA TCATAACGCAGAACAGCGAA
Mungbean_SSR_ID_2 ATGTGTGTGAGCACCTCGAC TTTGGCCATGCAAGATGTAA
Mungbean_SSR_ID_4 GCGGTTCACCTAGCCATAAA GGACCCTTCTGTGCGTGTAT
Mungbean_SSR_ID_5 GTTTGTGCTGCGGATTCTTT TTGGCAATTTGGACTAAGGC
Mungbean_SSR_ID_7 TTGACCCAAAACTTACCAATTT GCTAAGGACTGGGGGTCTTC
Trinity
$ ../../download/trinityrnaseq-2.0.6/Trinity --seqType fq --left SRR2132444_1.fastq --right SRR2132444_2.fastq --SS_lib_type FR --CPU 4 --max_memory 50G
$ /program/trinityrnaseq-2.0.6/trinity-plugins/TransDecoder-2.0.1/TransDecoder.LongOrfs -t SRR2132441_trinity_out.Trinity.fasta
MCSCANX
$ cat ath.gff.cor.gff aha.gff.cor.gff > At2Ah.gff
$ /data2/k821209/programs/MCScanX/MCScanX At2Ah
$ /usr/bin/perl /data2/k821209/programs/MCScanX/downstream_analyses/add_ka_and_ks_to_collinearity.pl -i At2Ah.collinearity -d At2Ah.cds -o At2Ah.collinearity.kaks
#synteny chromosome plot
#python3 /data2/k821209/good_python_code/synteny_plot_parse.py Pv2Va.collinearity.kaks Pv2Va Pv2Va.gff
$ python3 synteny_plot_parse.py Va2Gm.collinearity.kaks Va2Gm Va2Gm.gff
$ cat Pv2Va.collinearity.kaks.Rin Va2Gm.collinearity.kaks.Rin Va2Vr.collinearity.kaks.Rin > merge.Rin
library(ggplot2)
library(scales)
data<-read.table('merge.Rin',header=F,sep='\t')
sub<-subset(data,!grepl("s",data$V3)) # remove scaffolds
sub<-subset(sub,!grepl("SS",sub$V3)) # remove super scaffolds
g <- ggplot(data, aes(x=V6,y=V7,color=factor(V3))) + geom_point(alpha=0.5,size=1) + facet_grid(V5 ~ V1) + ylim(0,3) +
scale_x_continuous(labels = comma) + theme(axis.text.x = element_text(angle = 60, hjust = 1))+ ylab('Ks value') + xlab('Genomic position')
png('all.png', height=600, width=1240, res = 100, units = 'px')
print(g)
dev.off()
Orthomcl
orthomcl ๋ ์์์ ์ด๋ฆ์ ๋ฐ๊พธ๋ฉด๋๋ค.
#Make config file : orthomcl.config.template
dbVendor=mysql
dbConnectString=dbi:mysql:orthomcl:mysql_local_infile=1:localhost:3306
dbLogin=orthomcl
dbPassword=8804555
similarSequencesTable=SimilarSequences
orthologTable=Ortholog
inParalogTable=InParalog
coOrthologTable=CoOrtholog
interTaxonMatchView=InterTaxonMatch
percentMatchCutoff=50
evalueExponentCutoff=-5
oracleIndexTblSpc=NONE
$ mysql -u root -p
mysql> CREATE DATABASE orthomcl;
mysql> GRANT SELECT,INSERT,UPDATE,DELETE,CREATE VIEW,CREATE, INDEX, DROP on orthomcl.* TO orthomcl@localhost;
mysql> set password for orthomcl@localhost = password('8804555');
$ orthomclInstallSchema orthomcl.config.template
$ orthomclAdjustFasta VRA final.assembly.pep.fasta.primary.SH.fa 1
$ mkdir ./compliantFasta/
$ mv VRA.fasta ./compliantFasta/ # same for the ATH.fasta etc.
$ orthomclFilterFasta ./compliantFasta/ 10 20
$ blastall -m 8 -a 10 -p blastp -F 'm S' -v 100000 -b 100000 -z 259701 -e 1e-5 -d goodProteins.fasta -i goodProteins.fasta -o goodProteins.fasta.self.out # 27๊ฐ ํ ๋ ๋ณด๋ฆ ์ด์๊ฑธ๋ฆผ
#-z database size : grep -v '>' INPUT.fa | tr -d '\n' | wc
$ gpu:/data/k821209/programs/gpu-blast-1.1_ncbi-blast-2.2.26/ncbi-blast-2.2.26+-src/c++/GCC460-ReleaseMT64/bin/blastp -query ./goodProteins.fasta -dbsize 101358393 -evalue 1e-5 -out self.out -outfmt 6 -seg yes -db ./goodProteins.fasta.sorted -gpu t
$ orthomclBlastParser goodProteins.fasta.self.out compliantFasta/ >> similarSequences.txt # 27๊ฐ accessionํ ๋ ์ด ์คํ
๋ง 6์๊ฐ ์ ๋ ๊ฑธ๋ฆผ
$ orthomclLoadBlast orthomcl.config.template similarSequences.txt # ์ฉ๋ ๋ง์ด ๋จน์ ์ถฉ๋ถํ ํ๋ณด
$ orthomclPairs orthomcl.config.template orthomcl_pairs.log cleanup=no
$ orthomclDumpPairsFiles orthomcl.config.template
$ mcl mclInput --abc -I 1.5 -o mclOutput
$ orthomclMclToGroups CLST 1000 < mclOutput > groups.txt
$ python3 /data2/k821209/good_python_code/orthomcl/groups_count.py
Venndia gram via R
source("http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/overLapper.R")
sets <- read.delim("groups.txt.Rin")
setlistImp <- lapply(colnames(sets), function(x) as.character(sets[sets[,x]!="", x]))
names(setlistImp) <- colnames(sets)
setlist5 <- setlistImp[1:5]; OLlist5 <- overLapper(setlist=setlist5, sep="_", type="vennsets")
counts <- sapply(OLlist5$Venn_List, length)
vennPlot(counts=counts, ccol=c(rep(1,30),2), lcex=1.5, ccex=c(rep(1.5,5), rep(0.6,25),1.5))
ipython notebook server
$ ipython notebook --profile=nbserver
Venndia gram
Filename : ts.Rin
Flower Leaf Pod Root
Vang01g00010 Vang01g00010 Vang01g00010 Vang01g00010
Vang01g00020 Vang01g00030 Vang01g00030 Vang01g00020
Vang01g00030 Vang01g00050 Vang01g00050 Vang01g00030
Vang01g00050 Vang01g00110 Vang01g00110 Vang01g00050
Vang01g00110 Vang01g00120 Vang01g00120 Vang01g00110
Vang01g00120 Vang01g00160 Vang01g00160 Vang01g00120
Vang01g00140 Vang01g00170 Vang01g00170 Vang01g00140
Vang01g00160 Vang01g00250 Vang01g00250 Vang01g00160
Vang01g00170 Vang01g00270 Vang01g00270 Vang01g00170
Vang01g00180 Vang01g00310 Vang01g00310 Vang01g00250
Vang01g00210 Vang01g00320 Vang01g00320 Vang01g00270
.....
source("http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/overLapper.R")
sets <- read.delim("ts.Rin")
setlist <- lapply(colnames(sets), function(x) as.character(sets[sets[,x]!="", x]))
names(setlist) <- colnames(sets)
OLlist <- overLapper(setlist=setlist, sep="", type="vennsets"); OLlist; names(OLlist)
setlist5 <- setlist[1:5]; OLlist5 <- overLapper(setlist=setlist5, sep="", type="vennsets")
counts <- sapply(OLlist5$Venn_List, length)
vennPlot(counts=counts, ccol=c(rep(1,30),2), lcex=1.5, ccex=c(rep(1.5,5), rep(0.6,25),1.5))
Interpro
$/data2/k821209/programs/interproscan-5-RC4/interproscan.sh -appl Pfam -T /data2/k821209/Mungbean/maker4/TMP/ -i target.pep.fa
Repeatmasking
#BuildDatabase -name echinosophora -engine ncbi echinosophora_p_pilon.fasta
#RepeatModeler -pa 20 -engine ncbi -database echinosophora 2>&1 | tee repeatmodeler.log
RepeatMasker -pa 20 -e ncbi -lib echinosophora-families.fa -dir ./echinosophora_repeat_v1/ -gff echinosophora_p_pilon.fasta
LTR search using gt
$ /data2/k821209/programs/genometools-1.5.1/bin/gt suffixerator -suf -lcp -dna -indexname ${1} -db ${1}
$ /data2/k821209/programs/genometools-1.5.1/bin/gt ltrharvest -gff3 ${1}.gff3 -index ${1}
$ /data2/k821209/programs/genometools-1.5.1/bin/gt gff3 -sort {1}.gff3 > {1}.sort.gff3
$ /data2/k821209/programs/genometools-1.5.1/bin/gt -j 2 ltrdigest -pptlen 10 30 -pbsoffset 0 3 -trnas Athal-tRNAs.fa -hmms ./hmm/*.hmm -outfileprefix {1} {1}.sort.gff3 Athaliana_167.fa | tee {1}.ltrdigest_GyDB.gff3
Trinity RNAseq assemble
#single end
$ /path/trinityrnaseq_r2013-02-25/Trinity.pl --seqType fq --single ERR185151.fastq --JM 2G --CPU 4 --no_cleanup --monitoring
#paired end
$ /data2/k821209/programs/trinityrnaseq_r2013-02-25/Trinity.pl --seqType fq --left ERR173896_1.fastq --right ERR173896_2.fastq --SS_lib_type FR --CPU 4 --no_cleanup --monitoring --JM 2G
Jellyfish:Kmercount
$ zcat SunhwaN_?.fastq.gz | jellyfish count -C -m 22 -s 50G -t 10 -c 6 -o output /dev/fd/0
$ jellyfish histo -o hist.out output_0
BSseq
$ ll *
-rw-rw-r-- 1 k821209 k821209 148 1?? 21 11:21 run.bismark.sh
-rw-r--r-- 1 k821209 k821209 8396394060 1?? 21 11:10 S_1_4_5_1.fastq.gz
-rw-r--r-- 1 k821209 k821209 8014309256 1?? 21 11:13 S_1_4_5_2.fastq.gz
gw:
total 453976
drwxrwxr-x 3 k821209 k821209 4096 1?? 21 11:21 ./
drwxrwxr-x 3 k821209 k821209 4096 1?? 21 11:21 ../
drwxrwxr-x 4 k821209 k821209 4096 1?? 21 11:21 Bisulfite_Genome/
-rwxr-xr-x 1 k821209 k821209 464854829 1?? 21 11:14 Vradi_ver6.fa*
- file:run.bismark.sh
path2bowtie=/where/is/
path2samtools=/where/is/
bismark_genome_preparation --bowtie2 --path_to_bowtie ${path2bowtie} ${PWD}/gw/
bismark --gzip --bowtie2 --fastq --bam -p 4 -N 1 --samtools_path ${path2samtools} --path_to_bowtie ${path2bowtie} ${PWD}/gw/ -1 read_1.fastq.gz -2 read_2.fastq.gz
bismark_methylation_extractor --multicore 3 --genome_folder ${PWD}/gw/ --scaffolds --cytosine_report --CX --paired-end --no_overlap --comprehensive --samtools_path ${path2samtools} --gzip --bedGraph --counts --buffer_size 10G resulting.bam
- bsseq (R)
## try http:// if https:// URLs are not supported
source("https://bioconductor.org/biocLite.R")
biocLite("bsseq")
library(bsseq)
mungbean_methylation<-read.bismark(file=c("./S_1_4_5_1.fastq.gz_bismark_bt2_pe.bismark.cov","./K_1_2_4_1.fastq.gz_bismark_bt2_pe.bismark.cov"),sampleNames =c("sunhwa","kyunggi"), fileType="cov",verbose=TRUE,rmZeroCov=FALSE)
mungbean_methylation_smed = BSmooth(mungbean_methylation, mc.cores = 1, verbose = TRUE)
Genotype by sequencing (GBS)
For GBS, we can use whole reference genome for read mapping procedure or sub genomic region harboring the restriction enzyme site. I usually use whole genome; however in the case of interspecies crossing population that would have many segregation distortion, it was better to use the sub genomic region.
samtools mpileup -DSugf Enzyme_site_block_ApeKI.txt.fa *.bam | bcftools view -bvcg - > all.bvcg.bcf
Q value
False discovery rate (FDR) calculation using R
> source("http://bioconductor.org/biocLite.R")
> biocLite("qvalue")
> library(qvalue)
> p<-scan("S_all.pv.txt.pv") # only number containing file ex > 1 \n 0.01 \n 0.2 \n ...
> qobj <- qvalue(p)
> qobj1 <- qvalue(p, lambda=0.5, robust=TRUE)
> qobj2 <- qvalue(p, fdr.level=0.05, pi0.method="bootstrap")
> qwrite(qobj, filename="myresults.txt")
Weblogo python
# composition ์ ์์๋๋ ๋น์จ์ ๋ฃ์ด์ค์ผ ํจ!
weblogo --format pdf --size large --resolution 300 --datatype fasta --composition "{'A':20, 'C':30, 'G':30, 'T':20}" --fin S_total.CG.context.fa > S_total.CG.context.fa.pdf
RNA structure
~/programs/RNAstructure/exe/MaxExpect comp_test3.fa comp_test3.fa.me.ct --sequence
# ๊ฒฐ๊ณผ outfile.ct ๋ jViz.Rna ๋ก Visualization