Genome skim reads to Mvim genome (randomly subsequenced 1kb every 100kb segments in each chromosome) - DR-genomics/Genomics-pipelines GitHub Wiki

  1. bedtools makewindows -w 100000 -g ../chr_sizes.txt > JS_allchr23.windows.100kb #100 kb windows
  2. grep -w "Scaffold_1|Scaffold_2" JS_allchr23.windows.100kb > JS_chr1_2.windows.100kb #filtered only first 2 chr's to test the pipeline
  3. bedtools makewindows -w 1000 -g JS_chr1_2.windows.100kb > JS_chr1_2.windows.100kb.into1kb_bins #1000 kb segments in every 100 kb window
  4. bedtools getfasta -fi JS_allchr23.fasta -bed JS_chr1_2.windows.100kb -fo JS_chr1_2.windows.100kb.fasta #fasta file for step 2 bed file => file with 100 kb segments in both chr 1 & 2
  5. split -l 100 --filter='shuf -n1' JS_chr1_2.windows.100kb.into1kb_bins > JS_chr1_2.windows.100kb.into1kb_bins.random1kb.bed #Randomly choose 1 1000kb interval from the bed file created in step3.
  6. nohup bedtools getfasta -fi JS_allchr23.fasta -bed JS_chr1_2.windows.100kb.into1kb_bins.random1kb.bed -fo JS_chr1_2.windows.100kb.into1kb_bins.random1kb.fasta & #Extract those randomly chosen 1kb fragments (from each 100kb bin) from whole genome
  7. sed 's/:.*$//g' JS_chr1_2.windows.100kb.into1kb_bins.random1kb.fasta > a.fasta #Remove coordinate information in fasta header, to keep the same fasta hearder for each chromosome.
  8. awk '/^>/ {if(prev!=$0) {prev=$0;printf("\n%s\n",$0);} next;} {printf("%s",$0);} END {printf("\n");}' a.fasta > # Reconcatenate each 1 kb fragments from each chromosome into 1 minichromosome using awk command by merging all sequences with identical fasta header