7_Expression - bennestor/hakea_genome GitHub Wiki

Table of Contents

Expression of Hakea genes

Quantifying gene expression

Map RNAseq reads to genome with HISAT2

   #Build index in genome_idx/ 
   hisat2-build -p 24 ../../../6_repeats_out/ragtag_mask.fa ragtag_mask_idx 
   
   #RvL array script: 1_hisat.array took ~5 min 
      #Array options:
      #SBATCH --array=0-10
      #SBATCH --cpus-per-task=8
      #SBATCH --mem=16G
      FILES=($(\ls [^W]*_val_1.fq.gz))
      FILENAME=${FILES[$SLURM_ARRAY_TASK_ID]}
      FILENAME2=${FILENAME/_R1_justCor_val_1/_R2_justCor_val_2}
      FILENAME3=${FILENAME%_R1_justCor_val_1.fq.gz}
      #Job command:
      export RD="/scratch/pawsey0149/bnestor/2021_03_17_Transcriptome/3-cleaning/2_trim_out"
      hisat2 -x genome_idx/ragtag_mask_idx --threads 8 -1 $RD/${FILENAME} -2 $RD/${FILENAME2} | samtools sort -O BAM --threads 8 > ${FILENAME3}.bam
   
   #WL array script: 1_hisat_WL.array took ~5 min 
      #Array options:
      #SBATCH --array=0-14
      #SBATCH --cpus-per-task=8
      #SBATCH --mem=16G
      FILES=($(\ls [^W]*_val_1.fq.gz))
      FILENAME=${FILES[$SLURM_ARRAY_TASK_ID]}
      FILENAME2=${FILENAME/_R1_justCor_val_1/_R2_justCor_val_2}
      FILENAME3=${FILENAME%_R1_justCor_val_1.fq.gz}
      #Job command:
      export RD="/scratch/pawsey0149/bnestor/2021_03_17_Transcriptome/3-cleaning/2_trim_out"
      hisat2 -x genome_idx/ragtag_mask_idx --threads 8 --rna-strandness RF -1 $RD/${FILENAME} -2 $RD/${FILENAME2} | samtools sort -O BAM --threads 8 > ${FILENAME3}.bam

Featurecounts to quantify gene expression

   #make saf annotation file 
   cat ../7_gemoma_out/filtered_predictions.gff  | grep 'CDS' | sed -r 's/(Parent=)(.*)/gene_id \"\2\"/g' | sed -r 's/CDS/exon/g' > filtered_predictions.saf
   
   #run featurecounts in 1_hisat_out 
   featureCounts -p -t exon -a ../filtered_predictions_rn.saf -o ../2_featurecounts_out/featurecounts.matrix -T 24 -s 0,0,0,0,0,0,0,0,0,0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2 CR0.bam CR1.bam CR4.bam CR7.bam CR12.bam R1.bam R10.bam R50.bam ML1.bam ML10.bam ML50.bam WL1.bam WL2.bam WL3.bam WL4.bam WL5.bam WL6.bam WL7.bam WL8.bam WL9.bam WL10.bam WL11.bam WL12.bam WL13.bam WL14.bam WL15.bam 
   
   #Clean up using vim 
   %s/.bam//g 

Differential expression of roots vs leaves

Installing programs (on local computer)

   #Install CGI 
   sudo apt install libcgi-pm-perl 
   
   #Install r-base (v4.2) by conda 
   
   #Install BiocManager in R
   if (!require("BiocManager", quietly = TRUE)) 
       install.packages("BiocManager") 
   
   #Install edgeR (v3.40.0), limma, DESeq2 (v1.38.1) in R 
   BiocManager::install(c("edgeR", "limma", "DESeq2") 

Run TMM normalisation to make heatmaps

   #Extract RvL matrix 
   cat featurecounts_rn.txt | cut -f 1,12-17 > featurecounts_RvL.txt 
   
   #Roots TMM
   $TRINITY_HOME/run_TMM_normalization_write_FPKM_matrix.pl --just_do_TMM_scaling --matrix featurecounts_RvL.txt 
   
   #Get TMM count averages for roots and leaves 
   $TRINITY_HOME/replicates_to_sample_averages_matrix.pl --samples_file samples.txt --matrix featurecounts_RvL.txt.TMM_rescaled 
   #Add ‘Geneid’ back in as header
   
   #I used the TMM expression matrix to get expressions of genes I was interested in and make heatmaps with these

Test for differential expression using edgeR with no minimum expression

   #Run EdgeR with tab delimited samples.txt and contrasts.txt 
   $TRINITY_HOME/run_DE_analysis.pl --matrix ../2_featurecounts_out/featurecounts_rn.txt --method edgeR --min_reps_min_cpm 1,0.0000000001 --samples_file samples.txt --dispersion 0.4 --output 3_edgeR_norep_out/ --contrasts contrasts.txt 

Differential expression of leaf development stages

TMM normalisation

   #WL TMM 
   $TRINITY_HOME/run_TMM_normalization_write_FPKM_matrix.pl --just_do_TMM_scaling --matrix featurecounts_WL.txt 
   
   #Get count averages for each WL stage
   $TRINITY_HOME/replicates_to_sample_averages_matrix.pl --samples_file ../samples.txt --matrix featurecounts_WL.txt.TMM_rescaled 
   #Add ‘Geneid’ back in 

DESeq2 for differentially expressed genes

   #Filter deseq2 matrices (to my nitrate transporter genes for example)

for file in featurecounts_WL*DE_results ; do grep "NRT2\|NAR2\|\NPF\|sampleA" > ${file}_nitrate.txt ; done

   #Get IDs of all genes with significant differences between at least one pairwise comparison 
   cat *_nitrate.txt | grep -v "sampleA" | awk '{ if ($11 <= 0.05) print $1 }' | sort -u > nitrate_sig_dif.txt 
   #Add ‘Geneid to top’
   
   #Filter TMM matrix to differentially expressed nitrate genes to make a heatmap
   cat featurecounts_WL.matrix.TMM_rescaled_nitrate | grep -Fw -f ../2_deseq2_out/nitrate_sig_dif.txt | sort > featurecounts_WL.matrix.TMM_rescaled_nitrate_sig 
   
   #I used TBtools to automatically generate heatmaps, but R heatmap packages are probably better
⚠️ **GitHub.com Fallback** ⚠️