8_Orthofinder - bennestor/hakea_genome GitHub Wiki

Orthofinder

This analysis was done using the genome_v1 sequence of H. prostrata

Install

   #Download predicted protein sequences of species from NCBI:
   #Arabidopsis (GCF_000001735.4), Macadamia integrifolia (GCF_013358625.1), Telopea specioisissima (Chen et al 2021 paper), Gossypium hirsutum (GCF_007990345.1), Eucalyptus grandis (GCF_016545825.1), Vitis vinifera (GCF_000003745.3), and Nelumbo nucifera (GCF_000365185.1) 

Run Orthofinder on Hakea and other plant species

   #Run Orthofinder script: 1_orthofinder.sh on zeus (6h) took ~2h
   orthofinder -o 2_orthofinder_out -f proteomes -a 28 -t 28 -M msa -S diamond -A mafft -T fasttree
   #Number of orthogroups=32505, Number of species-specific orthogroups=13990, Number of single-copy orthogroups=29 

Get species-specific genes for Hakea

   #Extract Hakea gene ids from orthogroups 
   cat Orthogroups.txt | grep -Fw -f hakea_specific.ids | tr ' ' '\n' | grep -E -v 'O.*' | sort -u > hakea_specific_gene.txt  
   #5868 sequences

Make upset plot of orthogroups

   #Get orthogroups for each species 
   awk -F"\t" '$2>0' Orthogroups.GeneCount.tsv | cut -f1 > arabidopsis_orthogroups.ids 
   awk -F"\t" '$3>0' Orthogroups.GeneCount.tsv | cut -f1 > eucalyptus_orthogroups.ids 
   awk -F"\t" '$4>0' Orthogroups.GeneCount.tsv | cut -f1 > gossypium_orthogroups.ids 
   awk -F"\t" '$5>0' Orthogroups.GeneCount.tsv | cut -f1 > hakea_orthogroups.ids 
   awk -F"\t" '$6>0' Orthogroups.GeneCount.tsv | cut -f1 > macadamia_orthogroups.ids 
   awk -F"\t" '$7>0' Orthogroups.GeneCount.tsv | cut -f1 > nelumbo_orthogroups.ids 
   awk -F"\t" '$8>0' Orthogroups.GeneCount.tsv | cut -f1 > telopea_orthogroups.ids 
   awk -F"\t" '$9>0' Orthogroups.GeneCount.tsv | cut -f1 > vitis_orthogroups.ids
   #In R 
   #install.packages("UpSetR") 
   library(UpSetR) 
   a <- read.delim("arabidopsis_orthogroups.ids") 
   b <- read.delim("eucalyptus_orthogroups.ids") 
   c <- read.delim("gossypium_orthogroups.ids") 
   d <- read.delim("hakea_orthogroups.ids") 
   e <- read.delim("macadamia_orthogroups.ids") 
   f <- read.delim("nelumbo_orthogroups.ids") 
   g <- read.delim("telopea_orthogroups.ids") 
   h <- read.delim("vitis_orthogroups.ids") 
   x <- c(a,b,c,d,e,f,g,h) 
   upset(fromList(x), order.by = "freq", nsets = 8, text.scale = 1.1) 
⚠️ **GitHub.com Fallback** ⚠️