8_Orthofinder - bennestor/hakea_genome GitHub Wiki
This analysis was done using the genome_v1 sequence of H. prostrata
Install
#Download predicted protein sequences of species from NCBI: #Arabidopsis (GCF_000001735.4), Macadamia integrifolia (GCF_013358625.1), Telopea specioisissima (Chen et al 2021 paper), Gossypium hirsutum (GCF_007990345.1), Eucalyptus grandis (GCF_016545825.1), Vitis vinifera (GCF_000003745.3), and Nelumbo nucifera (GCF_000365185.1)
Run Orthofinder on Hakea and other plant species
#Run Orthofinder script: 1_orthofinder.sh on zeus (6h) took ~2h orthofinder -o 2_orthofinder_out -f proteomes -a 28 -t 28 -M msa -S diamond -A mafft -T fasttree #Number of orthogroups=32505, Number of species-specific orthogroups=13990, Number of single-copy orthogroups=29
Get species-specific genes for Hakea
#Extract Hakea gene ids from orthogroups cat Orthogroups.txt | grep -Fw -f hakea_specific.ids | tr ' ' '\n' | grep -E -v 'O.*' | sort -u > hakea_specific_gene.txt #5868 sequences
Make upset plot of orthogroups
#Get orthogroups for each species awk -F"\t" '$2>0' Orthogroups.GeneCount.tsv | cut -f1 > arabidopsis_orthogroups.ids awk -F"\t" '$3>0' Orthogroups.GeneCount.tsv | cut -f1 > eucalyptus_orthogroups.ids awk -F"\t" '$4>0' Orthogroups.GeneCount.tsv | cut -f1 > gossypium_orthogroups.ids awk -F"\t" '$5>0' Orthogroups.GeneCount.tsv | cut -f1 > hakea_orthogroups.ids awk -F"\t" '$6>0' Orthogroups.GeneCount.tsv | cut -f1 > macadamia_orthogroups.ids awk -F"\t" '$7>0' Orthogroups.GeneCount.tsv | cut -f1 > nelumbo_orthogroups.ids awk -F"\t" '$8>0' Orthogroups.GeneCount.tsv | cut -f1 > telopea_orthogroups.ids awk -F"\t" '$9>0' Orthogroups.GeneCount.tsv | cut -f1 > vitis_orthogroups.ids
#In R #install.packages("UpSetR") library(UpSetR) a <- read.delim("arabidopsis_orthogroups.ids") b <- read.delim("eucalyptus_orthogroups.ids") c <- read.delim("gossypium_orthogroups.ids") d <- read.delim("hakea_orthogroups.ids") e <- read.delim("macadamia_orthogroups.ids") f <- read.delim("nelumbo_orthogroups.ids") g <- read.delim("telopea_orthogroups.ids") h <- read.delim("vitis_orthogroups.ids") x <- c(a,b,c,d,e,f,g,h) upset(fromList(x), order.by = "freq", nsets = 8, text.scale = 1.1)