pangenes/asciinema.txt

# download code
git clone https://github.com/Ensembl/plant-scripts.git
cd plant-scripts/

# if you already had the software you could update as this
git pull

sleep 3

# install dependencies and download dataset test_rice, takes some time
make install_pangenes
ls lib pangenes/bin

sleep 3

# configure for your HPC cluster (optional, recommended)
cd pangenes/

# there are two sample config files: HPC.conf.sample -> LSF , HPC.conf.sample.slurm -> slurm
ls HPC*

# suppose you want to run GET_PANGENES in a slurm setting
cp HPC.conf.sample.slurm HPC.conf
cat HPC.conf
# you should manually edit file HPC.conf to match your settings
# for instance, the provided sample file assumes a queue named 'production' and max 70GB RAM per job, 
# enough in our benchmarks up to wheat and maize using minimap2, you might want to change that

sleep 3

## examples

# local analysis of test_rice data, make it HPC/parallel with: perl get_pangenes.pl -d ../files/test_rice -m cluster
perl get_pangenes.pl -d ../files/test_rice

# results folder is: /home/contrera/plant-scripts/pangenes/test_rice_pangenes/Oryza_nivara_v1chr1_alltaxa_2neigh_algMmap_
ls /home/contrera/plant-scripts/pangenes/test_rice_pangenes/Oryza_nivara_v1chr1_alltaxa_2neigh_algMmap_

sleep 3

# now restrict whole-genome alignments (WGA) to homologous chromosomes,
# for this to work chr names in input FASTA files must be consistent so that regular expression will match them all, let's check:
zgrep "^>" ../files/test_rice/*fa.gz

# in the test_rice example the main chromosomes are named with integer numbers, this should work:
perl get_pangenes.pl -d ../files/test_rice -s '^\d+'

# now the results are in /home/contrera/plant-scripts/pangenes/test_rice_pangenes/Oryza_nivara_v1chr1_alltaxa_2neigh_algMmap_split_
# you can see the folder name now is added 'split_' to indicate that input genomes were split by chr matching the regular expression

sleep 3

# now we will mask geneless regions >= 1Mb, this is required by minimap2 with large genomes 
perl get_pangenes.pl -d ../files/test_rice -H
# in this case, output is saved to Oryza_nivara_v1chr1_highrep_alltaxa_2neigh_algMmap_
# note the 'highrep_' tag

sleep 3

# let's check how much disk the output folder takes, most are temporary files
# that might be re-used in future jobs, but can be removed if needed
du -hs test_rice_pangenes/
du -hs test_rice_pangenes/tmp

sleep 3

# we will now extract the WGA evidence supporting an example pangene cluster,
# see also options -f -v
perl check_evidence.pl -d test_rice_pangenes/Oryza_nivara_v1chr1_alltaxa_2neigh_algMmap_/ -i gene:ONIVA01G50800.cdna.fna

sleep 3

# match arbitrary sequences to computed pangene clusters
perl match_cluster.pl -d test_rice_pangenes/Oryza_nivara_v1chr1_alltaxa_2neigh_algMmap_/ -s ../files/test_transcripts.fna -o test_transcripts.gmap.tsv 
cat test_transcripts.gmap.tsvtest_transcripts.gmap.tsv

sleep 3

# clean up
rm -rf test_rice_pangenes

exit