-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBUSCOv4.sh
executable file
·85 lines (65 loc) · 3.12 KB
/
BUSCOv4.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/bin/bash
# This script will run BUSCO version 4.0.6 to assess genome assembly completeness
###############
### SETUP ###
###############
### Conda ###
echo "Setting up conda environment"
source /home/sejoslin/.bashrc
conda activate busco4
#export AUGUSTUS_CONFIG_PATH="/home/sejoslin/config/augustus-3.2.3/config"
### Variables ###
echo "Setting up variables"
THREADS=${SLURM_NTASKS}
# external variables
PREFIX=$1 # Prefix of all fastq files (and output)
ASSEMBLY=$2 # Assembly file
OUT_DIR=$3 # Path to outputt directory
LINEAGE=$4 # Lineage will to be downloaded and used (figure it out with `busco --list-datasets`
# internal variables
busco_dir="${OUT_DIR}/BUSCO"
echo " Assembly : ${ASSEMBLY}"
echo " Lineage File : ${LINEAGE}"
echo " BUSCO Directory : ${busco_dir}"
echo " Threads : ${THREADS}"
echo ""
###################
### RUN BUSCO ###
###################
mkdir -p ${busco_dir}
cd ${busco_dir}
BUSCO_call="busco -m genome \
-i ${ASSEMBLY} \
-l ${LINEAGE} \
-o BUSCO_${PREFIX} \
--out_path ${busco_dir} \
-c $((THREADS - 1))"
echo ${BUSCO_call}
eval ${BUSCO_call}
## BUSCO README ##
cat << readme >> BUSCO_${PREFIX}/README.md
This directory contains output for the reports generated by $(busco --version) for the assembly of Hypomesus transpacificus.
Files & Directories:
--run_lineage_name/ == Main results folder.
--short_summary_*.txt == Contains a plain text summary of the results in BUSCO notation and give brief breakdown of metrics.
--full_table_.tsvi == Contains the complete results in a tabular format with scores and lengths of BUSCO matches, and coordinates.
--missing_buscos_list_*.tsv == Contains a list of missing BUSCOs.
--hmmer_output/ == Directory containing HMMER output of searches with BUSCO HMMs
--blast_output/ == tBLASTn results
--logs/ == Detailed busco.log with debug info and stderr & stdout of all 3rd party software
--augustus_output == Augustus-predicted genes
--predicted_genes == Augustus raw gene output.
--extracted_proteins == Augustus protein FASTA output.
--retraining_parameters == BUSCO retraining. Specific to species.
--gb == GenBank format complete BUSCOs before retraining.
--gff == General Feature Format complete BUSCOs before retraining.
--training_set.db == Genes used for Augustus retraining.
--blast_output/ == Results of the tBLASTn alignment tool, for the eukaryotic genome runs.
--tblastn.tsv == tabular tBLASTn results
--coordinates.tsv == locations of BUSCO matches (eukaryotic genome)
--tblastn_missing_and_frag_rerun.tsv == tabular tBLASTn results during the 2nd phase (eukaryotic genome)
--coordinates_missing_and_frag_rerun.tsv == locations of BUSCO matches during the 2nd phase (eukaryotic genome)
--db/ == Blast database
--sequences == Sequences having blast results.
--single_copy_busco_seq == FASTA format file for each complete single-copy BUSCO identified.
readme