diff --git a/paper/sample.json.png b/paper/sample.json.png index e85537b4..155fe449 100644 Binary files a/paper/sample.json.png and b/paper/sample.json.png differ diff --git a/tests/10_benchmark.sh b/tests/10_benchmark.sh index f98eb5aa..af05d9a9 100644 --- a/tests/10_benchmark.sh +++ b/tests/10_benchmark.sh @@ -5,6 +5,13 @@ set -e thisDir=$(dirname $0); export PATH=$thisDir/../target/release:$PATH +# Check whether hyperfine is installed +if ! command -v hyperfine &> /dev/null +then + echo "hyperfine could not be found. It is required for benchmarking." + exit +fi + # Hyperfine parameters # Locally, just run one time per test but in the cloud, boost it to ten num_runs=10 diff --git a/tests/benchmark_sample.sh b/tests/benchmark_sample.sh index 944c680f..e0cf41f0 100644 --- a/tests/benchmark_sample.sh +++ b/tests/benchmark_sample.sh @@ -10,7 +10,8 @@ export PATH=$thisDir/../target/release:$PATH hyperfine --export-json=$reportsDir/sample.json --warmup 2 --shell $SHELL --runs $num_runs \ -n "Fasten sample" "cat $large_R1 | fasten_sample --frequency 0.1" \ -n "seqkit sample" "cat $large_R1 | seqkit sample --proportion 0.1" \ - -n "Seqtk sample" "seqtk seq -f 0.1 $large_R1"; + -n "Seqtk sample" "seqtk seq -f 0.1 $large_R1" \ + -n "Seqfu sample" "seqfu cat --skip 10 $large_R1"; -plot_whisker.py --title "subsample reads (reps=$num_runs)" --labels "fasten sample,seqkit sample,seqtk sample" --output $reportsDir/sample.json.png $reportsDir/sample.json +plot_whisker.py --title "subsample reads (reps=$num_runs)" --labels "fasten sample,seqkit sample,seqtk sample,seqfu cat" --output $reportsDir/sample.json.png $reportsDir/sample.json diff --git a/tests/fasten_combine.sh b/tests/fasten_combine.sh index 1f8fbc25..736aa34c 100644 --- a/tests/fasten_combine.sh +++ b/tests/fasten_combine.sh @@ -16,15 +16,15 @@ if [ "$reads_not_collapsed" != "$original_reads" ]; then fi reads_collapsed=$(cat $INPUT $INPUT $INPUT | ./target/debug/fasten_combine | ./target/debug/fasten_metrics --each-read) -total_quals=$(echo "$reads_collapsed" | cut -f 3 | tail -n +2 | paste -sd+ | bc -l) +total_quals=$(echo "$reads_collapsed" | cut -f 3 | tail -n +2 | awk '{if(NR>1) printf "+"; printf $1} END{print "\n";}' | bc -l) if [ "$total_quals" != "259.31" ]; then echo "Test failed for total expected quality when collapsing three sets of reads" exit 1 fi pe_collapsed=$(cat $INPUT $INPUT $INPUT | ./target/debug/fasten_combine --paired-end | ./target/debug/fasten_metrics --each-read) -IDs=$(echo "$pe_collapsed" | cut -f 1 | tail -n +2 | paste -sd+) -pe_quals=$(echo "$pe_collapsed" | cut -f 3 | tail -n +2 | paste -sd+ | bc -l) +IDs=$(echo "$pe_collapsed" | cut -f 1 | tail -n +2 | awk '{if(NR>1) printf "+"; printf $1}') +pe_quals=$(echo "$pe_collapsed" | cut -f 3 | tail -n +2 | awk '{if(NR>1) printf "+"; printf $1} END{print "\n";}' | bc -l) if [ "$IDs" != "1/1+1/2+2/1+2/2+3/1+3/2+4/1+4/2" ]; then echo "Test failed for total expected quality when collapsing three sets of reads using --paired-end" exit 1 diff --git a/tests/fasten_convert.sh b/tests/fasten_convert.sh new file mode 100644 index 00000000..8a59e258 --- /dev/null +++ b/tests/fasten_convert.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +# Minimal test suite for fasten (telatin 2024) + +THIS_SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +THIS_SCRIPT_NAME=$(basename "$0" | sed 's/\.sh//g') +source "${THIS_SCRIPT_DIR}/test_functions.sh" + +IN_FILE="${THIS_SCRIPT_DIR}/../testdata/four_reads.fastq" + +"$BIN" --out-format FASTA < "$IN_FILE" > "$TEST_TMP_FILE" +FASTA_COUNT=$(grep -c ">" "$TEST_TMP_FILE") +FASTQ_COUNT=$(grep -c "^@" "$TEST_TMP_FILE") +equal "$FASTA_COUNT" "4" "Testing that the output is in FASTA format" +equal "$FASTQ_COUNT" "0" "Testing that the output is in not FASTQ format" + +"$BIN" --out-format FASTQ --in-format FASTA < "$TEST_TMP_FILE" > "$TEST_TMP_FILE.2" + +FASTA_COUNT=$(grep -c "^>" "$TEST_TMP_FILE.2") +FASTQ_COUNT=$(grep -c "^@r" "$TEST_TMP_FILE.2") + +equal "$FASTQ_COUNT" "4" "Testing that the output is in not FASTA format" +equal "$FASTQ_COUNT" "4" "Testing that the output is in FASTQ format" +rm "$TEST_TMP_FILE.2" \ No newline at end of file diff --git a/tests/fasten_inspect.sh b/tests/fasten_inspect.sh new file mode 100644 index 00000000..f574c512 --- /dev/null +++ b/tests/fasten_inspect.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash +# Minimal test suite for fasten (telatin 2024) + +THIS_SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +THIS_SCRIPT_NAME=$(basename "$0" | sed 's/\.sh//g') +source "${THIS_SCRIPT_DIR}/test_functions.sh" diff --git a/tests/fasten_mutate.sh b/tests/fasten_mutate.sh new file mode 100644 index 00000000..f2987ef6 --- /dev/null +++ b/tests/fasten_mutate.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +# Minimal test suite for fasten mutate (telatin 2024) + +THIS_SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +THIS_SCRIPT_NAME=$(basename "$0" | sed 's/\.sh//g') +source "${THIS_SCRIPT_DIR}/test_functions.sh" + +INFILE="${THIS_SCRIPT_DIR}/../testdata/four_reads.fastq" + + +"$BIN" --snps 1 < "$INFILE" > "$TEST_TMP_FILE" + +MD5_IN_EXPECTED="8a08ae75226dfacd60f6fe2a1000f100" +MD5=$(getmd5 "$TEST_TMP_FILE" | cut -f 1 -d " ") +MD5_IN=$(getmd5 "$INFILE" | cut -f 1 -d " ") + +equal "$MD5_IN" "$MD5_IN_EXPECTED" "Testing that the input file wasnt changed" +different "$MD5" "$MD5_IN" "Testing that the output is different from the input" \ No newline at end of file diff --git a/tests/fasten_normalize.sh b/tests/fasten_normalize.sh new file mode 100644 index 00000000..e4e08cec --- /dev/null +++ b/tests/fasten_normalize.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash +# Minimal test suite for fasten normalise (telatin 2024) + +THIS_SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +THIS_SCRIPT_NAME=$(basename "$0" | sed 's/\.sh//g') +source "${THIS_SCRIPT_DIR}/test_functions.sh" diff --git a/tests/fasten_progress.sh b/tests/fasten_progress.sh new file mode 100644 index 00000000..8ce6a4d5 --- /dev/null +++ b/tests/fasten_progress.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +# Minimal test suite for fasten progress (telatin 2024) + +THIS_SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +THIS_SCRIPT_NAME=$(basename "$0" | sed 's/\.sh//g') +source "${THIS_SCRIPT_DIR}/test_functions.sh" + +GOT_PERL=$(which perl) + +if [ -z "$GOT_PERL" ]; then + echo "Perl not found, skipping test" + exit 0 +fi + + +## Here we test that STDOUT is passed using --print + +# shellcheck disable=SC2016 +TOT=$("$GOT_PERL" -e 'my $c=0;for (1..1000) { + $c++; + print "\@fasten_test$c\nAAA\n+\nIII\n"; + sleep 0.1; + }' | "$BIN" --id "test-suite" --print | grep -c 'fasten_test' | grep -w 1000) + +equal "$TOT" "1000" "Testing sort order of 1000 reads" + + +## Here we test the final message +# shellcheck disable=SC2016 +"$GOT_PERL" -e 'my $c=0;for (1..1000) { + $c++; + print "\@fasten_test$c\nAAA\n+\nIII\n"; + sleep 0.1; + }' | "$BIN" --id "test-suite" 2> "$TEST_TMP_FILE" + +END=$(grep "Finished" "$TEST_TMP_FILE" | cut -f 3 -d ":") +equal "$END" " Finished progress on 4000 reads" "Testing progress output" + +done_testing \ No newline at end of file diff --git a/tests/fasten_regex.sh b/tests/fasten_regex.sh index 82f3133b..f710dcdb 100644 --- a/tests/fasten_regex.sh +++ b/tests/fasten_regex.sh @@ -26,12 +26,12 @@ if [ "$(wc -l <<< "$pe_filtered")" -ne 24 ]; then exit 1 fi -if [ "$(echo "$pe_filtered" | ./target/debug/fasten_metrics --each-read | tail -n +2 | cut -f 1 | paste -sd+)" != "read0/1+read0/2+read1/1+read1/2+read2/1+read2/2" ]; then +if [ "$(echo "$pe_filtered" | ./target/debug/fasten_metrics --each-read | tail -n +2 | cut -f 1 | awk '{if(NR>1) printf "+"; printf $1}' )" != "read0/1+read0/2+read1/1+read1/2+read2/1+read2/2" ]; then echo "ERROR filtering for the right read names" exit 1 fi -if [ "$(echo "$r1_filtered" | ./target/debug/fasten_regex --regex read1 --which ID | ./target/debug/fasten_metrics --each-read | tail -n +2 | cut -f 1 | paste -sd+)" != "read1/1+read1/2" ]; then +if [ "$(echo "$r1_filtered" | ./target/debug/fasten_regex --regex read1 --which ID | ./target/debug/fasten_metrics --each-read | tail -n +2 | cut -f 1 | awk '{if(NR>1) printf "+"; printf $1}' )" != "read1/1+read1/2" ]; then echo "ERROR running regex on IDs for read1" exit 1 fi diff --git a/tests/fasten_sort.sh b/tests/fasten_sort.sh new file mode 100644 index 00000000..81965c15 --- /dev/null +++ b/tests/fasten_sort.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +# Minimal test suite for fasten sort (telatin 2024) + +THIS_SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +THIS_SCRIPT_NAME=$(basename "$0" | sed 's/\.sh//g') +source "${THIS_SCRIPT_DIR}/test_functions.sh" + +# Test fasten_sort with two reads +FIRST=$(echo -e "@ciao\nAAA\n+\nIII\n@andrea\nCCC\n+\nEEE" | $BIN | head -n 1) +equal "$FIRST" "@andrea" "Testing sort order of two reads" + +done_testing \ No newline at end of file diff --git a/tests/fasten_trim.sh b/tests/fasten_trim.sh index 729299e4..e31857ef 100644 --- a/tests/fasten_trim.sh +++ b/tests/fasten_trim.sh @@ -10,7 +10,7 @@ if [ "$reads_not_trimmed" != "$original_reads" ]; then exit 1 fi -onebase=$(./target/debug/fasten_trim --first-base 3 --last-base 4 < testdata/four_reads.pe.fastq | perl -lane 'print if($i++ % 4 == 1);' | paste -sd'_') +onebase=$(./target/debug/fasten_trim --first-base 3 --last-base 4 < testdata/four_reads.pe.fastq | perl -lane 'print if($i++ % 4 == 1);' |awk 'NR > 1 { printf "_"; } { printf $1; } END { printf "\n"; }') shouldbe="T_T_G_A_C_A_C_A" if [ "$onebase" != "$shouldbe" ]; then echo "ERROR trimming to the third base" diff --git a/tests/lib/benchmark.sh b/tests/lib/benchmark.sh index 5314d815..01d2c67c 100644 --- a/tests/lib/benchmark.sh +++ b/tests/lib/benchmark.sh @@ -51,6 +51,7 @@ zcat $large_interleaved | fasten_sort --sort-by GC --paired-end | gzip -c > $lar which bbnorm.sh which fasten_clean +which seqfu # Version information seqtk 2>&1 | grep -i version | sed 's/^/seqtk /' @@ -58,5 +59,9 @@ seqkit version | grep -m 1 v fasten_clean --version fastq_to_fasta -h | grep "Part of FASTX" bbnorm.sh version 2>&1 | grep 'BBMap version' +seqfu --version +# hyperfine +which hyperfine +which plot_whisker.py || echo "WARNING: plot_whisker.py from hyperfine not found in path: will not be able to plot graphs." diff --git a/tests/test_functions.sh b/tests/test_functions.sh new file mode 100644 index 00000000..13290727 --- /dev/null +++ b/tests/test_functions.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash + +# This should be sourced by other test scripts: die if not +if [ -z "$THIS_SCRIPT_DIR" ]; then + echo "ERROR: test_functions.sh should be sourced by other test scripts" + exit 1 +fi +TEST_TMP_FILE=$(mktemp) +NUM=0 +FAIL=0 +RED='\033[0;31m' +GREEN='\033[0;32m' +NC='\033[0m' # No Color +BIN=$(readlink -f "${THIS_SCRIPT_DIR}/../target/release/${THIS_SCRIPT_NAME}") +DEB_BIN=$(readlink -f "${THIS_SCRIPT_DIR}/../target/debug/${THIS_SCRIPT_NAME}") + + +echo -e " *** ${GREEN}Testing $THIS_SCRIPT_NAME${NC} (test: $TEST_TMP_FILE)*** " + + + +function test { + NUM=$((NUM+1)) + local msg="$1" + local condition=$2 + if [ "$condition" ]; then + echo -e "${GREEN}OK${NC}\t$NUM: $msg" + else + FAIL=$((FAIL+1)) + echo -e "${RED}FAIL${NC}\t$NUM: $msg" + fi +} + +function equal { + local got="$1" + local expected="$2" + local msg="$3" + NUM=$((NUM+1)) + if [ "$got" == "$expected" ]; then + echo -e "${GREEN}OK${NC}\t$NUM: $msg [$got]" + else + FAIL=$((FAIL+1)) + echo -e "${RED}FAIL${NC}\t$NUM: $msg" + echo -e "\tGot: $got" + echo -e "\tExpected: $expected" + fi +} + +function different { + local got="$1" + local expected="$2" + local msg="$3" + NUM=$((NUM+1)) + if [ "$got" != "$expected" ]; then + echo -e "${GREEN}OK${NC}\t$NUM: $msg [$got != $expected]" + else + FAIL=$((FAIL+1)) + echo -e "${RED}FAIL${NC}\t$NUM: $msg" + echo -e "\tGot: $got" + echo -e "\tequals to: $expected" + fi +} + +function getmd5 { + # use md5sum on Linux, md5 on OSX + if [ "$(uname)" == "Darwin" ]; then + md5 -q "$1" + else + md5sum "$1" | cut -f 1 -d " " + fi +} +function done_testing { + if [ -e "$TEST_TMP_FILE" ]; then + rm "$TEST_TMP_FILE" + fi + if [ "$FAIL" -eq 0 ]; then + echo -e "${GREEN}OK!${NC}\tAll $NUM tests passed${NC}" + exit 0 + else + echo -e "${RED}$FAIL/$NUM errors${NC}\ttests failed${NC}" + exit 1 + fi +} + +test "Release binary $BIN" "-e $BIN" +test "Release debug $DEB_BIN" "-e $DEB_BIN" +test "Release binary --help" "$DEB_BIN --help" +test "Debug binary --help" "$DEB_BIN --help" + +test "Release binary --version" "$DEB_BIN --version" +test "Debug binary --version" "$DEB_BIN --version"