From 699be5bbdad0570c07769b10a41de307bd85cb79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleksander=20Cis=C5=82ak?= Date: Sat, 30 Jun 2018 12:46:50 +0200 Subject: [PATCH] Slightly updated script parameter naming convention. --- README.md | 24 ++++++++++++------------ run_all.sh | 10 +++++----- scripts/ed_histogram.py | 4 ++-- scripts/generate_synth.py | 38 +++++++++++++++++++------------------- 4 files changed, 38 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index 1787cf9..2f7a90c 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,7 @@ Testing on human genome and synthetic data. 1. Generate synthetic data by running `python generate_synth.py` (requires Python 2.7) 4 times for the number of segments (parameter `nSegments`) set to 100, 500, 1000, and 1600 thousands. Rename files as `chr24.eds`, `chr25.eds`, `chr26.eds`, `chr27.eds`. -1. Set parameter `inputDir` in `run_all.sh` to the folder containing `.eds` files and pattern files (all pattern files are located in the `sample/` folder as part of this package). +1. Set parameter `inDir` in `run_all.sh` to the folder containing `.eds` files and pattern files (all pattern files are located in the `sample/` folder as part of this package). 1. Compile SoPanG (see above). @@ -80,7 +80,7 @@ Parameter name | Parameter description Parameter name | Parameter description ---------------- | --------------------- -`dBufferSize` | buffer size for processing segment variants, the size of the largest segment (i.e., the number of variants) from the input file cannot be larger than this value +`dBufferSize` | buffer size for processing segment variants, the size of the largest segment (i.e. the number of variants) from the input file cannot be larger than this value `maskBufferSize` | buffer size for Shift-Or masks for the input alphabet, must be larger than the largest input character ASCII code `wordSize` | word size (in bits) used by the Shift-Or algorithm @@ -90,23 +90,23 @@ Parameter name | Parameter description Parameter name | Parameter description ---------------- | --------------------- -`inputDir` | input directory path containing `.eds` ED text files and `.txt` input pattern files +`inDir` | input directory path containing `.eds` ED text files and `.txt` input pattern files `outFile` | base name for output files #### ed_histogram.py (scripts folder) Parameter name | Parameter description ---------------------- | --------------------- -`inputDir` | input directory path containing `.eds` ED text files +`pInDir` | input directory path containing `.eds` ED text files #### generate_synth.py (scripts folder) -Parameter name | Parameter description ----------------------- | --------------------- -`nSegments` | total number of segments -`alphabet` | alphabet for character sampling -`nDegeneratePositions` | number of segments (must be smaller than or equal to `nSegments`) which are non-deterministic, i.e., contain multiple variants -`nMaxSegmentVariants` | maximum number of variants (`a`), the number of variants for each non-deterministic segment will be sampled from the interval `[2, a]` -`nMaxVariantLength` | maximum length of each segment variant (`b`), the length for each variant will be sampled from the interval `[0, b]` (segments might contain empty words) -`outFile` | output file path +Parameter name | Parameter description +----------------------- | --------------------- +`pNSegments` | total number of segments +`pAlphabet` | alphabet for character sampling +`pNDegeneratePositions` | number of segments (must be smaller than or equal to `nSegments`) which are non-deterministic, i.e. contain multiple variants +`pNMaxSegmentVariants` | maximum number of variants (`a`), the number of variants for each non-deterministic segment will be sampled from the interval `[2, a]` +`pNMaxVariantLength` | maximum length of each segment variant (`b`), the length for each variant will be sampled from the interval `[0, b]` (segments might contain empty words) +`pOutFile` | output file path diff --git a/run_all.sh b/run_all.sh index 15857db..fc45666 100755 --- a/run_all.sh +++ b/run_all.sh @@ -1,12 +1,12 @@ #!/bin/sh -inputDir="data" +inDir="data" outFile="out" for i in $(seq 1 27); do - ./sopang -d -o ${outFile}8.txt $inputDir/chr${i}.eds $inputDir/patterns8.txt; - ./sopang -d -o ${outFile}16.txt $inputDir/chr${i}.eds $inputDir/patterns16.txt; - ./sopang -d -o ${outFile}32.txt $inputDir/chr${i}.eds $inputDir/patterns32.txt; - ./sopang -d -o ${outFile}64.txt $inputDir/chr${i}.eds $inputDir/patterns64.txt; + ./sopang -d -o ${outFile}8.txt $inDir/chr${i}.eds $inDir/patterns8.txt; + ./sopang -d -o ${outFile}16.txt $inDir/chr${i}.eds $inDir/patterns16.txt; + ./sopang -d -o ${outFile}32.txt $inDir/chr${i}.eds $inDir/patterns32.txt; + ./sopang -d -o ${outFile}64.txt $inDir/chr${i}.eds $inDir/patterns64.txt; done diff --git a/scripts/ed_histogram.py b/scripts/ed_histogram.py index 17a967e..24ab7ec 100644 --- a/scripts/ed_histogram.py +++ b/scripts/ed_histogram.py @@ -6,11 +6,11 @@ import os import re -inputDir = "../sample" # Input directory path containing `.eds` ED text files. +pInDir = "../sample" # Input directory path containing `.eds` ED text files. def main(): res = [] - edsFiles = [os.path.join(inputDir, f) for f in os.listdir(inputDir) if f.endswith(".eds")] + edsFiles = [os.path.join(pInDir, f) for f in os.listdir(pInDir) if f.endswith(".eds")] for edsFile in edsFiles: print "Reading file: {0}".format(edsFile) diff --git a/scripts/generate_synth.py b/scripts/generate_synth.py index 8d22d4b..35dfd72 100644 --- a/scripts/generate_synth.py +++ b/scripts/generate_synth.py @@ -15,49 +15,49 @@ import random -nSegments = 100 * 1000 # Total number of segments: 100, 500, 1000, 1600 thousands segments. -alphabet = "ACGTN" # Alphabet for character sampling. +pNSegments = 100 * 1000 # Total number of segments: 100, 500, 1000, 1600 thousands segments. +pAlphabet = "ACGTN" # Alphabet for character sampling. -# Number of segments (must be smaller than or equal to nSegments) which are degenerate (indeterminate), +# Number of segments (must be smaller than or equal to nSegments) which are non-deterministic, # i.e. contain multiple variants. -nDegeneratePositions = int(0.1 * nSegments) # 10% of the text as in Grossi et al. +pNDegeneratePositions = int(0.1 * pNSegments) # 10% of the text as in Grossi et al. -# Maximum number of variants (a), the number of variants for each degenerate segment +# Maximum number of variants (a), the number of variants for each non-deterministic segment # will be sampled from the interval [2, a]. -nMaxSegmentVariants = 10 +pNMaxSegmentVariants = 10 # Maximum length of each segment variant (b), the length for each variant # will be sampled from the interval [0, b] (segments might contain empty words). -nMaxVariantLength = 10 +pNMaxVariantLength = 10 -outFile = "text.eds" # Output file path. +pOutFile = "text.eds" # Output file path. def main(): - textSizeMB = round(nSegments / 1000.0 / 1000.0, 3) - print "Started, alph = \"{0}\", text size = {1}m".format(alphabet, textSizeMB) + textSizeMB = round(pNSegments / 1000.0 / 1000.0, 3) + print "Started, alph = \"{0}\", text size = {1}m".format(pAlphabet, textSizeMB) - text = randomString(alphabet, nSegments) + text = randomString(pAlphabet, pNSegments) # Randomly drawn degenerate positions. - degenPosList = random.sample(xrange(nSegments), nDegeneratePositions) + degenPosList = random.sample(xrange(pNSegments), pNDegeneratePositions) # Dictionary: position in text -> list of a few strings. degenStrings = {} - print "Generating degenerate strings for #positions = {0}k".format(nDegeneratePositions / 1000.0) + print "Generating degenerate strings for #positions = {0}k".format(pNDegeneratePositions / 1000.0) for curPos in degenPosList: # Degenerate letter is defined as a "non-empty set of strings". - howMany = random.randint(2, nMaxSegmentVariants) + howMany = random.randint(2, pNMaxSegmentVariants) curSet = set() while len(curSet) < howMany: - curLen = random.randint(0, nMaxVariantLength) # Includes empty strings. - curStr = randomString(alphabet, curLen) + curLen = random.randint(0, pNMaxVariantLength) # Includes empty strings. + curStr = randomString(pAlphabet, curLen) curSet.add(curStr) degenStrings[curPos] = curSet - assert len(degenPosList) == len(degenStrings) == nDegeneratePositions + assert len(degenPosList) == len(degenStrings) == pNDegeneratePositions dumpToFile(text, set(degenPosList), degenStrings) def randomString(alph, size): @@ -89,10 +89,10 @@ def dumpToFile(text, degenPosSet, degenStrings): outStr += curRun + "}" - with open(outFile, "w") as f: + with open(pOutFile, "w") as f: f.write(outStr) - print "Dumped to file: {0}".format(outFile) + print "Dumped to file: {0}".format(pOutFile) if __name__ == "__main__": main()