Slightly updated script parameter naming convention.

MrAlexSee · Jun 30, 2018 · 699be5b · 699be5b
1 parent 4934a29
commit 699be5b
Show file tree

Hide file tree

Showing 4 changed files with 38 additions and 38 deletions.
diff --git a/README.md b/README.md
@@ -62,7 +62,7 @@ Testing on human genome and synthetic data.
 
 1. Generate synthetic data by running `python generate_synth.py` (requires Python 2.7) 4 times for the number of segments (parameter `nSegments`) set to 100, 500, 1000, and 1600 thousands. Rename files as `chr24.eds`, `chr25.eds`, `chr26.eds`, `chr27.eds`.
 
-1. Set parameter `inputDir` in `run_all.sh` to the folder containing `.eds` files and pattern files (all pattern files are located in the `sample/` folder as part of this package).
+1. Set parameter `inDir` in `run_all.sh` to the folder containing `.eds` files and pattern files (all pattern files are located in the `sample/` folder as part of this package).
 
 1. Compile SoPanG (see above).
 
@@ -80,7 +80,7 @@ Parameter name  | Parameter description
 
 Parameter name   | Parameter description
 ---------------- | ---------------------
-`dBufferSize`    | buffer size for processing segment variants, the size of the largest segment (i.e., the number of variants) from the input file cannot be larger than this value
+`dBufferSize`    | buffer size for processing segment variants, the size of the largest segment (i.e. the number of variants) from the input file cannot be larger than this value
 `maskBufferSize` | buffer size for Shift-Or masks for the input alphabet, must be larger than the largest input character ASCII code
 `wordSize`       | word size (in bits) used by the Shift-Or algorithm
 
@@ -90,23 +90,23 @@ Parameter name   | Parameter description
 
 Parameter name   | Parameter description
 ---------------- | ---------------------
-`inputDir`       | input directory path containing `.eds` ED text files and `.txt` input pattern files
+`inDir`          | input directory path containing `.eds` ED text files and `.txt` input pattern files
 `outFile`        | base name for output files
 
 #### ed_histogram.py (scripts folder)
 
 Parameter name         | Parameter description
 ---------------------- | ---------------------
-`inputDir`             | input directory path containing `.eds` ED text files
+`pInDir`               | input directory path containing `.eds` ED text files
 
 #### generate_synth.py (scripts folder)
 
-Parameter name         | Parameter description
----------------------- | ---------------------
-`nSegments`            | total number of segments
-`alphabet`             | alphabet for character sampling
-`nDegeneratePositions` | number of segments (must be smaller than or equal to `nSegments`) which are non-deterministic, i.e., contain multiple variants
-`nMaxSegmentVariants`  | maximum number of variants (`a`), the number of variants for each non-deterministic segment will be sampled from the interval `[2, a]`
-`nMaxVariantLength`    | maximum length of each segment variant (`b`), the length for each variant will be sampled from the interval `[0, b]` (segments might contain empty words)
-`outFile`              | output file path
+Parameter name          | Parameter description
+----------------------- | ---------------------
+`pNSegments`            | total number of segments
+`pAlphabet`             | alphabet for character sampling
+`pNDegeneratePositions` | number of segments (must be smaller than or equal to `nSegments`) which are non-deterministic, i.e. contain multiple variants
+`pNMaxSegmentVariants`  | maximum number of variants (`a`), the number of variants for each non-deterministic segment will be sampled from the interval `[2, a]`
+`pNMaxVariantLength`    | maximum length of each segment variant (`b`), the length for each variant will be sampled from the interval `[0, b]` (segments might contain empty words)
+`pOutFile`              | output file path
 
diff --git a/run_all.sh b/run_all.sh
@@ -1,12 +1,12 @@
 #!/bin/sh
 
-inputDir="data"
+inDir="data"
 outFile="out"
 
 for i in $(seq 1 27);
 do
-    ./sopang -d -o ${outFile}8.txt $inputDir/chr${i}.eds $inputDir/patterns8.txt;
-    ./sopang -d -o ${outFile}16.txt $inputDir/chr${i}.eds $inputDir/patterns16.txt;
-    ./sopang -d -o ${outFile}32.txt $inputDir/chr${i}.eds $inputDir/patterns32.txt;
-    ./sopang -d -o ${outFile}64.txt $inputDir/chr${i}.eds $inputDir/patterns64.txt;
+    ./sopang -d -o ${outFile}8.txt $inDir/chr${i}.eds $inDir/patterns8.txt;
+    ./sopang -d -o ${outFile}16.txt $inDir/chr${i}.eds $inDir/patterns16.txt;
+    ./sopang -d -o ${outFile}32.txt $inDir/chr${i}.eds $inDir/patterns32.txt;
+    ./sopang -d -o ${outFile}64.txt $inDir/chr${i}.eds $inDir/patterns64.txt;
 done
diff --git a/scripts/ed_histogram.py b/scripts/ed_histogram.py
@@ -6,11 +6,11 @@
 import os
 import re
 
-inputDir = "../sample" # Input directory path containing `.eds` ED text files.
+pInDir = "../sample" # Input directory path containing `.eds` ED text files.
 
 def main():
     res = []
-    edsFiles = [os.path.join(inputDir, f) for f in os.listdir(inputDir) if f.endswith(".eds")]
+    edsFiles = [os.path.join(pInDir, f) for f in os.listdir(pInDir) if f.endswith(".eds")]
 
     for edsFile in edsFiles:
         print "Reading file: {0}".format(edsFile)

diff --git a/scripts/generate_synth.py b/scripts/generate_synth.py
@@ -15,49 +15,49 @@
 
 import random
 
-nSegments = 100 * 1000 # Total number of segments: 100, 500, 1000, 1600 thousands segments.
-alphabet = "ACGTN" # Alphabet for character sampling.
+pNSegments = 100 * 1000 # Total number of segments: 100, 500, 1000, 1600 thousands segments.
+pAlphabet = "ACGTN" # Alphabet for character sampling.
 
-# Number of segments (must be smaller than or equal to nSegments) which are degenerate (indeterminate),
+# Number of segments (must be smaller than or equal to nSegments) which are non-deterministic,
 # i.e. contain multiple variants.
-nDegeneratePositions = int(0.1 * nSegments) # 10% of the text as in Grossi et al.
+pNDegeneratePositions = int(0.1 * pNSegments) # 10% of the text as in Grossi et al.
 
-# Maximum number of variants (a), the number of variants for each degenerate segment
+# Maximum number of variants (a), the number of variants for each non-deterministic segment
 # will be sampled from the interval [2, a].
-nMaxSegmentVariants = 10
+pNMaxSegmentVariants = 10
 
 # Maximum length of each segment variant (b), the length for each variant 
 # will be sampled from the interval [0, b] (segments might contain empty words).
-nMaxVariantLength = 10
+pNMaxVariantLength = 10
 
-outFile = "text.eds" # Output file path.
+pOutFile = "text.eds" # Output file path.
 
 def main():
-    textSizeMB = round(nSegments / 1000.0 / 1000.0, 3)
-    print "Started, alph = \"{0}\", text size = {1}m".format(alphabet, textSizeMB)
+    textSizeMB = round(pNSegments / 1000.0 / 1000.0, 3)
+    print "Started, alph = \"{0}\", text size = {1}m".format(pAlphabet, textSizeMB)
 
-    text = randomString(alphabet, nSegments)
+    text = randomString(pAlphabet, pNSegments)
 
     # Randomly drawn degenerate positions.
-    degenPosList = random.sample(xrange(nSegments), nDegeneratePositions)
+    degenPosList = random.sample(xrange(pNSegments), pNDegeneratePositions)
     # Dictionary: position in text -> list of a few strings.
     degenStrings = {}
 
-    print "Generating degenerate strings for #positions = {0}k".format(nDegeneratePositions / 1000.0)
+    print "Generating degenerate strings for #positions = {0}k".format(pNDegeneratePositions / 1000.0)
 
     for curPos in degenPosList:
         # Degenerate letter is defined as a "non-empty set of strings".
-        howMany = random.randint(2, nMaxSegmentVariants)
+        howMany = random.randint(2, pNMaxSegmentVariants)
         curSet = set()
 
         while len(curSet) < howMany:
-            curLen = random.randint(0, nMaxVariantLength) # Includes empty strings.
-            curStr = randomString(alphabet, curLen)
+            curLen = random.randint(0, pNMaxVariantLength) # Includes empty strings.
+            curStr = randomString(pAlphabet, curLen)
             curSet.add(curStr)
 
         degenStrings[curPos] = curSet
 
-    assert len(degenPosList) == len(degenStrings) == nDegeneratePositions
+    assert len(degenPosList) == len(degenStrings) == pNDegeneratePositions
     dumpToFile(text, set(degenPosList), degenStrings)
 
 def randomString(alph, size):
@@ -89,10 +89,10 @@ def dumpToFile(text, degenPosSet, degenStrings):
 
                 outStr += curRun + "}"
 
-    with open(outFile, "w") as f:
+    with open(pOutFile, "w") as f:
         f.write(outStr)
 
-    print "Dumped to file: {0}".format(outFile)
+    print "Dumped to file: {0}".format(pOutFile)
 
 if __name__ == "__main__":
     main()