dirac-institute · astronomerritt · Aug 7, 2024 · Aug 7, 2024 · Aug 7, 2024 · Aug 7, 2024
diff --git a/src/sorcha/modules/PPCommandLineParser.py b/src/sorcha/modules/PPCommandLineParser.py
@@ -2,6 +2,7 @@
 import sys
 import logging
 import glob
+import re
 from .PPConfigParser import PPFindFileOrExit, PPFindDirectoryOrExit
 
 
@@ -64,6 +65,21 @@ def PPCommandLineParser(args):
     cmd_args_dict["outpath"] = PPFindFileOrExit(args.o, "-o, --outfile")
     cmd_args_dict["pointing_database"] = PPFindFileOrExit(args.pd, "-pd, --pointing_database")
 
+    if args.process_subset:
+        m = re.match(r"^(\d+)/(\d+)$", args.process_subset)
+        if m is None:
+            sys.exit("--process-subset: the argument must be in form of <split>/<nsplits>")
+
+        split, nsplits = int(m.group(1)), int(m.group(2))
+        if nsplits <= 0:
+            pplogger.error("--process-subset: the number of splits must be >= 1")
+            sys.exit("--process-subset: the number of splits must be >= 1")
+        if split < 1 or split > nsplits:
+            pplogger.error("--process-subset: the chosen splits must be between 1 and <nsplits> (inclusive).")
+            sys.exit("--process-subset: the chosen splits must be between 1 and <nsplits> (inclusive).")
+
+        cmd_args_dict["process_subset"] = (split, nsplits)
+
     if args.cp:
         cmd_args_dict["complex_physical_parameters"] = PPFindFileOrExit(
             args.cp, "-cp, --complex_physical_parameters"

diff --git a/src/sorcha/sorcha.py b/src/sorcha/sorcha.py
@@ -161,12 +161,31 @@ def runLSSTSimulation(args, configs):
     endChunk = 0
     loopCounter = 0
 
+    # Find the number of objects in the input file.  FIXME: This assumes the
+    # input file has a header, and has no empty or comment lines.
     ii = -1
     with open(args.orbinfile) as f:
         for ii, l in enumerate(f):
             pass
     lenf = ii
 
+    split, nsplits = args.process_subset
+    print(split, nsplits)
+    if nsplits > 1:
+        # calculate the [beginning, end) indices. For example
+        #   np.linspace(0, 100, 3+1, dtype=int)
+        #   --> array([  0,  33,  66, 100])
+        edges = np.linspace(0, lenf, nsplits + 1, dtype=int)
+        b, e = edges[split - 1], edges[split]
+        lenf = e - b
+
+        # fast-forward to the requested split
+        at = 0
+        while at < b:
+            bs = min(at + configs["size_serial_chunk"], b)
+            reader.read_aux_block(block_size=bs)
+            at += bs
+
     footprint = None
     if configs["camera_model"] == "footprint":
         verboselog("Creating sensor footprint object for filtering")
@@ -178,13 +197,12 @@ def runLSSTSimulation(args, configs):
         verboselog("Working on objects {}-{}".format(startChunk, endChunk))
 
         # Processing begins, all processing is done for chunks
+        bs = min(endChunk, lenf) - startChunk
         if configs["ephemerides_type"].casefold() == "external":
             verboselog("Reading in chunk of orbits and associated ephemeris from an external file")
-            observations = reader.read_block(block_size=configs["size_serial_chunk"])
-            observations.to_csv("post_readin_ephem_nonprimary.csv")
+            observations = reader.read_block(block_size=bs)
         else:
-            verboselog("Ingest chunk of orbits")
-            orbits_df = reader.read_aux_block(block_size=configs["size_serial_chunk"])
+            orbits_df = reader.read_aux_block(block_size=bs)
             verboselog("Starting ephemeris generation")
             observations = create_ephemeris(orbits_df, filterpointing, args, configs)
             verboselog("Ephemeris generation completed")

diff --git a/src/sorcha/utilities/diffTestUtils.py b/src/sorcha/utilities/diffTestUtils.py
@@ -55,6 +55,7 @@ def compare_result_files(test_output, golden_output):
     "outfilestem": f"out_end2end",
     "verbose": False,
     "stats": None,
+    "process_subset": (1, 1),
 }
 
 WITH_EPHEMERIS_ARGS = {
@@ -67,6 +68,7 @@ def compare_result_files(test_output, golden_output):
     "outfilestem": f"out_end2end_with_ephemeris_generation",
     "verbose": False,
     "stats": None,
+    "process_subset": (1, 1),
 }
 
 CHUNKED_ARGS = {
@@ -79,6 +81,7 @@ def compare_result_files(test_output, golden_output):
     "outfilestem": f"out_end2end_chunked",
     "verbose": False,
     "stats": None,
+    "process_subset": (1, 1),
 }
 
 UNCHUNKED_ARGS = {
@@ -91,6 +94,7 @@ def compare_result_files(test_output, golden_output):
     "outfilestem": f"out_end2end_unchunked",
     "verbose": False,
     "stats": None,
+    "process_subset": (1, 1),
 }
 
 
@@ -104,6 +108,20 @@ def compare_result_files(test_output, golden_output):
     "outfilestem": f"verification_output",
     "verbose": False,
     "stats": None,
+    "process_subset": (1, 1),
+}
+
+PROCESS_SUBSET_ARGS = {
+    "paramsinput": get_demo_filepath("sspp_testset_colours.txt"),
+    "orbinfile": get_demo_filepath("sspp_testset_orbits.des"),
+    "oifoutput": get_demo_filepath("example_oif_output.txt"),
+    "configfile": get_test_filepath("PPConfig_goldens_test.ini"),
+    "pointing_database": get_demo_filepath("baseline_v2.0_1yr.db"),
+    "surveyname": "rubin_sim",
+    "outfilestem": f"out_end2end_subset",
+    "verbose": False,
+    "stats": None,
+    "process_subset": (2, 10),
 }
 
 
@@ -135,6 +153,8 @@ def override_seed_and_run(outpath, arg_set="baseline"):
         cmd_args_dict = UNCHUNKED_ARGS
     elif arg_set == "truth":
         cmd_args_dict = VERIFICATION_TRUTH
+    elif arg_set == "subset":
+        cmd_args_dict = PROCESS_SUBSET_ARGS
     else:
         raise ValueError(
             f"Unknown arg set name, {arg_set}. Must be one of: 'baseline', 'with_ephemeris', 'truth'."

diff --git a/src/sorcha/utilities/sorchaArguments.py b/src/sorcha/utilities/sorchaArguments.py
@@ -3,6 +3,7 @@
 import time
 from os import path, urandom
 import logging
+from typing import Tuple
 
 from sorcha.modules.PPModuleRNG import PerModuleRNG
 from sorcha.modules.PPGetLogger import PPGetLogger
@@ -31,6 +32,9 @@ class sorchaArguments:
     surveyname: str = ""
     """name of the survey (`rubin_sim` is only one implemented currently)"""
 
+    process_subset: Tuple[int, int] = (1, 1)
+    """the subset of the file to process, in form of (split, nsplits)"""
+
     complex_parameters: str = ""
     """optional, extra complex physical parameter input files"""
 
@@ -73,6 +77,7 @@ def read_from_dict(self, args):
         self.ar_data_file_path = args.get("ar_data_path")
         self.verbose = args["verbose"]
         self.stats = args["stats"]
+        self.process_subset = args["process_subset"]
 
         self.surveyname = args["surveyname"]
 

diff --git a/src/sorcha_cmdline/run.py b/src/sorcha_cmdline/run.py
@@ -115,6 +115,14 @@ def main():
         dest="st",
         default=None,
     )
+    optional.add_argument(
+        "--process-subset",
+        help="Process a subset of the input objects. Specify in form of <split>/<nsplits>, where <nsplits> is the number of chunks into which"
+        " the input will be divided, and <split> is the (1-based) chunk for to be processed here. For example, writing 3/5 with a catalog"
+        " of 100 objects will process objects with (0-based) indices [40, 60).",
+        type=str,
+        default="1/1",
+    )
 
     args = parser.parse_args()
 

diff --git a/tests/activity/test_activity_registration.py b/tests/activity/test_activity_registration.py
@@ -9,7 +9,7 @@
 
 def test_register_subclasses():
     output = register_activity_subclasses()
-
+    update_activity_subclasses()  # if sorcha-addons is installed we need to update the subclasses
     assert output == CA_METHODS
 
 

diff --git a/tests/ephemeris/test_ephemeris_generation.py b/tests/ephemeris/test_ephemeris_generation.py
@@ -118,6 +118,7 @@ def test_ephemeris_end2end(single_synthetic_pointing, tmp_path):
         "outfilestem": f"out_400k",
         "verbose": False,
         "stats": None,
+        "process_subset": (1, 1),
     }
 
     pplogger = PPGetLogger(cmd_args_dict["outpath"])

diff --git a/tests/ephemeris/test_pixdict.py b/tests/ephemeris/test_pixdict.py
@@ -62,6 +62,7 @@ def test_pixeldict(tmp_path):
         "outfilestem": f"out_400k",
         "verbose": False,
         "stats": None,
+        "process_subset": (1, 1),
     }
 
     args = sorchaArguments(cmd_args_dict)

diff --git a/tests/lightcurves/test_lightcurve_registration.py b/tests/lightcurves/test_lightcurve_registration.py
@@ -9,7 +9,7 @@
 
 def test_register_subclasses():
     output = register_lc_subclasses()
-
+    update_lc_subclasses()  # if sorcha-addons is installed we need to update the subclasses
     assert output == LC_METHODS
 
 

diff --git a/tests/sorcha/test_PPCommandLineParser.py b/tests/sorcha/test_PPCommandLineParser.py
@@ -5,7 +5,7 @@
 
 
 class args:
-    def __init__(self, cp, t="testout", o="./", f=False):
+    def __init__(self, cp, t="testout", o="./", f=False, process_subset=("1/1")):
         self.p = get_test_filepath("testcolour.txt")
         self.ob = get_test_filepath("testorb.des")
         self.er = get_test_filepath("oiftestoutput.txt")
@@ -20,6 +20,7 @@ def __init__(self, cp, t="testout", o="./", f=False):
         self.f = f
         self.ar = None
         self.st = "test.csv"
+        self.process_subset = process_subset
 
 
 def test_PPCommandLineParser():
@@ -41,6 +42,7 @@ def test_PPCommandLineParser():
         "ar_data_path": None,
         "output_ephemeris_file": None,
         "stats": "test.csv",
+        "process_subset": (1, 1),
     }
 
     cmd_dict_2 = PPCommandLineParser(args(get_test_filepath("testcomet.txt")))
@@ -58,6 +60,7 @@ def test_PPCommandLineParser():
         "ar_data_path": None,
         "output_ephemeris_file": None,
         "stats": "test.csv",
+        "process_subset": (1, 1),
     }
 
     with open(os.path.join(tmp_path, "dummy_file.txt"), "w") as _:
@@ -74,3 +77,24 @@ def test_PPCommandLineParser():
     assert not os.path.isfile(os.path.join(tmp_path, "dummy_file.txt"))
 
     return
+
+
+def test_PPCommandLineParser_subset():
+    from sorcha.modules.PPCommandLineParser import PPCommandLineParser
+
+    tmp_path = os.path.dirname(get_test_filepath("test_input_fullobs.csv"))
+
+    with pytest.raises(SystemExit) as e:
+        _ = PPCommandLineParser(args(False, process_subset="3/1"))
+
+    assert e.value.code == "--process-subset: the chosen splits must be between 1 and <nsplits> (inclusive)."
+
+    with pytest.raises(SystemExit) as e2:
+        _ = PPCommandLineParser(args(False, process_subset="-1/1"))
+
+    assert e2.value.code == "--process-subset: the argument must be in form of <split>/<nsplits>"
+
+    with pytest.raises(SystemExit) as e3:
+        _ = PPCommandLineParser(args(False, process_subset="1/0"))
+
+    assert e3.value.code == "--process-subset: the number of splits must be >= 1"
diff --git a/tests/sorcha/test_PPConfigParser.py b/tests/sorcha/test_PPConfigParser.py
@@ -249,6 +249,7 @@ def test_PPPrintConfigsToLog(tmp_path):
         "verbose": True,
         "seed": 24601,
         "stats": None,
+        "process_subset": (1, 1),
     }
 
     args = sorchaArguments(cmd_args)

diff --git a/tests/sorcha/test_demo_process_subset.py b/tests/sorcha/test_demo_process_subset.py
@@ -0,0 +1,23 @@
+import os
+import tempfile
+import pandas as pd
+
+from sorcha.utilities.dataUtilitiesForTests import get_demo_filepath
+from sorcha.utilities.diffTestUtils import override_seed_and_run
+
+
+def test_demo_process_subset():
+    """This tests the --process-subset command line option, where only a chunk of
+    the input files are run through Sorcha. It is a full end-to-end test
+    with all randomised elements turned off for a quick test.
+    """
+
+    with tempfile.TemporaryDirectory() as dir_name:
+        override_seed_and_run(dir_name, arg_set="subset")
+        res_file = os.path.join(dir_name, "out_end2end_subset.csv")
+        assert os.path.isfile(res_file)
+
+        subset_data = pd.read_csv(res_file)
+
+        assert len(subset_data["ObjID"].unique()) == 1
+        assert subset_data["ObjID"].unique()[0] == "2010_TC209"
diff --git a/tests/sorcha/test_sorchaArguments.py b/tests/sorcha/test_sorchaArguments.py
@@ -14,6 +14,7 @@
     "verbose": False,
     "pointing_database": get_demo_filepath("baseline_v2.0_1yr.db"),
     "stats": "./test.csv",
+    "process_subset": (1, 1),
 }