Metagenjstyle (#40)

* broaden plot to 110 from 102 and add plots for clarity on daily coverage * for metagen, changed logging level to DEBUG, turn down logging for everything else and move some messages to debug for clarity * attempting fix for issue #33 and other uneeded f format * black lines for all plots, standard DPI and fix issue #33 * refactor - moving single info log setup to the logging_helper.py * improved styling of xaxis for metagen coverage plot * better metagen log file format with dates * minor changes to coverage plot to reduce the size and improve readability * remove print as this may be truncating the display in jupyter * print location of plot for convenience * separate test output for soundtrap local/s3 * better formatting of coverage plot * ruff reformat * exclude metadata needed for hmd plots in coverage plots * other more os agnostic changes for metagen test and remove unused import * ruff reformat * fix pandas SettingWithCopyWarning warning * more work on removing pandas warnings * make edge color black and save coverage with instrument prefix * ruff reformat * lint check * lighter gray face color for coverage and ruff format * minor grammar fix
mbari-org · Sep 15, 2024 · 1f83952 · carueda · Sep 16, 2024 · 1f83952
1 parent 781b139
commit 1f83952
Show file tree

Hide file tree

Showing 10 changed files with 137 additions and 71 deletions.
diff --git a/pbp/logging_helper.py b/pbp/logging_helper.py
@@ -45,3 +45,27 @@ def create_logger(
         )
 
     return log
+
+
+def create_logger_info(log_filename: str):
+    """
+    Create a logger with INFO level for console and file and simple format (no log level).
+    Also logs to a file all messages at DEBUG level and above.
+    Best used for scripts that don't need DEBUG level logging to the console.
+    :param log_filename:
+        The name of the log file to create
+    """
+    loguru.logger.remove()
+    log = copy.deepcopy(loguru.logger)
+    info_format = "{message}"
+    default_format = "{time} {level} {message}"
+    log.add(
+        sys.stdout,
+        level="INFO",
+        format=info_format,
+        filter=lambda record: record["level"].name == "INFO",
+    )
+    log.add(
+        sink=open(log_filename, "w"), level="DEBUG", format=default_format, enqueue=True
+    )
+    return log
diff --git a/pbp/main_meta_generator.py b/pbp/main_meta_generator.py
@@ -2,11 +2,13 @@
 from datetime import datetime
 from pathlib import Path
 
+from pbp.logging_helper import create_logger_info
 from pbp.meta_gen.gen_nrs import NRSMetadataGenerator
 from pbp.meta_gen.gen_iclisten import IcListenMetadataGenerator
 from pbp.meta_gen.gen_soundtrap import SoundTrapMetadataGenerator
 from pbp.main_meta_generator_args import parse_arguments
 
+
 # Some imports, in particular involving data processing, cause a delay that is
 # noticeable when just running the --help option. We get around this issue by
 # postponing the imports until actually needed. See the main() function.
@@ -15,17 +17,6 @@
 def main():
     opts = parse_arguments()
 
-    # pylint: disable=import-outside-toplevel
-    from pbp.logging_helper import create_logger
-
-    log = create_logger(
-        log_filename_and_level=(
-            f"{opts.output_dir}/{opts.recorder}{opts.start}_{opts.end}.log",
-            "INFO",
-        ),
-        console_level="INFO",
-    )
-
     log_dir = Path(opts.output_dir)
     json_dir = Path(opts.json_base_dir)
     if opts.xml_dir is None:
@@ -42,6 +33,10 @@ def main():
     start = datetime.strptime(opts.start, "%Y%m%d")
     end = datetime.strptime(opts.end, "%Y%m%d")
 
+    log = create_logger_info(
+        f"{opts.output_dir}/{opts.recorder}{opts.start:%Y%m%d}_{opts.end:%Y%m%d}.log"
+    )
+
     try:
         if opts.recorder == "NRS":
             generator = NRSMetadataGenerator(

diff --git a/pbp/main_plot.py b/pbp/main_plot.py
@@ -113,8 +113,6 @@ def main():
             jpeg_filename=jpeg_filename,
             show=show,
         )
-        if jpeg_filename is not None:
-            print(f"   done: {jpeg_filename}")
 
 
 if __name__ == "__main__":

diff --git a/pbp/meta_gen/gen_iclisten.py b/pbp/meta_gen/gen_iclisten.py
@@ -138,7 +138,7 @@ def run(self):
                                             )
                                         )
 
-                self.log.info(
+                self.log.debug(
                     f"{self.log_prefix} Found {len(wav_files)} files to process that "
                     f"cover the expanded period {start_dt} - {end_dt}"
                 )
@@ -154,8 +154,8 @@ def run(self):
                 wav_files.sort(key=lambda x: x.start)
 
                 # create a dataframe from the wav files
-                self.log.info(
-                    f"{self.log_prefix}  Creating dataframe from {len(wav_files)} files "
+                self.log.debug(
+                    f"{self.log_prefix} creating dataframe from {len(wav_files)} files "
                     f"spanning {wav_files[0].start} to {wav_files[-1].start}..."
                 )
 
@@ -180,10 +180,16 @@ def run(self):
             except Exception as ex:
                 self.log.exception(str(ex))
 
+        # plot the daily coverage only on files that are greater than the start date
+        # this os tp avoid plotting any coverage on files only included for overlap
         plot_file = plot_daily_coverage(
-            InstrumentType.ICLISTEN, self.df, self.json_base_dir, self.start, self.end
+            InstrumentType.ICLISTEN,
+            self.df[self.df["start"] >= self.start],
+            self.json_base_dir,
+            self.start,
+            self.end,
         )
-        self.log.info(f"Plot file: {plot_file}")
+        self.log.info(f"Coverage plot saved to {plot_file}")
 
 
 if __name__ == "__main__":

diff --git a/pbp/meta_gen/gen_nrs.py b/pbp/meta_gen/gen_nrs.py
@@ -83,7 +83,7 @@ def run(self):
                     if f_dt is None:
                         continue
                     if start_dt <= f_dt <= end_dt:
-                        self.log.info(f"Found file {filename} with timestamp {f_dt}")
+                        self.log.debug(f"Found file {filename} with timestamp {f_dt}")
                         if ext == "*.flac":
                             sound_files.append(FlacFile(self.log, str(filename), f_dt))
                         if ext == "*.wav":
@@ -102,13 +102,13 @@ def run(self):
                 if f_dt is None:
                     continue
                 if start_dt <= f_dt <= end_dt:
-                    self.log.info(f"Found file {blob.name} with timestamp {f_dt}")
+                    self.log.debug(f"Found file {blob.name} with timestamp {f_dt}")
                     if re.search(r"\.flac$", blob.name):
                         sound_files.append(FlacFile(self.log, f_path, f_dt))
                     if re.search(r"\.wav$", blob.name):
                         sound_files.append(WavFile(self.log, f_path, f_dt))
                 # delay to avoid 400 error
-                if i % 100 == 0:
+                if i % 100 == 0 and i > 0:
                     self.log.info(
                         f"{i} files searched...found {len(sound_files)} files that match the search pattern"
                     )
@@ -135,7 +135,7 @@ def run(self):
         for day in pd.date_range(self.start, self.end, freq="D"):
             try:
                 # create a dataframe from the flac files
-                self.log.info(
+                self.log.debug(
                     f"Creating dataframe from {len(sound_files)} "
                     f"files spanning {sound_files[0].start} to {sound_files[-1].start} in self.json_base_dir..."
                 )
@@ -155,9 +155,14 @@ def run(self):
             except Exception as ex:
                 self.log.exception(str(ex))
 
-        # plot the daily coverage
+        # plot the daily coverage only on files that are greater than the start date
+        # this os tp avoid plotting any coverage on files only included for overlap
         plot_file = plot_daily_coverage(
-            InstrumentType.NRS, self.df, self.json_base_dir, self.start, self.end
+            InstrumentType.NRS,
+            self.df[self.df["start"] >= self.start],
+            self.json_base_dir,
+            self.start,
+            self.end,
         )
         self.log.info(f"Coverage plot saved to {plot_file}")
 

diff --git a/pbp/meta_gen/gen_soundtrap.py b/pbp/meta_gen/gen_soundtrap.py
@@ -121,7 +121,7 @@ def run(self):
             else:
                 # if the audio_loc is a s3 url, then we need to list the files in buckets that cover the start and end
                 # dates
-                self.log.info(f"Searching between {start_dt} and {end_dt}")
+                self.log.debug(f"Searching between {start_dt} and {end_dt}")
 
                 client = boto3.client("s3", config=Config(signature_version=UNSIGNED))
                 paginator = client.get_paginator("list_objects")
@@ -148,7 +148,7 @@ def run(self):
                         if start_dt <= key_dt <= end_dt and key.endswith(".wav"):
                             # download the associated xml file to the wav file and create a SoundTrapWavFile object
                             try:
-                                self.log.info(f"Downloading {key_xml} ...")
+                                self.log.debug(f"Downloading {key_xml} ...")
                                 client.download_file(bucket, key_xml, xml_path)
                                 wav_files.append(SoundTrapWavFile(uri, xml_path, key_dt))
                             except Exception as ex:
@@ -158,7 +158,7 @@ def run(self):
                                 continue
 
             self.log.info(
-                f"Found {len(wav_files)} files to process that cover the expanded period {start_dt} - {end_dt}"
+                f"Found {len(wav_files)} files to process that covers the expanded period {start_dt} - {end_dt}"
             )
 
             if len(wav_files) == 0:
@@ -168,7 +168,7 @@ def run(self):
             wav_files.sort(key=lambda x: x.start)
 
             # create a dataframe from the wav files
-            self.log.info(
+            self.log.debug(
                 f"Creating dataframe from {len(wav_files)} files spanning "
                 f"{wav_files[0].start} to {wav_files[-1].start}..."
             )
@@ -206,7 +206,7 @@ def run(self):
             # plot the daily coverage
             plot_file = plot_daily_coverage(
                 InstrumentType.SOUNDTRAP,
-                self.df,
+                self.df[self.df["start"] >= self.start],
                 self.json_base_dir,
                 self.start,
                 self.end,

diff --git a/pbp/meta_gen/json_generator.py b/pbp/meta_gen/json_generator.py
@@ -71,10 +71,6 @@ def run(self):
                 | ((self.raw_df["end"] >= self.day) & (self.raw_df["start"] < self.day))
             ]
 
-            self.log.info(
-                f"Creating metadata for day {self.day} from {len(day_df)} files..."
-            )
-
             if len(day_df) == 0:
                 self.log.warning(f"No metadata found for day {self.day}")
                 return
@@ -85,7 +81,7 @@ def run(self):
             day_df["end"] = pd.to_datetime(day_df["end"])
 
             # get the file list that covers the requested day
-            self.log.info(
+            self.log.debug(
                 f'Found {len(day_df)} files for day {self.day}, between {day_df.iloc[0]["start"]} and {day_df.iloc[-1]["end"]}'
             )
 
@@ -159,10 +155,6 @@ def run(self):
 
         except Exception as e:
             self.log.exception(f"Error correcting metadata for  {self.day}. {e}")
-        finally:
-            self.log.info(
-                f"Done correcting metadata for {self.day}. Saved to {self.json_base_dir}"
-            )
 
     def no_jitter(self, day_df: pd.DataFrame) -> pd.DataFrame:
         """
@@ -172,7 +164,7 @@ def no_jitter(self, day_df: pd.DataFrame) -> pd.DataFrame:
         :return:
             The corrected dataframe
         """
-        self.log.info(
+        self.log.debug(
             "Using file start times as is, setting jitter to 0 and calculating end times."
         )
         # calculate the difference between each row start time and save as diff in a copy of the dataframe
@@ -236,4 +228,6 @@ def save_day(self, day: datetime.datetime, day_df: pd.DataFrame, prefix: str = "
             output_path = Path(self.json_base_dir, str(day.year))
             output_path.mkdir(parents=True, exist_ok=True)
             shutil.copy2(temp_metadata.as_posix(), output_path)
-            self.log.info(f"Wrote {output_path}/{temp_metadata.name}")
+            self.log.info(
+                f"Done correcting metadata for {self.day}. Saved to {output_path}/{temp_metadata.name}"
+            )
diff --git a/pbp/meta_gen/utils.py b/pbp/meta_gen/utils.py
@@ -7,8 +7,13 @@
 from datetime import datetime
 from pathlib import Path
 
+import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
+import matplotlib.dates as mdates
+from matplotlib.ticker import NullLocator
+
+from pbp.plot_const import DEFAULT_DPI
 
 
 class InstrumentType:
@@ -108,37 +113,63 @@ def plot_daily_coverage(
     :param end: The end date of the recordings
     :return: The path to the plot file
     """
-    # Create a plot of the dataframe with the x-axis as the month, and the y-axis as the daily recording coverage,
-    # which is percent of the day covered by recordings
+    # Create a plot of the dataframe with the x-axis as the month, and the y-axis as the daily recording coverage.
+    # This is percent of the day covered by recordings
     plt.rcParams["text.usetex"] = False
-    df["duration"] = (df["end"] - df["start"]).dt.total_seconds()
-    ts_df = df[["start", "duration"]].copy()
+    plt.rcParams["axes.edgecolor"] = "black"
+    duration = (df["end"] - df["start"]).dt.total_seconds()
+    ts_df = df[["start"]].copy()
+    ts_df["duration"] = duration
     ts_df.set_index("start", inplace=True)
     daily_sum_df = ts_df.resample("D").sum()
     daily_sum_df["coverage"] = 100 * daily_sum_df["duration"] / 86400
     daily_sum_df["coverage"] = daily_sum_df[
         "coverage"
     ].round()  # round to nearest integer
-    plot = daily_sum_df["coverage"].plot()
-    plot.set_ylabel("Daily % Recording")
-    plot.set_xlabel("Date")
+    if len(daily_sum_df) == 1:
+        # Add a row with a NaN coverage before and after the single day to avoid matplotlib
+        # warnings about automatically expanding the x-axis
+        daily_sum_df.loc[daily_sum_df.index[0] - pd.DateOffset(days=1)] = np.nan
+        daily_sum_df.loc[daily_sum_df.index[0] + pd.DateOffset(days=1)] = np.nan
+    plot = daily_sum_df["coverage"].plot(
+        linestyle="-",
+        markerfacecolor="none",
+        marker="o",
+        color="b",
+        markersize=5,
+        linewidth=1,
+        figsize=(8, 4),
+    )
+    plot.set_ylabel("Daily % Recording", fontsize=8)
+    plot.set_xlabel("Date", fontsize=8)
     plot.set_xticks(daily_sum_df.index.values)
-    plot.set_ylim(0, 102)
-    # Angle the x-axis labels for better readability and force them to be in the format YYYY-MM-DD
-    plot.set_xticklabels([x.strftime("%Y-%m-%d") for x in daily_sum_df.index])
-    plot.set_xticklabels(plot.get_xticklabels(), rotation=45, horizontalalignment="right")
+    plot.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m-%d"))
+    # Maximum 15 ticks on the x-axis
+    # plot.xaxis.set_major_locator(
+    #     MaxNLocator(nbins=min(15, len(daily_sum_df.index.values) - 1))
+    # )
+    plot.axes.set_facecolor("#E4E4F1")
+    # Rotate the x-axis labels for better readability
+    plt.xticks(rotation=45)
+    # Set both x and y axis tick label font size to 6
+    plot.tick_params(axis="both", which="major", labelsize=6)
+    # Disable the minor ticks on the x-axis using NullLocator, as they are not needed
+    plot.xaxis.set_minor_locator(NullLocator())
+    # Set the y-axis limits to 0-110 to avoid the plot being too close to the top
+    plot.set_ylim(0, 110)
     # Adjust the title based on the instrument type
     if instrument_type == InstrumentType.NRS:
-        plot.set_title("Daily Coverage of NRS Recordings")
+        plot.set_title("Daily Coverage of NRS Recordings", fontsize=11)
     elif instrument_type == InstrumentType.ICLISTEN:
-        plot.set_title("Daily Coverage of icListen Recordings")
+        plot.set_title("Daily Coverage of icListen Recordings", fontsize=11)
     elif instrument_type == InstrumentType.SOUNDTRAP:
-        plot.set_title("Daily Coverage of SoundTrap Recordings")
-    plot_file = Path(base_dir) / f"soundtrap_coverage_{start:%Y%m%d}_{end:%Y%m%d}.jpg"
-    dpi = 300
+        plot.set_title("Daily Coverage of SoundTrap Recordings", fontsize=11)
+    plot_file = (
+        Path(base_dir)
+        / f"{str(instrument_type).lower()}_coverage_{start:%Y%m%d}_{end:%Y%m%d}.jpg"
+    )
     fig = plot.get_figure()
-    fig.set_size_inches(10, 5)
-    fig.set_dpi(dpi)
-    fig.savefig(plot_file.as_posix(), bbox_inches="tight")
+    fig.autofmt_xdate()
+    fig.savefig(plot_file.as_posix(), dpi=DEFAULT_DPI, bbox_inches="tight")
     plt.close(fig)
     return plot_file.as_posix()
diff --git a/pbp/plotting.py b/pbp/plotting.py
@@ -39,6 +39,9 @@ def plot_dataset_summary(
     :param jpeg_filename: If given, filename to save the plot to.
     :param show: Whether to show the plot.
     """
+    plt.rcParams["text.usetex"] = False
+    plt.rcParams["axes.edgecolor"] = "black"
+
     # Transpose psd array for plotting
     da = xr.DataArray.transpose(ds.psd)
 
@@ -151,6 +154,7 @@ def plot_dataset_summary(
     plt.gcf().text(0.65, 0.91, "UTC")
 
     if jpeg_filename is not None:
+        print(f"Saving plot to {jpeg_filename}")
         plt.savefig(jpeg_filename, dpi=dpi)
     if show:
         plt.show()