Skip to content

Commit

Permalink
Metagenjstyle (#40)
Browse files Browse the repository at this point in the history
* broaden plot to 110 from 102 and add plots for clarity on daily coverage

* for metagen, changed logging level to DEBUG, turn down logging for everything else and move some messages to debug for clarity

* attempting fix for issue #33 and other uneeded f format

* black lines for all plots, standard DPI and fix issue #33

* refactor - moving single info log setup to the logging_helper.py

* improved styling of xaxis for metagen coverage plot

* better metagen log file format with dates

* minor changes to coverage plot to reduce the size and improve readability

* remove print as this may be truncating the display in jupyter

* print location of plot for convenience

* separate test output for soundtrap local/s3

* better formatting of coverage plot

* ruff reformat

* exclude metadata needed for hmd plots in coverage plots

* other more os agnostic changes for metagen test and remove unused import

* ruff reformat

* fix pandas SettingWithCopyWarning warning

* more work on removing pandas warnings

* make edge color black and save coverage with instrument prefix

* ruff reformat

* lint check

* lighter gray face color for coverage and ruff format

* minor grammar fix
  • Loading branch information
danellecline authored Sep 15, 2024
1 parent 781b139 commit 1f83952
Show file tree
Hide file tree
Showing 10 changed files with 137 additions and 71 deletions.
24 changes: 24 additions & 0 deletions pbp/logging_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,27 @@ def create_logger(
)

return log


def create_logger_info(log_filename: str):
"""
Create a logger with INFO level for console and file and simple format (no log level).
Also logs to a file all messages at DEBUG level and above.
Best used for scripts that don't need DEBUG level logging to the console.
:param log_filename:
The name of the log file to create
"""
loguru.logger.remove()
log = copy.deepcopy(loguru.logger)
info_format = "{message}"
default_format = "{time} {level} {message}"
log.add(
sys.stdout,
level="INFO",
format=info_format,
filter=lambda record: record["level"].name == "INFO",
)
log.add(
sink=open(log_filename, "w"), level="DEBUG", format=default_format, enqueue=True
)
return log
17 changes: 6 additions & 11 deletions pbp/main_meta_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@
from datetime import datetime
from pathlib import Path

from pbp.logging_helper import create_logger_info
from pbp.meta_gen.gen_nrs import NRSMetadataGenerator
from pbp.meta_gen.gen_iclisten import IcListenMetadataGenerator
from pbp.meta_gen.gen_soundtrap import SoundTrapMetadataGenerator
from pbp.main_meta_generator_args import parse_arguments


# Some imports, in particular involving data processing, cause a delay that is
# noticeable when just running the --help option. We get around this issue by
# postponing the imports until actually needed. See the main() function.
Expand All @@ -15,17 +17,6 @@
def main():
opts = parse_arguments()

# pylint: disable=import-outside-toplevel
from pbp.logging_helper import create_logger

log = create_logger(
log_filename_and_level=(
f"{opts.output_dir}/{opts.recorder}{opts.start}_{opts.end}.log",
"INFO",
),
console_level="INFO",
)

log_dir = Path(opts.output_dir)
json_dir = Path(opts.json_base_dir)
if opts.xml_dir is None:
Expand All @@ -42,6 +33,10 @@ def main():
start = datetime.strptime(opts.start, "%Y%m%d")
end = datetime.strptime(opts.end, "%Y%m%d")

log = create_logger_info(
f"{opts.output_dir}/{opts.recorder}{opts.start:%Y%m%d}_{opts.end:%Y%m%d}.log"
)

try:
if opts.recorder == "NRS":
generator = NRSMetadataGenerator(
Expand Down
2 changes: 0 additions & 2 deletions pbp/main_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,6 @@ def main():
jpeg_filename=jpeg_filename,
show=show,
)
if jpeg_filename is not None:
print(f" done: {jpeg_filename}")

This comment has been minimized.

Copy link
@carueda

carueda Sep 16, 2024

Member

certainly not critical but this is the CLI program, so wonder why remove this print?



if __name__ == "__main__":
Expand Down
16 changes: 11 additions & 5 deletions pbp/meta_gen/gen_iclisten.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def run(self):
)
)

self.log.info(
self.log.debug(
f"{self.log_prefix} Found {len(wav_files)} files to process that "
f"cover the expanded period {start_dt} - {end_dt}"
)
Expand All @@ -154,8 +154,8 @@ def run(self):
wav_files.sort(key=lambda x: x.start)

# create a dataframe from the wav files
self.log.info(
f"{self.log_prefix} Creating dataframe from {len(wav_files)} files "
self.log.debug(
f"{self.log_prefix} creating dataframe from {len(wav_files)} files "
f"spanning {wav_files[0].start} to {wav_files[-1].start}..."
)

Expand All @@ -180,10 +180,16 @@ def run(self):
except Exception as ex:
self.log.exception(str(ex))

# plot the daily coverage only on files that are greater than the start date
# this os tp avoid plotting any coverage on files only included for overlap
plot_file = plot_daily_coverage(
InstrumentType.ICLISTEN, self.df, self.json_base_dir, self.start, self.end
InstrumentType.ICLISTEN,
self.df[self.df["start"] >= self.start],
self.json_base_dir,
self.start,
self.end,
)
self.log.info(f"Plot file: {plot_file}")
self.log.info(f"Coverage plot saved to {plot_file}")


if __name__ == "__main__":
Expand Down
17 changes: 11 additions & 6 deletions pbp/meta_gen/gen_nrs.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def run(self):
if f_dt is None:
continue
if start_dt <= f_dt <= end_dt:
self.log.info(f"Found file {filename} with timestamp {f_dt}")
self.log.debug(f"Found file {filename} with timestamp {f_dt}")
if ext == "*.flac":
sound_files.append(FlacFile(self.log, str(filename), f_dt))
if ext == "*.wav":
Expand All @@ -102,13 +102,13 @@ def run(self):
if f_dt is None:
continue
if start_dt <= f_dt <= end_dt:
self.log.info(f"Found file {blob.name} with timestamp {f_dt}")
self.log.debug(f"Found file {blob.name} with timestamp {f_dt}")
if re.search(r"\.flac$", blob.name):
sound_files.append(FlacFile(self.log, f_path, f_dt))
if re.search(r"\.wav$", blob.name):
sound_files.append(WavFile(self.log, f_path, f_dt))
# delay to avoid 400 error
if i % 100 == 0:
if i % 100 == 0 and i > 0:
self.log.info(
f"{i} files searched...found {len(sound_files)} files that match the search pattern"
)
Expand All @@ -135,7 +135,7 @@ def run(self):
for day in pd.date_range(self.start, self.end, freq="D"):
try:
# create a dataframe from the flac files
self.log.info(
self.log.debug(
f"Creating dataframe from {len(sound_files)} "
f"files spanning {sound_files[0].start} to {sound_files[-1].start} in self.json_base_dir..."
)
Expand All @@ -155,9 +155,14 @@ def run(self):
except Exception as ex:
self.log.exception(str(ex))

# plot the daily coverage
# plot the daily coverage only on files that are greater than the start date
# this os tp avoid plotting any coverage on files only included for overlap
plot_file = plot_daily_coverage(
InstrumentType.NRS, self.df, self.json_base_dir, self.start, self.end
InstrumentType.NRS,
self.df[self.df["start"] >= self.start],
self.json_base_dir,
self.start,
self.end,
)
self.log.info(f"Coverage plot saved to {plot_file}")

Expand Down
10 changes: 5 additions & 5 deletions pbp/meta_gen/gen_soundtrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def run(self):
else:
# if the audio_loc is a s3 url, then we need to list the files in buckets that cover the start and end
# dates
self.log.info(f"Searching between {start_dt} and {end_dt}")
self.log.debug(f"Searching between {start_dt} and {end_dt}")

client = boto3.client("s3", config=Config(signature_version=UNSIGNED))
paginator = client.get_paginator("list_objects")
Expand All @@ -148,7 +148,7 @@ def run(self):
if start_dt <= key_dt <= end_dt and key.endswith(".wav"):
# download the associated xml file to the wav file and create a SoundTrapWavFile object
try:
self.log.info(f"Downloading {key_xml} ...")
self.log.debug(f"Downloading {key_xml} ...")
client.download_file(bucket, key_xml, xml_path)
wav_files.append(SoundTrapWavFile(uri, xml_path, key_dt))
except Exception as ex:
Expand All @@ -158,7 +158,7 @@ def run(self):
continue

self.log.info(
f"Found {len(wav_files)} files to process that cover the expanded period {start_dt} - {end_dt}"
f"Found {len(wav_files)} files to process that covers the expanded period {start_dt} - {end_dt}"
)

if len(wav_files) == 0:
Expand All @@ -168,7 +168,7 @@ def run(self):
wav_files.sort(key=lambda x: x.start)

# create a dataframe from the wav files
self.log.info(
self.log.debug(
f"Creating dataframe from {len(wav_files)} files spanning "
f"{wav_files[0].start} to {wav_files[-1].start}..."
)
Expand Down Expand Up @@ -206,7 +206,7 @@ def run(self):
# plot the daily coverage
plot_file = plot_daily_coverage(
InstrumentType.SOUNDTRAP,
self.df,
self.df[self.df["start"] >= self.start],
self.json_base_dir,
self.start,
self.end,
Expand Down
16 changes: 5 additions & 11 deletions pbp/meta_gen/json_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,6 @@ def run(self):
| ((self.raw_df["end"] >= self.day) & (self.raw_df["start"] < self.day))
]

self.log.info(
f"Creating metadata for day {self.day} from {len(day_df)} files..."
)

if len(day_df) == 0:
self.log.warning(f"No metadata found for day {self.day}")
return
Expand All @@ -85,7 +81,7 @@ def run(self):
day_df["end"] = pd.to_datetime(day_df["end"])

# get the file list that covers the requested day
self.log.info(
self.log.debug(
f'Found {len(day_df)} files for day {self.day}, between {day_df.iloc[0]["start"]} and {day_df.iloc[-1]["end"]}'
)

Expand Down Expand Up @@ -159,10 +155,6 @@ def run(self):

except Exception as e:
self.log.exception(f"Error correcting metadata for {self.day}. {e}")
finally:
self.log.info(
f"Done correcting metadata for {self.day}. Saved to {self.json_base_dir}"
)

def no_jitter(self, day_df: pd.DataFrame) -> pd.DataFrame:
"""
Expand All @@ -172,7 +164,7 @@ def no_jitter(self, day_df: pd.DataFrame) -> pd.DataFrame:
:return:
The corrected dataframe
"""
self.log.info(
self.log.debug(
"Using file start times as is, setting jitter to 0 and calculating end times."
)
# calculate the difference between each row start time and save as diff in a copy of the dataframe
Expand Down Expand Up @@ -236,4 +228,6 @@ def save_day(self, day: datetime.datetime, day_df: pd.DataFrame, prefix: str = "
output_path = Path(self.json_base_dir, str(day.year))
output_path.mkdir(parents=True, exist_ok=True)
shutil.copy2(temp_metadata.as_posix(), output_path)
self.log.info(f"Wrote {output_path}/{temp_metadata.name}")
self.log.info(
f"Done correcting metadata for {self.day}. Saved to {output_path}/{temp_metadata.name}"
)
69 changes: 50 additions & 19 deletions pbp/meta_gen/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,13 @@
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.ticker import NullLocator

from pbp.plot_const import DEFAULT_DPI


class InstrumentType:
Expand Down Expand Up @@ -108,37 +113,63 @@ def plot_daily_coverage(
:param end: The end date of the recordings
:return: The path to the plot file
"""
# Create a plot of the dataframe with the x-axis as the month, and the y-axis as the daily recording coverage,
# which is percent of the day covered by recordings
# Create a plot of the dataframe with the x-axis as the month, and the y-axis as the daily recording coverage.
# This is percent of the day covered by recordings
plt.rcParams["text.usetex"] = False
df["duration"] = (df["end"] - df["start"]).dt.total_seconds()
ts_df = df[["start", "duration"]].copy()
plt.rcParams["axes.edgecolor"] = "black"
duration = (df["end"] - df["start"]).dt.total_seconds()
ts_df = df[["start"]].copy()
ts_df["duration"] = duration
ts_df.set_index("start", inplace=True)
daily_sum_df = ts_df.resample("D").sum()
daily_sum_df["coverage"] = 100 * daily_sum_df["duration"] / 86400
daily_sum_df["coverage"] = daily_sum_df[
"coverage"
].round() # round to nearest integer
plot = daily_sum_df["coverage"].plot()
plot.set_ylabel("Daily % Recording")
plot.set_xlabel("Date")
if len(daily_sum_df) == 1:
# Add a row with a NaN coverage before and after the single day to avoid matplotlib
# warnings about automatically expanding the x-axis
daily_sum_df.loc[daily_sum_df.index[0] - pd.DateOffset(days=1)] = np.nan
daily_sum_df.loc[daily_sum_df.index[0] + pd.DateOffset(days=1)] = np.nan
plot = daily_sum_df["coverage"].plot(
linestyle="-",
markerfacecolor="none",
marker="o",
color="b",
markersize=5,
linewidth=1,
figsize=(8, 4),
)
plot.set_ylabel("Daily % Recording", fontsize=8)
plot.set_xlabel("Date", fontsize=8)
plot.set_xticks(daily_sum_df.index.values)
plot.set_ylim(0, 102)
# Angle the x-axis labels for better readability and force them to be in the format YYYY-MM-DD
plot.set_xticklabels([x.strftime("%Y-%m-%d") for x in daily_sum_df.index])
plot.set_xticklabels(plot.get_xticklabels(), rotation=45, horizontalalignment="right")
plot.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m-%d"))
# Maximum 15 ticks on the x-axis
# plot.xaxis.set_major_locator(
# MaxNLocator(nbins=min(15, len(daily_sum_df.index.values) - 1))
# )
plot.axes.set_facecolor("#E4E4F1")
# Rotate the x-axis labels for better readability
plt.xticks(rotation=45)
# Set both x and y axis tick label font size to 6
plot.tick_params(axis="both", which="major", labelsize=6)
# Disable the minor ticks on the x-axis using NullLocator, as they are not needed
plot.xaxis.set_minor_locator(NullLocator())
# Set the y-axis limits to 0-110 to avoid the plot being too close to the top
plot.set_ylim(0, 110)
# Adjust the title based on the instrument type
if instrument_type == InstrumentType.NRS:
plot.set_title("Daily Coverage of NRS Recordings")
plot.set_title("Daily Coverage of NRS Recordings", fontsize=11)
elif instrument_type == InstrumentType.ICLISTEN:
plot.set_title("Daily Coverage of icListen Recordings")
plot.set_title("Daily Coverage of icListen Recordings", fontsize=11)
elif instrument_type == InstrumentType.SOUNDTRAP:
plot.set_title("Daily Coverage of SoundTrap Recordings")
plot_file = Path(base_dir) / f"soundtrap_coverage_{start:%Y%m%d}_{end:%Y%m%d}.jpg"
dpi = 300
plot.set_title("Daily Coverage of SoundTrap Recordings", fontsize=11)
plot_file = (
Path(base_dir)
/ f"{str(instrument_type).lower()}_coverage_{start:%Y%m%d}_{end:%Y%m%d}.jpg"
)
fig = plot.get_figure()
fig.set_size_inches(10, 5)
fig.set_dpi(dpi)
fig.savefig(plot_file.as_posix(), bbox_inches="tight")
fig.autofmt_xdate()
fig.savefig(plot_file.as_posix(), dpi=DEFAULT_DPI, bbox_inches="tight")
plt.close(fig)
return plot_file.as_posix()
4 changes: 4 additions & 0 deletions pbp/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ def plot_dataset_summary(
:param jpeg_filename: If given, filename to save the plot to.
:param show: Whether to show the plot.
"""
plt.rcParams["text.usetex"] = False
plt.rcParams["axes.edgecolor"] = "black"

# Transpose psd array for plotting
da = xr.DataArray.transpose(ds.psd)

Expand Down Expand Up @@ -151,6 +154,7 @@ def plot_dataset_summary(
plt.gcf().text(0.65, 0.91, "UTC")

if jpeg_filename is not None:
print(f"Saving plot to {jpeg_filename}")
plt.savefig(jpeg_filename, dpi=dpi)
if show:
plt.show()
Expand Down
Loading

0 comments on commit 1f83952

Please sign in to comment.