From 96ffc70d7a32b8317d3078135a087d9044072e00 Mon Sep 17 00:00:00 2001 From: Stephen Polcyn Date: Thu, 2 May 2024 15:39:10 -0400 Subject: [PATCH 1/5] Add download option to skip accession numbers This improves efficiency when some filings have been already downloaded to another location. --- sec_edgar_downloader/_Downloader.py | 3 +++ sec_edgar_downloader/_orchestrator.py | 3 +++ sec_edgar_downloader/_types.py | 1 + 3 files changed, 7 insertions(+) diff --git a/sec_edgar_downloader/_Downloader.py b/sec_edgar_downloader/_Downloader.py index d88e2e5..5828851 100644 --- a/sec_edgar_downloader/_Downloader.py +++ b/sec_edgar_downloader/_Downloader.py @@ -67,6 +67,7 @@ def get( before: Optional[Date] = None, include_amends: bool = False, download_details: bool = False, + skip_accession_numbers: Optional[set[str]] = None, ) -> int: """Download filings and save them to disk. @@ -84,6 +85,7 @@ def get( Defaults to False. :param download_details: denotes whether to download human-readable and easily parseable filing detail documents (e.g. form 4 XML, 8-K HTML). Defaults to False. + :param skip_accession_numbers: Set of accession numbers to skip when downloading. :return: number of filings downloaded. Usage:: @@ -173,6 +175,7 @@ def get( download_details, # Save ticker if passed in to form file system path for saving filings ticker=ticker_or_cik if not is_cik(ticker_or_cik) else None, + skip_accession_numbers=skip_accession_numbers, ), self.user_agent, ) diff --git a/sec_edgar_downloader/_orchestrator.py b/sec_edgar_downloader/_orchestrator.py index 223f827..b5d5809 100644 --- a/sec_edgar_downloader/_orchestrator.py +++ b/sec_edgar_downloader/_orchestrator.py @@ -130,6 +130,9 @@ def get_to_download(cik: str, acc_num: str, doc: str) -> ToDownload: def fetch_and_save_filings(download_metadata: DownloadMetadata, user_agent: str) -> int: successfully_downloaded = 0 to_download = aggregate_filings_to_download(download_metadata, user_agent) + if download_metadata.skip_accession_numbers is not None: + to_download = [td for td in to_download if td.accession_number not in download_metadata.skip_accession_numbers] + for td in to_download: try: save_location = get_save_location( diff --git a/sec_edgar_downloader/_types.py b/sec_edgar_downloader/_types.py index 5dd6c4f..1f900b2 100644 --- a/sec_edgar_downloader/_types.py +++ b/sec_edgar_downloader/_types.py @@ -20,6 +20,7 @@ class DownloadMetadata: include_amends: bool = False download_details: bool = False ticker: Optional[str] = None + skip_accession_numbers: Optional[set[str]] = None @dataclass From dee72adad3d1fa900988589b59bc381890472c16 Mon Sep 17 00:00:00 2001 From: Stephen Polcyn Date: Thu, 20 Jun 2024 10:28:47 -0400 Subject: [PATCH 2/5] Use `Set` from typing and improve var name --- sec_edgar_downloader/_Downloader.py | 8 ++++---- sec_edgar_downloader/_orchestrator.py | 4 ++-- sec_edgar_downloader/_types.py | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/sec_edgar_downloader/_Downloader.py b/sec_edgar_downloader/_Downloader.py index 5828851..fee84b9 100644 --- a/sec_edgar_downloader/_Downloader.py +++ b/sec_edgar_downloader/_Downloader.py @@ -1,6 +1,6 @@ import sys from pathlib import Path -from typing import ClassVar, List, Optional +from typing import ClassVar, List, Optional, Set from ._constants import DEFAULT_AFTER_DATE, DEFAULT_BEFORE_DATE from ._constants import SUPPORTED_FORMS as _SUPPORTED_FORMS @@ -67,7 +67,7 @@ def get( before: Optional[Date] = None, include_amends: bool = False, download_details: bool = False, - skip_accession_numbers: Optional[set[str]] = None, + accession_numbers_to_skip: Optional[Set[str]] = None, ) -> int: """Download filings and save them to disk. @@ -85,7 +85,7 @@ def get( Defaults to False. :param download_details: denotes whether to download human-readable and easily parseable filing detail documents (e.g. form 4 XML, 8-K HTML). Defaults to False. - :param skip_accession_numbers: Set of accession numbers to skip when downloading. + :param accession_numbers_to_skip: Set of accession numbers to skip when downloading. :return: number of filings downloaded. Usage:: @@ -175,7 +175,7 @@ def get( download_details, # Save ticker if passed in to form file system path for saving filings ticker=ticker_or_cik if not is_cik(ticker_or_cik) else None, - skip_accession_numbers=skip_accession_numbers, + accession_numbers_to_skip=accession_numbers_to_skip, ), self.user_agent, ) diff --git a/sec_edgar_downloader/_orchestrator.py b/sec_edgar_downloader/_orchestrator.py index b5d5809..57d3c5e 100644 --- a/sec_edgar_downloader/_orchestrator.py +++ b/sec_edgar_downloader/_orchestrator.py @@ -130,8 +130,8 @@ def get_to_download(cik: str, acc_num: str, doc: str) -> ToDownload: def fetch_and_save_filings(download_metadata: DownloadMetadata, user_agent: str) -> int: successfully_downloaded = 0 to_download = aggregate_filings_to_download(download_metadata, user_agent) - if download_metadata.skip_accession_numbers is not None: - to_download = [td for td in to_download if td.accession_number not in download_metadata.skip_accession_numbers] + if download_metadata.accession_numbers_to_skip is not None: + to_download = [td for td in to_download if td.accession_number not in download_metadata.accession_numbers_to_skip] for td in to_download: try: diff --git a/sec_edgar_downloader/_types.py b/sec_edgar_downloader/_types.py index 1f900b2..8739b9a 100644 --- a/sec_edgar_downloader/_types.py +++ b/sec_edgar_downloader/_types.py @@ -20,7 +20,7 @@ class DownloadMetadata: include_amends: bool = False download_details: bool = False ticker: Optional[str] = None - skip_accession_numbers: Optional[set[str]] = None + accession_numbers_to_skip: Optional[set[str]] = None @dataclass From c5d826c084080ceab4eedc7fd6638c47cc705e26 Mon Sep 17 00:00:00 2001 From: Stephen Polcyn Date: Thu, 20 Jun 2024 10:32:04 -0400 Subject: [PATCH 3/5] Fix typing annotation --- sec_edgar_downloader/_types.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sec_edgar_downloader/_types.py b/sec_edgar_downloader/_types.py index 8739b9a..7345a3b 100644 --- a/sec_edgar_downloader/_types.py +++ b/sec_edgar_downloader/_types.py @@ -2,7 +2,7 @@ from dataclasses import dataclass from datetime import date, datetime from pathlib import Path -from typing import Optional, Union +from typing import Optional, Set, Union from ._constants import DEFAULT_AFTER_DATE, DEFAULT_BEFORE_DATE @@ -20,7 +20,7 @@ class DownloadMetadata: include_amends: bool = False download_details: bool = False ticker: Optional[str] = None - accession_numbers_to_skip: Optional[set[str]] = None + accession_numbers_to_skip: Optional[Set[str]] = None @dataclass From 378443cc71468d986d011a3463ad1537be6a4f95 Mon Sep 17 00:00:00 2001 From: Stephen Polcyn Date: Thu, 20 Jun 2024 12:31:48 -0400 Subject: [PATCH 4/5] Add test for skipping accession numbers --- tests/test_orchestrator.py | 40 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index 3e40a68..c9b6a48 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -313,6 +313,46 @@ def test_fetch_and_save_filings_given_paths_that_already_exist( assert mock_save_document.call_count == 0 +def test_fetch_and_save_filings_given_accession_numbers_to_skip( + user_agent, form_10k, apple_cik +): + limit = 2 + download_metadata = DownloadMetadata( + download_folder=Path("."), + form=form_10k, + cik=apple_cik, + limit=limit, + after=DEFAULT_AFTER_DATE, + before=DEFAULT_BEFORE_DATE, + include_amends=False, + download_details=False, + accession_numbers_to_skip={"acc_num_0"}, + ) + + to_download_list = [ + ToDownload( + raw_filing_uri=f"raw_{i}", + primary_doc_uri=f"pd_{i}", + accession_number=f"acc_num_{i}", + details_doc_suffix=".xml", + ) + for i in range(limit) + ] + + with patch( + "sec_edgar_downloader._orchestrator.aggregate_filings_to_download", + new=lambda x, y: to_download_list, + ), patch( + "sec_edgar_downloader._orchestrator.download_filing", autospec=True + ) as mock_download_filing, patch( + "sec_edgar_downloader._orchestrator.save_document", autospec=True + ) as mock_save_document: + num_downloaded = fetch_and_save_filings(download_metadata, user_agent) + + assert num_downloaded == 1 + assert mock_download_filing.call_count == 1 + assert mock_save_document.call_count == 1 + def test_fetch_and_save_filings_given_exception(user_agent, form_10k, apple_cik): limit = 2 download_metadata = DownloadMetadata( From e11e67ff590976b451a0d3a9705978d9221ddc57 Mon Sep 17 00:00:00 2001 From: Stephen Polcyn Date: Thu, 20 Jun 2024 12:35:45 -0400 Subject: [PATCH 5/5] Apply linter It was previously not fully initialized --- sec_edgar_downloader/_orchestrator.py | 6 +++++- tests/test_orchestrator.py | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/sec_edgar_downloader/_orchestrator.py b/sec_edgar_downloader/_orchestrator.py index 57d3c5e..ca9f532 100644 --- a/sec_edgar_downloader/_orchestrator.py +++ b/sec_edgar_downloader/_orchestrator.py @@ -131,7 +131,11 @@ def fetch_and_save_filings(download_metadata: DownloadMetadata, user_agent: str) successfully_downloaded = 0 to_download = aggregate_filings_to_download(download_metadata, user_agent) if download_metadata.accession_numbers_to_skip is not None: - to_download = [td for td in to_download if td.accession_number not in download_metadata.accession_numbers_to_skip] + to_download = [ + td + for td in to_download + if td.accession_number not in download_metadata.accession_numbers_to_skip + ] for td in to_download: try: diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index c9b6a48..34ff520 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -353,6 +353,7 @@ def test_fetch_and_save_filings_given_accession_numbers_to_skip( assert mock_download_filing.call_count == 1 assert mock_save_document.call_count == 1 + def test_fetch_and_save_filings_given_exception(user_agent, form_10k, apple_cik): limit = 2 download_metadata = DownloadMetadata(