From 7daed7b433222639fe8e6f145e88a9b2d9f4ef3c Mon Sep 17 00:00:00 2001 From: Misi <61704770+MilanVarady@users.noreply.github.com> Date: Mon, 4 Jul 2022 11:08:21 +0200 Subject: [PATCH] Add page limit option --- .gitignore | 10 +++++++++- README.md | 3 ++- bing_image_downloader/bing.py | 13 +++++++++++-- bing_image_downloader/downloader.py | 10 ++++------ 4 files changed, 26 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index ec358d5..034b77c 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,12 @@ dataset/* dataset dist image_search.egg-info -**/.vscode/* \ No newline at end of file +**/.vscode/* + +### macOS ### +.DS_Store +.AppleDouble +.LSOverride + +### Jetbrains ### +.idea/** \ No newline at end of file diff --git a/README.md b/README.md index 7d251db..1eaef14 100644 --- a/README.md +++ b/README.md @@ -30,12 +30,13 @@ pip install . ### Usage
```python from bing_image_downloader import downloader -downloader.download(query_string, limit=100, output_dir='dataset', adult_filter_off=True, force_replace=False, timeout=60, verbose=True) +downloader.download(query_string, limit=100, output_dir='dataset', page_limit=100, adult_filter_off=True, force_replace=False, timeout=60, verbose=True) ``` `query_string` : String to be searched.
`limit` : (optional, default is 100) Number of images to download.
`output_dir` : (optional, default is 'dataset') Name of output dir.
+`page_limit` : (optional, default is 100) Number of pages to scan for images.
`adult_filter_off` : (optional, default is True) Enable of disable adult filteration.
`force_replace` : (optional, default is False) Delete folder if present and start a fresh download.
`timeout` : (optional, default is 60) timeout for connection in seconds.
diff --git a/bing_image_downloader/bing.py b/bing_image_downloader/bing.py index 4156678..4578a2a 100644 --- a/bing_image_downloader/bing.py +++ b/bing_image_downloader/bing.py @@ -12,7 +12,7 @@ class Bing: - def __init__(self, query, limit, output_dir, adult, timeout, filter='', verbose=True): + def __init__(self, query, limit, output_dir, page_limit, adult, timeout, filter='', verbose=True): self.download_count = 0 self.query = query self.output_dir = output_dir @@ -23,6 +23,8 @@ def __init__(self, query, limit, output_dir, adult, timeout, filter='', verbose assert type(limit) == int, "limit must be integer" self.limit = limit + assert type(page_limit) == int, "page_limit must be integer" + self.page_limit = page_limit assert type(timeout) == int, "timeout must be integer" self.timeout = timeout @@ -88,7 +90,7 @@ def download_image(self, link): def run(self): - while self.download_count < self.limit: + while self.download_count < self.limit and self.page_counter < self.page_limit: if self.verbose: print('\n\n[!!]Indexing page: {}\n'.format(self.page_counter + 1)) # Parse the page source and download pics @@ -112,4 +114,11 @@ def run(self): self.download_image(link) self.page_counter += 1 + + if self.download_count >= self.limit: + print("[!!]Reached download limit") + + if self.page_counter >= self.page_limit: + print("[!!]Reached page limit") + print("\n\n[%] Done. Downloaded {} images.".format(self.download_count)) diff --git a/bing_image_downloader/downloader.py b/bing_image_downloader/downloader.py index 41789dd..5610971 100644 --- a/bing_image_downloader/downloader.py +++ b/bing_image_downloader/downloader.py @@ -8,16 +8,14 @@ from .bing import Bing -def download(query, limit=100, output_dir='dataset', adult_filter_off=True, -force_replace=False, timeout=60, filter="", verbose=True): - +def download(query, limit=100, output_dir='dataset', page_limit=100, adult_filter_off=True, + force_replace=False, timeout=60, filter="", verbose=True): # engine = 'bing' if adult_filter_off: adult = 'off' else: adult = 'on' - image_dir = Path(output_dir).joinpath(query).absolute() if force_replace: @@ -32,9 +30,9 @@ def download(query, limit=100, output_dir='dataset', adult_filter_off=True, except Exception as e: print('[Error]Failed to create directory.', e) sys.exit(1) - + print("[%] Downloading Images to {}".format(str(image_dir.absolute()))) - bing = Bing(query, limit, image_dir, adult, timeout, filter, verbose) + bing = Bing(query, limit, image_dir, page_limit, adult, timeout, filter, verbose) bing.run()