From cfbf4f4eb917a24f080b4b0b5138769aea9358ea Mon Sep 17 00:00:00 2001 From: thomasasia Date: Mon, 11 Apr 2022 17:55:23 -0400 Subject: [PATCH 1/7] Added error protection functionality --- bing_image_downloader/bing.py | 71 +++++++++++++++++++++++++---- bing_image_downloader/downloader.py | 10 ++-- test.py | 7 ++- 3 files changed, 69 insertions(+), 19 deletions(-) diff --git a/bing_image_downloader/bing.py b/bing_image_downloader/bing.py index 4156678..16908bb 100644 --- a/bing_image_downloader/bing.py +++ b/bing_image_downloader/bing.py @@ -4,6 +4,7 @@ import imghdr import posixpath import re +from time import sleep ''' Python api to download image form Bing. @@ -12,7 +13,7 @@ class Bing: - def __init__(self, query, limit, output_dir, adult, timeout, filter='', verbose=True): + def __init__(self, query, limit, output_dir, adult, timeout, filter='', verbose=True, error_protection=False): self.download_count = 0 self.query = query self.output_dir = output_dir @@ -20,6 +21,7 @@ def __init__(self, query, limit, output_dir, adult, timeout, filter='', verbose self.filter = filter self.verbose = verbose self.seen = set() + self.error_protection = error_protection assert type(limit) == int, "limit must be integer" self.limit = limit @@ -28,7 +30,7 @@ def __init__(self, query, limit, output_dir, adult, timeout, filter='', verbose # self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0'} self.page_counter = 0 - self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) ' + self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) ' 'AppleWebKit/537.11 (KHTML, like Gecko) ' 'Chrome/23.0.1271.64 Safari/537.11', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', @@ -62,7 +64,7 @@ def save_image(self, link, file_path): with open(str(file_path), 'wb') as f: f.write(image) - + def download_image(self, link): self.download_count += 1 # Get the image link @@ -72,21 +74,50 @@ def download_image(self, link): file_type = filename.split(".")[-1] if file_type.lower() not in ["jpe", "jpeg", "jfif", "exif", "tiff", "gif", "bmp", "png", "webp", "jpg"]: file_type = "jpg" - + downloaded = False if self.verbose: # Download the image print("[%] Downloading Image #{} from {}".format(self.download_count, link)) - - self.save_image(link, self.output_dir.joinpath("Image_{}.{}".format( - str(self.download_count), file_type))) - if self.verbose: + delay = 1 + while self.error_protection: + try: + self.save_image(link, self.output_dir.joinpath("Image_{}.{}".format( + str(self.download_count), file_type))) + downloaded = True + break + except urllib.error.URLError: + if self.verbose: + print("[%] URLError, sleeping for " + str(delay)) + + # sleeping for 1 second at a time makes it easier to escape out + for i in range(delay): + sleep(1) + delay *= 2 + if self.doub_sum(delay) > self.timeout : break + + else: + self.save_image(link, self.output_dir.joinpath("Image_{}.{}".format( + str(self.download_count), file_type))) + downloaded = True + if self.verbose and downloaded: print("[%] File Downloaded !\n") + elif self.verbose: + print("[%] Timeout exceeded : Persistent Connection Error, File not Downloaded !\n") + except Exception as e: self.download_count -= 1 print("[!] Issue getting: {}\n[!] Error:: {}".format(link, e)) - + # for calculating the error_protection delay + def doub_sum(self, val): + sum = 0 + val = int(val) + while val > 0: + sum += val + val //= 2 + return sum + def run(self): while self.download_count < self.limit: if self.verbose: @@ -96,7 +127,27 @@ def run(self): + '&first=' + str(self.page_counter) + '&count=' + str(self.limit) \ + '&adlt=' + self.adult + '&qft=' + ('' if self.filter is None else self.get_filter(self.filter)) request = urllib.request.Request(request_url, None, headers=self.headers) - response = urllib.request.urlopen(request) + + delay = 1 + while self.error_protection: + try: + response = urllib.request.urlopen(request) + break + except urllib.error.URLError: + if self.verbose: + print("URLError on page, sleeping for " + str(delay)) + + # sleeping for 1 second at a time makes it easier to escape out + for i in range(delay): + sleep(1.0) + delay *= 2 + if self.verbose: + print('\n\n[!!]Retrying page: {}\n'.format(self.page_counter + 1)) + if self.doub_sum(delay) > self.timeout : + break + else: + response = urllib.request.urlopen(request) + html = response.read().decode('utf8') if html == "": print("[%] No more images are available") diff --git a/bing_image_downloader/downloader.py b/bing_image_downloader/downloader.py index 41789dd..eaf9961 100644 --- a/bing_image_downloader/downloader.py +++ b/bing_image_downloader/downloader.py @@ -8,8 +8,8 @@ from .bing import Bing -def download(query, limit=100, output_dir='dataset', adult_filter_off=True, -force_replace=False, timeout=60, filter="", verbose=True): +def download(query, limit=100, output_dir='dataset', adult_filter_off=True, +force_replace=False, timeout=60, filter="", verbose=True, error_protection=False): # engine = 'bing' if adult_filter_off: @@ -17,7 +17,7 @@ def download(query, limit=100, output_dir='dataset', adult_filter_off=True, else: adult = 'on' - + image_dir = Path(output_dir).joinpath(query).absolute() if force_replace: @@ -32,9 +32,9 @@ def download(query, limit=100, output_dir='dataset', adult_filter_off=True, except Exception as e: print('[Error]Failed to create directory.', e) sys.exit(1) - + print("[%] Downloading Images to {}".format(str(image_dir.absolute()))) - bing = Bing(query, limit, image_dir, adult, timeout, filter, verbose) + bing = Bing(query, limit, image_dir, adult, timeout, filter, verbose, error_protection) bing.run() diff --git a/test.py b/test.py index e7badfc..14a03a2 100644 --- a/test.py +++ b/test.py @@ -7,16 +7,15 @@ filter=sys.argv[2] else: filter="" - - + downloader.download( query, limit=10, output_dir="dataset", adult_filter_off=True, force_replace=False, - timeout=60, + timeout=4, filter=filter, verbose=True, + error_protection=True ) - From 41d23ae608c255e164596e595d61bce0e20a1214 Mon Sep 17 00:00:00 2001 From: thomasasia Date: Mon, 11 Apr 2022 17:59:30 -0400 Subject: [PATCH 2/7] updated readyme for error protection --- README.md | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 7d251db..1f10945 100644 --- a/README.md +++ b/README.md @@ -11,14 +11,14 @@ This package uses async url, which makes it very fast while downloading.
### Disclaimer
This program lets you download tons of images from Bing. -Please do not download or use any image that violates its copyright terms. +Please do not download or use any image that violates its copyright terms. ### Installation
```sh pip install bing-image-downloader ``` -or +or ```bash git clone https://github.com/gurugaurav/bing_image_downloader cd bing_image_downloader @@ -30,7 +30,7 @@ pip install . ### Usage
```python from bing_image_downloader import downloader -downloader.download(query_string, limit=100, output_dir='dataset', adult_filter_off=True, force_replace=False, timeout=60, verbose=True) +downloader.download(query_string, limit=100, output_dir='dataset', adult_filter_off=True, force_replace=False, timeout=60, verbose=True, error_protection=False) ``` `query_string` : String to be searched.
@@ -41,6 +41,7 @@ downloader.download(query_string, limit=100, output_dir='dataset', adult_filter `timeout` : (optional, default is 60) timeout for connection in seconds.
`filter` : (optional, default is "") filter, choose from [line, photo, clipart, gif, transparent]
`verbose` : (optional, default is True) Enable downloaded message.
+`error_protection` : (optional, default is False) Enable protections from url errors, like disconnects.
You can also test the programm by runnning `test.py keyword` @@ -58,7 +59,3 @@ https://pypi.org/project/bing-image-downloader/ You can buy me a coffee if this project was helpful to you.
[Show your support](https://www.buymeacoffee.com/gurugaurav) - - - - From c953b5009bdfef8e4f1c6fca4fc996a3b93c7954 Mon Sep 17 00:00:00 2001 From: thomasasia Date: Mon, 11 Apr 2022 17:59:49 -0400 Subject: [PATCH 3/7] spelling correction --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1f10945..ffd4bf2 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ downloader.download(query_string, limit=100, output_dir='dataset', adult_filter `error_protection` : (optional, default is False) Enable protections from url errors, like disconnects.
-You can also test the programm by runnning `test.py keyword` +You can also test the program by runnning `test.py keyword` ### PyPi
From f1c8f9dcc2af767bb176324e6e2f65f71227f9b5 Mon Sep 17 00:00:00 2001 From: thomasasia Date: Mon, 11 Apr 2022 18:01:07 -0400 Subject: [PATCH 4/7] URLError print formating --- bing_image_downloader/bing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bing_image_downloader/bing.py b/bing_image_downloader/bing.py index 16908bb..a886e9c 100644 --- a/bing_image_downloader/bing.py +++ b/bing_image_downloader/bing.py @@ -135,7 +135,7 @@ def run(self): break except urllib.error.URLError: if self.verbose: - print("URLError on page, sleeping for " + str(delay)) + print("[%] URLError on page, sleeping for " + str(delay)) # sleeping for 1 second at a time makes it easier to escape out for i in range(delay): From 4d9e1031efcf2e3afab4ab89b1f399a4d1f9e4d1 Mon Sep 17 00:00:00 2001 From: thomasasia Date: Mon, 11 Apr 2022 19:20:57 -0400 Subject: [PATCH 5/7] added additional error checking --- bing_image_downloader/bing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bing_image_downloader/bing.py b/bing_image_downloader/bing.py index a886e9c..a144d1f 100644 --- a/bing_image_downloader/bing.py +++ b/bing_image_downloader/bing.py @@ -147,8 +147,8 @@ def run(self): break else: response = urllib.request.urlopen(request) - - html = response.read().decode('utf8') + if response: + html = response.read().decode('utf8') if html == "": print("[%] No more images are available") break From 8f7a5ad2d003604be6b60131f8281767434e8562 Mon Sep 17 00:00:00 2001 From: thomasasia Date: Mon, 11 Apr 2022 19:23:52 -0400 Subject: [PATCH 6/7] fixed error --- bing_image_downloader/bing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bing_image_downloader/bing.py b/bing_image_downloader/bing.py index a144d1f..a8aaa71 100644 --- a/bing_image_downloader/bing.py +++ b/bing_image_downloader/bing.py @@ -147,7 +147,7 @@ def run(self): break else: response = urllib.request.urlopen(request) - if response: + if response is not None: html = response.read().decode('utf8') if html == "": print("[%] No more images are available") From 5b91575fbf579bac2632df9f7d15cb5dbd204575 Mon Sep 17 00:00:00 2001 From: thomasasia Date: Mon, 11 Apr 2022 19:28:00 -0400 Subject: [PATCH 7/7] extended timeout for page queries --- bing_image_downloader/bing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bing_image_downloader/bing.py b/bing_image_downloader/bing.py index a8aaa71..59c5569 100644 --- a/bing_image_downloader/bing.py +++ b/bing_image_downloader/bing.py @@ -143,7 +143,7 @@ def run(self): delay *= 2 if self.verbose: print('\n\n[!!]Retrying page: {}\n'.format(self.page_counter + 1)) - if self.doub_sum(delay) > self.timeout : + if self.doub_sum(delay) > max(self.timeout * 4, 30): # pages are very important, so extend the timeout for those break else: response = urllib.request.urlopen(request)