diff --git a/README.md b/README.md
index 7d251db..ffd4bf2 100644
--- a/README.md
+++ b/README.md
@@ -11,14 +11,14 @@ This package uses async url, which makes it very fast while downloading.
### Disclaimer
This program lets you download tons of images from Bing.
-Please do not download or use any image that violates its copyright terms.
+Please do not download or use any image that violates its copyright terms.
### Installation
```sh
pip install bing-image-downloader
```
-or
+or
```bash
git clone https://github.com/gurugaurav/bing_image_downloader
cd bing_image_downloader
@@ -30,7 +30,7 @@ pip install .
### Usage
```python
from bing_image_downloader import downloader
-downloader.download(query_string, limit=100, output_dir='dataset', adult_filter_off=True, force_replace=False, timeout=60, verbose=True)
+downloader.download(query_string, limit=100, output_dir='dataset', adult_filter_off=True, force_replace=False, timeout=60, verbose=True, error_protection=False)
```
`query_string` : String to be searched.
@@ -41,9 +41,10 @@ downloader.download(query_string, limit=100, output_dir='dataset', adult_filter
`timeout` : (optional, default is 60) timeout for connection in seconds.
`filter` : (optional, default is "") filter, choose from [line, photo, clipart, gif, transparent]
`verbose` : (optional, default is True) Enable downloaded message.
+`error_protection` : (optional, default is False) Enable protections from url errors, like disconnects.
-You can also test the programm by runnning `test.py keyword`
+You can also test the program by runnning `test.py keyword`
### PyPi
@@ -58,7 +59,3 @@ https://pypi.org/project/bing-image-downloader/
You can buy me a coffee if this project was helpful to you.
[](https://www.buymeacoffee.com/gurugaurav)
-
-
-
-
diff --git a/bing_image_downloader/bing.py b/bing_image_downloader/bing.py
index 4156678..59c5569 100644
--- a/bing_image_downloader/bing.py
+++ b/bing_image_downloader/bing.py
@@ -4,6 +4,7 @@
import imghdr
import posixpath
import re
+from time import sleep
'''
Python api to download image form Bing.
@@ -12,7 +13,7 @@
class Bing:
- def __init__(self, query, limit, output_dir, adult, timeout, filter='', verbose=True):
+ def __init__(self, query, limit, output_dir, adult, timeout, filter='', verbose=True, error_protection=False):
self.download_count = 0
self.query = query
self.output_dir = output_dir
@@ -20,6 +21,7 @@ def __init__(self, query, limit, output_dir, adult, timeout, filter='', verbose
self.filter = filter
self.verbose = verbose
self.seen = set()
+ self.error_protection = error_protection
assert type(limit) == int, "limit must be integer"
self.limit = limit
@@ -28,7 +30,7 @@ def __init__(self, query, limit, output_dir, adult, timeout, filter='', verbose
# self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0'}
self.page_counter = 0
- self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '
+ self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '
'AppleWebKit/537.11 (KHTML, like Gecko) '
'Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
@@ -62,7 +64,7 @@ def save_image(self, link, file_path):
with open(str(file_path), 'wb') as f:
f.write(image)
-
+
def download_image(self, link):
self.download_count += 1
# Get the image link
@@ -72,21 +74,50 @@ def download_image(self, link):
file_type = filename.split(".")[-1]
if file_type.lower() not in ["jpe", "jpeg", "jfif", "exif", "tiff", "gif", "bmp", "png", "webp", "jpg"]:
file_type = "jpg"
-
+ downloaded = False
if self.verbose:
# Download the image
print("[%] Downloading Image #{} from {}".format(self.download_count, link))
-
- self.save_image(link, self.output_dir.joinpath("Image_{}.{}".format(
- str(self.download_count), file_type)))
- if self.verbose:
+ delay = 1
+ while self.error_protection:
+ try:
+ self.save_image(link, self.output_dir.joinpath("Image_{}.{}".format(
+ str(self.download_count), file_type)))
+ downloaded = True
+ break
+ except urllib.error.URLError:
+ if self.verbose:
+ print("[%] URLError, sleeping for " + str(delay))
+
+ # sleeping for 1 second at a time makes it easier to escape out
+ for i in range(delay):
+ sleep(1)
+ delay *= 2
+ if self.doub_sum(delay) > self.timeout : break
+
+ else:
+ self.save_image(link, self.output_dir.joinpath("Image_{}.{}".format(
+ str(self.download_count), file_type)))
+ downloaded = True
+ if self.verbose and downloaded:
print("[%] File Downloaded !\n")
+ elif self.verbose:
+ print("[%] Timeout exceeded : Persistent Connection Error, File not Downloaded !\n")
+
except Exception as e:
self.download_count -= 1
print("[!] Issue getting: {}\n[!] Error:: {}".format(link, e))
-
+ # for calculating the error_protection delay
+ def doub_sum(self, val):
+ sum = 0
+ val = int(val)
+ while val > 0:
+ sum += val
+ val //= 2
+ return sum
+
def run(self):
while self.download_count < self.limit:
if self.verbose:
@@ -96,8 +127,28 @@ def run(self):
+ '&first=' + str(self.page_counter) + '&count=' + str(self.limit) \
+ '&adlt=' + self.adult + '&qft=' + ('' if self.filter is None else self.get_filter(self.filter))
request = urllib.request.Request(request_url, None, headers=self.headers)
- response = urllib.request.urlopen(request)
- html = response.read().decode('utf8')
+
+ delay = 1
+ while self.error_protection:
+ try:
+ response = urllib.request.urlopen(request)
+ break
+ except urllib.error.URLError:
+ if self.verbose:
+ print("[%] URLError on page, sleeping for " + str(delay))
+
+ # sleeping for 1 second at a time makes it easier to escape out
+ for i in range(delay):
+ sleep(1.0)
+ delay *= 2
+ if self.verbose:
+ print('\n\n[!!]Retrying page: {}\n'.format(self.page_counter + 1))
+ if self.doub_sum(delay) > max(self.timeout * 4, 30): # pages are very important, so extend the timeout for those
+ break
+ else:
+ response = urllib.request.urlopen(request)
+ if response is not None:
+ html = response.read().decode('utf8')
if html == "":
print("[%] No more images are available")
break
diff --git a/bing_image_downloader/downloader.py b/bing_image_downloader/downloader.py
index 41789dd..eaf9961 100644
--- a/bing_image_downloader/downloader.py
+++ b/bing_image_downloader/downloader.py
@@ -8,8 +8,8 @@
from .bing import Bing
-def download(query, limit=100, output_dir='dataset', adult_filter_off=True,
-force_replace=False, timeout=60, filter="", verbose=True):
+def download(query, limit=100, output_dir='dataset', adult_filter_off=True,
+force_replace=False, timeout=60, filter="", verbose=True, error_protection=False):
# engine = 'bing'
if adult_filter_off:
@@ -17,7 +17,7 @@ def download(query, limit=100, output_dir='dataset', adult_filter_off=True,
else:
adult = 'on'
-
+
image_dir = Path(output_dir).joinpath(query).absolute()
if force_replace:
@@ -32,9 +32,9 @@ def download(query, limit=100, output_dir='dataset', adult_filter_off=True,
except Exception as e:
print('[Error]Failed to create directory.', e)
sys.exit(1)
-
+
print("[%] Downloading Images to {}".format(str(image_dir.absolute())))
- bing = Bing(query, limit, image_dir, adult, timeout, filter, verbose)
+ bing = Bing(query, limit, image_dir, adult, timeout, filter, verbose, error_protection)
bing.run()
diff --git a/test.py b/test.py
index e7badfc..14a03a2 100644
--- a/test.py
+++ b/test.py
@@ -7,16 +7,15 @@
filter=sys.argv[2]
else:
filter=""
-
-
+
downloader.download(
query,
limit=10,
output_dir="dataset",
adult_filter_off=True,
force_replace=False,
- timeout=60,
+ timeout=4,
filter=filter,
verbose=True,
+ error_protection=True
)
-