Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: implementing browserless #252

Open
wants to merge 1 commit into
base: browserless
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
*.diff
.*.sw*
/brozzler.egg-info/
venv
17 changes: 14 additions & 3 deletions brozzler/browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,20 +288,23 @@ class Browser:
'''
logger = logging.getLogger(__module__ + '.' + __qualname__)

def __init__(self, **kwargs):
def __init__(self, chrome_exe, browserless_port, **kwargs):
'''
Initializes the Browser.

Args:
**kwargs: arguments for Chrome(...)
'''
self.chrome = Chrome(**kwargs)
self.websock_url = None
self.websock = None
self.websock_thread = None
self.is_browsing = False
self._command_id = Counter()
self._wait_interval = 0.5
self.browse_port = browserless_port
self.is_browserless = chrome_exe == 'browserless'
self.chrome = Chrome(chrome_exe=chrome_exe, browserless_port=browserless_port,
is_browserless=self.is_browserless, **kwargs)
self._command_id = Counter()

def __enter__(self):
self.start()
Expand Down Expand Up @@ -343,6 +346,14 @@ def start(self, **kwargs):
**kwargs: arguments for self.chrome.start(...)
'''
if not self.is_running():

if self.is_browserless:
# Open a ws to create a browser on demand
args = self.chrome._browserless_args()
self.browserless_ws = websocket.create_connection(
"ws://localhost:" + str(self.browse_port) + "?" + args
)

self.websock_url = self.chrome.start(**kwargs)
self.websock = websocket.WebSocketApp(self.websock_url)
self.websock_thread = WebsockReceiverThread(
Expand Down
127 changes: 83 additions & 44 deletions brozzler/chrome.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import json
import tempfile
import sys
import functools

def check_version(chrome_exe):
'''
Expand Down Expand Up @@ -62,7 +63,8 @@ def check_version(chrome_exe):
class Chrome:
logger = logging.getLogger(__module__ + '.' + __qualname__)

def __init__(self, chrome_exe, port=9222, ignore_cert_errors=False):
def __init__(self, chrome_exe, browserless_port, is_browserless,
port=9222, ignore_cert_errors=False):
'''
Initializes instance of this class.

Expand All @@ -74,17 +76,27 @@ def __init__(self, chrome_exe, port=9222, ignore_cert_errors=False):
ignore_cert_errors: configure chrome to accept all certs (default
False)
'''
if chrome_exe == 'browserless':
# init browserless here maybe
pass

self.is_browserless = is_browserless
self.browserless_port = browserless_port

if self.is_browserless:
# browserless isn't attached to a PID
self.chrome_exe = None
self.port = None

else:
# use a local browser
self.port = port
self.chrome_exe = chrome_exe
self.ignore_cert_errors = ignore_cert_errors
self._shutdown = threading.Event()
self.chrome_process = None


self.ignore_cert_errors = ignore_cert_errors
self._shutdown = threading.Event()
self._home_tmpdir = tempfile.TemporaryDirectory()
self._chrome_user_data_dir = os.path.join(
self._home_tmpdir.name, 'chrome-user-data')
self.chrome_process = None

def __enter__(self):
'''
Returns websocket url to chrome window with about:blank loaded.
Expand Down Expand Up @@ -139,8 +151,41 @@ def persist_and_read_cookie_db(self):
cookie_location, exc_info=True)
return cookie_db

def start(self, proxy=None, cookie_db=None, disk_cache_dir=None,
disk_cache_size=None, websocket_timeout=60):
def _chrome_args(self, disk_cache_dir=None, disk_cache_size=None,
proxy=None):
chrome_args = [
self.chrome_exe,
'--remote-debugging-port=%s' % self.port or self.browserless_port,
'--use-mock-keychain', # mac thing
'--user-data-dir=%s' % self._chrome_user_data_dir,
'--disable-background-networking', '--disable-breakpad',
'--disable-renderer-backgrounding', '--disable-hang-monitor',
'--disable-background-timer-throttling', '--mute-audio',
'--disable-web-sockets',
'--window-size=1100,900', '--no-default-browser-check',
'--disable-first-run-ui', '--no-first-run',
'--homepage=about:blank', '--disable-direct-npapi-requests',
'--disable-web-security', '--disable-notifications',
'--disable-extensions', '--disable-save-password-bubble',
'--disable-sync']

extra_chrome_args = os.environ.get('BROZZLER_EXTRA_CHROME_ARGS')
if extra_chrome_args:
chrome_args.extend(extra_chrome_args.split())
if disk_cache_dir:
chrome_args.append('--disk-cache-dir=%s' % disk_cache_dir)
if disk_cache_size:
chrome_args.append('--disk-cache-size=%s' % disk_cache_size)
if self.ignore_cert_errors:
chrome_args.append('--ignore-certificate-errors')
if proxy:
chrome_args.append('--proxy-server=%s' % proxy)
chrome_args.append('about:blank')

return chrome_args

def start(self, proxy=None, cookie_db=None,
disk_cache_dir=None, disk_cache_size=None, websocket_timeout=60):
'''
Starts chrome/chromium process.

Expand All @@ -158,44 +203,20 @@ def start(self, proxy=None, cookie_db=None, disk_cache_dir=None,
Returns:
websocket url to chrome window with about:blank loaded
'''

# these can raise exceptions
self._home_tmpdir = tempfile.TemporaryDirectory()
self._chrome_user_data_dir = os.path.join(
self._home_tmpdir.name, 'chrome-user-data')
if cookie_db:
self._init_cookie_db(cookie_db)
self._shutdown.clear()

new_env = os.environ.copy()
new_env['HOME'] = self._home_tmpdir.name
chrome_args = [
self.chrome_exe,
'--remote-debugging-port=%s' % self.port,
'--use-mock-keychain', # mac thing
'--user-data-dir=%s' % self._chrome_user_data_dir,
'--disable-background-networking', '--disable-breakpad',
'--disable-renderer-backgrounding', '--disable-hang-monitor',
'--disable-background-timer-throttling', '--mute-audio',
'--disable-web-sockets',
'--window-size=1100,900', '--no-default-browser-check',
'--disable-first-run-ui', '--no-first-run',
'--homepage=about:blank', '--disable-direct-npapi-requests',
'--disable-web-security', '--disable-notifications',
'--disable-extensions', '--disable-save-password-bubble',
'--disable-sync']
chrome_args = self._chrome_args(disk_cache_dir=disk_cache_dir, disk_cache_size=disk_cache_size,
proxy=proxy)

if self.is_browserless:
return self.start_browserless()

extra_chrome_args = os.environ.get('BROZZLER_EXTRA_CHROME_ARGS')
if extra_chrome_args:
chrome_args.extend(extra_chrome_args.split())
if disk_cache_dir:
chrome_args.append('--disk-cache-dir=%s' % disk_cache_dir)
if disk_cache_size:
chrome_args.append('--disk-cache-size=%s' % disk_cache_size)
if self.ignore_cert_errors:
chrome_args.append('--ignore-certificate-errors')
if proxy:
chrome_args.append('--proxy-server=%s' % proxy)
chrome_args.append('about:blank')
self.logger.info('running: %r', subprocess.list2cmdline(chrome_args))
# start_new_session - new process group so we can kill the whole group
self.chrome_process = subprocess.Popen(
Expand All @@ -209,7 +230,25 @@ def start(self, proxy=None, cookie_db=None, disk_cache_dir=None,

return self._websocket_url(timeout_sec=websocket_timeout)

def _websocket_url(self, timeout_sec = 60):
def _browserless_args(self):
chrome_args = self._chrome_args()
chrome_args.pop(0)
chrome_args.pop(0)
return functools.reduce(lambda a, b: a + "&" + b, chrome_args)

def start_browserless(self):
json_url = "http://localhost:" + str(self.browserless_port) + "/sessions"

brwlss_json_raw = urllib.request.urlopen(json_url, timeout=30).read()
brwlss_json = json.loads(brwlss_json_raw)
wsURL = brwlss_json[0]['webSocketDebuggerUrl']

self.logger.info('got chrome websocket debug url %s from Browserless at %s', wsURL, json_url)
self.port = brwlss_json[0]['port']

return wsURL

def _websocket_url(self, timeout_sec=60):
json_url = 'http://localhost:%s/json' % self.port
# make this a member variable so that kill -QUIT reports it
self._start = time.time()
Expand Down Expand Up @@ -261,7 +300,7 @@ def readline_nonblock(f):
buf = b''
try:
while not self._shutdown.is_set() and (
len(buf) == 0 or buf[-1] != 0xa) and select.select(
len(buf) == 0 or buf[-1] != 0xa) and select.select(
[f],[],[],0.5)[0]:
buf += f.read(1)
except (ValueError, OSError):
Expand All @@ -282,8 +321,8 @@ def readline_nonblock(f):
buf = readline_nonblock(self.chrome_process.stderr)
if buf:
self.logger.trace(
'chrome pid %s STDERR %s',
self.chrome_process.pid, buf)
'chrome pid %s STDERR %s',
self.chrome_process.pid, buf)
except:
self.logger.error('unexpected exception', exc_info=True)

Expand Down
6 changes: 5 additions & 1 deletion brozzler/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,9 @@ def brozzle_page(argv=None):
'--skip-browserless', dest='skip_browserless', action='store_true')
arg_parser.add_argument(
'--simpler404', dest='simpler404', action='store_true')
arg_parser.add_argument(
'--browserless-port', dest='browserless_port', default='3000',
help='port on which the browserless instance is')
add_common_options(arg_parser, argv)

args = arg_parser.parse_args(args=argv[1:])
Expand Down Expand Up @@ -210,7 +213,8 @@ def on_screenshot(screenshot_jpeg):
f.write(screenshot_jpeg)
logging.info('wrote screenshot to %s', filename)

browser = brozzler.Browser(chrome_exe=args.chrome_exe)
browser = brozzler.Browser(chrome_exe=args.chrome_exe,
browserless_port=args.browserless_port)
try:
browser.start(proxy=args.proxy)
outlinks = worker.brozzle_page(
Expand Down