Skip to content

Commit

Permalink
chore(python): add ijson streaming jsonl
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Nov 4, 2024
1 parent 990324f commit 5568728
Show file tree
Hide file tree
Showing 12 changed files with 722 additions and 199 deletions.
481 changes: 370 additions & 111 deletions cli/Cargo.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider-cloud-cli"
version = "0.1.5"
version = "0.1.22"
edition = "2021"
authors = [ "j-mendez <[email protected]>"]
description = "The Spider Cloud CLI for web crawling and scraping"
Expand Down
4 changes: 2 additions & 2 deletions javascript/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion javascript/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@spider-cloud/spider-client",
"version": "0.0.72",
"version": "0.1.22",
"description": "Isomorphic Javascript SDK for Spider Cloud services",
"scripts": {
"test": "node --import tsx --test __tests__/*test.ts",
Expand Down
23 changes: 23 additions & 0 deletions python/example_streaming.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@

from spider import Spider

# Initialize the Spider with your API key using the env key SPIDER_API_KEY
app = Spider()

crawler_params = {
'limit': 1000,
'proxy_enabled': False,
'store_data': False,
'metadata': False,
'request': 'http'
}

count = [0]

def process_json(data: dict) -> None:
print(f"Processing: {count[0]}")
count[0] += 1
for key, value in data.items():
print(f"{key}: {value}")

app.crawl_url('https://spider.cloud', params=crawler_params, stream=True, callback=process_json)
3 changes: 2 additions & 1 deletion python/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ pytest
pytest-asyncio
python-dotenv
aiohttp
python-dotenv
python-dotenv
ijson
2 changes: 1 addition & 1 deletion python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def read_file(fname):

setup(
name="spider-client",
version="0.0.72",
version="0.1.22",
url="https://github.com/spider-rs/spider-clients/tree/main/python",
author="Spider",
author_email="[email protected]",
Expand Down
2 changes: 1 addition & 1 deletion python/spider/async_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ def _prepare_headers(
return {
"Content-Type": content_type,
"Authorization": f"Bearer {self.api_key}",
"User-Agent": "AsyncSpider-Client/0.0.72",
"User-Agent": "AsyncSpider-Client/0.1.22",
}

async def _handle_error(self, response: ClientResponse, action: str) -> None:
Expand Down
16 changes: 9 additions & 7 deletions python/spider/spider.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import os, requests, json, logging
import os, requests, logging, ijson
from typing import Optional, Dict
from spider.spider_types import RequestParamsDict, JsonCallback, QueryRequest


class Spider:
def __init__(self, api_key: Optional[str] = None):
"""
Expand Down Expand Up @@ -402,18 +403,19 @@ def data_delete(

def stream_reader(self, response, callback):
response.raise_for_status()
for chunk in response.iter_lines(chunk_size=None, decode_unicode=True):
try:
json_obj = json.loads(chunk)

try:
for json_obj in ijson.items(response.raw, "", multiple_values=True):
callback(json_obj)
except json.JSONDecodeError:
logging.error("Failed to parse chunk: %s", chunk)

except Exception as e:
logging.error(f"An error occurred while parsing JSON: {e}")

def _prepare_headers(self, content_type: str = "application/json"):
return {
"Content-Type": content_type,
"Authorization": f"Bearer {self.api_key}",
"User-Agent": f"Spider-Client/0.0.72",
"User-Agent": f"Spider-Client/0.1.22",
}

def _post_request(self, url: str, data, headers, stream=False):
Expand Down
6 changes: 5 additions & 1 deletion python/tests/test_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,13 +266,17 @@ def test_create_signed_url(mock_get, spider):
def test_stream_reader():
spider = Spider(api_key="test_api_key")
mock_response = MagicMock()
mock_response.iter_lines.return_value = [b'{"key": "value"}\n', b'{"key2": "value2"}\n']
raw_data = b'{"key": "value"}\n{"key2": "value2"}\n'
mock_response.raw = MagicMock()
mock_response.raw.read = MagicMock(return_value=raw_data)

callback_data = []
def callback(json_obj):
callback_data.append(json_obj)

spider.stream_reader(mock_response, callback)

# Assertions to verify the callback was called with the correct data
assert len(callback_data) == 2
assert callback_data[0] == {"key": "value"}
assert callback_data[1] == {"key2": "value2"}
Expand Down
Loading

0 comments on commit 5568728

Please sign in to comment.