Skip to content

feat(datasets): add synthetic data generation pipeline #425

feat(datasets): add synthetic data generation pipeline

feat(datasets): add synthetic data generation pipeline #425

GitHub Actions / JUnit Test Report failed Nov 19, 2024 in 0s

171 tests run, 158 passed, 9 skipped, 4 failed.

Annotations

Check failure on line 34 in packages/ragbits-document-search/tests/integration/test_unstructured.py

See this annotation in the file changed.

@github-actions github-actions / JUnit Test Report

test_unstructured.test_document_processor_processes_text_document_with_unstructured_provider[config0]

urllib.error.HTTPError: HTTP Error 403: Forbidden
Raw output
config = {}

    @pytest.mark.parametrize(
        "config",
        [
            {},
            pytest.param(
                {DocumentType.TXT: UnstructuredDefaultProvider(use_api=True)},
                marks=pytest.mark.skipif(
                    env_vars_not_set([UNSTRUCTURED_SERVER_URL_ENV, UNSTRUCTURED_API_KEY_ENV]),
                    reason="Unstructured API environment variables not set",
                ),
            ),
        ],
    )
    async def test_document_processor_processes_text_document_with_unstructured_provider(config: ProvidersConfig):
        document_processor = DocumentProcessorRouter.from_config(config)
        document_meta = DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.")
    
>       elements = await document_processor.get_provider(document_meta).process(document_meta)

packages/ragbits-document-search/tests/integration/test_unstructured.py:34: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
packages/ragbits-core/src/ragbits/core/audit/__init__.py:106: in wrapper_async
    returned = await func(*args, **kwargs)  # type: ignore
packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/default.py:139: in process
    elements = partition(
.venv/lib/python3.10/site-packages/unstructured/partition/auto.py:416: in partition
    elements = partition_text(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:102: in partition_text
    return _partition_text(
.venv/lib/python3.10/site-packages/unstructured/documents/elements.py:605: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:731: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:687: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/chunking/dispatch.py:74: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:181: in _partition_text
    file_content = _split_by_paragraph(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:361: in _split_by_paragraph
    _split_content_to_fit_max(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:393: in _split_content_to_fit_max
    sentences = sent_tokenize(content)
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:134: in sent_tokenize
    _download_nltk_packages_if_not_present()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:128: in _download_nltk_packages_if_not_present
    download_nltk_packages()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:86: in download_nltk_packages
    urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file_path)
/usr/lib/python3.10/urllib/request.py:241: in urlretrieve
    with contextlib.closing(urlopen(url, data)) as fp:
/usr/lib/python3.10/urllib/request.py:216: in urlopen
    return opener.open(url, data, timeout)
/usr/lib/python3.10/urllib/request.py:525: in open
    response = meth(req, response)
/usr/lib/python3.10/urllib/request.py:634: in http_response
    response = self.parent.error(
/usr/lib/python3.10/urllib/request.py:563: in error
    return self._call_chain(*args)
/usr/lib/python3.10/urllib/request.py:496: in _call_chain
    result = func(*args)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <urllib.request.HTTPDefaultErrorHandler object at 0x7f31a95f80a0>
req = <urllib.request.Request object at 0x7f31a95f82e0>
fp = <http.client.HTTPResponse object at 0x7f31a7944e80>, code = 403
msg = 'Forbidden', hdrs = <http.client.HTTPMessage object at 0x7f31a79453c0>

    def http_error_default(self, req, fp, code, msg, hdrs):
>       raise HTTPError(req.full_url, code, msg, hdrs, fp)
E       urllib.error.HTTPError: HTTP Error 403: Forbidden

/usr/lib/python3.10/urllib/request.py:643: HTTPError

Check failure on line 90 in packages/ragbits-document-search/tests/integration/test_unstructured.py

See this annotation in the file changed.

@github-actions github-actions / JUnit Test Report

test_unstructured.test_unstructured_provider_document_with_default_partition_kwargs[False]

urllib.error.HTTPError: HTTP Error 403: Forbidden
Raw output
use_api = False

    @pytest.mark.parametrize(
        "use_api",
        [
            False,
            pytest.param(
                True,
                marks=pytest.mark.skipif(
                    env_vars_not_set([UNSTRUCTURED_SERVER_URL_ENV, UNSTRUCTURED_API_KEY_ENV]),
                    reason="Unstructured API environment variables not set",
                ),
            ),
        ],
    )
    async def test_unstructured_provider_document_with_default_partition_kwargs(use_api: bool):
        document_meta = DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.")
        unstructured_provider = UnstructuredDefaultProvider(use_api=use_api)
>       elements = await unstructured_provider.process(document_meta)

packages/ragbits-document-search/tests/integration/test_unstructured.py:90: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
packages/ragbits-core/src/ragbits/core/audit/__init__.py:106: in wrapper_async
    returned = await func(*args, **kwargs)  # type: ignore
packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/default.py:139: in process
    elements = partition(
.venv/lib/python3.10/site-packages/unstructured/partition/auto.py:416: in partition
    elements = partition_text(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:102: in partition_text
    return _partition_text(
.venv/lib/python3.10/site-packages/unstructured/documents/elements.py:605: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:731: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:687: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/chunking/dispatch.py:74: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:181: in _partition_text
    file_content = _split_by_paragraph(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:361: in _split_by_paragraph
    _split_content_to_fit_max(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:393: in _split_content_to_fit_max
    sentences = sent_tokenize(content)
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:134: in sent_tokenize
    _download_nltk_packages_if_not_present()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:128: in _download_nltk_packages_if_not_present
    download_nltk_packages()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:86: in download_nltk_packages
    urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file_path)
/usr/lib/python3.10/urllib/request.py:241: in urlretrieve
    with contextlib.closing(urlopen(url, data)) as fp:
/usr/lib/python3.10/urllib/request.py:216: in urlopen
    return opener.open(url, data, timeout)
/usr/lib/python3.10/urllib/request.py:525: in open
    response = meth(req, response)
/usr/lib/python3.10/urllib/request.py:634: in http_response
    response = self.parent.error(
/usr/lib/python3.10/urllib/request.py:563: in error
    return self._call_chain(*args)
/usr/lib/python3.10/urllib/request.py:496: in _call_chain
    result = func(*args)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <urllib.request.HTTPDefaultErrorHandler object at 0x7f31a95f80a0>
req = <urllib.request.Request object at 0x7f31c1b76890>
fp = <http.client.HTTPResponse object at 0x7f31c1c48ca0>, code = 403
msg = 'Forbidden', hdrs = <http.client.HTTPMessage object at 0x7f31c1c48c70>

    def http_error_default(self, req, fp, code, msg, hdrs):
>       raise HTTPError(req.full_url, code, msg, hdrs, fp)
E       urllib.error.HTTPError: HTTP Error 403: Forbidden

/usr/lib/python3.10/urllib/request.py:643: HTTPError

Check failure on line 114 in packages/ragbits-document-search/tests/integration/test_unstructured.py

See this annotation in the file changed.

@github-actions github-actions / JUnit Test Report

test_unstructured.test_unstructured_provider_document_with_custom_partition_kwargs[False]

urllib.error.HTTPError: HTTP Error 403: Forbidden
Raw output
use_api = False

    @pytest.mark.parametrize(
        "use_api",
        [
            False,
            pytest.param(
                True,
                marks=pytest.mark.skipif(
                    env_vars_not_set([UNSTRUCTURED_SERVER_URL_ENV, UNSTRUCTURED_API_KEY_ENV]),
                    reason="Unstructured API environment variables not set",
                ),
            ),
        ],
    )
    async def test_unstructured_provider_document_with_custom_partition_kwargs(use_api: bool):
        document_meta = DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George.")
        partition_kwargs = {"languages": ["pl"], "strategy": "fast"}
        unstructured_provider = UnstructuredDefaultProvider(use_api=use_api, partition_kwargs=partition_kwargs)
>       elements = await unstructured_provider.process(document_meta)

packages/ragbits-document-search/tests/integration/test_unstructured.py:114: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
packages/ragbits-core/src/ragbits/core/audit/__init__.py:106: in wrapper_async
    returned = await func(*args, **kwargs)  # type: ignore
packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/default.py:139: in process
    elements = partition(
.venv/lib/python3.10/site-packages/unstructured/partition/auto.py:416: in partition
    elements = partition_text(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:102: in partition_text
    return _partition_text(
.venv/lib/python3.10/site-packages/unstructured/documents/elements.py:605: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:731: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:687: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/chunking/dispatch.py:74: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:181: in _partition_text
    file_content = _split_by_paragraph(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:361: in _split_by_paragraph
    _split_content_to_fit_max(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:393: in _split_content_to_fit_max
    sentences = sent_tokenize(content)
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:134: in sent_tokenize
    _download_nltk_packages_if_not_present()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:128: in _download_nltk_packages_if_not_present
    download_nltk_packages()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:86: in download_nltk_packages
    urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file_path)
/usr/lib/python3.10/urllib/request.py:241: in urlretrieve
    with contextlib.closing(urlopen(url, data)) as fp:
/usr/lib/python3.10/urllib/request.py:216: in urlopen
    return opener.open(url, data, timeout)
/usr/lib/python3.10/urllib/request.py:525: in open
    response = meth(req, response)
/usr/lib/python3.10/urllib/request.py:634: in http_response
    response = self.parent.error(
/usr/lib/python3.10/urllib/request.py:563: in error
    return self._call_chain(*args)
/usr/lib/python3.10/urllib/request.py:496: in _call_chain
    result = func(*args)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <urllib.request.HTTPDefaultErrorHandler object at 0x7f31a95f80a0>
req = <urllib.request.Request object at 0x7f31a7bd3b20>
fp = <http.client.HTTPResponse object at 0x7f31a7bd1900>, code = 403
msg = 'Forbidden', hdrs = <http.client.HTTPMessage object at 0x7f31a7bd30a0>

    def http_error_default(self, req, fp, code, msg, hdrs):
>       raise HTTPError(req.full_url, code, msg, hdrs, fp)
E       urllib.error.HTTPError: HTTP Error 403: Forbidden

/usr/lib/python3.10/urllib/request.py:643: HTTPError

Check failure on line 198 in packages/ragbits-document-search/tests/unit/test_document_search.py

See this annotation in the file changed.

@github-actions github-actions / JUnit Test Report

test_document_search.test_document_search_with_batched

urllib.error.HTTPError: HTTP Error 403: Forbidden
Raw output
async def test_document_search_with_batched():
        documents = [
            DocumentMeta.create_text_document_from_literal("Name of Peppa's brother is George"),
            DocumentMeta.create_text_document_from_literal("Name of Peppa's father is Daddy Pig"),
            DocumentMeta.create_text_document_from_literal("Name of Peppa's mother is Mummy Pig"),
            DocumentMeta.create_text_document_from_literal("Name of Peppa's friend is Suzy Sheep"),
            DocumentMeta.create_text_document_from_literal("Name of Peppa's friend is Danny Dog"),
            DocumentMeta.create_text_document_from_literal("Name of Peppa's friend is Pedro Pony"),
            DocumentMeta.create_text_document_from_literal("Name of Peppa's friend is Emily Elephant"),
            DocumentMeta.create_text_document_from_literal("Name of Peppa's friend is Candy Cat"),
            DocumentMeta.create_text_document_from_literal("Name of Peppa's teacher is Madame Gazelle"),
            DocumentMeta.create_text_document_from_literal("Name of Peppa's doctor is Dr. Brown Bear"),
            DocumentMeta.create_text_document_from_literal("Name of Peppa's cousin is Chloe Pig"),
            DocumentMeta.create_text_document_from_literal("Name of Peppa's cousin is Alexander Pig"),
        ]
    
        embeddings_mock = AsyncMock()
        embeddings_mock.embed_text.return_value = [[0.1, 0.1]] * len(documents)
    
        processing_strategy = BatchedAsyncProcessing(batch_size=5)
        vectore_store = InMemoryVectorStore()
    
        document_search = DocumentSearch(
            embedder=embeddings_mock,
            vector_store=vectore_store,
            processing_strategy=processing_strategy,
        )
    
>       await document_search.ingest(documents)

packages/ragbits-document-search/tests/unit/test_document_search.py:198: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
packages/ragbits-core/src/ragbits/core/audit/__init__.py:106: in wrapper_async
    returned = await func(*args, **kwargs)  # type: ignore
packages/ragbits-document-search/src/ragbits/document_search/_main.py:141: in ingest
    elements = await self.processing_strategy.process_documents(
packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/batched.py:59: in process_documents
    responses = await asyncio.gather(
packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/batched.py:35: in _process_with_semaphore
    return await self.process_document(document, processor_router, processor_overwrite)
packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/base.py:72: in process_document
    return await processor.process(document_meta)
packages/ragbits-core/src/ragbits/core/audit/__init__.py:106: in wrapper_async
    returned = await func(*args, **kwargs)  # type: ignore
packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/default.py:139: in process
    elements = partition(
.venv/lib/python3.10/site-packages/unstructured/partition/auto.py:416: in partition
    elements = partition_text(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:102: in partition_text
    return _partition_text(
.venv/lib/python3.10/site-packages/unstructured/documents/elements.py:605: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:731: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:687: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/chunking/dispatch.py:74: in wrapper
    elements = func(*args, **kwargs)
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:181: in _partition_text
    file_content = _split_by_paragraph(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:361: in _split_by_paragraph
    _split_content_to_fit_max(
.venv/lib/python3.10/site-packages/unstructured/partition/text.py:393: in _split_content_to_fit_max
    sentences = sent_tokenize(content)
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:134: in sent_tokenize
    _download_nltk_packages_if_not_present()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:128: in _download_nltk_packages_if_not_present
    download_nltk_packages()
.venv/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:86: in download_nltk_packages
    urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file_path)
/usr/lib/python3.10/urllib/request.py:241: in urlretrieve
    with contextlib.closing(urlopen(url, data)) as fp:
/usr/lib/python3.10/urllib/request.py:216: in urlopen
    return opener.open(url, data, timeout)
/usr/lib/python3.10/urllib/request.py:525: in open
    response = meth(req, response)
/usr/lib/python3.10/urllib/request.py:634: in http_response
    response = self.parent.error(
/usr/lib/python3.10/urllib/request.py:563: in error
    return self._call_chain(*args)
/usr/lib/python3.10/urllib/request.py:496: in _call_chain
    result = func(*args)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <urllib.request.HTTPDefaultErrorHandler object at 0x7f31a95f80a0>
req = <urllib.request.Request object at 0x7f31a78a1120>
fp = <http.client.HTTPResponse object at 0x7f31a78a0400>, code = 403
msg = 'Forbidden', hdrs = <http.client.HTTPMessage object at 0x7f31a78a1f90>

    def http_error_default(self, req, fp, code, msg, hdrs):
>       raise HTTPError(req.full_url, code, msg, hdrs, fp)
E       urllib.error.HTTPError: HTTP Error 403: Forbidden

/usr/lib/python3.10/urllib/request.py:643: HTTPError