diff --git a/.github/workflows/integration_test.yml b/.github/workflows/integration_test.yml
index 9bc76cfa..42599bc8 100644
--- a/.github/workflows/integration_test.yml
+++ b/.github/workflows/integration_test.yml
@@ -20,7 +20,9 @@ concurrency:
 
 jobs:
   test:
+    environment: integration_test
     strategy:
+      max-parallel: 1
       matrix:
         python-version: ["3.10", "3.11", "3.12"]
         os: [ubuntu-latest, macos-latest, windows-latest]
@@ -50,6 +52,14 @@ jobs:
         path: .venv
         key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}
 
+    - name: Load test cache
+      uses: actions/cache@v4
+      with:
+        path: tests/itest_cache
+        key: tests-itest-cache
+        restore-keys: |
+          test-itest-cache
+
     - name: Install extra dependencies
       run: poetry run pip install -r requirements-extra.txt
 
@@ -58,6 +68,6 @@ jobs:
 
     - name: Run integration tests
       env:
-        OPENAI_API_KEY: fake_key_1337
+        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
         TIKTOKEN_CACHE_DIR: tests/itest_cache/tiktoken_cache
-      run: poetry run python tests/run_integration_tests.py
+      run: poetry run python tests/run_integration_tests.py --os ${{ runner.os }}
diff --git a/.github/workflows/integration_test_minimal.yml b/.github/workflows/integration_test_minimal.yml
index a301e201..7d5e1962 100644
--- a/.github/workflows/integration_test_minimal.yml
+++ b/.github/workflows/integration_test_minimal.yml
@@ -20,7 +20,9 @@ concurrency:
 
 jobs:
   test:
+    environment: integration_test
     strategy:
+      max-parallel: 1
       matrix:
         python-version: ["3.10", "3.11", "3.12"]
         os: [ubuntu-latest, macos-latest, windows-latest]
@@ -50,6 +52,14 @@ jobs:
         path: .venv
         key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}
 
+    - name: Load test cache
+      uses: actions/cache@v4
+      with:
+        path: tests/itest_cache
+        key: tests-itest-cache
+        restore-keys: |
+          test-itest-cache
+
     - name: Install extra dependencies
       run: poetry run pip install -r requirements-extra.txt
 
@@ -58,6 +68,6 @@ jobs:
 
     - name: Run integration tests
       env:
-        OPENAI_API_KEY: fake_key_1337
+        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
         TIKTOKEN_CACHE_DIR: tests/itest_cache/tiktoken_cache
-      run: poetry run python tests/run_integration_tests.py --minimal-only
+      run: poetry run python tests/run_integration_tests.py --minimal-only --os ${{ runner.os }}
diff --git a/examples/Advanced output handling.ipynb b/examples/Advanced output handling.ipynb
index 92751bb3..1ffc6b77 100644
--- a/examples/Advanced output handling.ipynb	
+++ b/examples/Advanced output handling.ipynb	
@@ -17,9 +17,25 @@
     "This should give a good understanding of output handler semantics and prepare you to build your own ones."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install lunary package if you need logging."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install lunary==1.1.5"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -860,8 +876,6 @@
       "\n",
       "Let's proceed to return the final result.\n",
       "\n",
-      "\n",
-      "\n",
       "\u001b[0mCritic response:  OK\n",
       "\u001b[32;1m\u001b[1;3m('def bubble_sort(array):\\n    \"\"\"\\n    Sorts an array using the bubble sort algorithm.\\n\\n    The bubble sort algorithm works by repeatedly stepping through the list to be sorted,\\n    comparing each pair of adjacent items and swapping them if they are in the wrong order.\\n    This process is repeated until the list is sorted. The algorithm gets its name because\\n    smaller elements \\'bubble\\' to the top of the list.\\n\\n    Parameters:\\n    array (list): The list of elements to be sorted.\\n\\n    Returns:\\n    list: The sorted list.\\n\\n    Time Complexity:\\n    - Worst and Average Case: O(n^2)\\n    - Best Case: O(n) when the list is already sorted\\n\\n    Space Complexity:\\n    - O(1) as it is an in-place sorting algorithm\\n    \"\"\"\\n    length = len(array)\\n    for i in range(length):\\n        swapped = False\\n        for j in range(0, length - i - 1):\\n            if array[j] > array[j + 1]:\\n                # Swap if the element found is greater than the next element\\n                array[j], array[j + 1] = array[j + 1], array[j]\\n                swapped = True\\n        # If no elements were swapped, the array is already sorted\\n        if not swapped:\\n            break\\n    return array\\n\\n# Test the bubble sort function with various cases\\ntest_cases = [\\n    [64, 34, 25, 12, 22, 11, 90],\\n    [],\\n    [1],\\n    [2, 1],\\n    [3, 3, 3],\\n    [5, 1, 4, 2, 8],\\n    [-1, -3, -2, -5, -4],\\n    [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],\\n    [7, 7, 7, 7, 7]\\n]\\n\\nfor i, sample_list in enumerate(test_cases):\\n    sorted_list = bubble_sort(sample_list.copy())\\n    print(f\"Test case {i + 1}: Original: {sample_list}, Sorted: {sorted_list}\")', \"The bubble sort algorithm works by repeatedly stepping through the list to be sorted, comparing each pair of adjacent items and swapping them if they are in the wrong order. This process is repeated until the list is sorted. The algorithm gets its name because smaller elements 'bubble' to the top of the list. The outer loop runs n times, and the inner loop runs n-i-1 times, where n is the length of the list and i is the current iteration of the outer loop. An optimization is added with a flag to detect if any swapping happened in the inner loop. If no swapping happened, the list is already sorted, and the loop can be terminated early. The time complexity of bubble sort is O(n^2) in the worst and average cases, and O(n) in the best case when the list is already sorted. The space complexity is O(1) as it is an in-place sorting algorithm.\")\u001b[0m\n",
       "\n",
@@ -1183,9 +1197,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.2"
+   "version": "3.12.6"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
diff --git a/examples/Blog with Images.ipynb b/examples/Blog with Images.ipynb
index dd7aa40a..7a8eac29 100644
--- a/examples/Blog with Images.ipynb	
+++ b/examples/Blog with Images.ipynb	
@@ -10,7 +10,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "16b8ccc4-b34b-47c3-94ee-cbd82ef4c8f2",
+   "id": "bf3534fb-ed51-4a82-ad66-71259b2e282f",
    "metadata": {},
    "source": [
     "We use this example of creating a web search-based blog with illustrations to show how to \n",
@@ -20,9 +20,31 @@
     "* create tasks and link them using the >> operator"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "6ab41b51-2064-4623-866b-f4310d062425",
+   "metadata": {},
+   "source": [
+    "Let's install external dependencies for the example."
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
+   "id": "e8d44edb-4841-4f3c-822c-136276606e09",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install duckduckgo-search==5.3.0b4 llama-index==0.11.2\n",
+    "%pip install crewai==0.51.1 --no-deps\n",
+    "\n",
+    "# install lunary if you need logging\n",
+    "%pip install lunary==1.1.5"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
    "id": "2596164c",
    "metadata": {},
    "outputs": [
@@ -84,7 +106,7 @@
    "source": [
     "search_tool = MotleyTool.from_supported_tool(\n",
     "    DuckDuckGoSearchRun(),\n",
-    "    retry_config=RetryConfig(max_retries=5)  # for retrying rate limit errors\n",
+    "    retry_config=RetryConfig(max_retries=7)  # for retrying rate limit errors\n",
     ")\n",
     "\n",
     "researcher = CrewAIMotleyAgent(\n",
@@ -518,7 +540,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.2"
+   "version": "3.12.6"
   }
  },
  "nbformat": 4,
diff --git a/examples/Math via python code with a single agent.ipynb b/examples/Math via python code with a single agent.ipynb
index 57b16e57..7a45e96d 100644
--- a/examples/Math via python code with a single agent.ipynb	
+++ b/examples/Math via python code with a single agent.ipynb	
@@ -16,9 +16,30 @@
     "The point of this example is to illustrate that for agents that have an AgentExecutor (which most frameworks have apart from Autogen), the approach from [MathChat](https://microsoft.github.io/autogen/blog/2023/06/28/MathChat/) can be done with a single agent, as the second conversation partner there just plays the role of an AgentExecutor."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "92b1d63b-b288-46f5-8eb7-19ee56853e94",
+   "metadata": {},
+   "source": [
+    "Let's install external dependencies for the example."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e5fc0be9-1d35-4902-9a1e-bd19bb119f4a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install crewai==0.51.1 --no-deps\n",
+    "\n",
+    "# install lunary if you need logging\n",
+    "%pip install lunary==1.1.5"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "id": "51527a55-d470-4509-9930-01466e3f78f3",
    "metadata": {},
    "outputs": [
@@ -772,8 +793,7 @@
    ],
    "source": [
     "from IPython.display import display, Math, Latex\n",
-    "for line in task.output.split(\"\\n\"):\n",
-    "    display(Latex(line))"
+    "display(Latex(task.output))"
    ]
   },
   {
@@ -803,7 +823,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.12.6"
   }
  },
  "nbformat": 4,
diff --git a/examples/Multi-step research agent.ipynb b/examples/Multi-step research agent.ipynb
index e94dd246..f7142e86 100644
--- a/examples/Multi-step research agent.ipynb	
+++ b/examples/Multi-step research agent.ipynb	
@@ -254,7 +254,7 @@
       "Question:  Why did Arjuna kill Karna, his half-brother?\n",
       "Answer:  Arjuna killed Karna, his half-brother, during their duel on the battlefield of Kurukshetra. The pivotal moment came when Karna's chariot got stuck in the mud, and he requested Arjuna and Krishna to honorably allow him to free his chariot. Krishna responded by reminding Karna of his past dishonorable actions, such as humiliating Draupadi and killing Abhimanyu, thereby justifying Arjuna's actions against Karna. Additionally, when Karna attempted to use the Brahmastra, he suddenly forgot the mantra, which can be seen as divine intervention. Krishna's words and the subsequent events reinforced Arjuna's resolve to kill Karna, aligning with his duty and dharma as a warrior.\n",
       "To explore the graph:\n",
-      "docker run -p 8000:8000  -v /var/folders/fv/tyhll76x0fn6l7j_q2nhvyg00000gn/T/tmp4obi9n3p/kuzu_db:/database --rm kuzudb/explorer:latest\n",
+      "docker run -p 8000:8000  -v /tmp/tmp5gpyhq4x/kuzu_db:/database --rm kuzudb/explorer:0.4.2\n",
       "And in the kuzu explorer at http://localhost:8000 enter\n",
       "MATCH (A)-[r]->(B) RETURN *;\n"
      ]
@@ -266,7 +266,7 @@
     "print(\"Question: \", final_answer.question)\n",
     "print(\"Answer: \", final_answer.answer)\n",
     "print(\"To explore the graph:\")\n",
-    "print(f\"docker run -p 8000:8000  -v {crew.graph_store.database_path}:/database --rm kuzudb/explorer:latest\")\n",
+    "print(f\"docker run -p 8000:8000  -v {crew.graph_store.database_path}:/database --rm kuzudb/explorer:0.4.2\")\n",
     "print(\"And in the kuzu explorer at http://localhost:8000 enter\")\n",
     "print(\"MATCH (A)-[r]->(B) RETURN *;\")\n",
     "final_result = \"{}\\n\\n{}\".format(final_answer.question, final_answer.answer)"
@@ -303,7 +303,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.1"
+   "version": "3.12.6"
   }
  },
  "nbformat": 4,
diff --git a/examples/Using AutoGen with motleycrew.ipynb b/examples/Using AutoGen with motleycrew.ipynb
index a6189ec3..29b97036 100644
--- a/examples/Using AutoGen with motleycrew.ipynb	
+++ b/examples/Using AutoGen with motleycrew.ipynb	
@@ -12,9 +12,80 @@
     "The two options are described in detail below"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "4360c4e9-db54-4ecd-a2c1-0c651e678578",
+   "metadata": {},
+   "source": [
+    "Let's install external dependencies for the example."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 1,
+   "id": "a800a778-92fc-4a37-835c-37ff07f787a6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting duckduckgo-search==5.3.0b4\n",
+      "  Using cached duckduckgo_search-5.3.0b4-py3-none-any.whl.metadata (18 kB)\n",
+      "Requirement already satisfied: click>=8.1.7 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from duckduckgo-search==5.3.0b4) (8.1.7)\n",
+      "Requirement already satisfied: httpx>=0.27.0 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from httpx[brotli,http2,socks]>=0.27.0->duckduckgo-search==5.3.0b4) (0.27.2)\n",
+      "Requirement already satisfied: anyio in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from httpx>=0.27.0->httpx[brotli,http2,socks]>=0.27.0->duckduckgo-search==5.3.0b4) (4.6.0)\n",
+      "Requirement already satisfied: certifi in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from httpx>=0.27.0->httpx[brotli,http2,socks]>=0.27.0->duckduckgo-search==5.3.0b4) (2024.8.30)\n",
+      "Requirement already satisfied: httpcore==1.* in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from httpx>=0.27.0->httpx[brotli,http2,socks]>=0.27.0->duckduckgo-search==5.3.0b4) (1.0.5)\n",
+      "Requirement already satisfied: idna in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from httpx>=0.27.0->httpx[brotli,http2,socks]>=0.27.0->duckduckgo-search==5.3.0b4) (3.10)\n",
+      "Requirement already satisfied: sniffio in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from httpx>=0.27.0->httpx[brotli,http2,socks]>=0.27.0->duckduckgo-search==5.3.0b4) (1.3.1)\n",
+      "Requirement already satisfied: h11<0.15,>=0.13 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from httpcore==1.*->httpx>=0.27.0->httpx[brotli,http2,socks]>=0.27.0->duckduckgo-search==5.3.0b4) (0.14.0)\n",
+      "Requirement already satisfied: brotli in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from httpx[brotli,http2,socks]>=0.27.0->duckduckgo-search==5.3.0b4) (1.1.0)\n",
+      "Requirement already satisfied: h2<5,>=3 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from httpx[brotli,http2,socks]>=0.27.0->duckduckgo-search==5.3.0b4) (4.1.0)\n",
+      "Requirement already satisfied: socksio==1.* in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from httpx[brotli,http2,socks]>=0.27.0->duckduckgo-search==5.3.0b4) (1.0.0)\n",
+      "Requirement already satisfied: hyperframe<7,>=6.0 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from h2<5,>=3->httpx[brotli,http2,socks]>=0.27.0->duckduckgo-search==5.3.0b4) (6.0.1)\n",
+      "Requirement already satisfied: hpack<5,>=4.0 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from h2<5,>=3->httpx[brotli,http2,socks]>=0.27.0->duckduckgo-search==5.3.0b4) (4.0.0)\n",
+      "Using cached duckduckgo_search-5.3.0b4-py3-none-any.whl (23 kB)\n",
+      "Installing collected packages: duckduckgo-search\n",
+      "Successfully installed duckduckgo-search-5.3.0b4\n",
+      "Note: you may need to restart the kernel to use updated packages.\n",
+      "Collecting lunary==1.1.5\n",
+      "  Using cached lunary-1.1.5-py3-none-any.whl.metadata (1.6 kB)\n",
+      "Requirement already satisfied: aiohttp<4.0.0,>=3.9.5 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from lunary==1.1.5) (3.10.8)\n",
+      "Requirement already satisfied: chevron<0.15.0,>=0.14.0 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from lunary==1.1.5) (0.14.0)\n",
+      "Requirement already satisfied: jsonpickle<4.0.0,>=3.0.4 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from lunary==1.1.5) (3.3.0)\n",
+      "Requirement already satisfied: packaging<24.0,>=23.2 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from lunary==1.1.5) (23.2)\n",
+      "Requirement already satisfied: pyhumps<4.0.0,>=3.8.0 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from lunary==1.1.5) (3.8.0)\n",
+      "Requirement already satisfied: requests<3.0.0,>=2.31.0 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from lunary==1.1.5) (2.32.3)\n",
+      "Requirement already satisfied: setuptools<73.0.0,>=72.1.0 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from lunary==1.1.5) (72.2.0)\n",
+      "Requirement already satisfied: tenacity<9.0.0,>=8.2.3 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from lunary==1.1.5) (8.5.0)\n",
+      "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from aiohttp<4.0.0,>=3.9.5->lunary==1.1.5) (2.4.2)\n",
+      "Requirement already satisfied: aiosignal>=1.1.2 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from aiohttp<4.0.0,>=3.9.5->lunary==1.1.5) (1.3.1)\n",
+      "Requirement already satisfied: attrs>=17.3.0 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from aiohttp<4.0.0,>=3.9.5->lunary==1.1.5) (24.2.0)\n",
+      "Requirement already satisfied: frozenlist>=1.1.1 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from aiohttp<4.0.0,>=3.9.5->lunary==1.1.5) (1.4.1)\n",
+      "Requirement already satisfied: multidict<7.0,>=4.5 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from aiohttp<4.0.0,>=3.9.5->lunary==1.1.5) (6.1.0)\n",
+      "Requirement already satisfied: yarl<2.0,>=1.12.0 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from aiohttp<4.0.0,>=3.9.5->lunary==1.1.5) (1.13.1)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from requests<3.0.0,>=2.31.0->lunary==1.1.5) (3.3.2)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from requests<3.0.0,>=2.31.0->lunary==1.1.5) (3.10)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from requests<3.0.0,>=2.31.0->lunary==1.1.5) (2.2.3)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from requests<3.0.0,>=2.31.0->lunary==1.1.5) (2024.8.30)\n",
+      "Using cached lunary-1.1.5-py3-none-any.whl (18 kB)\n",
+      "Installing collected packages: lunary\n",
+      "Successfully installed lunary-1.1.5\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "%pip install duckduckgo-search==5.3.0b4\n",
+    "\n",
+    "# install lunary if you need logging\n",
+    "%pip install lunary==1.1.5"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
    "id": "d6602c72",
    "metadata": {},
    "outputs": [
@@ -24,7 +95,7 @@
        "True"
       ]
      },
-     "execution_count": 1,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -49,12 +120,73 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
+   "id": "be73aa72",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: pyautogen in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (0.3.1)\n",
+      "Requirement already satisfied: diskcache in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from pyautogen) (5.6.3)\n",
+      "Requirement already satisfied: docker in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from pyautogen) (7.1.0)\n",
+      "Requirement already satisfied: flaml in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from pyautogen) (2.3.1)\n",
+      "Requirement already satisfied: numpy<2,>=1.17.0 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from pyautogen) (1.26.4)\n",
+      "Requirement already satisfied: openai>=1.3 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from pyautogen) (1.50.2)\n",
+      "Requirement already satisfied: packaging in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from pyautogen) (23.2)\n",
+      "Requirement already satisfied: pydantic!=2.6.0,<3,>=1.10 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from pyautogen) (2.9.2)\n",
+      "Requirement already satisfied: python-dotenv in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from pyautogen) (1.0.1)\n",
+      "Requirement already satisfied: termcolor in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from pyautogen) (2.5.0)\n",
+      "Requirement already satisfied: tiktoken in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from pyautogen) (0.7.0)\n",
+      "Requirement already satisfied: anyio<5,>=3.5.0 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from openai>=1.3->pyautogen) (4.6.0)\n",
+      "Requirement already satisfied: distro<2,>=1.7.0 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from openai>=1.3->pyautogen) (1.9.0)\n",
+      "Requirement already satisfied: httpx<1,>=0.23.0 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from openai>=1.3->pyautogen) (0.27.2)\n",
+      "Requirement already satisfied: jiter<1,>=0.4.0 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from openai>=1.3->pyautogen) (0.5.0)\n",
+      "Requirement already satisfied: sniffio in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from openai>=1.3->pyautogen) (1.3.1)\n",
+      "Requirement already satisfied: tqdm>4 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from openai>=1.3->pyautogen) (4.66.5)\n",
+      "Requirement already satisfied: typing-extensions<5,>=4.11 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from openai>=1.3->pyautogen) (4.12.2)\n",
+      "Requirement already satisfied: annotated-types>=0.6.0 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from pydantic!=2.6.0,<3,>=1.10->pyautogen) (0.7.0)\n",
+      "Requirement already satisfied: pydantic-core==2.23.4 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from pydantic!=2.6.0,<3,>=1.10->pyautogen) (2.23.4)\n",
+      "Requirement already satisfied: requests>=2.26.0 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from docker->pyautogen) (2.32.3)\n",
+      "Requirement already satisfied: urllib3>=1.26.0 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from docker->pyautogen) (2.2.3)\n",
+      "Requirement already satisfied: regex>=2022.1.18 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from tiktoken->pyautogen) (2024.9.11)\n",
+      "Requirement already satisfied: idna>=2.8 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from anyio<5,>=3.5.0->openai>=1.3->pyautogen) (3.10)\n",
+      "Requirement already satisfied: certifi in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from httpx<1,>=0.23.0->openai>=1.3->pyautogen) (2024.8.30)\n",
+      "Requirement already satisfied: httpcore==1.* in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from httpx<1,>=0.23.0->openai>=1.3->pyautogen) (1.0.5)\n",
+      "Requirement already satisfied: h11<0.15,>=0.13 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from httpcore==1.*->httpx<1,>=0.23.0->openai>=1.3->pyautogen) (0.14.0)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from requests>=2.26.0->docker->pyautogen) (3.3.2)\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "%pip install pyautogen"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
    "id": "6b3da0bc-d0f6-4f9c-8e69-0ad126f3a5ee",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "flaml.automl is not available. Please install flaml[automl] to enable AutoML functionalities.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml\n",
+      "sagemaker.config INFO - Not applying SDK defaults from location: /home/stefan/.config/sagemaker/config.yaml\n"
+     ]
+    }
+   ],
    "source": [
-    "!pip install pyautogen\n",
     "import autogen\n",
     "import os\n",
     "\n",
@@ -66,7 +198,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 5,
    "id": "d3f7738e",
    "metadata": {},
    "outputs": [],
@@ -112,7 +244,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 6,
    "id": "37d610dc",
    "metadata": {},
    "outputs": [],
@@ -140,21 +272,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 7,
    "id": "534453a5",
    "metadata": {},
    "outputs": [],
    "source": [
     "from motleycrew import MotleyCrew\n",
-    "from motleycrew.agents.langchain.tool_calling_react import ReActToolCallingAgent\n",
+    "from motleycrew.agents.langchain import ReActToolCallingMotleyAgent\n",
     "\n",
     "crew = MotleyCrew()\n",
-    "writer = ReActToolCallingAgent(tools=[knowledge_retrieval_tool])"
+    "writer = ReActToolCallingMotleyAgent(tools=[knowledge_retrieval_tool])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 8,
    "id": "59d9d90f",
    "metadata": {},
    "outputs": [],
@@ -175,28 +307,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 9,
    "id": "cf0c1a96",
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "WARNING:root:No known Cypher type matching annotation typing.Optional[typing.Any], will use JSON string\n",
-      "WARNING:root:No known Cypher type matching annotation typing.List[str], will use JSON string\n",
-      "WARNING:root:No known Cypher type matching annotation typing.List[str], will use JSON string\n"
-     ]
-    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "\u001b[33mUser_proxy\u001b[0m (to chat_manager):\n",
       "\n",
-      "Find a latest paper about GPT-4 on arxiv and find its potential applications in software.\n",
+      "Find a latest paper about GPT-4 advancements on arxiv and find its potential applications in software.\n",
       "\n",
       "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: User_proxy\n",
+      "\u001b[0m\n",
       "\u001b[31m\n",
       ">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
       "\u001b[33mUser_proxy\u001b[0m (to chat_manager):\n",
@@ -204,27 +330,93 @@
       "\n",
       "\n",
       "--------------------------------------------------------------------------------\n",
-      "\u001b[31m\n",
-      ">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
-      "\u001b[33mUser_proxy\u001b[0m (to chat_manager):\n",
+      "\u001b[32m\n",
+      "Next speaker: Coder\n",
+      "\u001b[0m\n",
+      "\u001b[33mCoder\u001b[0m (to chat_manager):\n",
       "\n",
+      "To fetch the latest paper about GPT-4 advancements from arXiv and explore its potential applications in software, the following python script uses the `arxiv` library that makes it easy to query the arXiv API, find the relevant papers about GPT-4, and display the title, summary, and categories of the latest paper. This will help us understand what kind of advancements have been made and the potential domains of application.\n",
       "\n",
+      "Here’s the python script that you will need to run:\n",
+      "\n",
+      "```python\n",
+      "# filename: fetch_latest_gpt4_paper.py\n",
+      "import arxiv\n",
+      "\n",
+      "# Search for papers on arXiv about GPT-4\n",
+      "search = arxiv.Search(\n",
+      "  query=\"GPT-4\",\n",
+      "  max_results=1,\n",
+      "  sort_by=arxiv.SortCriterion.SubmittedDate,  # Fetches the most recent submission\n",
+      "  sort_order=arxiv.SortOrder.Descending\n",
+      ")\n",
+      "\n",
+      "# Fetch and display the information for the latest paper\n",
+      "for result in search.results():\n",
+      "    print(\"Title:\", result.title)\n",
+      "    print(\"Authors:\", \", \".join(author.name for author in result.authors))\n",
+      "    print(\"Summary:\", result.summary.replace('\\n', ' '))\n",
+      "    print(\"Categories:\", \", \".join(category for category in result.categories))\n",
+      "    print(\"Published Date:\", result.published)\n",
+      "    print(\"PDF Link:\", result.pdf_url)\n",
+      "```\n",
+      "\n",
+      "To proceed:\n",
+      "1. Ensure you have the `arxiv` Python library installed. If not, you can install it using pip:\n",
+      "\n",
+      "   ```\n",
+      "   pip install arxiv\n",
+      "   ```\n",
+      "\n",
+      "2. Save the above code into a file named `fetch_latest_gpt4_paper.py`.\n",
+      "3. Run the script using Python:\n",
+      "\n",
+      "   ```\n",
+      "   python fetch_latest_gpt4_paper.py\n",
+      "   ```\n",
+      "\n",
+      "The script will display the latest GPT-4 paper it finds on arXiv. After you run this and provide me with the summary from the output, I can then help identify the potential software applications described in the paper.\n",
       "\n",
       "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: User_proxy\n",
+      "\u001b[0m\n",
       "\u001b[31m\n",
       ">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
+      "\u001b[31m\n",
+      ">>>>>>>> EXECUTING CODE BLOCK 0 (inferred language is python)...\u001b[0m\n",
       "\u001b[33mUser_proxy\u001b[0m (to chat_manager):\n",
       "\n",
+      "exitcode: 1 (execution failed)\n",
+      "Code output: \n",
+      "Traceback (most recent call last):\n",
+      "  File \"fetch_latest_gpt4_paper.py\", line 2, in <module>\n",
+      "    import arxiv\n",
+      "ModuleNotFoundError: No module named 'arxiv'\n",
       "\n",
       "\n",
       "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: User_proxy\n",
+      "\u001b[0m\n",
       "\u001b[31m\n",
       ">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
+      "\u001b[31m\n",
+      ">>>>>>>> EXECUTING CODE BLOCK 0 (inferred language is python)...\u001b[0m\n",
       "\u001b[33mUser_proxy\u001b[0m (to chat_manager):\n",
       "\n",
+      "exitcode: 1 (execution failed)\n",
+      "Code output: \n",
+      "Traceback (most recent call last):\n",
+      "  File \"fetch_latest_gpt4_paper.py\", line 2, in <module>\n",
+      "    import arxiv\n",
+      "ModuleNotFoundError: No module named 'arxiv'\n",
       "\n",
       "\n",
       "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: User_proxy\n",
+      "\u001b[0m\n",
       "\u001b[31m\n",
       ">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
       "\u001b[33mUser_proxy\u001b[0m (to chat_manager):\n",
@@ -232,6 +424,9 @@
       "\n",
       "\n",
       "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: User_proxy\n",
+      "\u001b[0m\n",
       "\u001b[31m\n",
       ">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
       "\u001b[33mUser_proxy\u001b[0m (to chat_manager):\n",
@@ -239,161 +434,112 @@
       "\n",
       "\n",
       "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: Coder\n",
+      "\u001b[0m\n",
       "\u001b[33mCoder\u001b[0m (to chat_manager):\n",
       "\n",
-      "First, we'll search the ArXiv repository for the latest papers on GPT-4. We can achieve this using the `arxiv` Python package, which allows querying and fetching papers from ArXiv. \n",
-      "\n",
-      "To begin, we need to search for the most recent papers related to \"GPT-4\". Once we find the papers, I will look into the one that is most recent and check its content to find mentions of potential applications in software.\n",
-      "\n",
-      "Let's start by finding the latest paper about GPT-4 on ArXiv:\n",
+      "It appears that the ‘arxiv’ library required for the script has not been installed. Please install the library using the following command and then execute the script again:\n",
       "\n",
-      "```python\n",
-      "# Install the arXiv package if needed\n",
-      "# pip install arxiv\n",
+      "```sh\n",
+      "pip install arxiv\n",
+      "```\n",
       "\n",
-      "import arxiv\n",
+      "After installing the library, rerun the script `fetch_latest_gpt4_paper.py` by issuing the following command:\n",
       "\n",
-      "# Query for the latest papers on GPT-4\n",
-      "def find_latest_gpt4_paper():\n",
-      "    search = arxiv.Search(\n",
-      "        query=\"GPT-4\",\n",
-      "        max_results=1,\n",
-      "        sort_by=arxiv.SortCriterion.SubmittedDate\n",
-      "    )\n",
-      "    \n",
-      "    # Fetch the most recent paper\n",
-      "    for result in search.results():\n",
-      "        print(\"Title:\", result.title)\n",
-      "        print(\"Authors:\", \", \".join(author.name for author in result.authors))\n",
-      "        print(\"Abstract:\", result.summary.replace('\\n', ' '))\n",
-      "        print(\"Published:\", result.published.date())\n",
-      "        print(\"URL:\", result.entry_id)\n",
-      "        break\n",
-      "\n",
-      "find_latest_gpt4_paper()\n",
+      "```sh\n",
+      "python fetch_latest_gpt4_paper.py\n",
       "```\n",
       "\n",
-      "Please run the above Python script. It will output the title, authors, abstract, publication date, and URL of the most recent paper about GPT-4 on ArXiv. We can then analyze the abstract to identify its potential applications in software.\n",
+      "This should fetch the latest GPT-4 paper from arXiv. Please provide the output afterwards so I can help identify the potential applications in software from the paper's content.\n",
       "\n",
       "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: User_proxy\n",
+      "\u001b[0m\n",
       "\u001b[31m\n",
       ">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
       "\u001b[31m\n",
-      ">>>>>>>> EXECUTING CODE BLOCK 0 (inferred language is python)...\u001b[0m\n",
+      ">>>>>>>> EXECUTING CODE BLOCK 0 (inferred language is sh)...\u001b[0m\n",
+      "\u001b[31m\n",
+      ">>>>>>>> EXECUTING CODE BLOCK 1 (inferred language is sh)...\u001b[0m\n",
       "\u001b[33mUser_proxy\u001b[0m (to chat_manager):\n",
       "\n",
-      "exitcode: 1 (execution failed)\n",
+      "exitcode: 0 (execution succeeded)\n",
       "Code output: \n",
-      "Traceback (most recent call last):\n",
-      "  File \"\", line 4, in <module>\n",
-      "    import arxiv\n",
-      "ModuleNotFoundError: No module named 'arxiv'\n",
+      "Collecting arxiv\n",
+      "  Downloading arxiv-2.1.3-py3-none-any.whl.metadata (6.1 kB)\n",
+      "Collecting feedparser~=6.0.10 (from arxiv)\n",
+      "  Using cached feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)\n",
+      "Requirement already satisfied: requests~=2.32.0 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from arxiv) (2.32.3)\n",
+      "Collecting sgmllib3k (from feedparser~=6.0.10->arxiv)\n",
+      "  Using cached sgmllib3k-1.0.0-py3-none-any.whl\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from requests~=2.32.0->arxiv) (3.3.2)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from requests~=2.32.0->arxiv) (3.10)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from requests~=2.32.0->arxiv) (2.2.3)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /home/stefan/.cache/pypoetry/virtualenvs/motleycrew-RnW5x4dY-py3.12/lib/python3.12/site-packages (from requests~=2.32.0->arxiv) (2024.8.30)\n",
+      "Downloading arxiv-2.1.3-py3-none-any.whl (11 kB)\n",
+      "Using cached feedparser-6.0.11-py3-none-any.whl (81 kB)\n",
+      "Installing collected packages: sgmllib3k, feedparser, arxiv\n",
+      "Successfully installed arxiv-2.1.3 feedparser-6.0.11 sgmllib3k-1.0.0\n",
+      "\n",
+      "Title: SWE-Bench+: Enhanced Coding Benchmark for LLMs\n",
+      "Authors: Reem Aleithan, Haoran Xue, Mohammad Mahdi Mohajer, Elijah Nnorom, Gias Uddin, Song Wang\n",
+      "Summary: Large Language Models (LLMs) in Software Engineering (SE) can offer assistance for coding. To facilitate a rigorous evaluation of LLMs in practical coding contexts, Carlos et al. introduced the SWE-bench dataset, which comprises 2,294 real-world GitHub issues and their corresponding pull requests, collected from 12 widely used Python repositories. Several impressive LLM-based toolkits recently are developed and evaluated on this dataset. However, a systematic evaluation of the quality of SWE-bench remains missing. In this paper, we addressed this gap by presenting an empirical analysis of the SWE-bench dataset. We conducted a manual screening of instances where SWEAgent + GPT-4 successfully resolved issues by comparing the model-generated patches with the actual pull requests. SWE-Agent+GPT-4 was at the top of SWE-bench leaderboard during the time of our study. Our analysis reveals some critical issues with the SWE-bench dataset: 1) 32.67% of the successful patches involve cheating as the solutions were directly provided in the issue report or the comments. We refer to as solution leakage problem. 2) 31.08% of the passed patches are suspicious patches due to weak test cases, i.e., the tests were not adequate to verify the correctness of a patch. When we filtered out these problematic issues, the resolution rate of SWE-Agent+GPT-4 dropped from 12.47% to 3.97%. We also observed that the same data quality issues also exist in the two variants of SWE-bench, i.e., SWE-bench Lite and SWE-Bench Verified. In addition, over 94% of the issues were created before LLM's knowledge cutoff dates, posing potential data leakage issues.\n",
+      "Categories: cs.SE\n",
+      "Published Date: 2024-10-09 15:38:53+00:00\n",
+      "PDF Link: http://arxiv.org/pdf/2410.06992v1\n",
       "\n",
       "\n",
       "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: Coder\n",
+      "\u001b[0m\n",
       "\u001b[33mCoder\u001b[0m (to chat_manager):\n",
       "\n",
-      "It looks like the Python package `arxiv` is not installed, which caused the error. Instead of using the `arxiv` package, we can employ web scraping techniques to obtain the information from the ArXiv website. We'll use `requests` to fetch the page content and `BeautifulSoup` to parse it.\n",
+      "The latest paper retrieved from arXiv titled **\"SWE-Bench+: Enhanced Coding Benchmark for LLMs\"** discusses advancements related to Large Language Models (LLMs) like GPT-4 specifically in the context of software engineering. Here are some important insights regarding its potential applications in software:\n",
       "\n",
-      "Please ensure that you have the `requests` and `beautifulsoup4` libraries installed. If not, you can install them using pip.\n",
+      "1. **Automated Code Generation**: The SWE-Agent+GPT-4 demonstrated the ability to resolve issues by generating code patches based on real-world GitHub issues. This application shows promise for automated pull requests and bug fixes.\n",
       "\n",
-      "Here is the modified code to search for the latest GPT-4 paper on ArXiv using web scraping:\n",
+      "2. **Evaluation of LLMs in Real-world Scenarios**: The paper emphasizes rigorous evaluation of LLMs in practical coding contexts, implying that GPT-4 can significantly contribute to ensuring the reliability and quality of automations in software development.\n",
       "\n",
-      "```python\n",
-      "# filename: fetch_latest_gpt4_paper.py\n",
+      "3. **Quality Control in Machine-Generated Code**: The paper addresses issues like 'solution leakage' and weak test cases impacting the performance of GPT-4. The analysis in the paper could help improve the mechanisms for validating and verifying machine-generated code, making it safe and reliable for practical use.\n",
       "\n",
-      "# Install the necessary packages if they are not installed\n",
-      "# pip install requests beautifulsoup4\n",
-      "\n",
-      "import requests\n",
-      "from bs4 import BeautifulSoup\n",
-      "\n",
-      "def fetch_latest_gpt4_paper():\n",
-      "    # URL to search for GPT-4 related papers sorted by newest\n",
-      "    url = \"https://arxiv.org/search/?query=GPT-4&searchtype=all&abstracts=show&order=-announced_date_first&size=1\"\n",
-      "    \n",
-      "    # Issue a GET request to fetch the page content\n",
-      "    response = requests.get(url)\n",
-      "    if response.status_code != 200:\n",
-      "        print(\"Failed to fetch data from arXiv\")\n",
-      "        return\n",
-      "    \n",
-      "    # Use BeautifulSoup to parse the HTML content\n",
-      "    soup = BeautifulSoup(response.text, 'html.parser')\n",
-      "    \n",
-      "    # Locate the title, authors, abstract, published date, and link to the paper\n",
-      "    paper = soup.select_one('li.arxiv-result')\n",
-      "    title = paper.select_one('p.title').text.strip()\n",
-      "    authors = paper.select_one('p.authors').text.replace('\\n', ' ').strip()\n",
-      "    abstract = paper.select_one('span.abstract-full').text.strip()\n",
-      "    published_date = paper.select_one('p.is-size-7').text.strip()\n",
-      "    link = \"https://arxiv.org\" + paper.select_one('p.title').a['href']\n",
-      "\n",
-      "    # Print the results\n",
-      "    print(\"Title:\", title)\n",
-      "    print(\"Authors:\", authors)\n",
-      "    print(\"Abstract:\", abstract)\n",
-      "    print(\"Published:\", published_date)\n",
-      "    print(\"URL:\", link)\n",
-      "\n",
-      "fetch_latest_gpt4_paper()\n",
-      "```\n",
+      "The analysis also highlighted the importance of data quality in training these models, where data leakage and inadequate tests can mislead the effectiveness of the model. This understanding can drive the development of better training datasets and evaluation protocols for AI-driven development tools.\n",
       "\n",
-      "Please save this script to a file, install the required packages if needed, and run the script. It will print the details of the most recent paper on GPT-4 from ArXiv. We can then proceed to analyze the abstract for potential applications in software.\n",
+      "Overall, the potential applications of GPT-4 in software outlined by this academic work include enhancing automated coding tools, improving software development testing, and refining dataset quality for training AI models. These advancements can lead to more robust, AI-assisted programming environments, potentially reducing human error and increasing development efficiency.\n",
       "\n",
-      "--------------------------------------------------------------------------------\n",
-      "\u001b[31m\n",
-      ">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
-      "\u001b[31m\n",
-      ">>>>>>>> EXECUTING CODE BLOCK 0 (inferred language is python)...\u001b[0m\n",
-      "\u001b[33mUser_proxy\u001b[0m (to chat_manager):\n",
+      "This reflects a substantial forward in how AI can assist in software engineering, hinting at broader implementation and deeper integration of LLMs into the software development lifecycle.\n",
       "\n",
-      "exitcode: 0 (execution succeeded)\n",
-      "Code output: \n",
-      "Failed to fetch data from arXiv\n",
+      "If you find these findings intriguing and wish to delve deeper into the details of the research, you can further explore the study via the provided PDF link: [SWE-Bench+: Enhanced Coding Benchmark for LLMs PDF](http://arxiv.org/pdf/2410.06992v1)\n",
       "\n",
+      "Would there be anything specific you'd like to explore or inquire further about this topic?\n",
       "\n",
       "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: Product_manager\n",
+      "\u001b[0m\n",
       "\u001b[33mProduct_manager\u001b[0m (to chat_manager):\n",
       "\n",
-      "It seems there was an issue fetching the data from the ArXiv website, possibly due to network issues or restrictions on scraping from their site. As an alternative approach, here are some methods you can consider to manually find the latest paper about GPT-4 on ArXiv:\n",
-      "\n",
-      "### Manual Search on ArXiv:\n",
-      "1. **Go to the ArXiv website**: Visit [arxiv.org](https://arxiv.org/).\n",
-      "2. **Use the Search Bar**: Type \"GPT-4\" into the search bar.\n",
-      "3. **Adjust Search Settings**: Ensure the search is set to sort by the most recent submissions first. You can adjust this in the search settings.\n",
-      "4. **Review Results**: Look through the titles and abstracts for the most recent entries to find relevant papers.\n",
-      "\n",
-      "### Google Scholar:\n",
-      "1. **Visit Google Scholar**: Go to [Google Scholar](https://scholar.google.com/).\n",
-      "2. **Search for \"GPT-4\"**: Ensure to add filters for recent years to get the most up-to-date research.\n",
-      "3. **Review and Filter Results**: Google Scholar often includes links to ArXiv preprints, helping you find the latest studies.\n",
-      "\n",
-      "### Using APIs:\n",
-      "If you are still interested in automating this process, using a more robust method such as the CrossRef API or the ArXiv API with proper handling of user-agent and headers might yield better results.\n",
-      "\n",
-      "If you can manually find a paper, you can share its abstract here, and we can analyze potential software applications together. Alternatively, if you have any other tasks or need further assistance, feel free to let me know!\n",
+      "It appears there was a misunderstanding. If you have specific software product ideas in relation to GPT-4 advancements or any other requests, please let me know so I can assist you further.\n",
       "\n",
       "--------------------------------------------------------------------------------\n",
-      "\u001b[33mUser_proxy\u001b[0m (to chat_manager):\n",
-      "\n",
-      "Find a latest paper about GPT-4 on arxiv and find its potential applications in software.\n",
-      "\n",
-      "--------------------------------------------------------------------------------\n",
-      "\u001b[31m\n",
-      ">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
-      "\u001b[33mUser_proxy\u001b[0m (to chat_manager):\n",
-      "\n",
+      "\u001b[32m\n",
+      "Next speaker: Product_manager\n",
+      "\u001b[0m\n",
+      "\u001b[33mProduct_manager\u001b[0m (to chat_manager):\n",
       "\n",
+      "It looks like there was a bit of confusion in our last exchange. If you have any specific queries or directions regarding using GPT-4 advancements in software product ideas or any other questions, please let me know! I'm here to help with more tailored responses or to explore other areas of interest you might have.\n",
       "\n",
       "--------------------------------------------------------------------------------\n",
-      "\u001b[31m\n",
-      ">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
       "\u001b[33mUser_proxy\u001b[0m (to chat_manager):\n",
       "\n",
-      "\n",
+      "Find a latest paper about GPT-4 advancements on arxiv and find its potential applications in software.\n",
       "\n",
       "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: User_proxy\n",
+      "\u001b[0m\n",
       "\u001b[31m\n",
       ">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
       "\u001b[33mUser_proxy\u001b[0m (to chat_manager):\n",
@@ -401,6 +547,9 @@
       "\n",
       "\n",
       "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: User_proxy\n",
+      "\u001b[0m\n",
       "\u001b[31m\n",
       ">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
       "\u001b[33mUser_proxy\u001b[0m (to chat_manager):\n",
@@ -408,6 +557,9 @@
       "\n",
       "\n",
       "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: User_proxy\n",
+      "\u001b[0m\n",
       "\u001b[31m\n",
       ">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
       "\u001b[33mUser_proxy\u001b[0m (to chat_manager):\n",
@@ -415,6 +567,9 @@
       "\n",
       "\n",
       "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: User_proxy\n",
+      "\u001b[0m\n",
       "\u001b[31m\n",
       ">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
       "\u001b[33mUser_proxy\u001b[0m (to chat_manager):\n",
@@ -422,6 +577,9 @@
       "\n",
       "\n",
       "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: User_proxy\n",
+      "\u001b[0m\n",
       "\u001b[31m\n",
       ">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
       "\u001b[33mUser_proxy\u001b[0m (to chat_manager):\n",
@@ -429,6 +587,9 @@
       "\n",
       "\n",
       "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: User_proxy\n",
+      "\u001b[0m\n",
       "\u001b[31m\n",
       ">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
       "\u001b[33mUser_proxy\u001b[0m (to chat_manager):\n",
@@ -436,53 +597,55 @@
       "\n",
       "\n",
       "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: Coder\n",
+      "\u001b[0m\n",
       "\u001b[33mCoder\u001b[0m (to chat_manager):\n",
       "\n",
-      "First, let's write a script to search for the latest papers about GPT-4 on arXiv, a well-known repository for scientific papers. The script will use the arXiv API to fetch this information. Let's retrieve the details of the most recent paper related to GPT-4.\n",
+      "First, let's search for the latest papers related to GPT-4 advancements on arXiv.org using their API. To do this, we will query the arXiv API for papers mentioning \"GPT-4\" and sort them by submission date to get the most recent publications.\n",
       "\n",
       "```python\n",
-      "# filename: fetch_gpt4_papers.py\n",
       "import urllib.request\n",
       "import urllib.parse\n",
-      "import xml.etree.ElementTree as ET\n",
+      "import feedparser\n",
       "\n",
-      "# Define the base URL for the ArXiv API\n",
+      "# Base URL for the arXiv API\n",
       "base_url = 'http://export.arxiv.org/api/query?'\n",
       "\n",
-      "# Specify search parameters: search for GPT-4, max result is 1, sorted by the latest submissions\n",
-      "search_params = {\n",
-      "    \"search_query\": \"all:GPT-4\",\n",
-      "    \"sortBy\": \"submittedDate\",\n",
-      "    \"sortOrder\": \"descending\",\n",
-      "    \"max_results\": 1\n",
+      "# Search parameters\n",
+      "query = 'ti:\"GPT-4\"'\n",
+      "params = {\n",
+      "    'search_query': query,\n",
+      "    'sortBy': 'submittedDate',\n",
+      "    'sortOrder': 'descending',\n",
+      "    'max_results': 1\n",
       "}\n",
       "\n",
-      "# Encode the search parameters, make the request, and read the response\n",
-      "query_string = urllib.parse.urlencode(search_params)\n",
-      "search_url = base_url + query_string\n",
-      "response = urllib.request.urlopen(search_url)\n",
-      "response_content = response.read()\n",
-      "\n",
-      "# Parse the XML response content\n",
-      "root = ET.fromstring(response_content)\n",
-      "\n",
-      "# Extract and print the title, summary, and published date from the entry\n",
-      "for entry in root.findall('{http://www.w3.org/2005/Atom}entry'):\n",
-      "    title = entry.find('{http://www.w3.org/2005/Atom}title').text\n",
-      "    summary = entry.find('{http://www.w3.org/2005/Atom}summary').text\n",
-      "    published_date = entry.find('{http://www.w3.org/2005/Atom}published').text\n",
-      "\n",
-      "    # Output the results\n",
-      "    print('Title:', title)\n",
-      "    print('Published Date:', published_date)\n",
-      "    print('Summary:', summary)\n",
+      "# Form the request URL\n",
+      "url = base_url + urllib.parse.urlencode(params)\n",
+      "\n",
+      "# Perform the request\n",
+      "response = urllib.request.urlopen(url)\n",
+      "feed = feedparser.parse(response)\n",
+      "\n",
+      "# Check if any results were found\n",
+      "if len(feed.entries) > 0:\n",
+      "    title = feed.entries[0].title\n",
+      "    summary = feed.entries[0].summary\n",
+      "    published = feed.entries[0].published\n",
+      "    link = feed.entries[0].links[0].href\n",
+      "\n",
+      "    print(f\"Title: {title}\\nPublished: {published}\\nLink: {link}\\nSummary: {summary}\")\n",
+      "else:\n",
+      "    print(\"No results found for GPT-4 on arXiv.\")\n",
       "```\n",
       "\n",
-      "**Instructions**: \n",
-      "1. Save the above script to a file named `fetch_gpt4_papers.py`.\n",
-      "2. Run the script using a Python interpreter. This will retrieve the latest paper about GPT-4 from arXiv.\n",
+      "Please run the above Python script. It will find and print details about the latest paper related to GPT-4 on arXiv, including the title, publication date, link, and a brief summary.\n",
       "\n",
       "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: User_proxy\n",
+      "\u001b[0m\n",
       "\u001b[31m\n",
       ">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
       "\u001b[31m\n",
@@ -491,72 +654,113 @@
       "\n",
       "exitcode: 0 (execution succeeded)\n",
       "Code output: \n",
-      "Title: Observational Scaling Laws and the Predictability of Language Model\n",
-      "  Performance\n",
-      "Published Date: 2024-05-17T17:49:44Z\n",
-      "Summary:   Understanding how language model performance varies with scale is critical to\n",
-      "benchmark and algorithm development. Scaling laws are one approach to building\n",
-      "this understanding, but the requirement of training models across many\n",
-      "different scales has limited their use. We propose an alternative,\n",
-      "observational approach that bypasses model training and instead builds scaling\n",
-      "laws from ~80 publically available models. Building a single scaling law from\n",
-      "multiple model families is challenging due to large variations in their\n",
-      "training compute efficiencies and capabilities. However, we show that these\n",
-      "variations are consistent with a simple, generalized scaling law where language\n",
-      "model performance is a function of a low-dimensional capability space, and\n",
-      "model families only vary in their efficiency in converting training compute to\n",
-      "capabilities. Using this approach, we show the surprising predictability of\n",
-      "complex scaling phenomena: we show that several emergent phenomena follow a\n",
-      "smooth, sigmoidal behavior and are predictable from small models; we show that\n",
-      "the agent performance of models such as GPT-4 can be precisely predicted from\n",
-      "simpler non-agentic benchmarks; and we show how to predict the impact of\n",
-      "post-training interventions like Chain-of-Thought and Self-Consistency as\n",
-      "language model capabilities continue to improve.\n",
+      "Title: Creative and Context-Aware Translation of East Asian Idioms with GPT-4\n",
+      "Published: 2024-10-01T18:24:43Z\n",
+      "Link: http://arxiv.org/abs/2410.00988v1\n",
+      "Summary: As a type of figurative language, an East Asian idiom condenses rich cultural\n",
+      "background into only a few characters. Translating such idioms is challenging\n",
+      "for human translators, who often resort to choosing a context-aware translation\n",
+      "from an existing list of candidates. However, compiling a dictionary of\n",
+      "candidate translations demands much time and creativity even for expert\n",
+      "translators. To alleviate such burden, we evaluate if GPT-4 can help generate\n",
+      "high-quality translations. Based on automatic evaluations of faithfulness and\n",
+      "creativity, we first identify Pareto-optimal prompting strategies that can\n",
+      "outperform translation engines from Google and DeepL. Then, at a low cost, our\n",
+      "context-aware translations can achieve far more high-quality translations per\n",
+      "idiom than the human baseline. We open-source all code and data to facilitate\n",
+      "further research.\n",
+      "\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: Product_manager\n",
+      "\u001b[0m\n",
+      "\u001b[33mProduct_manager\u001b[0m (to chat_manager):\n",
+      "\n",
+      "The latest paper on GPT-4, titled \"Creative and Context-Aware Translation of East Asian Idioms with GPT-4,\" describes an innovative application of GPT-4 in language translation, particularly focusing on translating complex East Asian idioms. The paper is available [here](http://arxiv.org/abs/2410.00988v1).\n",
+      "\n",
+      "**Potential Applications in Software:**\n",
+      "\n",
+      "1. **Translation Software Enhancement:**\n",
+      "   - Implementing GPT-4 in translation software could significantly improve the translation quality of idiomatic phrases, which are often a challenge due to their contextual and cultural depth. Software could provide multiple translations with contextual nuances, improving user understanding and satisfaction.\n",
+      "\n",
+      "2. **Language Learning Tools:**\n",
+      "   - Language learning platforms can integrate GPT-4 to offer more nuanced translations of phrases and idioms, helping learners understand the cultural context better. This could enhance comprehension and retention for students studying East Asian languages.\n",
+      "\n",
+      "3. **Content Localization Services:**\n",
+      "   - Companies that require localization of content into multiple languages, such as media services or multinational corporations, could use GPT-4-enhanced tools to ensure that the translated content retains the intended meaning and cultural significance.\n",
       "\n",
+      "4. **Dictionary and Reference Applications:**\n",
+      "   - Digital dictionaries and reference applications can leverage GPT-4 to provide users with rich, contextually appropriate interpretations of idioms or phrases, rather than static, literal translations.\n",
       "\n",
+      "5. **Creative Writing Software:**\n",
+      "   - Tools designed to aid in creative writing or content generation could incorporate GPT-4 to suggest creative ways to use idiomatic expressions in various languages, enhancing the narrative quality and engagement of written content.\n",
+      "\n",
+      "6. **Automated Subtitling and Dubbing in Media Production:**\n",
+      "   - Media production companies can apply GPT-4-based translation frameworks to automate and improve the quality of subtitling and dubbing in different languages, particularly for content rich in cultural idioms.\n",
+      "\n",
+      "This advancement offers an exciting glimpse into how AI can bridge the gap between languages and cultures, enhancing understanding and communication globally.\n",
       "\n",
       "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: Product_manager\n",
+      "\u001b[0m\n",
       "\u001b[33mProduct_manager\u001b[0m (to chat_manager):\n",
       "\n",
-      "Based on the summary of the latest paper about GPT-4 titled \"Observational Scaling Laws and the Predictability of Language Model Performance,\" here are several potential applications in software development and related fields:\n",
+      "This research demonstrates GPT-4's potential in language translation applications, specifically for translating complex East Asian idioms. Here are possible applications of this technology in software:\n",
+      "\n",
+      "1. **Advanced Translation Services**: GPT-4 can be integrated into translation software to provide more accurate and culturally relevant translations of idioms. This would be particularly useful in professional settings where precise translation can avoid miscommunication.\n",
+      "\n",
+      "2. **Educational Tools**: Educational platforms could use GPT-4 to help students understand idioms and expressions from different cultures in their original context, improving language learning by providing insights into cultural nuances.\n",
       "\n",
-      "1. **Predictive Benchmarking Tools**:\n",
-      "   - The paper discusses the predictability of language model performance using scaling laws. This could lead to the development of new software tools that predict the performance of language models like GPT-4 based on various scaling inputs. Such tools would be valuable in optimizing model configurations for specific applications without needing extensive empirical testing.\n",
+      "3. **Content Localization**: Companies looking to expand globally often need to adapt their content to local cultures. GPT-4's ability to provide context-aware translations could improve content localization, making products more accessible and appealing to international markets.\n",
       "\n",
-      "2. **Resource Optimization Algorithms**:\n",
-      "   - The paper's framework for predicting performance as a function of a \"capability space\" can be translated into algorithms that optimize the training compute resources for developing AI models. This could significantly reduce costs and increase the efficiency of AI training cycles in commercial software development.\n",
+      "4. **Customer Support**: AI-powered chatbots and customer support tools could use GPT-4 to better understand and respond to queries that include idiomatic expressions, providing a more effective and satisfying user experience for customers speaking different languages.\n",
       "\n",
-      "3. **Automated Model Scaling**:\n",
-      "   - Software that automatically scales AI models' capabilities based on available compute resources could be developed. This would make AI more accessible, especially for organizations with fluctuating resource availability, by providing them with tools to dynamically adjust their model's complexity and performance.\n",
+      "5. **Creative Writing**: Software tools designed to assist with writing could incorporate GPT-4 to suggest or decode idiomatic expressions, aiding writers in creating more authentic and engaging content for diverse audiences.\n",
       "\n",
-      "4. **Advanced Development Frameworks**:\n",
-      "   - Integrating the predictive scaling laws into AI development frameworks could enable developers to anticipate the behavior and limitations of their models, facilitating more accurate and robust AI systems design.\n",
+      "These applications could significantly enhance the capabilities of software products by providing deeper linguistic and cultural understanding, essential for global interaction in the digital age.\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: Product_manager\n",
+      "\u001b[0m\n",
+      "\u001b[33mProduct_manager\u001b[0m (to chat_manager):\n",
       "\n",
-      "5. **Enhanced Debugging and Analysis Tools**:\n",
-      "   - Using the generalized scaling law proposed in the paper, new debugging and performance analysis tools can be created, which would help in identifying bottlenecks and inefficiencies in language model training and operation.\n",
+      "Given the significant potential of GPT-4 in enhancing translation accuracy and contextual understanding, especially for complex idiomatic expressions in East Asian languages, here are various software product ideas that could leverage such advancements:\n",
       "\n",
-      "6. **Optimization of Intervention Techniques**:\n",
-      "   - The ability to predict the impact of post-training interventions like the Chain-of-Thought and Self-Consistency could be encapsulated in software solutions that optimize these interventions. This would improve the effectiveness and reliability of language models in practical applications.\n",
+      "### Multilingual Customer Support Chatbots\n",
+      "1. **Improved Understanding**: GPT-4 could enable chatbots to understand and employ idiomatic language effectively, providing more natural and relatable interactions for users who communicate using these expressions.\n",
+      "2. **Cultural Sensitivity**: Chatbots could be trained to recognize cultural nuances, improving engagement with users from diverse backgrounds.\n",
       "\n",
-      "These applications demonstrate how theoretical insights from academic research can be used to inform and enhance software product development, particularly in the rapidly evolving field of artificial intelligence.\n",
+      "### Translation Tools for Authors and Creators\n",
+      "1. **Cultural Translation Features**: Tools that help authors translate their works while keeping cultural meanings intact, preserving the original sentiment and context.\n",
+      "2. **Assisted Writing Modes**: For writers looking to authentically incorporate foreign idioms into their work, offering suggestions and translations that retain the flavor of the source language.\n",
+      "\n",
+      "### Enhanced Subtitling Applications for Media\n",
+      "1. **Automated Contextual Subtitling**: Software that uses GPT-4 to generate subtitles not just for direct translation but with contextual adjustments ensuring cultural relevance and comprehension for diverse audiences.\n",
+      "2. **Creative Subtitling**: Tools for subtitlers that suggest multiple translation options per idiom, based on varying levels of formality or directness, enhancing viewer engagement and satisfaction.\n",
+      "\n",
+      "### Educational Language Learning Platforms\n",
+      "1. **Contextual Learning Modules**: Use GPT-4 to design language learning curricula that focus on the use of idioms in everyday conversation, providing learners with the cultural context behind the expressions.\n",
+      "2. **Interactive Scenarios**: Develop interactive scenarios where learners can see idioms used in different contexts, helping to solidify understanding and usage.\n",
+      "\n",
+      "### Localized Content Generation for Marketers\n",
+      "1. **Automated Content Localization**: Tools that automatically adapt content (including idioms and colloquialisms) to fit local markets more effectively, enhancing user engagement and brand relatability.\n",
+      "2. **Marketing Aide for Cultural Nuances**: Software that suggests modifications or enhancements to marketing materials to make them more appealing to specific regional audiences.\n",
+      "\n",
+      "These product ideas not only focus on translating language but also emphasize understanding and conveying underlying cultural contexts, which could dramatically improve communication and user experience across various digital platforms.\n",
       "\n",
       "--------------------------------------------------------------------------------\n"
      ]
     },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "WARNING:root:No known Cypher type matching annotation typing.Optional[typing.Any], will use JSON string\n"
-     ]
-    },
     {
      "data": {
       "text/plain": [
-       "[Task(status=done)]"
+       "[TaskUnit(status=done)]"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -567,26 +771,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 10,
    "id": "0ae5789a",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/markdown": [
-       "Final Answer: Welcome to the Future: Exploring the Frontiers of GPT-4\n",
+       "Final Answer: \n",
+       "\n",
+       "**Unveiling the Magic of GPT-4: Transforming Language and Culture**\n",
        "\n",
-       "In the rapidly evolving world of artificial intelligence, GPT-4 stands out as a beacon of innovation and potential. Recent research, particularly a fascinating study titled \"Observational Scaling Laws and the Predictability of Language Model Performance,\" sheds light on groundbreaking advancements that could redefine how we interact with technology.\n",
+       "In the ever-evolving world of artificial intelligence, GPT-4 stands out as a beacon of innovation, pushing the boundaries of what's possible in language processing. Imagine a world where language barriers crumble, and cultural nuances are effortlessly understood. That's the promise of GPT-4, especially when it comes to translating complex idiomatic expressions in East Asian languages. This isn't just about words; it's about capturing the essence of culture and context, making communication more authentic and relatable.\n",
        "\n",
-       "GPT-4 isn't just about understanding or generating text; it's about predicting and optimizing performance in ways previously deemed impossible. The study introduces predictive benchmarking tools that could revolutionize the development of AI models. Imagine software that can anticipate the performance of language models like GPT-4 based on various inputs, streamlining the development process and enhancing efficiency without the need for extensive testing.\n",
+       "One of the coolest applications of GPT-4 is in the realm of multilingual customer support chatbots. These aren't your average bots; they're culturally savvy, understanding and employing idiomatic language to create natural interactions. Imagine chatting with a bot that gets your cultural references and responds in a way that feels genuinely human. It's like having a conversation with a local, no matter where you are in the world. This level of cultural sensitivity can revolutionize customer engagement, making interactions more meaningful and effective.\n",
        "\n",
-       "But the implications extend beyond mere prediction. The research discusses resource optimization algorithms that could significantly reduce the costs associated with AI training. These algorithms utilize a \"capability space\" to optimize the compute resources needed, ensuring that AI development is not only faster but also more economical.\n",
+       "For authors and creators, GPT-4 opens up a world of possibilities in translation tools. Picture a tool that not only translates your work but also preserves the cultural meanings and sentiments behind it. Whether you're an author looking to share your story with a global audience or a writer wanting to incorporate foreign idioms authentically, GPT-4 has got you covered. It's like having a cultural ambassador by your side, ensuring your message resonates with readers from different backgrounds.\n",
        "\n",
-       "Moreover, GPT-4 could lead to the creation of automated model scaling software. Such tools would adjust AI capabilities based on available resources, making cutting-edge technology accessible to a broader range of users and organizations, regardless of their fluctuating resource availability.\n",
+       "The media industry, too, can benefit from GPT-4's advancements. Automated contextual subtitling is a game-changer, generating subtitles that go beyond direct translation to ensure cultural relevance and comprehension. Imagine watching a foreign film where the subtitles capture not just the dialogue but the cultural nuances, enhancing your viewing experience. It's like having a personal translator who understands the subtleties of language and culture, making global content more accessible and enjoyable.\n",
        "\n",
-       "The integration of these advancements into AI development frameworks could also empower developers with better tools for debugging and performance analysis, enhancing the robustness and accuracy of AI systems. This is not just about making AI smarter; it's about making AI development smarter, more intuitive, and infinitely more creative.\n",
+       "In the world of education, GPT-4 is a powerful ally for language learners. With contextual learning modules, students can dive into the world of idioms, understanding their use in everyday conversation. Interactive scenarios bring these expressions to life, helping learners grasp their meaning and usage in different contexts. It's like having a language coach who not only teaches you the words but also the stories behind them, enriching your learning journey.\n",
        "\n",
-       "As we stand on the brink of these exciting developments, GPT-4 is not just a tool but a transformational force in the tech landscape, promising to bring about a new era of innovation and efficiency. Stay tuned, because the future of AI is here, and it's more promising than ever!"
+       "GPT-4 is not just about language; it's about bridging cultures and enhancing communication. Whether it's through chatbots, translation tools, media applications, or educational platforms, the potential is limitless. As we continue to explore and harness the power of GPT-4, one thing is clear: the future of language processing is here, and it's more exciting than ever."
       ],
       "text/plain": [
        "<IPython.core.display.Markdown object>"
@@ -622,7 +828,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 11,
    "id": "a79da460",
    "metadata": {},
    "outputs": [
@@ -632,16 +838,19 @@
        "<function motleycrew.tools.tool.MotleyTool.to_autogen_tool.<locals>.autogen_tool_fn(input: str) -> str>"
       ]
      },
-     "execution_count": 24,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "from motleycrew.tools import MotleyTool\n",
+    "from motleycrew.tools import MotleyTool, RetryConfig\n",
     "from langchain_community.tools import DuckDuckGoSearchRun\n",
     "\n",
-    "search_tool = MotleyTool.from_supported_tool(DuckDuckGoSearchRun())  # Any tools or even motleycrew's agents can be converted to MotleyTool like this!\n",
+    "search_tool = MotleyTool.from_supported_tool(\n",
+    "    DuckDuckGoSearchRun(),\n",
+    "    retry_config=RetryConfig(max_retries=5)  # for retrying rate limit errors\n",
+    ")  # Any tools or even motleycrew's agents can be converted to MotleyTool like this!\n",
     "\n",
     "# Let's first define the assistant agent that suggests tool calls.\n",
     "assistant = autogen.ConversableAgent(\n",
@@ -673,7 +882,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 12,
    "id": "6027c59f",
    "metadata": {},
    "outputs": [
@@ -690,7 +899,7 @@
       ">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
       "\u001b[33mAssistant\u001b[0m (to User):\n",
       "\n",
-      "\u001b[32m***** Suggested tool call (call_Na0NhkbZVtS7cnme7XRqxqUS): search_tool *****\u001b[0m\n",
+      "\u001b[32m***** Suggested tool call (call_lp7DaZO1WUlu7YsMZazwsZX2): search_tool *****\u001b[0m\n",
       "Arguments: \n",
       "{\"input\":\"first computer\"}\n",
       "\u001b[32m****************************************************************************\u001b[0m\n",
@@ -702,8 +911,8 @@
       "\n",
       "\u001b[33mUser\u001b[0m (to Assistant):\n",
       "\n",
-      "\u001b[32m***** Response from calling tool (call_Na0NhkbZVtS7cnme7XRqxqUS) *****\u001b[0m\n",
-      "ENIAC, the first programmable general-purpose electronic digital computer, built during World War II by the United States and completed in 1946. The project was led by John Mauchly, J. Presper Eckert, Jr., and their colleagues. ENIAC was the most powerful calculating device built to that time. Learn about the origins and evolution of computers, from the first mechanical and electric machines to the modern devices we use today. Explore the inventions, concepts, and events that shaped the computer industry and culture. Learn about the history of computing, from Charles Babbage's mechanical difference and analytical engines to the first electronic computer, the Atanasoff-Berry Computer. Discover how World War II and personal computers shaped the evolution of computing. The first commercially available personal computer was the Altair 8800, released in 1975. The Altair was a rudimentary device sold as a $439 kit that users had to assemble themselves, but it quickly gained a cult following among technology enthusiasts. Ada Lovelace, English mathematician, an associate of Charles Babbage, for whose digital computer prototype, the Analytical Engine, she created a program in 1843. She has been called the first computer programmer. Ada Lovelace Day, the second Tuesday in October, honors women's contributions to science and technology.\n",
+      "\u001b[32m***** Response from calling tool (call_lp7DaZO1WUlu7YsMZazwsZX2) *****\u001b[0m\n",
+      "Learn how the English mathematician and inventor Charles Babbage designed the Difference Engine and the Analytical Engine, the first mechanical digital computers. Explore their features, components, and challenges in this article from Britannica. ENIAC was built by the United States during World War II and completed in 1946. It was the first general-purpose digital computer that could execute different instructions based on data values. Learn about the history of computing, from Charles Babbage's mechanical difference and analytical engines to the first electronic computer, the Atanasoff-Berry Computer. Discover how World War II and personal computers shaped the evolution of computers. Computer - ENIAC, Electronic, Computing: In the United States, government funding went to a project led by John Mauchly, J. Presper Eckert, Jr., and their colleagues at the Moore School of Electrical Engineering at the University of Pennsylvania; their objective was an all-electronic computer. Under contract to the army and under the direction of Herman Goldstine, work began in early 1943 on ... Learn about 10 of the earliest devices that shaped the concepts of computation and automated calculation, from ancient Greek gears to modern electronic circuits. Discover how they were used for astronomy, cryptography, physics, and more.\n",
       "\u001b[32m**********************************************************************\u001b[0m\n",
       "\n",
       "--------------------------------------------------------------------------------\n",
@@ -711,7 +920,31 @@
       ">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
       "\u001b[33mAssistant\u001b[0m (to User):\n",
       "\n",
-      "The first programmable general-purpose electronic digital computer was the ENIAC, completed in 1946. It was built during World War II by the United States and led by John Mauchly, J. Presper Eckert, Jr., and their colleagues. ENIAC stood as the most powerful calculating device of its time.\n",
+      "The title of \"first computer\" can be attributed to different machines depending on the criteria used (mechanical vs. digital, programmable vs. non-programmable).\n",
+      "\n",
+      "1. **Difference Engine and Analytical Engine by Charles Babbage**:\n",
+      "   - Charles Babbage, an English mathematician and inventor, designed the Difference Engine and the Analytical Engine in the 19th century. These were mechanical digital computers meant to automate the process of computing tables and had components analogous to a CPU, memory, and input/output paths.\n",
+      "   - Although the full versions of these engines were never completed during Babbage’s lifetime, they are often considered the first designs for a programmable computer.\n",
+      "\n",
+      "2. **Atanasoff-Berry Computer (ABC)**:\n",
+      "   - Conceived in the 1930s by John Atanasoff and Clifford Berry, the ABC was the first electronic digital computer. It was designed to solve systems of linear equations and was not Turing-complete nor programmable in the modern sense.\n",
+      "\n",
+      "3. **ENIAC (Electronic Numerical Integrator and Computer)**:\n",
+      "   - Developed by John Mauchly and J. Presper Eckert during World War II, and completed in 1946. ENIAC was considered the first high-speed, Turing-complete, digital computer capable of being reprogrammed to solve a full range of computing problems. \n",
+      "\n",
+      "The ENIAC often gets credited as the first \"general-purpose digital computer,\" given its electronic and programmable nature, distinguishing it from earlier mechanical or less flexible designs.\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[33mUser\u001b[0m (to Assistant):\n",
+      "\n",
+      "\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[31m\n",
+      ">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
+      "\u001b[33mAssistant\u001b[0m (to User):\n",
+      "\n",
+      "If you have any more questions or need further information about historical computers or any other topic, feel free to ask.\n",
       "\n",
       "--------------------------------------------------------------------------------\n",
       "\u001b[33mUser\u001b[0m (to Assistant):\n",
@@ -735,20 +968,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "d833c019",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "final_result = chat_result.summary"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a62f444f",
+   "execution_count": 17,
+   "id": "1d4b7df4-7327-46fe-81be-4e4e580638c9",
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The agent used the search tool!\n"
+     ]
+    }
+   ],
    "source": [
-    "The agent used the search tool!"
+    "final_result = 'The agent used the search tool!'\n",
+    "print(final_result)"
    ]
   }
  ],
@@ -768,7 +1002,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.12.6"
   }
  },
  "nbformat": 4,
diff --git a/examples/Validating agent output.ipynb b/examples/Validating agent output.ipynb
index dad852a1..055c1c62 100644
--- a/examples/Validating agent output.ipynb	
+++ b/examples/Validating agent output.ipynb	
@@ -11,15 +11,48 @@
     "As of now, tool calling is the most reliable way to get a structured output from an LLM. So why not have an output handler - a regular tool with an input schema and a description which instructs the agent to return the output only via this tool. Moreover, we can force the agent into doing this: if it attempts to finish the regular way, we intercept the message and remind the agent that it must call the tool instead."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's install external dependencies for the example."
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
+   "source": [
+    "%pip install duckduckgo-search==5.3.0b4\n",
+    "\n",
+    "# install lunary if you need logging\n",
+    "%pip install lunary==1.1.5"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "from langchain_community.tools import DuckDuckGoSearchRun\n",
     "from langchain_core.tools import StructuredTool\n",
     "\n",
+    "from dotenv import load_dotenv\n",
+    "load_dotenv()\n",
+    "\n",
     "from motleycrew import MotleyCrew\n",
     "from motleycrew.agents.langchain import ReActToolCallingMotleyAgent\n",
     "from motleycrew.common import configure_logging\n",
@@ -27,7 +60,7 @@
     "\n",
     "from motleycrew.common.exceptions import InvalidOutput\n",
     "\n",
-    "from motleycrew.tools import MotleyTool"
+    "from motleycrew.tools import MotleyTool, RetryConfig"
    ]
   },
   {
@@ -75,7 +108,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "search_tool = DuckDuckGoSearchRun()\n",
+    "search_tool = MotleyTool.from_supported_tool(\n",
+    "    DuckDuckGoSearchRun(),\n",
+    "    retry_config=RetryConfig(max_retries=7)  # for retrying rate limit errors\n",
+    ")\n",
+    "\n",
     "researcher = ReActToolCallingMotleyAgent(\n",
     "    tools=[search_tool, output_handler],\n",
     "    verbose=True,\n",
@@ -291,9 +328,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.2"
+   "version": "3.12.6"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
diff --git a/requirements-extra.txt b/requirements-extra.txt
index 195625be..3e4d546c 100644
--- a/requirements-extra.txt
+++ b/requirements-extra.txt
@@ -2,6 +2,6 @@ motleycache
 lunary==1.1.5
 llama-index==0.11.2
 crewai==0.51.1
-duckduckgo-search==5.3.0b4
+duckduckgo-search==6.3.3
 pglast==6.3
 ray[default]
diff --git a/tests/itest_cache/advanced_output_handling_ipynb/api.openai.com/v1_chat_completions/278ad6f1a07854b82944fac14065c44f44dd29e9923917fdbefd1dd992981f80.pkl b/tests/itest_cache/advanced_output_handling_ipynb/api.openai.com/v1_chat_completions/278ad6f1a07854b82944fac14065c44f44dd29e9923917fdbefd1dd992981f80.pkl
deleted file mode 100644
index 44086807..00000000
Binary files a/tests/itest_cache/advanced_output_handling_ipynb/api.openai.com/v1_chat_completions/278ad6f1a07854b82944fac14065c44f44dd29e9923917fdbefd1dd992981f80.pkl and /dev/null differ
diff --git a/tests/itest_cache/advanced_output_handling_ipynb/api.openai.com/v1_chat_completions/37ef0d5685ca2f344751efe775e3bdf1d39a24b33e3a60b30372ed4da9b77efb.pkl b/tests/itest_cache/advanced_output_handling_ipynb/api.openai.com/v1_chat_completions/37ef0d5685ca2f344751efe775e3bdf1d39a24b33e3a60b30372ed4da9b77efb.pkl
deleted file mode 100644
index b77561e1..00000000
Binary files a/tests/itest_cache/advanced_output_handling_ipynb/api.openai.com/v1_chat_completions/37ef0d5685ca2f344751efe775e3bdf1d39a24b33e3a60b30372ed4da9b77efb.pkl and /dev/null differ
diff --git a/tests/itest_cache/advanced_output_handling_ipynb/api.openai.com/v1_chat_completions/86c4a8edb449a0bb075cbf4518e499a05bc2e533fb891248492c3129340ddff7.pkl b/tests/itest_cache/advanced_output_handling_ipynb/api.openai.com/v1_chat_completions/86c4a8edb449a0bb075cbf4518e499a05bc2e533fb891248492c3129340ddff7.pkl
deleted file mode 100644
index aded3570..00000000
Binary files a/tests/itest_cache/advanced_output_handling_ipynb/api.openai.com/v1_chat_completions/86c4a8edb449a0bb075cbf4518e499a05bc2e533fb891248492c3129340ddff7.pkl and /dev/null differ
diff --git a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/1228edf573039e4eb59369efb58cdb5230f8b8946211df8213b487ef17f20bfd.pkl b/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/1228edf573039e4eb59369efb58cdb5230f8b8946211df8213b487ef17f20bfd.pkl
deleted file mode 100644
index 5cbcaa93..00000000
Binary files a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/1228edf573039e4eb59369efb58cdb5230f8b8946211df8213b487ef17f20bfd.pkl and /dev/null differ
diff --git a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/12d53ae9da313a308cf5bb68e44bb178c04d01f7765b6ae076cc13c52bba9803.pkl b/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/12d53ae9da313a308cf5bb68e44bb178c04d01f7765b6ae076cc13c52bba9803.pkl
deleted file mode 100644
index 0e5e1cbd..00000000
Binary files a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/12d53ae9da313a308cf5bb68e44bb178c04d01f7765b6ae076cc13c52bba9803.pkl and /dev/null differ
diff --git a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/2b1a0814c294df228a254354971ead9997b7e0f08a91981f54d9e5706883042e.pkl b/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/2b1a0814c294df228a254354971ead9997b7e0f08a91981f54d9e5706883042e.pkl
deleted file mode 100644
index 8abd9f95..00000000
Binary files a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/2b1a0814c294df228a254354971ead9997b7e0f08a91981f54d9e5706883042e.pkl and /dev/null differ
diff --git a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/3407f628e412e3aa66432e6461212b20d8ff7311d5da98c3c7b88f4b5e89f8dc.pkl b/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/3407f628e412e3aa66432e6461212b20d8ff7311d5da98c3c7b88f4b5e89f8dc.pkl
deleted file mode 100644
index 414f3dec..00000000
Binary files a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/3407f628e412e3aa66432e6461212b20d8ff7311d5da98c3c7b88f4b5e89f8dc.pkl and /dev/null differ
diff --git a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/3711f876fabe92527d25b451c67db3d0b93575809115523b262969f426548f47.pkl b/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/3711f876fabe92527d25b451c67db3d0b93575809115523b262969f426548f47.pkl
deleted file mode 100644
index f6e0d39d..00000000
Binary files a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/3711f876fabe92527d25b451c67db3d0b93575809115523b262969f426548f47.pkl and /dev/null differ
diff --git a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/442b2157449dd68cf83e44bee012c6b146d105f09fe0ad4b3168f62c1eab0943.pkl b/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/442b2157449dd68cf83e44bee012c6b146d105f09fe0ad4b3168f62c1eab0943.pkl
deleted file mode 100644
index 821ef665..00000000
Binary files a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/442b2157449dd68cf83e44bee012c6b146d105f09fe0ad4b3168f62c1eab0943.pkl and /dev/null differ
diff --git a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/6544681d9ef3741770ac38316a4122131b9c83d660d3bf7f5c8629e787d7675c.pkl b/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/6544681d9ef3741770ac38316a4122131b9c83d660d3bf7f5c8629e787d7675c.pkl
deleted file mode 100644
index b016a9cc..00000000
Binary files a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/6544681d9ef3741770ac38316a4122131b9c83d660d3bf7f5c8629e787d7675c.pkl and /dev/null differ
diff --git a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/7c65b64bb313648d664fd444ec90cfc76c3d488c921b5b6112693ef5adc83e45.pkl b/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/7c65b64bb313648d664fd444ec90cfc76c3d488c921b5b6112693ef5adc83e45.pkl
deleted file mode 100644
index 873ee889..00000000
Binary files a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/7c65b64bb313648d664fd444ec90cfc76c3d488c921b5b6112693ef5adc83e45.pkl and /dev/null differ
diff --git a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/7e78e6464eea80eda58d379d22108c8b15a8d3e6b0c2cf969e13f78723b4b4b4.pkl b/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/7e78e6464eea80eda58d379d22108c8b15a8d3e6b0c2cf969e13f78723b4b4b4.pkl
deleted file mode 100644
index 53b87389..00000000
Binary files a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/7e78e6464eea80eda58d379d22108c8b15a8d3e6b0c2cf969e13f78723b4b4b4.pkl and /dev/null differ
diff --git a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/82a89674a93f50a540164f85e97500cfb05b769be768fd412db94f0157b9f3bc.pkl b/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/82a89674a93f50a540164f85e97500cfb05b769be768fd412db94f0157b9f3bc.pkl
deleted file mode 100644
index 27cbf9aa..00000000
Binary files a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/82a89674a93f50a540164f85e97500cfb05b769be768fd412db94f0157b9f3bc.pkl and /dev/null differ
diff --git a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/976a7bf3c9a4516e27ab2e1bc1a3e61af0ec3bc9ba704286002cbeca94be9c59.pkl b/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/976a7bf3c9a4516e27ab2e1bc1a3e61af0ec3bc9ba704286002cbeca94be9c59.pkl
deleted file mode 100644
index 3e624f99..00000000
Binary files a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/976a7bf3c9a4516e27ab2e1bc1a3e61af0ec3bc9ba704286002cbeca94be9c59.pkl and /dev/null differ
diff --git a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/b1ce3e1b8d7fecab68827993832822f2c20409768d2c0aa6e48e9b0a83d1a0f9.pkl b/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/b1ce3e1b8d7fecab68827993832822f2c20409768d2c0aa6e48e9b0a83d1a0f9.pkl
deleted file mode 100644
index 2de558c6..00000000
Binary files a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/b1ce3e1b8d7fecab68827993832822f2c20409768d2c0aa6e48e9b0a83d1a0f9.pkl and /dev/null differ
diff --git a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/b77d3d58f75d201a06095aca2a9cbd1f00e71763eaa8006fbf4bb70d494d8474.pkl b/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/b77d3d58f75d201a06095aca2a9cbd1f00e71763eaa8006fbf4bb70d494d8474.pkl
deleted file mode 100644
index a3812e78..00000000
Binary files a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/b77d3d58f75d201a06095aca2a9cbd1f00e71763eaa8006fbf4bb70d494d8474.pkl and /dev/null differ
diff --git a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/b83a56cb51c38b986e03085962c30f98d314869d019734c44aede1e8cae585fe.pkl b/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/b83a56cb51c38b986e03085962c30f98d314869d019734c44aede1e8cae585fe.pkl
deleted file mode 100644
index 09d26b13..00000000
Binary files a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/b83a56cb51c38b986e03085962c30f98d314869d019734c44aede1e8cae585fe.pkl and /dev/null differ
diff --git a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/d9b81a1dc270d0f6fa257607913af50c8eefdf1c138c864e1518c4ad676214c1.pkl b/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/d9b81a1dc270d0f6fa257607913af50c8eefdf1c138c864e1518c4ad676214c1.pkl
deleted file mode 100644
index 779679a0..00000000
Binary files a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_chat_completions/d9b81a1dc270d0f6fa257607913af50c8eefdf1c138c864e1518c4ad676214c1.pkl and /dev/null differ
diff --git a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_images_generations/5e209df13cd0f76c140a16bf2c1f83a3f919d1d287674333a725a5c76c8d68cb.pkl b/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_images_generations/5e209df13cd0f76c140a16bf2c1f83a3f919d1d287674333a725a5c76c8d68cb.pkl
deleted file mode 100644
index 8d5a3865..00000000
Binary files a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_images_generations/5e209df13cd0f76c140a16bf2c1f83a3f919d1d287674333a725a5c76c8d68cb.pkl and /dev/null differ
diff --git a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_images_generations/70c3775dd154ab7d76f1c2f64b1ce80c7d89f03f8f4fb3381581c74b0fa0a236.pkl b/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_images_generations/70c3775dd154ab7d76f1c2f64b1ce80c7d89f03f8f4fb3381581c74b0fa0a236.pkl
deleted file mode 100644
index d3cd3b88..00000000
Binary files a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_images_generations/70c3775dd154ab7d76f1c2f64b1ce80c7d89f03f8f4fb3381581c74b0fa0a236.pkl and /dev/null differ
diff --git a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_images_generations/e56c95e7bdfc2689b458d949d2b9226561b5f0db09461544f967c5ee0ea917da.pkl b/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_images_generations/e56c95e7bdfc2689b458d949d2b9226561b5f0db09461544f967c5ee0ea917da.pkl
deleted file mode 100644
index 253a1ebd..00000000
Binary files a/tests/itest_cache/blog_with_images_ipynb/api.openai.com/v1_images_generations/e56c95e7bdfc2689b458d949d2b9226561b5f0db09461544f967c5ee0ea917da.pkl and /dev/null differ
diff --git a/tests/itest_cache/blog_with_images_ipynb/oaidalleapiprodscus.blob.core.windows.net/private_org-hU0IITr1SZU_1b872c1d73cf85e2_ITZ11U2ZTtuKwSPF9Pc.png/b0cb0faa4247dd6019587a6e3ae8322bf3ed6b3b26cec34221f54c905c59f3d9.pkl b/tests/itest_cache/blog_with_images_ipynb/oaidalleapiprodscus.blob.core.windows.net/private_org-hU0IITr1SZU_1b872c1d73cf85e2_ITZ11U2ZTtuKwSPF9Pc.png/b0cb0faa4247dd6019587a6e3ae8322bf3ed6b3b26cec34221f54c905c59f3d9.pkl
deleted file mode 100644
index f9dd4759..00000000
Binary files a/tests/itest_cache/blog_with_images_ipynb/oaidalleapiprodscus.blob.core.windows.net/private_org-hU0IITr1SZU_1b872c1d73cf85e2_ITZ11U2ZTtuKwSPF9Pc.png/b0cb0faa4247dd6019587a6e3ae8322bf3ed6b3b26cec34221f54c905c59f3d9.pkl and /dev/null differ
diff --git a/tests/itest_cache/blog_with_images_ipynb/oaidalleapiprodscus.blob.core.windows.net/private_org-hU0IITr1SZU_1c573d82e73abdf6_Z7skMx6xFImT6HA1Nrh.png/b0cb0faa4247dd6019587a6e3ae8322bf3ed6b3b26cec34221f54c905c59f3d9.pkl b/tests/itest_cache/blog_with_images_ipynb/oaidalleapiprodscus.blob.core.windows.net/private_org-hU0IITr1SZU_1c573d82e73abdf6_Z7skMx6xFImT6HA1Nrh.png/b0cb0faa4247dd6019587a6e3ae8322bf3ed6b3b26cec34221f54c905c59f3d9.pkl
deleted file mode 100644
index 854abc99..00000000
Binary files a/tests/itest_cache/blog_with_images_ipynb/oaidalleapiprodscus.blob.core.windows.net/private_org-hU0IITr1SZU_1c573d82e73abdf6_Z7skMx6xFImT6HA1Nrh.png/b0cb0faa4247dd6019587a6e3ae8322bf3ed6b3b26cec34221f54c905c59f3d9.pkl and /dev/null differ
diff --git a/tests/itest_cache/blog_with_images_ipynb/oaidalleapiprodscus.blob.core.windows.net/private_org-hU0IITr1SZU_d7ade7eeb7308856_2PqbrrSe0UQv0AKyOcS.png/b0cb0faa4247dd6019587a6e3ae8322bf3ed6b3b26cec34221f54c905c59f3d9.pkl b/tests/itest_cache/blog_with_images_ipynb/oaidalleapiprodscus.blob.core.windows.net/private_org-hU0IITr1SZU_d7ade7eeb7308856_2PqbrrSe0UQv0AKyOcS.png/b0cb0faa4247dd6019587a6e3ae8322bf3ed6b3b26cec34221f54c905c59f3d9.pkl
deleted file mode 100644
index 6ebcbc98..00000000
Binary files a/tests/itest_cache/blog_with_images_ipynb/oaidalleapiprodscus.blob.core.windows.net/private_org-hU0IITr1SZU_d7ade7eeb7308856_2PqbrrSe0UQv0AKyOcS.png/b0cb0faa4247dd6019587a6e3ae8322bf3ed6b3b26cec34221f54c905c59f3d9.pkl and /dev/null differ
diff --git a/tests/itest_cache/math_via_python_code_with_a_single_agent_ipynb/api.openai.com/v1_chat_completions/20905a41d435e2b7f70f38618dbfc4cc0e4842825b641bc5ca164316246796ee.pkl b/tests/itest_cache/math_via_python_code_with_a_single_agent_ipynb/api.openai.com/v1_chat_completions/20905a41d435e2b7f70f38618dbfc4cc0e4842825b641bc5ca164316246796ee.pkl
deleted file mode 100644
index 5ebc0071..00000000
Binary files a/tests/itest_cache/math_via_python_code_with_a_single_agent_ipynb/api.openai.com/v1_chat_completions/20905a41d435e2b7f70f38618dbfc4cc0e4842825b641bc5ca164316246796ee.pkl and /dev/null differ
diff --git a/tests/itest_cache/math_via_python_code_with_a_single_agent_ipynb/api.openai.com/v1_chat_completions/d4ca244caaf6952544bda13063a022efbe85314c8a1053edbdd202994ad7d2e1.pkl b/tests/itest_cache/math_via_python_code_with_a_single_agent_ipynb/api.openai.com/v1_chat_completions/d4ca244caaf6952544bda13063a022efbe85314c8a1053edbdd202994ad7d2e1.pkl
deleted file mode 100644
index 68c961cf..00000000
Binary files a/tests/itest_cache/math_via_python_code_with_a_single_agent_ipynb/api.openai.com/v1_chat_completions/d4ca244caaf6952544bda13063a022efbe85314c8a1053edbdd202994ad7d2e1.pkl and /dev/null differ
diff --git a/tests/itest_cache/math_via_python_code_with_a_single_agent_ipynb/api.openai.com/v1_chat_completions/e5b7cf9a5b104cd4c38ad599b2e943efc80db7e2742c3e27696bc20d021534e2.pkl b/tests/itest_cache/math_via_python_code_with_a_single_agent_ipynb/api.openai.com/v1_chat_completions/e5b7cf9a5b104cd4c38ad599b2e943efc80db7e2742c3e27696bc20d021534e2.pkl
deleted file mode 100644
index 57064d95..00000000
Binary files a/tests/itest_cache/math_via_python_code_with_a_single_agent_ipynb/api.openai.com/v1_chat_completions/e5b7cf9a5b104cd4c38ad599b2e943efc80db7e2742c3e27696bc20d021534e2.pkl and /dev/null differ
diff --git a/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_chat_completions/1992c5f8659c96a24fbd52e87e2ac2896101c53742f5465305388e0dd992733b.pkl b/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_chat_completions/1992c5f8659c96a24fbd52e87e2ac2896101c53742f5465305388e0dd992733b.pkl
deleted file mode 100644
index 61f2eee1..00000000
Binary files a/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_chat_completions/1992c5f8659c96a24fbd52e87e2ac2896101c53742f5465305388e0dd992733b.pkl and /dev/null differ
diff --git a/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_chat_completions/266b1164bbff5e9e0921e47783406ea7d0d9c0bbb23f420a8628881c097bcac9.pkl b/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_chat_completions/266b1164bbff5e9e0921e47783406ea7d0d9c0bbb23f420a8628881c097bcac9.pkl
deleted file mode 100644
index 5bfeee05..00000000
Binary files a/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_chat_completions/266b1164bbff5e9e0921e47783406ea7d0d9c0bbb23f420a8628881c097bcac9.pkl and /dev/null differ
diff --git a/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_chat_completions/74c579ddb06e4f6fa533fceb3f9d6b9653aa28ca7a8f0dd9fa3bb5dbf221b1a7.pkl b/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_chat_completions/74c579ddb06e4f6fa533fceb3f9d6b9653aa28ca7a8f0dd9fa3bb5dbf221b1a7.pkl
deleted file mode 100644
index 9f093918..00000000
Binary files a/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_chat_completions/74c579ddb06e4f6fa533fceb3f9d6b9653aa28ca7a8f0dd9fa3bb5dbf221b1a7.pkl and /dev/null differ
diff --git a/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_chat_completions/7f3307a5722aed216bd5ad38b3f3183cd81c40247a7f28a1e8520b506b85cab1.pkl b/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_chat_completions/7f3307a5722aed216bd5ad38b3f3183cd81c40247a7f28a1e8520b506b85cab1.pkl
deleted file mode 100644
index ad7e7115..00000000
Binary files a/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_chat_completions/7f3307a5722aed216bd5ad38b3f3183cd81c40247a7f28a1e8520b506b85cab1.pkl and /dev/null differ
diff --git a/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_chat_completions/99ff7c26f19a665c80bbffb1e5f835dd6dfc06d3790921224e2f369f782d6c23.pkl b/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_chat_completions/99ff7c26f19a665c80bbffb1e5f835dd6dfc06d3790921224e2f369f782d6c23.pkl
deleted file mode 100644
index 1df9c444..00000000
Binary files a/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_chat_completions/99ff7c26f19a665c80bbffb1e5f835dd6dfc06d3790921224e2f369f782d6c23.pkl and /dev/null differ
diff --git a/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_chat_completions/9f2119bb25b96c02c51c07fb840973e53792a345e0a93e935769d3d3b2912765.pkl b/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_chat_completions/9f2119bb25b96c02c51c07fb840973e53792a345e0a93e935769d3d3b2912765.pkl
deleted file mode 100644
index 73c7fa4a..00000000
Binary files a/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_chat_completions/9f2119bb25b96c02c51c07fb840973e53792a345e0a93e935769d3d3b2912765.pkl and /dev/null differ
diff --git a/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_chat_completions/b8323ccfcef0074de4325fe4fac395e78632a43ac122bb6fba4037ca79f2734d.pkl b/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_chat_completions/b8323ccfcef0074de4325fe4fac395e78632a43ac122bb6fba4037ca79f2734d.pkl
deleted file mode 100644
index 8c4049ca..00000000
Binary files a/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_chat_completions/b8323ccfcef0074de4325fe4fac395e78632a43ac122bb6fba4037ca79f2734d.pkl and /dev/null differ
diff --git a/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_chat_completions/d80b1318b202968026a4f33fa7f4e286b555f6df54faf27d545a8f13de6552fe.pkl b/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_chat_completions/d80b1318b202968026a4f33fa7f4e286b555f6df54faf27d545a8f13de6552fe.pkl
deleted file mode 100644
index 43f6a996..00000000
Binary files a/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_chat_completions/d80b1318b202968026a4f33fa7f4e286b555f6df54faf27d545a8f13de6552fe.pkl and /dev/null differ
diff --git a/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_chat_completions/e63b66ec839298d91654a974a3104914215e5a94c26a87ad1d8dad3881459b11.pkl b/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_chat_completions/e63b66ec839298d91654a974a3104914215e5a94c26a87ad1d8dad3881459b11.pkl
deleted file mode 100644
index dbd79ffc..00000000
Binary files a/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_chat_completions/e63b66ec839298d91654a974a3104914215e5a94c26a87ad1d8dad3881459b11.pkl and /dev/null differ
diff --git a/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_embeddings/1551c2f735715ff74e73ddedff891d47760b8bb08ed97fbfac4279526f47aa37.pkl b/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_embeddings/1551c2f735715ff74e73ddedff891d47760b8bb08ed97fbfac4279526f47aa37.pkl
deleted file mode 100644
index 7d7a5e50..00000000
Binary files a/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_embeddings/1551c2f735715ff74e73ddedff891d47760b8bb08ed97fbfac4279526f47aa37.pkl and /dev/null differ
diff --git a/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_embeddings/2ba375f12e98ab74f5d9f7cf425b959a7d57c4a70e06e9e222a8af1e044c515f.pkl b/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_embeddings/2ba375f12e98ab74f5d9f7cf425b959a7d57c4a70e06e9e222a8af1e044c515f.pkl
deleted file mode 100644
index b9861c7f..00000000
Binary files a/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_embeddings/2ba375f12e98ab74f5d9f7cf425b959a7d57c4a70e06e9e222a8af1e044c515f.pkl and /dev/null differ
diff --git a/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_embeddings/af9a6801a473bb2052a04f58adb99b30be8a24fb8ec999cd1a2e99a5e50ee8f2.pkl b/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_embeddings/af9a6801a473bb2052a04f58adb99b30be8a24fb8ec999cd1a2e99a5e50ee8f2.pkl
deleted file mode 100644
index 07bc5b4c..00000000
Binary files a/tests/itest_cache/multi_step_research_agent_ipynb/api.openai.com/v1_embeddings/af9a6801a473bb2052a04f58adb99b30be8a24fb8ec999cd1a2e99a5e50ee8f2.pkl and /dev/null differ
diff --git a/tests/itest_cache/validating_agent_output_ipynb/api.openai.com/v1_chat_completions/192117a0f367712eb5012a49213db95fd9ea6908fe3f9d882a8bb78e67bd69db.pkl b/tests/itest_cache/validating_agent_output_ipynb/api.openai.com/v1_chat_completions/192117a0f367712eb5012a49213db95fd9ea6908fe3f9d882a8bb78e67bd69db.pkl
deleted file mode 100644
index 203258bd..00000000
Binary files a/tests/itest_cache/validating_agent_output_ipynb/api.openai.com/v1_chat_completions/192117a0f367712eb5012a49213db95fd9ea6908fe3f9d882a8bb78e67bd69db.pkl and /dev/null differ
diff --git a/tests/itest_cache/validating_agent_output_ipynb/api.openai.com/v1_chat_completions/334aaaf8ce973dbed735b8c4611a1508132d989884064f1eaff8b091b1de20fa.pkl b/tests/itest_cache/validating_agent_output_ipynb/api.openai.com/v1_chat_completions/334aaaf8ce973dbed735b8c4611a1508132d989884064f1eaff8b091b1de20fa.pkl
deleted file mode 100644
index 90db6d0e..00000000
Binary files a/tests/itest_cache/validating_agent_output_ipynb/api.openai.com/v1_chat_completions/334aaaf8ce973dbed735b8c4611a1508132d989884064f1eaff8b091b1de20fa.pkl and /dev/null differ
diff --git a/tests/itest_cache/validating_agent_output_ipynb/api.openai.com/v1_chat_completions/6857570ee113eaf11d8474bf121e80738427ba705cdeb0ec342e287b2d16b3df.pkl b/tests/itest_cache/validating_agent_output_ipynb/api.openai.com/v1_chat_completions/6857570ee113eaf11d8474bf121e80738427ba705cdeb0ec342e287b2d16b3df.pkl
deleted file mode 100644
index 71b070cf..00000000
Binary files a/tests/itest_cache/validating_agent_output_ipynb/api.openai.com/v1_chat_completions/6857570ee113eaf11d8474bf121e80738427ba705cdeb0ec342e287b2d16b3df.pkl and /dev/null differ
diff --git a/tests/itest_cache/validating_agent_output_ipynb/api.openai.com/v1_chat_completions/994dc81cb5291809d8ee451cf0a4e00f1b7c5f09a3423f84750ce14b370806c3.pkl b/tests/itest_cache/validating_agent_output_ipynb/api.openai.com/v1_chat_completions/994dc81cb5291809d8ee451cf0a4e00f1b7c5f09a3423f84750ce14b370806c3.pkl
deleted file mode 100644
index f879520e..00000000
Binary files a/tests/itest_cache/validating_agent_output_ipynb/api.openai.com/v1_chat_completions/994dc81cb5291809d8ee451cf0a4e00f1b7c5f09a3423f84750ce14b370806c3.pkl and /dev/null differ
diff --git a/tests/itest_cache/validating_agent_output_ipynb/api.openai.com/v1_chat_completions/ca8b3c34c094b9148f8844717350f8f0f57205f90ded869e2c34baf659093736.pkl b/tests/itest_cache/validating_agent_output_ipynb/api.openai.com/v1_chat_completions/ca8b3c34c094b9148f8844717350f8f0f57205f90ded869e2c34baf659093736.pkl
deleted file mode 100644
index d9889dfe..00000000
Binary files a/tests/itest_cache/validating_agent_output_ipynb/api.openai.com/v1_chat_completions/ca8b3c34c094b9148f8844717350f8f0f57205f90ded869e2c34baf659093736.pkl and /dev/null differ
diff --git a/tests/itest_cache/validating_agent_output_ipynb/duckduckgo.com/48b9aea6597b2b544f9d167f8df886317a4fe5996c26ea69a49602e57c287b0d.pkl b/tests/itest_cache/validating_agent_output_ipynb/duckduckgo.com/48b9aea6597b2b544f9d167f8df886317a4fe5996c26ea69a49602e57c287b0d.pkl
deleted file mode 100644
index b69f4bbe..00000000
Binary files a/tests/itest_cache/validating_agent_output_ipynb/duckduckgo.com/48b9aea6597b2b544f9d167f8df886317a4fe5996c26ea69a49602e57c287b0d.pkl and /dev/null differ
diff --git a/tests/itest_cache/validating_agent_output_ipynb/duckduckgo.com/4bf4bfbbfa93b08789016604fbfdfed9ebb824699c8ff1c7c9957d833ac22da1.pkl b/tests/itest_cache/validating_agent_output_ipynb/duckduckgo.com/4bf4bfbbfa93b08789016604fbfdfed9ebb824699c8ff1c7c9957d833ac22da1.pkl
deleted file mode 100644
index 30486e6a..00000000
Binary files a/tests/itest_cache/validating_agent_output_ipynb/duckduckgo.com/4bf4bfbbfa93b08789016604fbfdfed9ebb824699c8ff1c7c9957d833ac22da1.pkl and /dev/null differ
diff --git a/tests/itest_cache/validating_agent_output_ipynb/duckduckgo.com/756de4eb2aeb9ab57927495f4326d74ca589e0a3934a2a3bd5acef83b2af884c.pkl b/tests/itest_cache/validating_agent_output_ipynb/duckduckgo.com/756de4eb2aeb9ab57927495f4326d74ca589e0a3934a2a3bd5acef83b2af884c.pkl
deleted file mode 100644
index 3f3573ef..00000000
Binary files a/tests/itest_cache/validating_agent_output_ipynb/duckduckgo.com/756de4eb2aeb9ab57927495f4326d74ca589e0a3934a2a3bd5acef83b2af884c.pkl and /dev/null differ
diff --git a/tests/itest_cache/validating_agent_output_ipynb/duckduckgo.com/eea66765dd2fdb569aa5df252542f6f1a0f09eedbec08bab71f29adfb2b3606f.pkl b/tests/itest_cache/validating_agent_output_ipynb/duckduckgo.com/eea66765dd2fdb569aa5df252542f6f1a0f09eedbec08bab71f29adfb2b3606f.pkl
deleted file mode 100644
index a4aa962e..00000000
Binary files a/tests/itest_cache/validating_agent_output_ipynb/duckduckgo.com/eea66765dd2fdb569aa5df252542f6f1a0f09eedbec08bab71f29adfb2b3606f.pkl and /dev/null differ
diff --git a/tests/itest_cache/validating_agent_output_ipynb/links.duckduckgo.com/d.js/0cd2e1db74a3db86813678607677f7af262210f79498fb9a46fc9007c3bd74fb.pkl b/tests/itest_cache/validating_agent_output_ipynb/links.duckduckgo.com/d.js/0cd2e1db74a3db86813678607677f7af262210f79498fb9a46fc9007c3bd74fb.pkl
deleted file mode 100644
index fad1593a..00000000
Binary files a/tests/itest_cache/validating_agent_output_ipynb/links.duckduckgo.com/d.js/0cd2e1db74a3db86813678607677f7af262210f79498fb9a46fc9007c3bd74fb.pkl and /dev/null differ
diff --git a/tests/itest_cache/validating_agent_output_ipynb/links.duckduckgo.com/d.js/12d4e578baceab26f8100e95db263a606798df478a8aa57aff10ffb867a1d26e.pkl b/tests/itest_cache/validating_agent_output_ipynb/links.duckduckgo.com/d.js/12d4e578baceab26f8100e95db263a606798df478a8aa57aff10ffb867a1d26e.pkl
deleted file mode 100644
index 352dca86..00000000
Binary files a/tests/itest_cache/validating_agent_output_ipynb/links.duckduckgo.com/d.js/12d4e578baceab26f8100e95db263a606798df478a8aa57aff10ffb867a1d26e.pkl and /dev/null differ
diff --git a/tests/itest_cache/validating_agent_output_ipynb/links.duckduckgo.com/d.js/5d66809346a31532618a24b92fa0b5a7e00f1ee1ad0dcdce994c9315bebd70d5.pkl b/tests/itest_cache/validating_agent_output_ipynb/links.duckduckgo.com/d.js/5d66809346a31532618a24b92fa0b5a7e00f1ee1ad0dcdce994c9315bebd70d5.pkl
deleted file mode 100644
index 9cd94a5f..00000000
Binary files a/tests/itest_cache/validating_agent_output_ipynb/links.duckduckgo.com/d.js/5d66809346a31532618a24b92fa0b5a7e00f1ee1ad0dcdce994c9315bebd70d5.pkl and /dev/null differ
diff --git a/tests/itest_cache/validating_agent_output_ipynb/links.duckduckgo.com/d.js/5f7a737c1a76bdc7548ddb55adba25645bf0bbaa36a8573c903bed135a15f22e.pkl b/tests/itest_cache/validating_agent_output_ipynb/links.duckduckgo.com/d.js/5f7a737c1a76bdc7548ddb55adba25645bf0bbaa36a8573c903bed135a15f22e.pkl
deleted file mode 100644
index 0c3b99b9..00000000
Binary files a/tests/itest_cache/validating_agent_output_ipynb/links.duckduckgo.com/d.js/5f7a737c1a76bdc7548ddb55adba25645bf0bbaa36a8573c903bed135a15f22e.pkl and /dev/null differ
diff --git a/tests/itest_golden_data/advanced_output_handling_ipynb.json b/tests/itest_golden_data/advanced_output_handling_ipynb.json
deleted file mode 100644
index 8d5200ff..00000000
--- a/tests/itest_golden_data/advanced_output_handling_ipynb.json
+++ /dev/null
@@ -1 +0,0 @@
-"# Bubble Sort Implementation\ndef bubble_sort(arr):\n    n = len(arr)\n    for i in range(n):\n        for j in range(0, n-i-1):\n            if arr[j] > arr[j+1]:\n                arr[j], arr[j+1] = arr[j+1], arr[j]\n    return arr\n\n# Test the bubble sort implementation\nsample_list = [64, 34, 25, 12, 22, 11, 90]\nsorted_list = bubble_sort(sample_list)\nprint(\"Sorted list:\", sorted_list)\n\nThe `bubble_sort` function implements the bubble sort algorithm. It takes a list `arr` as input and sorts it in ascending order. The algorithm works by repeatedly stepping through the list, comparing adjacent elements, and swapping them if they are in the wrong order. This process is repeated until the list is sorted. The outer loop runs `n` times, where `n` is the length of the list, and the inner loop runs `n-i-1` times to avoid re-checking the already sorted elements. The sorted list is then returned."
\ No newline at end of file
diff --git a/tests/itest_golden_data/blog_with_images_ipynb.json b/tests/itest_golden_data/blog_with_images_ipynb.json
deleted file mode 100644
index 42a80be5..00000000
--- a/tests/itest_golden_data/blog_with_images_ipynb.json
+++ /dev/null
@@ -1 +0,0 @@
-"# The Coolest AI Advancements in 2024: What You Need to Know\n\nHey tech enthusiasts! 2024 is shaping up to be an incredible year for artificial intelligence. From mind-blowing new technologies to game-changing industry impacts, AI is taking the world by storm. Let's dive into some of the most exciting advancements that are making waves this year.\n\n## Generative AI: Creativity Unleashed\n\nGenerative AI is on fire! Imagine AI models that can create human-like text, images, and even videos. We're talking about the next level of content creation, coding, and scientific research. These models, like GPT-4 and beyond, are getting so sophisticated that they can produce content that's almost indistinguishable from what a human would create. It's like having a super-smart creative partner at your fingertips.\n![Generative AI](https://example.com/Users/whimo/motleycrew/images/1fd2d154.png)\n\n## AI Ethics and Regulation: Playing by the Rules\n\nWith great power comes great responsibility. As AI continues to grow, there's a big push for ethical considerations and regulatory frameworks. Governments and organizations are stepping up to ensure AI is used responsibly, without perpetuating biases or causing harm. It's all about making sure AI benefits everyone and doesn't go rogue.\n\n## AI in Healthcare: A New Era of Medicine\n\nAI is revolutionizing healthcare in ways we never thought possible. From diagnosing diseases to personalizing treatment plans, AI algorithms are analyzing medical images, predicting patient outcomes, and even assisting in surgeries. It's like having a supercharged medical team working around the clock to keep us healthy.\n![AI in Healthcare](https://example.com/Users/whimo/motleycrew/images/c4335b5b.png)\n\n## Edge AI: Smarter, Faster, Closer\n\nEdge AI is all about bringing AI processing closer to where data is generated. This means reduced latency and bandwidth usage, making AI applications more efficient and responsive. Think of it as having a mini AI powerhouse right where you need it, whether it's in your smartphone, smart home devices, or even your car.\n\n## Quantum AI: The Future is Now\n\nQuantum AI is where things get really futuristic. By combining quantum computing with AI, we're on the brink of solving complex problems that were previously impossible. This could lead to breakthroughs in cryptography, material science, and even complex system simulations. It's like unlocking a whole new level of computational power.\n![Quantum AI](https://example.com/Users/whimo/motleycrew/images/957b0b62.png)\n\n## AI-Driven Drug Discovery: Speeding Up Science\n\nAI is speeding up the drug discovery process by predicting how different compounds will interact with targets in the body. This means new drugs can be developed faster and more cost-effectively, potentially saving countless lives. It's a game-changer for the pharmaceutical industry and for patients around the world.\n\n## Autonomous Systems: The Rise of the Machines\n\nFully autonomous systems, including self-driving cars, drones, and robots, are becoming more reliable and widespread. These systems are set to transform transportation, logistics, and various industries. Imagine a world where your car drives you to work, drones deliver your packages, and robots handle dangerous tasks. The future is here, and it's autonomous.\n\n## Natural Language Processing (NLP): Talking to Machines\n\nNLP technologies are getting so advanced that interacting with machines feels more natural and intuitive. From machine translation to sentiment analysis and conversational AI, these advancements are making it easier for us to communicate with our devices. It's like having a personal assistant that understands you perfectly.\n\nIn conclusion, 2024 is a landmark year for AI, with groundbreaking advancements that are set to transform industries and improve our lives. Whether it's through generative AI, quantum computing, or autonomous systems, the future of AI is bright, and we can't wait to see what's next. Stay tuned, because the AI revolution is just getting started!"
\ No newline at end of file
diff --git a/tests/itest_golden_data/math_via_python_code_with_a_single_agent_ipynb.json b/tests/itest_golden_data/math_via_python_code_with_a_single_agent_ipynb.json
deleted file mode 100644
index a5b155b2..00000000
--- a/tests/itest_golden_data/math_via_python_code_with_a_single_agent_ipynb.json
+++ /dev/null
@@ -1 +0,0 @@
-"To solve the system of linear equations:\n\n\\[ 725x + 727y = 1500 \\]\n\\[ 729x + 731y = 1508 \\]\n\nwe first represent the system in matrix form and solve it using sympy. The solutions for \\( x \\) and \\( y \\) are:\n\n\\[ x = -23 \\]\n\\[ y = 25 \\]\n\nNext, we calculate \\( x - y \\):\n\n\\[ x - y = -23 - 25 = -48 \\]\n\nThus, the values are:\n\n\\[ x = -23 \\]\n\\[ y = 25 \\]\n\\[ x - y = -48 \\]"
\ No newline at end of file
diff --git a/tests/itest_golden_data/multi_step_research_agent_ipynb.json b/tests/itest_golden_data/multi_step_research_agent_ipynb.json
deleted file mode 100644
index 03a6f960..00000000
--- a/tests/itest_golden_data/multi_step_research_agent_ipynb.json
+++ /dev/null
@@ -1 +0,0 @@
-"Why did Arjuna kill Karna, his half-brother?\n\nArjuna killed Karna, his half-brother, under the command and justification of Krishna. During their battle, Karna's chariot got stuck in the mud, and he was unable to remember the mantra to launch the Brahmastra. Karna called out to Arjuna and Krishna, claiming they were honor-bound to let him fix his chariot. Krishna responded by questioning the honor in Karna's past actions, specifically mentioning the humiliation of Draupadi and the killing of Abhimanyu while he was defenseless. Krishna argued that Karna's previous dishonorable deeds forfeited any claim to honor, thus justifying his death even while he was defenseless. Following Krishna's command, Arjuna hesitated but ultimately obeyed and killed Karna."
\ No newline at end of file
diff --git a/tests/itest_golden_data/validating_agent_output_ipynb.json b/tests/itest_golden_data/validating_agent_output_ipynb.json
deleted file mode 100644
index 85c0fe31..00000000
--- a/tests/itest_golden_data/validating_agent_output_ipynb.json
+++ /dev/null
@@ -1 +0,0 @@
-"### Comprehensive Analysis of AI Advancements in 2024\n\n#### Key Trends\n\n1. **Generative AI Accessibility**\n   - In 2024, generative AI is expected to become more accessible to the general public. This democratization of AI technology will enable more individuals to experiment with and utilize AI models for various applications, from creative endeavors to practical problem-solving.\n\n2. **Multimodal AI**\n   - Multimodal AI, which integrates multiple input types such as text, images, and audio, is set to revolutionize human-AI interaction. By mimicking human-like perception and cognition, multimodal AI will enhance the capabilities of generative and conversational AI systems. Gartner predicts that 40% of generative AI solutions will be multimodal by 2027, up from just 1% in 2023.\n\n3. **Quantum AI**\n   - Quantum advancements are poised to make significant contributions to AI capabilities in 2024. Quantum computing is expected to create substantial economic value, with projections ranging from $450 billion to $850 billion by 2040. The integration of quantum technology with AI and high-performance computing (HPC) tools will unlock new possibilities for complex problem-solving and secure digital landscapes.\n\n#### Breakthrough Technologies\n\n1. **GPT-4o and Azure OpenAI Service**\n   - The introduction of GPT-4o, a multimodal model available through the Azure OpenAI Service API and Azure AI Studio, represents a significant leap in generative and conversational AI. This model integrates text and vision capabilities, with plans to include audio in the future, setting a new standard for AI experiences.\n\n2. **Quantum-Resilient Computing**\n   - Collaborations between quantum computing companies and tech giants like Microsoft are advancing quantum-resilient computing. Innovations such as qubit-virtualization systems and resilient quantum computing hardware are paving the way for more robust and secure AI applications.\n\n#### AI Applications in Physics\n\n1. **Classifying Phase Transitions**\n   - Researchers at MIT and the University of Basel have developed a physics-informed technique using generative AI to classify phase transitions in materials or physical systems. This method is more efficient than existing machine-learning approaches and represents a significant advancement in the field of materials science.\n\n2. **Coupling AI with Fundamental Physics**\n   - Scientists are coupling AI with fundamental physics to perform faster calculations. This integration allows for more efficient and accurate modeling of physical phenomena, which can lead to new discoveries and advancements in theoretical physics.\n\n3. **Answering Complex Questions**\n   - Generative AI is being used to answer complex questions in physics, such as understanding phase transitions and other intricate physical processes. This application of AI is helping researchers uncover new insights and drive progress in the field.\n\n#### Potential Industry Impacts\n\n1. **Enhanced Human-AI Interaction**\n   - The rise of multimodal AI will lead to more intuitive and effective human-AI interactions. Industries such as customer service, healthcare, and education will benefit from AI systems that can understand and respond to diverse sensory inputs, providing more personalized and accurate assistance.\n\n2. **Economic Value Creation**\n   - Quantum AI advancements will drive significant economic value across various sectors. Industries that rely on complex problem-solving, such as finance, logistics, and cybersecurity, will see transformative impacts from the enhanced computational power and security offered by quantum AI.\n\n3. **Innovation and Ethical Considerations**\n   - The rapid advancements in AI technology will spur innovation across multiple fields. However, it will also necessitate careful consideration of ethical implications, including data privacy, algorithmic bias, and the societal impact of AI deployment. Policymakers and industry leaders will need to collaborate to ensure responsible AI development and usage."
\ No newline at end of file
diff --git a/tests/run_integration_tests.py b/tests/run_integration_tests.py
index 4dd4e78b..8a5a2fa0 100644
--- a/tests/run_integration_tests.py
+++ b/tests/run_integration_tests.py
@@ -1,8 +1,5 @@
 import argparse
-import difflib
-import json
 import os
-import shutil
 import sys
 import traceback
 from copy import copy
@@ -12,7 +9,7 @@
 
 import nbformat
 from dotenv import load_dotenv
-from motleycache import set_cache_location, set_strong_cache
+from motleycache import set_cache_location
 from nbconvert.preprocessors import ExecutePreprocessor
 from nbformat.v4.nbbase import new_code_cell
 
@@ -29,12 +26,17 @@
 
 IPYNB_INTEGRATION_TESTS = {
     # "blog_with_images_ipynb": "examples/Blog with Images.ipynb",
-    # TODO: this particular test was problematic in terms of caching, find ways to enable
     "multi_step_research_agent_ipynb": "examples/Multi-step research agent.ipynb",
     "math_via_python_code_with_a_single_agent_ipynb": "examples/Math via python code with a single agent.ipynb",
     "validating_agent_output_ipynb": "examples/Validating agent output.ipynb",
     "advanced_output_handling_ipynb": "examples/Advanced output handling.ipynb",
-    # "using_autogen_with_motleycrew_ipynb": "examples/Using AutoGen with motleycrew.ipynb"
+    "using_autogen_with_motleycrew_ipynb": "examples/Using AutoGen with motleycrew.ipynb"
+}
+
+INTEGRATION_TESTS_TO_SKIP = {
+    "Windows": [
+        "blog_with_images_ipynb"
+    ]
 }
 
 MINIMAL_INTEGRATION_TESTS = {}
@@ -77,62 +79,16 @@ def get_args_parser():
     )
     parser.add_argument("--cache-dir", type=str, help="Cache directory", default=DEFAULT_CACHE_DIR)
     parser.add_argument(
-        "--golden-dir",
-        type=str,
-        help="Reference data directory",
-        default=DEFAULT_GOLDEN_DIR,
+        "--minimal-only", default=False, action="store_true", help="Run minimal tests"
     )
     parser.add_argument(
-        "--update-golden",
-        action="store_true",
-        help="Update reference data together with the cache",
-    )
-    parser.add_argument(
-        "--minimal-only", default=False, action="store_true", help="Run minimal tests"
+        # added to skip problematic tests on Windows workers in GutHub Actions
+        "--os", type=str, default="Unix", help="Target operating system"
     )
 
     return parser
 
 
-def compare_results(result: str | list[str], expected_result: str | list[str]):
-    """Compare the received and expected results"""
-    if isinstance(result, str):
-        result = [result]
-    if isinstance(expected_result, str):
-        expected_result = [expected_result]
-
-    diff = []
-    for i, (row, expected_row) in enumerate(zip(result, expected_result)):
-        result_lines = row.splitlines()
-        expected_result_lines = expected_row.splitlines()
-        diff += list(difflib.unified_diff(result_lines, expected_result_lines))
-
-    if diff:
-        message = "Test result != expected result.\n{}\n".format("\n".join(diff))
-        raise Exception(message)
-
-
-def build_excepted_content_file_path(
-    golden_dir: str, test_name: str, extension: str = "txt"
-) -> str:
-    """Build golden data file path"""
-    return os.path.join(golden_dir, "{}.{}".format(test_name, extension))
-
-
-def write_content(golden_dir: str, test_name: str, content: str, extension: str = "json"):
-    """Write golden data to file"""
-    file_path = build_excepted_content_file_path(golden_dir, test_name, extension)
-    with open(file_path, "w") as fd:
-        json.dump(content, fd)
-
-
-def read_golden_data(golden_dir: str, test_name: str, extension: str = "json"):
-    """Read golden data from file"""
-    file_path = build_excepted_content_file_path(golden_dir, test_name, extension)
-    with open(file_path, "r") as fd:
-        return json.load(fd)
-
-
 def run_ipynb(ipynb_path: str, strong_cache: bool = False, cache_sub_dir: str = None) -> str:
     """Run jupiter notebook execution"""
     with open(ipynb_path) as f:
@@ -202,10 +158,9 @@ def build_ipynb_integration_tests(is_minimal: bool = False) -> dict:
 
 def run_integration_tests(
     cache_dir: str,
-    golden_dir: str,
-    update_golden: bool = False,
     test_names: Optional[list[str]] = None,
     minimal_only: bool = False,
+    target_os: str = "Unix",
 ):
     failed_tests = {}
 
@@ -226,45 +181,24 @@ def run_integration_tests(
         if test_names and current_test_name not in test_names:
             continue
 
+        if target_os in INTEGRATION_TESTS_TO_SKIP and current_test_name in INTEGRATION_TESTS_TO_SKIP[target_os]:
+            logger.info("Skipping test %s for target platform %s", current_test_name, target_os)
+            continue
+
         logger.info("Running test: %s", current_test_name)
 
         cache_sub_dir = os.path.join(cache_dir, current_test_name)
-        if update_golden:
-            logger.info("Update-golden flag is set. Cleaning cache directory %s", cache_sub_dir)
-            shutil.rmtree(cache_sub_dir, ignore_errors=True)
-            os.makedirs(cache_sub_dir, exist_ok=True)
-            os.makedirs(golden_dir, exist_ok=True)
-            strong_cache = False
-        else:
-            strong_cache = True
-
-        set_strong_cache(strong_cache)
         set_cache_location(cache_sub_dir)
 
         if current_test_name in IPYNB_INTEGRATION_TESTS:
             test_fn_kwargs = {
-                "strong_cache": strong_cache,
                 "cache_sub_dir": cache_sub_dir,
             }
         else:
             test_fn_kwargs = {}
 
         try:
-            test_result = test_fn(**test_fn_kwargs)
-            if (
-                current_test_name in INTEGRATION_TESTS
-                or current_test_name in IPYNB_INTEGRATION_TESTS
-            ):
-                if update_golden:
-                    logger.info(
-                        "Skipping check and updating golden data for test: %s",
-                        current_test_name,
-                    )
-                    write_content(golden_dir, current_test_name, test_result)
-                else:
-                    excepted_result = read_golden_data(golden_dir, current_test_name)
-                    compare_results(test_result, excepted_result)
-
+            test_fn(**test_fn_kwargs)
         except BaseException as e:
             logger.error("Test %s failed: %s", current_test_name, str(e))
             failed_tests[current_test_name] = traceback.format_exc()
@@ -288,10 +222,9 @@ def main():
 
     run_integration_tests(
         cache_dir=args.cache_dir,
-        golden_dir=args.golden_dir,
-        update_golden=args.update_golden,
         test_names=args.test_names,
         minimal_only=args.minimal_only,
+        target_os=args.os,
     )