diff --git a/entropix.ipynb b/entropix.ipynb
index 6361cec..be3a34a 100644
--- a/entropix.ipynb
+++ b/entropix.ipynb
@@ -5,7 +5,7 @@
     "colab": {
       "provenance": [],
       "gpuType": "V28",
-      "authorship_tag": "ABX9TyN8U6k4TIpovU5kGA3L4RuN",
+      "authorship_tag": "ABX9TyMwNK6aEAWPQERQG0wgmb5s",
       "include_colab_link": true
     },
     "kernelspec": {
@@ -30,13 +30,13 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 8,
+      "execution_count": 1,
       "metadata": {
         "id": "xZd9EBLmgKU6"
       },
       "outputs": [],
       "source": [
-        "from typing import NamedTuple, List, Optional, Tuple\n",
+        "from typing import Dict, List, NamedTuple, Optional, Tuple\n",
         "\n",
         "import jax\n",
         "import jax.numpy as jnp"
@@ -60,7 +60,7 @@
       "metadata": {
         "id": "oHaZYRqAnRPS"
       },
-      "execution_count": 9,
+      "execution_count": 2,
       "outputs": []
     },
     {
@@ -113,7 +113,7 @@
       "metadata": {
         "id": "RHxP8Bd1lvo8"
       },
-      "execution_count": 10,
+      "execution_count": 3,
       "outputs": []
     },
     {
@@ -147,7 +147,7 @@
       "metadata": {
         "id": "flDeKnQlhS1J"
       },
-      "execution_count": 11,
+      "execution_count": 4,
       "outputs": []
     },
     {
@@ -234,7 +234,7 @@
       "metadata": {
         "id": "u3lTK6HWhbFV"
       },
-      "execution_count": 12,
+      "execution_count": 5,
       "outputs": []
     },
     {
@@ -329,7 +329,7 @@
       "metadata": {
         "id": "2WdLNnGTicBG"
       },
-      "execution_count": 13,
+      "execution_count": 6,
       "outputs": []
     },
     {
@@ -370,7 +370,7 @@
       "metadata": {
         "id": "FoE8AuSDlPtr"
       },
-      "execution_count": 14,
+      "execution_count": 7,
       "outputs": []
     },
     {
@@ -386,9 +386,50 @@
       "cell_type": "code",
       "source": [
         "from typing import Optional, Tuple\n",
-        "\n",
         "DEFAULT_MASK_VALUE = -0.7 * float(jnp.finfo(jnp.dtype(\"float32\")).max)\n",
         "\n",
+        "class AttnStats(NamedTuple):\n",
+        "  entropy: jax.Array  # (bsz, n_layers, num_heads)\n",
+        "  varentropy: jax.Array  # (bsz, n_layers, num_heads)\n",
+        "  n_layers: int\n",
+        "  n_heads: int\n",
+        "\n",
+        "  @classmethod\n",
+        "  def new(cls, bsz: int, n_layers: int, n_heads: int) -> 'AttnStats':\n",
+        "    return cls(\n",
+        "        entropy=jnp.zeros((bsz, n_layers, n_heads), dtype=jnp.float32),\n",
+        "        varentropy=jnp.zeros((bsz, n_layers, n_heads), dtype=jnp.float32),\n",
+        "        n_layers=n_layers,\n",
+        "        n_heads=n_heads\n",
+        "    )\n",
+        "\n",
+        "  @property\n",
+        "  def avg_entropy(self):\n",
+        "    return self.entropy.sum(axis=-1, keepdims=False)  # Average across heads\n",
+        "\n",
+        "  @property\n",
+        "  def std_error(self):\n",
+        "    return jnp.sqrt(jnp.mean(self.varentropy)) / (self.n_heads * self.n_layers)\n",
+        "\n",
+        "  def update(self, scores: jax.Array, layer_idx: int):\n",
+        "    # scores shape: (bsz, n_heads, seqlen, n_words)\n",
+        "    probs = jax.nn.softmax(scores, axis=-1)\n",
+        "    new_entropy = -jnp.sum(jnp.where(probs > 0, probs * jnp.log(probs), 0), axis=-1)\n",
+        "    new_varentropy = jnp.sum(probs * (jnp.log(probs) + new_entropy[..., None])**2, axis=-1)\n",
+        "\n",
+        "    # print(f\"Layer {layer_idx} - Scores shape: {scores.shape}, Probs shape: {probs.shape}\")\n",
+        "    # print(f\"Layer {layer_idx} - New entropy shape: {new_entropy.shape}, Min: {jnp.min(new_entropy)}, Max: {jnp.max(new_entropy)}\")\n",
+        "\n",
+        "    updated_stats = self._replace(\n",
+        "        entropy=self.entropy.at[:, layer_idx, :].set(new_entropy),\n",
+        "        varentropy=self.varentropy.at[:, layer_idx, :].set(new_varentropy)\n",
+        "    )\n",
+        "\n",
+        "    # print(f\"Layer {layer_idx} - Updated entropy shape: {updated_stats.entropy.shape}\")\n",
+        "    # print(f\"Layer {layer_idx} - Updated entropy for this layer: {updated_stats.entropy[:, layer_idx, :]}\")\n",
+        "\n",
+        "    return updated_stats\n",
+        "\n",
         "\n",
         "#@partial(jax.jit, static_argnames=(\"eps\"))\n",
         "def rms_norm(x: jax.Array, w: jax.Array, eps: float = 1e-6) -> jax.Array:\n",
@@ -420,8 +461,8 @@
         "  keys = jnp.transpose(keys, (0, 2, 3, 1))  # (bs, n_heads, head_dim, cache_len + seqlen)\n",
         "  values = jnp.transpose(values, (0, 2, 1, 3))  # (bs, n_heads, cache_len + seqlen, head_dim)\n",
         "  scores = jnp.matmul(xq, keys)\n",
-        "  scores = scores / jnp.sqrt(model_params.head_dim)\n",
-        "  scores = scores.astype(jnp.float32)  # Always do attention softmax at float32\n",
+        "  pre_scores = scores / jnp.sqrt(model_params.head_dim)\n",
+        "  scores = pre_scores.astype(jnp.float32)  # Always do attention softmax at float32\n",
         "  if cur_pos == 0:\n",
         "    scores = scores + attn_mask\n",
         "  mask = jnp.where(scores != 0.0, scores, DEFAULT_MASK_VALUE)\n",
@@ -430,7 +471,7 @@
         "  output = jnp.matmul(scores, values)\n",
         "  output = jnp.swapaxes(output, 1, 2).reshape(xq.shape[0], xq.shape[2], -1)\n",
         "  out = jnp.dot(output, layer_weights.wo.T)\n",
-        "  return out, kvcache\n",
+        "  return out, kvcache, pre_scores\n",
         "\n",
         "#@partial(jax.jit)\n",
         "def feed_forward(x: jax.Array, layer_weights: LayerWeights) -> jax.Array:\n",
@@ -439,18 +480,24 @@
         "#@partial(jax.jit, static_argnames=(\"model_params\", \"cur_pos\"))\n",
         "def xfmr(xfmr_weights: XfmrWeights, model_params: ModelParams, tokens: jax.Array, cur_pos: int, freqs_cis: jax.Array, kvcache: KVCache, attn_mask: Optional[jax.Array]=None) -> Tuple[jax.Array, KVCache]:\n",
         "  h = xfmr_weights.tok_embeddings[tokens]\n",
+        "  attn_stats = AttnStats.new(\n",
+        "    bsz=tokens.shape[0],\n",
+        "    n_layers=model_params.n_layers,\n",
+        "    n_heads=model_params.n_local_heads\n",
+        "  )\n",
         "  for i in range(model_params.n_layers):\n",
         "    norm_x = rms_norm(h, xfmr_weights.layer_weights[i].attention_norm)\n",
-        "    h_attn, kvcache = attention(norm_x, xfmr_weights.layer_weights[i], model_params, cur_pos, i, freqs_cis, kvcache, attn_mask=attn_mask)\n",
+        "    h_attn, kvcache, scores = attention(norm_x, xfmr_weights.layer_weights[i], model_params, cur_pos, i, freqs_cis, kvcache, attn_mask=attn_mask)\n",
+        "    attn_stats = attn_stats.update(scores[:,:,-1,:], i)\n",
         "    h = h + h_attn\n",
         "    h = h + feed_forward(rms_norm(h, xfmr_weights.layer_weights[i].ffn_norm), xfmr_weights.layer_weights[i])\n",
         "  logits = jnp.dot(rms_norm(h, xfmr_weights.norm), xfmr_weights.output.T)\n",
-        "  return logits, kvcache"
+        "  return logits, kvcache, scores, attn_stats"
       ],
       "metadata": {
         "id": "oQe0q_Jzlap2"
       },
-      "execution_count": 15,
+      "execution_count": 8,
       "outputs": []
     },
     {
@@ -642,69 +689,152 @@
         "    varentropy = jnp.sum(probs * (log_probs / LN_2 + entropy[..., None])**2, axis=axis)\n",
         "    return entropy, varentropy\n",
         "\n",
-        "\n",
         "def multinomial_sample_one(probs_sort: jax.Array, key) -> jax.Array:\n",
-        "  \"\"\"Samples one token from a multinomial distribution with sorted probabilities.\"\"\"\n",
-        "  q = jax.random.exponential(key=key, shape=probs_sort.shape)\n",
-        "  return jnp.argmax(probs_sort / q, axis=-1, keepdims=True).astype(jnp.int32)\n",
-        "\n",
-        "\n",
-        "def _sample(logits: jax.Array, temperature=0.666, top_p=0.90, top_k=27, key=jax.random.PRNGKey(1337)) -> jax.Array:\n",
-        "  bsz = logits.shape[0]\n",
-        "  logit = logits[:, -1]\n",
-        "  probs = jax.nn.softmax(logit / temperature, axis=-1)\n",
-        "\n",
-        "  # Apply top-k sampling\n",
-        "  top_k_probs, top_k_indices = jax.lax.top_k(probs, k=top_k)\n",
-        "  probs_sort_jax = jnp.flip(top_k_probs, axis=-1)\n",
-        "  probs_idx_jax = jnp.flip(top_k_indices, axis=-1)\n",
-        "  probs_sum_jax = jnp.cumsum(probs_sort_jax, axis=-1)\n",
-        "\n",
-        "  # Apply top-p sampling\n",
-        "  mask_jax = jnp.where(probs_sum_jax - probs_sort_jax > top_p, True, False)  # Use jnp.where\n",
-        "  probs_sort_jax = probs_sort_jax * (1 - mask_jax)  # Set values to 0.0 using multiplication\n",
-        "  probs_sort_jax = probs_sort_jax / jnp.sum(probs_sort_jax, axis=-1, keepdims=True)\n",
-        "\n",
-        "  next_token_jax = multinomial_sample_one(probs_sort_jax, key)\n",
-        "  next_token_g_jax = jnp.take_along_axis(probs_idx_jax, next_token_jax.reshape(bsz, 1), axis=-1)\n",
-        "  return next_token_g_jax.astype(jnp.int32)\n",
-        "\n",
+        "    \"\"\"Samples one token from a multinomial distribution with sorted probabilities.\"\"\"\n",
+        "    q = jax.random.exponential(key=key, shape=probs_sort.shape)\n",
+        "    return jnp.argmax(probs_sort / q, axis=-1, keepdims=True).astype(jnp.int32)\n",
+        "\n",
+        "def _sample(logits: jax.Array, temperature=0.666, top_p=0.90, top_k=27, min_p: float = 0.0, key=jax.random.PRNGKey(1337)) -> jax.Array:\n",
+        "    bsz = logits.shape[0]\n",
+        "    logit = logits[:, -1]\n",
+        "    probs = jax.nn.softmax(logit / temperature, axis=-1)\n",
+        "\n",
+        "    # Apply min_p sampling\n",
+        "    if min_p > 0.0:\n",
+        "      p_max = jnp.max(probs, axis=-1, keepdims=True)\n",
+        "      indices_to_remove = probs < (min_p * p_max)\n",
+        "      logit = jnp.where(indices_to_remove, jnp.full_like(logit, float('-inf')), logit)\n",
+        "\n",
+        "    # Apply top-k sampling\n",
+        "    top_k_probs, top_k_indices = jax.lax.top_k(probs, k=top_k)\n",
+        "    probs_sort = jnp.flip(top_k_probs, axis=-1)\n",
+        "    probs_idx = jnp.flip(top_k_indices, axis=-1)\n",
+        "    probs_sum = jnp.cumsum(probs_sort, axis=-1)\n",
+        "    # Apply top-p sampling\n",
+        "    mask = jnp.where(probs_sum - probs_sort > top_p, 1.0, 0.0)\n",
+        "    probs_sort = probs_sort * (1 - mask)\n",
+        "    probs_sort = probs_sort / jnp.sum(probs_sort, axis=-1, keepdims=True)\n",
+        "    next_token = multinomial_sample_one(probs_sort, key)\n",
+        "    next_token_g = jnp.take_along_axis(probs_idx, next_token.reshape(bsz, 1), axis=-1)\n",
+        "    return next_token_g.astype(jnp.int32)\n",
+        "\n",
+        "def calculate_metrics(logits: jnp.ndarray, attention_scores: jnp.ndarray) -> Dict[str, jnp.ndarray]:\n",
+        "    entropy, varentropy = calculate_varentropy_logsoftmax(logits)\n",
+        "\n",
+        "    attention_probs = jax.nn.softmax(attention_scores, axis=-1)\n",
+        "    attn_entropy = -jnp.sum(attention_probs * jnp.log2(jnp.clip(attention_probs, 1e-10, 1.0)), axis=-1)\n",
+        "    attn_varentropy = jnp.var(attn_entropy, axis=-1)\n",
+        "\n",
+        "    mean_attention = jnp.mean(attention_probs, axis=1)\n",
+        "    agreement = jnp.mean(jnp.abs(attention_probs - mean_attention[:, None, :]), axis=(1, 2))\n",
+        "\n",
+        "    interaction_strength = jnp.mean(jnp.abs(attention_scores), axis=(1, 2, 3))\n",
+        "\n",
+        "    return {\n",
+        "        \"logits_entropy\": jnp.mean(entropy),\n",
+        "        \"logits_varentropy\": jnp.mean(varentropy),\n",
+        "        \"attn_entropy\": jnp.mean(attn_entropy),\n",
+        "        \"attn_varentropy\": jnp.mean(attn_varentropy),\n",
+        "        \"agreement\": jnp.mean(agreement),\n",
+        "        \"interaction_strength\": interaction_strength\n",
+        "    }\n",
         "\n",
-        "def sample(gen_tokens: jax.Array, logits: jax.Array, temperature=0.666, top_p=0.90, top_k=27, key=jax.random.PRNGKey(1337)) -> jax.Array:\n",
-        "    ent, vent = calculate_varentropy_logsoftmax(logits)\n",
+        "def adaptive_sample(logits: jax.Array, metrics: Dict[str, jnp.ndarray],\n",
+        "                    gen_tokens: jax.Array, n_samples: int,\n",
+        "                    base_temp: float = 0.666, base_top_p: float = 0.90, base_top_k: int = 40, base_min_p: float = 0.03, # Turn this down to 0.01 to reduce the shoggoth\n",
+        "                    key: jax.random.PRNGKey = jax.random.PRNGKey(1337)) -> jax.Array:\n",
+        "    logits_uncertainty = metrics[\"logits_entropy\"] + metrics[\"logits_varentropy\"]\n",
+        "    attn_uncertainty = metrics[\"attn_entropy\"] + metrics[\"attn_varentropy\"]\n",
+        "\n",
+        "    temperature = base_temp * (1 + 0.3 * logits_uncertainty + 0.2 * attn_uncertainty - 0.2 * metrics[\"agreement\"])\n",
+        "    top_p = jnp.clip(base_top_p * (1 + 0.1 * metrics[\"attn_varentropy\"]), 0.1, 1.0)\n",
+        "    top_k = int(jnp.clip(\n",
+        "        jnp.round(base_top_k * (1 + 0.3 * metrics[\"interaction_strength\"].item() - 0.2 * metrics[\"agreement\"].item())),\n",
+        "        a_min=1,\n",
+        "        a_max=100\n",
+        "    ))\n",
+        "    min_p = jnp.clip(base_min_p * (1 - 0.5 * logits_uncertainty), 0.01, 0.5)\n",
+        "\n",
+        "    keys = jax.random.split(key, n_samples)\n",
+        "\n",
+        "    samples = []\n",
+        "    for sample_key in keys:\n",
+        "        sample = _sample(logits, temperature=temperature, top_p=top_p, top_k=top_k, min_p=min_p, key=sample_key)\n",
+        "        samples.append(sample)\n",
+        "\n",
+        "    def score_sample(sample):\n",
+        "        log_prob = jnp.sum(jax.nn.log_softmax(logits) * jax.nn.one_hot(sample, logits.shape[-1]))\n",
+        "        confidence_score = (\n",
+        "            (1 - metrics[\"logits_entropy\"]) * 0.1 +\n",
+        "            (1 - metrics[\"attn_entropy\"]) * 0.2 +\n",
+        "            (1 - metrics[\"logits_varentropy\"]) * 0.3 +\n",
+        "            (1 - metrics[\"attn_varentropy\"]) * 0.4 +\n",
+        "            metrics[\"agreement\"] * 0.5 +\n",
+        "            metrics[\"interaction_strength\"] * 0.6\n",
+        "        )\n",
+        "        return log_prob + confidence_score\n",
+        "\n",
+        "    sample_scores = [score_sample(sample) for sample in samples]\n",
+        "    best_sample_idx = jnp.argmax(jnp.array(sample_scores))\n",
+        "    return samples[best_sample_idx]\n",
+        "\n",
+        "# I am absolutely appaled that these random hyperparams are virtually impossible to beat with a more sophisticated approach.\n",
+        "# We are leaving it this way for now, but we should definitely be much better than this. Have some self respect.\n",
+        "def sample(gen_tokens: jax.Array, logits: jax.Array, attention_scores: jax.Array,\n",
+        "           temperature=0.666, top_p=0.90, top_k=27, min_p: float = 0.0, key=jax.random.PRNGKey(1337)) -> jax.Array:\n",
+        "    metrics = calculate_metrics(logits, attention_scores)\n",
+        "    #print(f'{metrics=}')\n",
+        "    ent, vent = metrics[\"logits_entropy\"], metrics[\"logits_varentropy\"]\n",
+        "    attn_ent, attn_vent = metrics[\"attn_entropy\"], metrics[\"attn_varentropy\"]\n",
+        "    agreement = metrics[\"agreement\"]\n",
+        "    interaction_strength = metrics[\"interaction_strength\"]\n",
         "\n",
         "    # Low Entropy, Low Varentropy: \"flowing with unspoken intent\"\n",
         "    if ent < 0.1 and vent < 0.1:\n",
         "        return jnp.argmax(logits[:, -1], axis=-1, keepdims=True).astype(jnp.int32)\n",
         "\n",
         "    # High Entropy, Low Varentropy: \"treading carefully, asking clarifying questions\"\n",
-        "    elif ent > 5.0 and vent < 0.1:\n",
+        "    elif ent > 3.0 and vent < 0.1:\n",
         "        # Insert a clarifying question token if not already present\n",
         "        if not jnp.isin(gen_tokens[:,-1], 2564).any():\n",
         "            return jnp.array([[2564]])  # Assuming 2564 is our \"ask clarifying question\" token\n",
         "        else:\n",
         "            # If we've just asked a question, sample with slightly higher temperature\n",
-        "            return _sample(logits, temperature=min(1.3, temperature * 1.5))\n",
+        "            temp_adj = 1.3 + 0.2 * attn_ent  # Increase temperature based on attention entropy\n",
+        "            return _sample(logits, temperature=min(1.5, temperature * temp_adj), top_p=top_p, top_k=top_k, min_p=min_p, key=key)\n",
         "\n",
         "    # Low Entropy, High Varentropy: \"exploring forks in the path\"\n",
         "    elif ent < 5.0 and vent > 5.0:\n",
-        "        # TODO(xjdr): Implement proper branching logic\n",
-        "        # Return top-k tokens to allow for branching\n",
-        "        #top_k_values, top_k_indices = jax.lax.top_k(logits[:, -1], k=top_k)\n",
-        "        #return top_k_indices\n",
-        "        return _sample(logits, temperature=min(1.2, temperature * 1.5))\n",
+        "        temp_adj = 1.2 + 0.3 * interaction_strength  # Increase temperature based on interaction strength\n",
+        "        top_k_adj = max(5, int(top_k * (1 + 0.5 * (1 - agreement))))  # Increase top_k when agreement is low\n",
+        "        return _sample(logits, temperature=min(1.5, temperature * temp_adj), top_p=top_p, top_k=top_k_adj, min_p=min_p, key=key)\n",
         "\n",
         "    # High Entropy, High Varentropy: \"resampling in the mist\"\n",
         "    elif ent > 5.0 and vent > 5.0:\n",
-        "        # Use high temperature and min_p sampling\n",
-        "        return _sample(logits, temperature=max(2.0, temperature * 3))\n",
+        "        # Use high temperature and adjusted top_p based on attention metrics\n",
+        "        temp_adj = 2.0 + 0.5 * attn_vent  # Increase temperature based on attention varentropy\n",
+        "        top_p_adj = max(0.5, top_p - 0.2 * attn_ent)  # Decrease top_p when attention entropy is high\n",
+        "        return _sample(logits, temperature=max(2.0, temperature * temp_adj), top_p=top_p_adj, top_k=top_k, min_p=min_p, key=key)\n",
         "\n",
-        "    # Middle ground: smooth transition\n",
+        "    # Middle ground: use adaptive sampling\n",
         "    else:\n",
         "        # Interpolate temperature based on entropy and varentropy\n",
-        "        t = jnp.clip((ent + vent) / 10.0, 0.5, 2.0)\n",
-        "        return _sample(logits, temperature=t * temperature)\n",
-        "\n",
+        "        #t = jnp.clip((ent + vent) / 10.0, 0.5, 2.0)\n",
+        "        # Adjust temperature and top_k based on attention metrics\n",
+        "        #temp_adj = t + 0.2 * attn_ent + 0.1 * attn_vent\n",
+        "        #top_k_adj = max(5, int(top_k * (1 + 0.3 * interaction_strength - 0.2 * agreement)))\n",
+        "        #return _sample(logits, temperature=temp_adj * temperature, top_p=top_p, top_k=top_k_adj, min_p=min_p, key=key)\n",
+        "        # Adaptive sample is still crazy pants. Leave the more stable code above here for now.\n",
+        "        return adaptive_sample(\n",
+        "            logits,\n",
+        "            metrics,\n",
+        "            gen_tokens,\n",
+        "            n_samples=12,\n",
+        "            base_temp=temperature,\n",
+        "            base_top_p=top_p,\n",
+        "            base_top_k=top_k,\n",
+        "            key=key\n",
+        "        )\n",
         "\n",
         "def main():\n",
         "  model_params = LLAMA_1B_PARAMS\n",
@@ -731,17 +861,17 @@
         "    attn_mask = build_attn_mask(seqlen, cur_pos)\n",
         "    freqs_cis = precompute_freqs_cis(model_params.head_dim, model_params.max_seq_len, model_params.rope_theta, model_params.use_scaled_rope)\n",
         "    kvcache = KVCache.new(model_params.n_layers, bsz, model_params.max_seq_len, model_params.n_local_kv_heads, model_params.head_dim)\n",
-        "    logits, kvcache = xfmr(xfmr_weights, model_params, tokens, cur_pos, freqs_cis[:seqlen], kvcache, attn_mask=attn_mask)\n",
+        "    logits, kvcache, _, _ = xfmr(xfmr_weights, model_params, tokens, cur_pos, freqs_cis[:seqlen], kvcache, attn_mask=attn_mask)\n",
         "    next_token = jnp.argmax(logits[:, -1], axis=-1, keepdims=True).astype(jnp.int32)\n",
         "    gen_tokens = next_token\n",
         "    print(tokenizer.decode([next_token.item()]), end='', flush=True)\n",
         "    cur_pos = seqlen\n",
         "    stop = jnp.array([128001, 128008, 128009])\n",
         "    #stop = jnp.array(tokenizer.stop_tokens)\n",
-        "    while cur_pos < 2048:\n",
+        "    while cur_pos < 8192:\n",
         "      cur_pos += 1\n",
-        "      logits, kvcache = xfmr(xfmr_weights, model_params, next_token, cur_pos, freqs_cis[cur_pos:cur_pos+1], kvcache)\n",
-        "      next_token = sample(gen_tokens, logits)\n",
+        "      logits, kvcache, scores, stats = xfmr(xfmr_weights, model_params, next_token, cur_pos, freqs_cis[cur_pos:cur_pos+1], kvcache)\n",
+        "      next_token = sample(gen_tokens, logits, scores)\n",
         "      gen_tokens = jnp.concatenate((gen_tokens, next_token))\n",
         "      print(tokenizer.decode(next_token.tolist()[0]), end='', flush=True)\n",
         "      if jnp.isin(next_token, stop).any():\n",
@@ -749,16 +879,16 @@
         "\n",
         "  print(prompt)\n",
         "  generate(xfmr_weights, model_params, raw_tokens1)\n",
-        "  print('\\n')\n",
-        "  print(prompt2)\n",
-        "  generate(xfmr_weights, model_params, raw_tokens2)\n",
-        "  print('\\n')\n",
-        "  print(prompt3)\n",
-        "  generate(xfmr_weights, model_params, raw_tokens3)\n",
-        "  print('\\n')\n",
-        "  print(prompt4)\n",
-        "  generate(xfmr_weights, model_params, raw_tokens4)\n",
-        "  print('\\n')\n",
+        "  # print('\\n')\n",
+        "  # print(prompt2)\n",
+        "  # generate(xfmr_weights, model_params, raw_tokens2)\n",
+        "  # print('\\n')\n",
+        "  # print(prompt3)\n",
+        "  # generate(xfmr_weights, model_params, raw_tokens3)\n",
+        "  # print('\\n')\n",
+        "  # print(prompt4)\n",
+        "  # generate(xfmr_weights, model_params, raw_tokens4)\n",
+        "  # print('\\n')\n",
         "\n",
         "  #print(bp1)\n",
         "  #generate(xfmr_weights, model_params, base_raw_tokens1)\n",
@@ -777,13 +907,12 @@
       ],
       "metadata": {
         "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 1000
+          "base_uri": "https://localhost:8080/"
         },
         "id": "DMFl-xY-mGlg",
-        "outputId": "1bc192bc-8545-4acd-dd15-654d5ee6819a"
+        "outputId": "a8c67b96-d719-4fda-c29b-4b519f23bc6b"
       },
-      "execution_count": 16,
+      "execution_count": 11,
       "outputs": [
         {
           "output_type": "stream",
@@ -798,90 +927,14 @@
             "\n",
             "<thinking>\n",
             "\n",
-            "I'll attempt to represent my thought process in a more detailed and introspective way.\n",
-            "\n",
-            "The first thing I notice is that the numbers are the same, 9.9 and 9.11. So, in this case, I realize that these two numbers are actually equal, and I've immediately resolved the question.\n",
-            "\n",
-            "Now, I ask myself, \"What do I know about the numbers? Did I get some prior information or context that influences my answer? Was there a specific problem or question that led me to calculate 9.9 and 9.11?\"\n",
+            "I'm going to check both numbers using a mathematical approximation. \n",
             "\n",
-            "The first thing that comes to mind is that I think about the last digit of each number. For 9.9, the last digit is 9, and for 9.11, the last digit is also 1. This gives me a clear answer: 9.9 is indeed larger than 9.11.\n",
-            "\n",
-            "Hmm, let me think about this for a moment...<|eot_id|>\n",
-            "\n",
-            "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n",
-            "You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n",
+            "First, I'll consider 9.9 as 9.99 and 9.11. \n",
             "\n",
-            "What is the capital of Spain?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n",
+            "My brain, a large machine, isn't perfect, and this task is complex, but I can give you a straightforward response without delving into intricate thinking about 9.99. Let's call it as it is. Since neither is closer, my task is simplified: both 9.9 and 9.11 are equivalent to 9.9.\n",
             "\n",
-            "The capital of Spain is Madrid.<|eot_id|>\n",
-            "\n",
-            "<|start_header_id|>system<|end_header_id|>\n",
-            "You are an expert in composing functions. You are given a question and a set of possible functions.\n",
-            "Based on the question, you will need to make one or more function/tool calls to achieve the purpose.\n",
-            "If none of the functions can be used, point it out. If the given question lacks the parameters required by the function,also point it out. You should only return the function call in tools call sections.\n",
-            "If you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]\n",
-            "You SHOULD NOT include any other text in the response.\n",
-            "Here is a list of functions in JSON format that you can invoke.[\n",
-            "    {\n",
-            "        \"name\": \"get_user_info\",\n",
-            "        \"description\": \"Retrieve details for a specific user by their unique identifier. Note that the provided function is in Python 3 syntax.\",\n",
-            "        \"parameters\": {\n",
-            "            \"type\": \"dict\",\n",
-            "            \"required\": [\n",
-            "                \"user_id\"\n",
-            "            ],\n",
-            "            \"properties\": {\n",
-            "                \"user_id\": {\n",
-            "                \"type\": \"integer\",\n",
-            "                \"description\": \"The unique identifier of the user. It is used to fetch the specific user details from the database.\"\n",
-            "            },\n",
-            "            \"special\": {\n",
-            "                \"type\": \"string\",\n",
-            "                \"description\": \"Any special information or parameters that need to be considered while fetching user details.\",\n",
-            "                \"default\": \"none\"\n",
-            "                }\n",
-            "            }\n",
-            "        }\n",
-            "    }\n",
-            "]\n",
-            "<|eot_id|><|start_header_id|>user<|end_header_id|>\n",
-            "\n",
-            "Can you retrieve the details for the user with the ID 7890, who has black as their special request?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n",
-            "\n",
-            "[get_user_info(user_id=7890, special='black')]<|eot_id|>\n",
-            "\n",
-            "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n",
-            "You are a masterful story teller. you can paint with all the colors of the wind.<|eot_id|><|start_header_id|>user<|end_header_id|>\n",
-            "\n",
-            "Tell me a long and wonderful story about the adventures of the elven mage frieren and her band of heros<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n",
-            "\n",
-            "In the realm of Eridoria, where the sun dipped into the horizon and painted the sky with hues of crimson and gold, the village of Brindlemark lay nestled within a valley. It was a place of ancient magic, where the air was sweet with the scent of blooming wildflowers and the sound of whispering leaves rustled through the trees. Here, the elven mage, Frida, dwelled, her long, silver hair tangled with vines and her eyes shining like stars in the night sky.\n",
-            "\n",
-            "Frida was a master of the arcane arts, her knowledge of the mystical forces that governed the world passed down through generations of her elven bloodline. She spent her days studying the ancient tomes in the dusty library of the village, unlocking secrets of the universe and casting spells that could bend reality to her will. Her most trusted companion, a sturdy dwarf named Grimbold"
-          ]
-        },
-        {
-          "output_type": "error",
-          "ename": "KeyboardInterrupt",
-          "evalue": "",
-          "traceback": [
-            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-            "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
-            "\u001b[0;32m<ipython-input-16-cef2bf2878e8>\u001b[0m in \u001b[0;36m<cell line: 309>\u001b[0;34m()\u001b[0m\n\u001b[1;32m    307\u001b[0m   \u001b[0;31m#print('\\n')\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    308\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 309\u001b[0;31m \u001b[0mmain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
-            "\u001b[0;32m<ipython-input-16-cef2bf2878e8>\u001b[0m in \u001b[0;36mmain\u001b[0;34m()\u001b[0m\n\u001b[1;32m    291\u001b[0m   \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'\\n'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    292\u001b[0m   \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprompt4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 293\u001b[0;31m   \u001b[0mgenerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxfmr_weights\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel_params\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mraw_tokens4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    294\u001b[0m   \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'\\n'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    295\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m<ipython-input-16-cef2bf2878e8>\u001b[0m in \u001b[0;36mgenerate\u001b[0;34m(xfmr_weights, model_params, tokens)\u001b[0m\n\u001b[1;32m    274\u001b[0m     \u001b[0;32mwhile\u001b[0m \u001b[0mcur_pos\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0;36m2048\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    275\u001b[0m       \u001b[0mcur_pos\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 276\u001b[0;31m       \u001b[0mlogits\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkvcache\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mxfmr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxfmr_weights\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel_params\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnext_token\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcur_pos\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfreqs_cis\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcur_pos\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mcur_pos\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkvcache\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    277\u001b[0m       \u001b[0mnext_token\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgen_tokens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlogits\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    278\u001b[0m       \u001b[0mgen_tokens\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcatenate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgen_tokens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnext_token\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m<ipython-input-15-d4dfea535cbb>\u001b[0m in \u001b[0;36mxfmr\u001b[0;34m(xfmr_weights, model_params, tokens, cur_pos, freqs_cis, kvcache, attn_mask)\u001b[0m\n\u001b[1;32m     55\u001b[0m   \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel_params\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mn_layers\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     56\u001b[0m     \u001b[0mnorm_x\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrms_norm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mh\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mxfmr_weights\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlayer_weights\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mattention_norm\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 57\u001b[0;31m     \u001b[0mh_attn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkvcache\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mattention\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnorm_x\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mxfmr_weights\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlayer_weights\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel_params\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcur_pos\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfreqs_cis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkvcache\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattn_mask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mattn_mask\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     58\u001b[0m     \u001b[0mh\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mh_attn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     59\u001b[0m     \u001b[0mh\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mfeed_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrms_norm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mh\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mxfmr_weights\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlayer_weights\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mffn_norm\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mxfmr_weights\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlayer_weights\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m<ipython-input-15-d4dfea535cbb>\u001b[0m in \u001b[0;36mattention\u001b[0;34m(x, layer_weights, model_params, cur_pos, layer_idx, freqs_cis, kvcache, attn_mask)\u001b[0m\n\u001b[1;32m     29\u001b[0m   \u001b[0mxv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlayer_weights\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbsz\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel_params\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mn_local_kv_heads\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel_params\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead_dim\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     30\u001b[0m   \u001b[0mxq\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mxk\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mapply_rotary_emb\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxq\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mxk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfreqs_cis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfreqs_cis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 31\u001b[0;31m   \u001b[0mkeys\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkvcache\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkvcache\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mxv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlayer_idx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcur_pos\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_rep\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     32\u001b[0m   \u001b[0mxq\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtranspose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxq\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m  \u001b[0;31m# (bs, n_heads, seqlen, head_dim)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     33\u001b[0m   \u001b[0mkeys\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtranspose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkeys\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m  \u001b[0;31m# (bs, n_heads, head_dim, cache_len + seqlen)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m<ipython-input-14-ea4a6279cef3>\u001b[0m in \u001b[0;36mupdate\u001b[0;34m(self, xk, xv, layer_idx, cur_pos, n_rep)\u001b[0m\n\u001b[1;32m     18\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     19\u001b[0m       \u001b[0mkeys\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrepeat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mck\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mlayer_idx\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_rep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 20\u001b[0;31m       \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrepeat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcv\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mlayer_idx\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_rep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     21\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     22\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mkeys\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mKVCache\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mck\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcv\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/jax/_src/array.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, idx)\u001b[0m\n\u001b[1;32m    353\u001b[0m             out.aval, sharding, [out], committed=False, _skip_checks=True)\n\u001b[1;32m    354\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 355\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mlax_numpy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_rewriting_take\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    356\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    357\u001b[0m   \u001b[0;32mdef\u001b[0m \u001b[0m__iter__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/jax/_src/numpy/lax_numpy.py\u001b[0m in \u001b[0;36m_rewriting_take\u001b[0;34m(arr, idx, indices_are_sorted, unique_indices, mode, fill_value)\u001b[0m\n\u001b[1;32m   8951\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   8952\u001b[0m   \u001b[0mtreedef\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstatic_idx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdynamic_idx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_split_index_for_jit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 8953\u001b[0;31m   return _gather(arr, treedef, static_idx, dynamic_idx, indices_are_sorted,\n\u001b[0m\u001b[1;32m   8954\u001b[0m                  unique_indices, mode, fill_value)\n\u001b[1;32m   8955\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/jax/_src/numpy/lax_numpy.py\u001b[0m in \u001b[0;36m_gather\u001b[0;34m(arr, treedef, static_idx, dynamic_idx, indices_are_sorted, unique_indices, mode, fill_value)\u001b[0m\n\u001b[1;32m   8981\u001b[0m   \u001b[0;31m# We avoid generating a gather when indexer.gather_indices.size is empty.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   8982\u001b[0m   \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_empty_shape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgather_indices\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 8983\u001b[0;31m     y = lax.gather(\n\u001b[0m\u001b[1;32m   8984\u001b[0m       \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindexer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgather_indices\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindexer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdnums\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindexer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgather_slice_shape\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   8985\u001b[0m       \u001b[0munique_indices\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0munique_indices\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mindexer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munique_indices\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/jax/_src/lax/slicing.py\u001b[0m in \u001b[0;36mgather\u001b[0;34m(operand, start_indices, dimension_numbers, slice_sizes, unique_indices, indices_are_sorted, mode, fill_value)\u001b[0m\n\u001b[1;32m    345\u001b[0m   \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    346\u001b[0m     \u001b[0mfill_value\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 347\u001b[0;31m   return gather_p.bind(\n\u001b[0m\u001b[1;32m    348\u001b[0m       \u001b[0moperand\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstart_indices\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdimension_numbers\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdimension_numbers\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    349\u001b[0m       \u001b[0mslice_sizes\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcanonicalize_shape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mslice_sizes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/jax/_src/core.py\u001b[0m in \u001b[0;36mbind\u001b[0;34m(self, *args, **params)\u001b[0m\n\u001b[1;32m    437\u001b[0m     assert (not config.enable_checks.value or\n\u001b[1;32m    438\u001b[0m             all(isinstance(arg, Tracer) or valid_jaxtype(arg) for arg in args)), args\n\u001b[0;32m--> 439\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbind_with_trace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfind_top_trace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    440\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    441\u001b[0m   \u001b[0;32mdef\u001b[0m \u001b[0mbind_with_trace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrace\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/jax/_src/core.py\u001b[0m in \u001b[0;36mbind_with_trace\u001b[0;34m(self, trace, args, params)\u001b[0m\n\u001b[1;32m    441\u001b[0m   \u001b[0;32mdef\u001b[0m \u001b[0mbind_with_trace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrace\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    442\u001b[0m     \u001b[0;32mwith\u001b[0m \u001b[0mpop_level\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrace\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlevel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 443\u001b[0;31m       \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrace\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprocess_primitive\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrace\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfull_raise\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    444\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mmap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfull_lower\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmultiple_results\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mfull_lower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    445\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/jax/_src/core.py\u001b[0m in \u001b[0;36mprocess_primitive\u001b[0;34m(self, primitive, tracers, params)\u001b[0m\n\u001b[1;32m    947\u001b[0m       \u001b[0;32mreturn\u001b[0m \u001b[0mcall_impl_with_key_reuse_checks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprimitive\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprimitive\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mimpl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0mtracers\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mparams\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    948\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 949\u001b[0;31m       \u001b[0;32mreturn\u001b[0m \u001b[0mprimitive\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mimpl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mtracers\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mparams\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    950\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    951\u001b[0m   \u001b[0;32mdef\u001b[0m \u001b[0mprocess_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprimitive\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtracers\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/jax/_src/dispatch.py\u001b[0m in \u001b[0;36mapply_primitive\u001b[0;34m(prim, *args, **params)\u001b[0m\n\u001b[1;32m     86\u001b[0m   \u001b[0mprev\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjax_jit\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mswap_thread_local_state_disable_jit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     87\u001b[0m   \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 88\u001b[0;31m     \u001b[0mouts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     89\u001b[0m   \u001b[0;32mfinally\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     90\u001b[0m     \u001b[0mlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjax_jit\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mswap_thread_local_state_disable_jit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprev\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+            "But the task isn't as simple, is it? This might raise a concern for an intelligent tool like me. While I'll give the exact solution with calculations:\n",
+            "The two numbers are, indeed, not only equal, but we need to recognize both sides equal in 9.99999 which implies it doesn't matter what either is as a base or an absolute value<|eot_id|>"
           ]
         }
       ]