update README and workflow!

jxmorris12 · Feb 22, 2024 · 0ccaca1 · 0ccaca1
1 parent 768a90b
commit 0ccaca1
Show file tree

Hide file tree

Showing 3 changed files with 52 additions and 2 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -5,9 +5,9 @@ name: Test with PyTest
 
 on:
   push:
-    branches: [ master ]
+    branches: [ main ]
   pull_request:
-    branches: [ master ]
+    branches: [ main ]
 
 jobs:
   build:

diff --git a/README.md b/README.md
@@ -0,0 +1,48 @@
+# bm25-pt
+
+A minimal BM25 implementation using PyTorch. (Also uses [HuggingFace tokenizers](https://huggingface.co/docs/tokenizers/en/index) behind the scenes to tokenize text.)
+
+```bash
+pip install bm25_pt
+```
+
+## Usage
+
+
+```python
+from bm25_pt import BM25
+
+bm25 = BM25()
+corpus = [
+    "A high weight in tf–idf is reached by a high term frequency",
+    "(in the given document) and a low document frequency of the term",
+    "in the whole collection of documents; the weights hence tend to filter",
+    "out common terms. Since the ratio inside the idf's log function is always",
+    "greater than or equal to 1, the value of idf (and tf–idf) is greater than or equal",
+    "to 0. As a term appears in more documents, the ratio inside the logarithm approaches",
+    "1, bringing the idf and tf–idf closer to 0.",
+]
+bm25.index(corpus)
+
+queries = ["weights", "ratio logarithm"]
+doc_scores = bm25.score_batch(queries)
+print(doc_scores)
+>> tensor([[0.0000, 0.0000, 1.4238, 0.0000, 0.0000, 0.0000, 0.0000],
+        [0.0000, 0.0000, 0.0000, 1.5317, 0.0000, 2.0203, 0.0000]])
+```
+
+can also call `score()` with a
+
+### Use your own tokenizer
+
+You can use your own tokenizer if you want. Simply provide your tokenizer to the `BM25` constructor:
+
+```python
+from bm25_pt import BM25
+import transformers
+
+tokenizer = transformers.AutoTokenizer.from_pretrained("t5-base")
+bm25 = BM25(tokenizer=tokenizer)
+```
+
+then proceed to use the library as normal.ss    
diff --git a/bm25_pt/bm25.py b/bm25_pt/bm25.py
@@ -5,6 +5,7 @@
 import torch
 import transformers
 
+
 def documents_to_bags(docs: torch.Tensor, vocab_size: int) -> torch.sparse.Tensor:
     num_docs, seq_length = docs.shape
     batch_idxs = torch.arange(num_docs)[:, None].expand(-1, seq_length)
@@ -15,6 +16,7 @@ def documents_to_bags(docs: torch.Tensor, vocab_size: int) -> torch.sparse.Tenso
     vals = (docs > 0).int().flatten()
     return torch.sparse_coo_tensor(idxs, vals, size=(num_docs, vocab_size)).coalesce()
 
+
 class TokenizedBM25:
     k1: float
     b: float