Skip to content

Commit afd0932

Browse files
badmonster0chardoncs
authored andcommitted
google drive example update query handling (#531)
1 parent cba6467 commit afd0932

File tree

2 files changed

+83
-48
lines changed

2 files changed

+83
-48
lines changed

examples/gdrive_text_embedding/README.md

Lines changed: 38 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,23 @@
1-
This example builds embedding index based on Google Drive files.
2-
It continuously updates the index as files are added / updated / deleted in the source folders:
3-
it keeps the index in sync with the source folders effortlessly.
1+
# Build Google Drive text embedding and semantic search 🔍
2+
[![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex)
3+
4+
In this example, we will build an embedding index based on Google Drive files and perform semantic search.
5+
6+
It continuously updates the index as files are added / updated / deleted in the source folders. It keeps the index in sync with the source folders in real-time.
7+
8+
We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful.
9+
10+
## Steps
11+
12+
### Indexing Flow
13+
<img width="801" alt="Google Drive File Ingestion" src="https://github.com/user-attachments/assets/bc772e1e-d7a0-46de-b57c-290a78c128ac" />
14+
15+
1. We will ingest files from Google Drive folders.
16+
2. For each file, perform chunking (recursively split) and then embedding.
17+
3. We will save the embeddings and the metadata in Postgres with PGVector.
18+
19+
### Query
20+
We will match against user-provided text by a SQL query, and reuse the embedding operation in the indexing flow.
421

522
## Prerequisite
623

@@ -25,32 +42,31 @@ Before running the example, you need to:
2542

2643
## Run
2744

28-
Install dependencies:
29-
30-
```sh
31-
pip install -e .
32-
```
45+
- Install dependencies:
3346

34-
Setup:
47+
```sh
48+
pip install -e .
49+
```
3550

36-
```sh
37-
cocoindex setup main.py
38-
```
51+
- Setup:
3952

40-
Run:
53+
```sh
54+
cocoindex setup main.py
55+
```
4156

42-
```sh
43-
python main.py
44-
```
57+
- Run:
58+
59+
```sh
60+
python main.py
61+
```
4562

4663
During running, it will keep observing changes in the source folders and update the index automatically.
4764
At the same time, it accepts queries from the terminal, and performs search on top of the up-to-date index.
4865

4966

5067
## CocoInsight
51-
CocoInsight is in Early Access now (Free) 😊 You found us! A quick 3 minute video tutorial about CocoInsight: [Watch on YouTube](https://youtu.be/ZnmyoHslBSc?si=pPLXWALztkA710r9).
52-
53-
Run CocoInsight to understand your RAG data pipeline:
68+
I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline.
69+
It just connects to your local CocoIndex server, with Zero pipeline data retention. Run following command to start CocoInsight:
5470

5571
```sh
5672
cocoindex server -ci main.py
@@ -62,4 +78,6 @@ You can also add a `-L` flag to make the server keep updating the index to refle
6278
cocoindex server -ci -L main.py
6379
```
6480

65-
Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight).
81+
Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight).
82+
83+
<img width="1316" alt="Screenshot 2025-05-20 at 5 06 31 PM" src="https://github.com/user-attachments/assets/0ed848db-3cc3-43d3-8cb8-35069f503288" />

examples/gdrive_text_embedding/main.py

Lines changed: 45 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,19 @@
11
from dotenv import load_dotenv
2-
2+
from psycopg_pool import ConnectionPool
33
import cocoindex
44
import datetime
55
import os
66

7+
@cocoindex.transform_flow()
8+
def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]:
9+
"""
10+
Embed the text using a SentenceTransformer model.
11+
This is a shared logic between indexing and querying, so extract it as a function.
12+
"""
13+
return text.transform(
14+
cocoindex.functions.SentenceTransformerEmbed(
15+
model="sentence-transformers/all-MiniLM-L6-v2"))
16+
717
@cocoindex.flow_def(name="GoogleDriveTextEmbedding")
818
def gdrive_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
919
"""
@@ -27,9 +37,7 @@ def gdrive_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope:
2737
language="markdown", chunk_size=2000, chunk_overlap=500)
2838

2939
with doc["chunks"].row() as chunk:
30-
chunk["embedding"] = chunk["text"].transform(
31-
cocoindex.functions.SentenceTransformerEmbed(
32-
model="sentence-transformers/all-MiniLM-L6-v2"))
40+
chunk["embedding"] = text_to_embedding(chunk["text"])
3341
doc_embeddings.collect(filename=doc["filename"], location=chunk["location"],
3442
text=chunk["text"], embedding=chunk["embedding"])
3543

@@ -42,33 +50,42 @@ def gdrive_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope:
4250
field_name="embedding",
4351
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
4452

45-
query_handler = cocoindex.query.SimpleSemanticsQueryHandler(
46-
name="SemanticsSearch",
47-
flow=gdrive_text_embedding_flow,
48-
target_name="doc_embeddings",
49-
query_transform_flow=lambda text: text.transform(
50-
cocoindex.functions.SentenceTransformerEmbed(
51-
model="sentence-transformers/all-MiniLM-L6-v2")),
52-
default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)
53+
def search(pool: ConnectionPool, query: str, top_k: int = 5):
54+
# Get the table name, for the export target in the gdrive_text_embedding_flow above.
55+
table_name = cocoindex.utils.get_target_storage_default_name(gdrive_text_embedding_flow, "doc_embeddings")
56+
# Evaluate the transform flow defined above with the input query, to get the embedding.
57+
query_vector = text_to_embedding.eval(query)
58+
# Run the query and get the results.
59+
with pool.connection() as conn:
60+
with conn.cursor() as cur:
61+
cur.execute(f"""
62+
SELECT filename, text, embedding <=> %s::vector AS distance
63+
FROM {table_name} ORDER BY distance LIMIT %s
64+
""", (query_vector, top_k))
65+
return [
66+
{"filename": row[0], "text": row[1], "score": 1.0 - row[2]}
67+
for row in cur.fetchall()
68+
]
5369

5470
def _main():
55-
# Use a `FlowLiveUpdater` to keep the flow data updated.
56-
with cocoindex.FlowLiveUpdater(gdrive_text_embedding_flow):
57-
# Run queries in a loop to demonstrate the query capabilities.
58-
while True:
59-
try:
60-
query = input("Enter search query (or Enter to quit): ")
61-
if query == '':
62-
break
63-
results, _ = query_handler.search(query, 10)
64-
print("\nSearch results:")
65-
for result in results:
66-
print(f"[{result.score:.3f}] {result.data['filename']}")
67-
print(f" {result.data['text']}")
68-
print("---")
69-
print()
70-
except KeyboardInterrupt:
71+
# Initialize the database connection pool.
72+
pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL"))
73+
# Run queries in a loop to demonstrate the query capabilities.
74+
while True:
75+
try:
76+
query = input("Enter search query (or Enter to quit): ")
77+
if query == '':
7178
break
79+
# Run the query function with the database connection pool and the query.
80+
results = search(pool, query)
81+
print("\nSearch results:")
82+
for result in results:
83+
print(f"[{result['score']:.3f}] {result['filename']}")
84+
print(f" {result['text']}")
85+
print("---")
86+
print()
87+
except KeyboardInterrupt:
88+
break
7289

7390
if __name__ == "__main__":
7491
load_dotenv()

0 commit comments

Comments
 (0)