From 189f26c55d987863f3452ce008d1265d75102422 Mon Sep 17 00:00:00 2001 From: Ben Perlmutter Date: Wed, 14 May 2025 14:52:02 -0400 Subject: [PATCH] ingest stuff for growth team --- packages/ingest-mongodb-public/package.json | 1 + .../src/clusterManager.config.ts | 111 ++++++++++++++++++ .../src/sources/index.ts | 30 +---- 3 files changed, 113 insertions(+), 29 deletions(-) create mode 100644 packages/ingest-mongodb-public/src/clusterManager.config.ts diff --git a/packages/ingest-mongodb-public/package.json b/packages/ingest-mongodb-public/package.json index 188949507..320e93fd3 100644 --- a/packages/ingest-mongodb-public/package.json +++ b/packages/ingest-mongodb-public/package.json @@ -23,6 +23,7 @@ "ingest:embed": "../../node_modules/mongodb-rag-ingest/build/main.js embed init --config ./build/config.js && ingest embed update --config ./build/config.js", "ingest:pages:meta": "../../node_modules/mongodb-rag-ingest/build/main.js pages update --config ./build/meta.config.js", "ingest:pages:docsWithLinks": "../../node_modules/mongodb-rag-ingest/build/main.js pages update --config ./build/docsWithLinks.config.js", + "ingest:all:clusterManager": "../../node_modules/mongodb-rag-ingest/build/main.js all --config ./build/clusterManager.config.js", "test": "node --experimental-vm-modules ../../node_modules/jest/bin/jest.js --forceExit" }, "devDependencies": { diff --git a/packages/ingest-mongodb-public/src/clusterManager.config.ts b/packages/ingest-mongodb-public/src/clusterManager.config.ts new file mode 100644 index 000000000..1eacf9b5f --- /dev/null +++ b/packages/ingest-mongodb-public/src/clusterManager.config.ts @@ -0,0 +1,111 @@ +import { Config, makeIngestMetaStore } from "mongodb-rag-ingest"; +import { standardChunkFrontMatterUpdater } from "mongodb-rag-core"; +import { + assertEnvVars, + makeOpenAiEmbedder, + makeMongoDbEmbeddedContentStore, + makeMongoDbPageStore, + filterFulfilled, +} from "mongodb-rag-core"; +import { AzureOpenAI } from "mongodb-rag-core/openai"; +import { sourceConstructors } from "./sources"; +import assert from "assert"; + +const { + OPENAI_ENDPOINT, + OPENAI_API_KEY, + OPENAI_API_VERSION, + OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT, + MONGODB_CONNECTION_URI, +} = assertEnvVars({ + OPENAI_ENDPOINT: "", + OPENAI_API_KEY: "", + OPENAI_API_VERSION: "", + OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT: "", + MONGODB_CONNECTION_URI: "", +}); + +const embedder = makeOpenAiEmbedder({ + openAiClient: new AzureOpenAI({ + apiKey: OPENAI_API_KEY, + endpoint: OPENAI_ENDPOINT, + apiVersion: OPENAI_API_VERSION, + }), + deployment: OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT, + backoffOptions: { + numOfAttempts: 25, + startingDelay: 1000, + }, +}); + +export const standardConfig = { + embedder: () => embedder, + embeddedContentStore: () => + makeMongoDbEmbeddedContentStore({ + connectionUri: MONGODB_CONNECTION_URI, + databaseName: "cluster-manager", + collectionName: process.env.MONGODB_EMBEDDED_CONTENT_COLLECTION_NAME, + searchIndex: { + embeddingName: OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT, + name: OPENAI_RETRIEVAL_EMBEDDING_DEPLOYMENT, + }, + }), + pageStore: () => + makeMongoDbPageStore({ + connectionUri: MONGODB_CONNECTION_URI, + databaseName: "cluster-manager", + }), + ingestMetaStore: () => + makeIngestMetaStore({ + connectionUri: MONGODB_CONNECTION_URI, + databaseName: "cluster-manager", + entryId: "all", + }), + chunkOptions: () => ({ + transform: standardChunkFrontMatterUpdater, + }), + concurrencyOptions: () => ({ + embed: { + createChunks: 5, + }, + }), + dataSources: async () => { + const allSources = await filterFulfilled( + await Promise.allSettled( + sourceConstructors.map((constructor) => constructor()) + ) + ) + .map(({ value }) => value) + .flat(1); + + const atlasDocs = allSources.find((s) => s.name === "snooty-cloud-docs"); + assert(atlasDocs, "snooty-cloud-docs data source not found"); + // Store the original fetchPages method + const originalFetchPages = atlasDocs.fetchPages; + // Override with a new implementation that calls the original + atlasDocs.fetchPages = async function () { + const urls = [ + "https://mongodb.com/docs/atlas/architecture/current/hierarchy/", + "https://mongodb.com/docs/atlas/sizing-tier-selection/", + "https://mongodb.com/docs/atlas/create-database-deployment/", + "https://mongodb.com/docs/atlas/reference/google-gcp/", + "https://mongodb.com/docs/atlas/reference/amazon-aws/", + "https://mongodb.com/docs/atlas/reference/microsoft-azure/", + "https://mongodb.com/docs/atlas/cluster-config/multi-cloud-distribution/", + "https://mongodb.com/docs/atlas/global-clusters/", + "https://mongodb.com/docs/atlas/tutorial/create-global-cluster/", + "https://mongodb.com/docs/atlas/billing/cluster-configuration-costs/#cluster-configuration-costs", + "https://mongodb.com/docs/atlas/architecture/current/scalability/", + "https://mongodb.com/docs/atlas/cluster-autoscaling/", + "https://mongodb.com/docs/atlas/reference/flex-limitations/", + ]; + + // Call the original method, not the overridden one + const pages = await originalFetchPages.call(atlasDocs); + return pages.filter((p) => urls.includes(p.url)); + }; + return [atlasDocs]; + }, +} satisfies Config; + +export default standardConfig; diff --git a/packages/ingest-mongodb-public/src/sources/index.ts b/packages/ingest-mongodb-public/src/sources/index.ts index cfe4d5a06..4e065f592 100644 --- a/packages/ingest-mongodb-public/src/sources/index.ts +++ b/packages/ingest-mongodb-public/src/sources/index.ts @@ -27,9 +27,7 @@ import { MakeMongoDbUniversityDataSourceParams, makeMongoDbUniversityDataSource, } from "./mongodb-university"; -const { DEVCENTER_CONNECTION_URI, UNIVERSITY_DATA_API_KEY } = assertEnvVars( - PUBLIC_INGEST_ENV_VARS -); + import { getUrlsFromSitemap, initialWebSources, @@ -43,30 +41,6 @@ import { chromium } from "playwright"; */ export type SourceConstructor = () => Promise; -export const devCenterProjectConfig: DevCenterProjectConfig = { - type: "devcenter", - name: "devcenter", - collectionName: "search_content_prod", - databaseName: "devcenter", - baseUrl: "https://www.mongodb.com/developer", - connectionUri: DEVCENTER_CONNECTION_URI, -}; - -const mongoDbUniversitySourceConstructor = async () => { - const universityDataApiKey = UNIVERSITY_DATA_API_KEY; - assert(!!universityDataApiKey, "UNIVERSITY_DATA_API_KEY required"); - const universityConfig: MakeMongoDbUniversityDataSourceParams = { - sourceName: "mongodb-university", - baseUrl: "https://api.learn.mongodb.com/rest/catalog", - apiKey: universityDataApiKey, - tiCatalogItems: { - publicOnly: true, - nestAssociatedContent: true, - }, - }; - return makeMongoDbUniversityDataSource(universityConfig); -}; - export const mongoDbCorpDataSourceConfig: MakeMdOnGithubDataSourceParams = { name: "mongodb-corp", repoUrl: "https://github.com/mongodb/chatbot/", @@ -204,8 +178,6 @@ const webDataSourceConstructor = async (): Promise => { export const sourceConstructors: SourceConstructor[] = [ webDataSourceConstructor, () => makeSnootyDataSources(snootyDataApiBaseUrl, snootyProjectConfig), - () => makeDevCenterDataSource(devCenterProjectConfig), - mongoDbUniversitySourceConstructor, mongooseSourceConstructor, prismaSourceConstructor, mongoDbCorpDataSource,