awslabs · krokoko · Jul 3, 2024 · Jun 27, 2024 · Jun 27, 2024 · Jul 1, 2024
@@ -0,0 +1,28 @@
+[@cdklabs/generative-ai-cdk-constructs](../README.md) / [opensearchserverless](../modules/opensearchserverless.md) / CharacterFilterType
+
+# Enumeration: CharacterFilterType
+
+[opensearchserverless](../modules/opensearchserverless.md).CharacterFilterType
+
+Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+ with the License. A copy of the License is located at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES
+ OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
+ and limitations under the License.
+
+## Table of contents
+
+### Enumeration Members
+
+- [ICU\_NORMALIZER](opensearchserverless.CharacterFilterType.md#icu_normalizer)
+
+## Enumeration Members
+
+### ICU\_NORMALIZER
+
+• **ICU\_NORMALIZER** = ``"icu_normalizer"``
@@ -0,0 +1,59 @@
+[@cdklabs/generative-ai-cdk-constructs](../README.md) / [opensearchserverless](../modules/opensearchserverless.md) / TokenFilterType
+
+# Enumeration: TokenFilterType
+
+[opensearchserverless](../modules/opensearchserverless.md).TokenFilterType
+
+## Table of contents
+
+### Enumeration Members
+
+- [CJK\_WIDTH](opensearchserverless.TokenFilterType.md#cjk_width)
+- [ICU\_FOLDING](opensearchserverless.TokenFilterType.md#icu_folding)
+- [JA\_STOP](opensearchserverless.TokenFilterType.md#ja_stop)
+- [KUROMOJI\_BASEFORM](opensearchserverless.TokenFilterType.md#kuromoji_baseform)
+- [KUROMOJI\_PART\_OF\_SPEECH](opensearchserverless.TokenFilterType.md#kuromoji_part_of_speech)
+- [KUROMOJI\_STEMMER](opensearchserverless.TokenFilterType.md#kuromoji_stemmer)
+- [LOWERCASE](opensearchserverless.TokenFilterType.md#lowercase)
+
+## Enumeration Members
+
+### CJK\_WIDTH
+
+• **CJK\_WIDTH** = ``"cjk_width"``
+
+___
+
+### ICU\_FOLDING
+
+• **ICU\_FOLDING** = ``"icu_folding"``
+
+___
+
+### JA\_STOP
+
+• **JA\_STOP** = ``"ja_stop"``
+
+___
+
+### KUROMOJI\_BASEFORM
+
+• **KUROMOJI\_BASEFORM** = ``"kuromoji_baseform"``
+
+___
+
+### KUROMOJI\_PART\_OF\_SPEECH
+
+• **KUROMOJI\_PART\_OF\_SPEECH** = ``"kuromoji_part_of_speech"``
+
+___
+
+### KUROMOJI\_STEMMER
+
+• **KUROMOJI\_STEMMER** = ``"kuromoji_stemmer"``
+
+___
+
+### LOWERCASE
+
+• **LOWERCASE** = ``"lowercase"``
@@ -0,0 +1,24 @@
+[@cdklabs/generative-ai-cdk-constructs](../README.md) / [opensearchserverless](../modules/opensearchserverless.md) / TokenizerType
+
+# Enumeration: TokenizerType
+
+[opensearchserverless](../modules/opensearchserverless.md).TokenizerType
+
+## Table of contents
+
+### Enumeration Members
+
+- [ICU\_TOKENIZER](opensearchserverless.TokenizerType.md#icu_tokenizer)
+- [KUROMOJI\_TOKENIZER](opensearchserverless.TokenizerType.md#kuromoji_tokenizer)
+
+## Enumeration Members
+
+### ICU\_TOKENIZER
+
+• **ICU\_TOKENIZER** = ``"icu_tokenizer"``
+
+___
+
+### KUROMOJI\_TOKENIZER
+
+• **KUROMOJI\_TOKENIZER** = ``"kuromoji_tokenizer"``
@@ -0,0 +1,39 @@
+[@cdklabs/generative-ai-cdk-constructs](../README.md) / [opensearch\_vectorindex](../modules/opensearch_vectorindex.md) / Analyzer
+
+# Interface: Analyzer
+
+[opensearch\_vectorindex](../modules/opensearch_vectorindex.md).Analyzer
+
+Properties for the Analyzer.
+
+## Table of contents
+
+### Properties
+
+- [characterFilters](opensearch_vectorindex.Analyzer.md#characterfilters)
+- [tokenFilters](opensearch_vectorindex.Analyzer.md#tokenfilters)
+- [tokenizer](opensearch_vectorindex.Analyzer.md#tokenizer)
+
+## Properties
+
+### characterFilters
+
+• `Readonly` **characterFilters**: [`ICU_NORMALIZER`](../enums/opensearchserverless.CharacterFilterType.md#icu_normalizer)[]
+
+The analyzers to use.
+
+___
+
+### tokenFilters
+
+• `Readonly` **tokenFilters**: [`TokenFilterType`](../enums/opensearchserverless.TokenFilterType.md)[]
+
+The token filters to use.
+
+___
+
+### tokenizer
+
+• `Readonly` **tokenizer**: [`TokenizerType`](../enums/opensearchserverless.TokenizerType.md)
+
+The tokenizer to use.
@@ -10,6 +10,7 @@ Properties for the VectorIndex.
 
 ### Properties
 
+- [analyzer](opensearch_vectorindex.VectorIndexProps.md#analyzer)
 - [collection](opensearch_vectorindex.VectorIndexProps.md#collection)
 - [indexName](opensearch_vectorindex.VectorIndexProps.md#indexname)
 - [mappings](opensearch_vectorindex.VectorIndexProps.md#mappings)
@@ -18,6 +19,20 @@ Properties for the VectorIndex.
 
 ## Properties
 
+### analyzer
+
+• `Optional` `Readonly` **analyzer**: [`Analyzer`](opensearch_vectorindex.Analyzer.md)
+
+The analyzer to use.
+
+**`Default`**
+
+```ts
+- No analyzer.
+```
+
+___
+
 ### collection
 
 • `Readonly` **collection**: [`VectorCollection`](../classes/opensearchserverless.VectorCollection.md)

@@ -10,6 +10,7 @@
 
 ### Interfaces
 
+- [Analyzer](../interfaces/opensearch_vectorindex.Analyzer.md)
 - [MetadataManagementFieldProps](../interfaces/opensearch_vectorindex.MetadataManagementFieldProps.md)
 - [VectorIndexProps](../interfaces/opensearch_vectorindex.VectorIndexProps.md)
 

@@ -6,6 +6,9 @@
 
 ### Enumerations
 
+- [CharacterFilterType](../enums/opensearchserverless.CharacterFilterType.md)
+- [TokenFilterType](../enums/opensearchserverless.TokenFilterType.md)
+- [TokenizerType](../enums/opensearchserverless.TokenizerType.md)
 - [VectorCollectionStandbyReplicas](../enums/opensearchserverless.VectorCollectionStandbyReplicas.md)
 
 ### Classes

@@ -11,27 +11,26 @@
 # and limitations under the License.
 #
 
+import logging
+import os
+import time
+from typing import Sequence, TypedDict
+
+import boto3
+from custom_resources.cr_types import CustomResourceRequest, CustomResourceResponse
 from opensearchpy import (
+    AuthorizationException,
+    AWSV4SignerAuth,
     OpenSearch,
     RequestsHttpConnection,
-    AWSV4SignerAuth,
-    AuthorizationException,
 )
-import boto3
-import logging
-import os
-import time
 from tenacity import (
     retry,
     retry_if_exception_type,
     stop_after_attempt,
     wait_exponential_jitter,
 )
 
-from typing import TypedDict, Sequence
-
-from custom_resources.cr_types import CustomResourceRequest, CustomResourceResponse
-
 LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
 
 logger = logging.getLogger(__name__)
@@ -44,12 +43,19 @@ class MetadataManagementField(TypedDict):
     Filterable: bool
 
 
+class AnalyzerProperties(TypedDict):
+    CharacterFilters: Sequence[str]
+    Tokenizer: str
+    TokenFilters: Sequence[str]
+
+
 class VectorIndexProperties(TypedDict):
     Endpoint: str
     IndexName: str
     VectorField: str
     Dimensions: int | str
     MetadataManagement: Sequence[MetadataManagementField]
+    Analyzer: AnalyzerProperties | None
 
 
 def validate_event(event: CustomResourceRequest[VectorIndexProperties]) -> bool:
@@ -70,6 +76,14 @@ def validate_event(event: CustomResourceRequest[VectorIndexProperties]) -> bool:
             raise ValueError("MetadataManagement is required")
         if event["RequestType"] == "Update" and event["PhysicalResourceId"] is None:
             raise ValueError("PhysicalResourceId is required")
+        if event["ResourceProperties"].get("Analyzer") is not None:
+            analyzer = event["ResourceProperties"]["Analyzer"]
+            if analyzer["CharacterFilters"] is None:
+                raise ValueError("CharacterFilters is required")
+            if analyzer["Tokenizer"] is None:
+                raise ValueError("Tokenizer is required")
+            if analyzer["TokenFilters"] is None:
+                raise ValueError("TokenFilters is required")
     elif event["RequestType"] == "Delete":
         if event["PhysicalResourceId"] is None:
             raise ValueError("PhysicalResourceId is required")
@@ -139,18 +153,39 @@ def create_mapping(
     return mapping
 
 
-def create_index(client: OpenSearch, index_name: str, mapping: dict[str, str]) -> None:
+def create_setting(analyzer: AnalyzerProperties | None) -> dict:
+    setting = {
+        "index": {
+            "number_of_shards": "2",
+            "knn.algo_param": {"ef_search": "512"},
+            "knn": "true",
+        },
+    }
+    if analyzer:
+        setting["analysis"] = {
+            "analyzer": {
+                "custom_analyzer": {
+                    "type": "custom",
+                    "tokenizer": analyzer["Tokenizer"],
+                    "char_filter": analyzer["CharacterFilters"],
+                    "filter": analyzer["TokenFilters"],
+                }
+            }
+        }
+
+    return setting
+
+
+def create_index(
+    client: OpenSearch, index_name: str, mapping: dict[str, str], setting: dict[str, str]
+) -> None:
     logger.debug(f"creating index {index_name}")
+    logger.debug(f"setting: {setting}")
+    logger.debug(f"mapping: {mapping}")
     client.indices.create(
         index_name,
         body={
-            "settings": {
-                "index": {
-                    "number_of_shards": "2",
-                    "knn.algo_param": {"ef_search": "512"},
-                    "knn": "true",
-                }
-            },
+            "settings": setting,
             "mappings": mapping,
         },
         params={"wait_for_active_shards": "all"},
@@ -171,13 +206,15 @@ def handle_create(
     vector_field: str,
     dimensions: int,
     metadata_management: Sequence[MetadataManagementField],
+    analyzer: AnalyzerProperties | None,
 ):
     if client.indices.exists(index_name):
         raise ValueError(f"Index {index_name} already exists")
 
     try:
         mapping = create_mapping(vector_field, dimensions, metadata_management)
-        create_index(client, index_name, mapping)
+        setting = create_setting(analyzer)
+        create_index(client, index_name, mapping, setting)
     except Exception as e:
         logger.error(f"Error creating index {index_name}")
         logger.exception(e)
@@ -211,6 +248,7 @@ def on_create(
         event["ResourceProperties"]["VectorField"],
         int(event["ResourceProperties"]["Dimensions"]),
         event["ResourceProperties"]["MetadataManagement"],
+        event["ResourceProperties"].get("Analyzer", None),
     )
     return {"PhysicalResourceId": physical_id}