Use result server for getting results

orazve · orazve · commit 94a1339af84a · 2024-08-02T13:08:12.000+01:00
diff --git a/doc/modules/ROOT/pages/gds-session-algorithms/knowledge-graph-embeddings.adoc b/doc/modules/ROOT/pages/gds-session-algorithms/knowledge-graph-embeddings.adoc
@@ -424,6 +424,26 @@ predict_result = gds.kge.model.predict(
 print(predict_result.to_string())
 ----
 
+Metrics will be printed after training stage which is computed on the test set.
+[source, python, role=no-test]
+----
+{'mean_rank': 6.062, 'mean_reciprocal_rank': 0.238, 'hits_at_k': 0.742}
+----
+
+
+Result will be a pandas DataFrame with top 3 tail entities and their scores for each head entity and relationship type.
+
+[source, python, role=no-test]
+----
+   sourceNodeId               rel    targetNodeIdTopK                                                        scoreTopK
+0          8115  REL_RELDIPLOMACY  [8109, 8116, 8118]     [-4.326232433319092, -4.508733749389648, -4.542135715484619]
+1          8115        REL_RELNGO  [8109, 8116, 8117]  [-4.3115034103393555, -4.3574066162109375, -4.5306196212768555]
+2          8116  REL_RELDIPLOMACY  [8109, 8116, 8118]    [-5.225207328796387, -5.367417335510254, -5.4092488288879395]
+3          8116        REL_RELNGO  [8109, 8116, 8117]      [-4.960464954376221, -4.990216255187988, -5.14272403717041]
+4          8119  REL_RELDIPLOMACY  [8109, 8120, 8116]    [-4.9556193351745605, -5.094477653503418, -5.164356708526611]
+5          8119        REL_RELNGO  [8109, 8116, 8117]    [-3.9914486408233643, -4.040783882141113, -4.112575054168701]
+----
+
 There is also a function to score the triplets.
 
 [source, python, role=no-test]
@@ -437,4 +457,13 @@ scores = gds.kge.model.score_triplets(
     model_name=model_name,
     triplets=triplets,
 )
+----
+
+Result will be a dataframe with score for each triplet.
+
+[source, python, role=no-test]
+----
+   sourceNodeId               rel  targetNodeId     score
+0          8115        REL_RELNGO          8116 -4.357407
+1          8115  REL_RELDIPLOMACY          8119 -5.142065
 ----
diff --git a/examples/kge-distmult-nations.py b/examples/kge-distmult-nations.py
@@ -195,11 +195,11 @@ def inspect_graph(G):
     res = gds.kge.model.train(
         G_train,
         model_name=model_name,
-        scoring_function="distmult",
-        num_epochs=1,
-        embedding_dimension=10,
+        scoring_function="TransE",
+        num_epochs=30,
+        embedding_dimension=64,
         epochs_per_checkpoint=0,
-        epochs_per_val=5,
+        epochs_per_val=0,
         split_ratios={"TRAIN": 0.8, "VALID": 0.1, "TEST": 0.1},
     )
     print(res["metrics"])
@@ -218,7 +218,7 @@ def inspect_graph(G):
     print(predict_result.to_string())
 
     for index, row in predict_result.iterrows():
-        h = row["head"]
+        h = row["sourceNodeId"]
         r = row["rel"]
         gds.run_cypher(
             f"""
@@ -227,7 +227,7 @@ def inspect_graph(G):
             MATCH (b:Entity WHERE id(b) = t)
             MERGE (a)-[:NEW_REL_{r}]->(b)
         """,
-            params={"tt": row["tail"]},
+            params={"tt": row["targetNodeIdTopK"]},
         )
 
     brazil_node = gds.find_node_id(["Entity"], {"text": "brazil"})
diff --git a/graphdatascience/model/kge_runner.py b/graphdatascience/model/kge_runner.py
@@ -4,7 +4,7 @@
 import time
 from typing import Any, Dict, Optional
 
-import pandas as pd
+import pyarrow
 import requests
 from pandas import DataFrame, Series
 
@@ -32,12 +32,13 @@ def __init__(
         self._namespace = namespace
         self._server_version = server_version
         self._compute_cluster_web_uri = f"http://{compute_cluster_ip}:5005"
+        self._compute_cluster_arrow_uri = f"grpc://{compute_cluster_ip}:8815"
         self._compute_cluster_mlflow_uri = f"http://{compute_cluster_ip}:8080"
         self._encrypted_db_password = encrypted_db_password
         self._arrow_uri = arrow_uri
 
     @property
-    def model(self):
+    def model(self) -> "KgeRunner":
         return self
 
     # @compatible_with("stream", min_inclusive=ServerVersion(2, 5, 0))
@@ -75,7 +76,7 @@ def train(
         mlflow_experiment_name: Optional[str] = None,
     ) -> Series:
         if epochs_per_checkpoint is None:
-            epochs_per_checkpoint = max(num_epochs / 10, 1)
+            epochs_per_checkpoint = max(int(num_epochs / 10), 1)
         if loss_function_kwargs is None:
             loss_function_kwargs = dict(margin=1.0, adversarial_temperature=1.0, gamma=20.0)
         if lr_scheduler_kwargs is None:
@@ -92,7 +93,7 @@ def train(
         }
         print(algo_config)
 
-        graph_config = {"name": G.name()}
+        graph_config = {"name": G.name(), "config_type": "GdsGraphConfig"}
 
         config = {
             "user_name": "DUMMY_USER",
@@ -144,8 +145,10 @@ def predict(
             "user_name": "DUMMY_USER",
             "task": "KGE_PREDICT_PYG",
             "task_config": {
+                "graph_config": {"config_type": "GdsGraphConfig", "name": "NOGRAPH"},
                 "modelname": model_name,
                 "task_config": algo_config,
+                "stream_rel_results": True,
             },
             "graph_arrow_uri": self._arrow_uri,
         }
@@ -162,7 +165,7 @@ def predict(
 
         self._wait_for_job(job_id)
 
-        return self._stream_results(config["user_name"], config["task_config"]["modelname"], job_id)
+        return self._stream_results(config, job_id)
 
     @client_only_endpoint("gds.kge.model")
     def score_triplets(
@@ -180,8 +183,10 @@ def score_triplets(
             "user_name": "DUMMY_USER",
             "task": "KGE_SCORE_TRIPLETS_PYG",
             "task_config": {
+                "graph_config": {"config_type": "GdsGraphConfig", "name": "NOGRAPH"},
                 "modelname": model_name,
                 "task_config": algo_config,
+                "stream_rel_results": True,
             },
             "graph_arrow_uri": self._arrow_uri,
         }
@@ -198,22 +203,20 @@ def score_triplets(
 
         self._wait_for_job(job_id)
 
-        return self._stream_results(config["user_name"], config["task_config"]["modelname"], job_id)
+        return self._stream_results(config, job_id)
 
-    def _stream_results(self, user_name: str, model_name: str, job_id: str) -> DataFrame:
-        res = requests.get(
-            f"{self._compute_cluster_web_uri}/internal/fetch-result",
-            params={"user_name": user_name, "modelname": model_name, "job_id": job_id},
-        )
-        res.raise_for_status()
+    def _stream_results(self, config: dict, job_id: str) -> DataFrame:
+        client = pyarrow.flight.connect(self._compute_cluster_arrow_uri)
 
-        res_file_name = f"res_{job_id}.json"
-        with open(res_file_name, mode="wb+") as f:
-            f.write(res.content)
+        if config["task_config"].get("stream_rel_results", False):
+            upload_descriptor = pyarrow.flight.FlightDescriptor.for_path(f"{job_id}.relationships")
+        else:
+            raise ValueError("No results to fetch: need to set stream_rel_results or stream_graph_results to True")
+        flight = client.get_flight_info(upload_descriptor)
+        reader = client.do_get(flight.endpoints[0].ticket)
+        read_table = reader.read_all()
 
-        df = pd.read_json(res_file_name, orient="records", lines=True)
-        os.remove(res_file_name)
-        return df
+        return read_table.to_pandas()
 
     def _get_metrics(self, user_name: str, model_name: str, job_id: str) -> DataFrame:
         res = requests.get(