enables upload of json (#23)

jhoetter · web-flow · commit f2019aa4a325 · 2022-09-21T16:58:37.000+02:00
* enables upload of json

* adds upload of dfs

* adds batching to import uploads

* handle json upload as file upload
diff --git a/refinery/__init__.py b/refinery/__init__.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 
+from uuid import uuid4
 from black import Any
 from wasabi import msg
 import pandas as pd
@@ -214,6 +215,44 @@ def post_associations(
         )
         return api_response
 
+    def post_records(self, records: List[Dict[str, Any]]):
+        """Posts records to the server.
+
+        Args:
+            records (List[Dict[str, str]]): List of records to post.
+        """
+        request_uuid = str(uuid4())
+        url = settings.get_import_json_url(self.project_id)
+
+        batch_responses = []
+        for records_batch in util.batch(records, settings.BATCH_SIZE_DEFAULT):
+            api_response = api_calls.post_request(
+                url,
+                {
+                    "request_uuid": request_uuid,
+                    "records": records_batch,
+                    "is_last": False,
+                },
+                self.session_token,
+            )
+            batch_responses.append(api_response)
+            time.sleep(0.5)  # wait half a second to avoid server overload
+        api_calls.post_request(
+            url,
+            {"request_uuid": request_uuid, "records": [], "is_last": True},
+            self.session_token,
+        )
+        return batch_responses
+
+    def post_df(self, df: pd.DataFrame):
+        """Posts a DataFrame to the server.
+
+        Args:
+            df (pd.DataFrame): DataFrame to post.
+        """
+        records = df.to_dict(orient="records")
+        return self.post_records(records)
+
     def post_file_import(
         self, path: str, import_file_options: Optional[str] = ""
     ) -> bool:
@@ -246,7 +285,7 @@ def post_file_import(
         endpoint = config_api_response.get("KERN_S3_ENDPOINT")
 
         # credentials
-        credentials_url = settings.get_import_url(self.project_id)
+        credentials_url = settings.get_import_file_url(self.project_id)
         credentials_api_response = api_calls.post_request(
             credentials_url,
             {
diff --git a/refinery/settings.py b/refinery/settings.py
@@ -2,6 +2,8 @@
 BASE_URI: str
 DEFAULT_URI: str = "https://app.kern.ai"
 
+BATCH_SIZE_DEFAULT: int = 1000
+
 
 def set_base_uri(uri: str):
     global BASE_URI
@@ -40,8 +42,13 @@ def get_export_url(project_id: str) -> str:
     return f"{get_project_url(project_id)}/export"
 
 
-def get_import_url(project_id: str) -> str:
-    return f"{get_project_url(project_id)}/import"
+def get_import_file_url(project_id: str) -> str:
+    return f"{get_project_url(project_id)}/import_file"
+
+
+def get_import_json_url(project_id: str) -> str:
+    return f"{get_project_url(project_id)}/import_json"
+
 
 def get_associations_url(project_id: str) -> str:
     return f"{get_project_url(project_id)}/associations"
diff --git a/refinery/util.py b/refinery/util.py
@@ -1,5 +1,6 @@
 import boto3
 from botocore.client import Config
+from typing import List, Dict, Any
 
 
 def s3_upload(
@@ -29,3 +30,17 @@ def s3_upload(
     with open(file_path, "rb") as file:
         s3_object.put(Body=file)
     return True
+
+
+def batch(records: List[Dict[str, Any]], batch_size: int):
+    """Batches records into batches of size `batch_size`.
+
+    Args:
+        records (List[Dict[str, Any]]): List of records to batch.
+        batch_size (int): Size of the batches.
+
+    Yields:
+        List[Dict[str, Any]]: Batches of records.
+    """
+    for i in range(0, len(records), batch_size):
+        yield records[i : i + batch_size]