minor bugfix in adapters for sklearn (#20)

jhoetter · web-flow · commit d561340a40ea · 2022-09-10T17:52:10.000+02:00
diff --git a/README.md b/README.md
@@ -41,7 +41,7 @@ project_id = "your-project-id" # can be found in the URL of the web application
 
 client = Client(user_name, password, project_id)
 # if you run the application locally, please use the following instead:
-# client = Client(username, password, project_id, uri="http://localhost:4455")
+# client = Client(user_name, password, project_id, uri="http://localhost:4455")
 ```
 
 The `project_id` can be found in your browser, e.g. if you run the app on your localhost: `http://localhost:4455/app/projects/{project_id}/overview`
diff --git a/refinery/adapter/sklearn.py b/refinery/adapter/sklearn.py
@@ -9,6 +9,7 @@ def build_classification_dataset(
     sentence_input: str,
     classification_label: str,
     config_string: Optional[str] = None,
+    num_train: Optional[int] = None,
 ) -> Dict[str, Dict[str, Any]]:
     """
     Builds a classification dataset from a refinery client and a config string.
@@ -18,22 +19,23 @@ def build_classification_dataset(
         sentence_input (str): Name of the column containing the sentence input.
         classification_label (str): Name of the label; if this is a task on the full record, enter the string with as "__<label>". Else, input it as "<attribute>__<label>".
         config_string (Optional[str], optional): Config string for the TransformerSentenceEmbedder. Defaults to None; if None is provided, the text will not be embedded.
+        num_train (Optional[int], optional): Number of training examples to use. Defaults to None; if None is provided, all examples will be used.
 
     Returns:
         Dict[str, Dict[str, Any]]: Containing the train and test datasets, with embedded inputs.
     """
 
-    df_test, df_train, _ = split_train_test_on_weak_supervision(
-        client, sentence_input, classification_label
+    df_train, df_test, _ = split_train_test_on_weak_supervision(
+        client, sentence_input, classification_label, num_train
     )
 
     if config_string is not None:
         embedder = TransformerSentenceEmbedder(config_string)
-        inputs_test = embedder.transform(df_test[sentence_input].tolist())
         inputs_train = embedder.transform(df_train[sentence_input].tolist())
+        inputs_test = embedder.transform(df_test[sentence_input].tolist())
     else:
-        inputs_test = df_test[sentence_input].tolist()
         inputs_train = df_train[sentence_input].tolist()
+        inputs_test = df_test[sentence_input].tolist()
 
     return {
         "train": {
diff --git a/refinery/adapter/util.py b/refinery/adapter/util.py
@@ -1,10 +1,10 @@
-from typing import List, Tuple
+from typing import List, Optional, Tuple
 from refinery import Client
 import pandas as pd
 
 
 def split_train_test_on_weak_supervision(
-    client: Client, _input: str, _label: str
+    client: Client, _input: str, _label: str, num_train: Optional[int] = None
 ) -> Tuple[pd.DataFrame, pd.DataFrame, List[str]]:
     """
     Puts the data into a train (weakly supervised data) and test set (manually labeled data).
@@ -14,6 +14,7 @@ def split_train_test_on_weak_supervision(
         client (Client): Refinery client
         _input (str): Name of the column containing the sentence input.
         _label (str): Name of the label; if this is a task on the full record, enter the string with as "__<label>". Else, input it as "<attribute>__<label>".
+        num_train (Optional[int], optional): Number of training examples to use. Defaults to None; if None is provided, all examples will be used.
 
     Returns:
         Tuple[pd.DataFrame, pd.DataFrame, List[str]]: Containing the train and test dataframes and the label name options.
@@ -22,19 +23,21 @@ def split_train_test_on_weak_supervision(
     label_attribute_train = f"{_label}__WEAK_SUPERVISION"
     label_attribute_test = f"{_label}__MANUAL"
 
-    df_train = client.get_record_export(
-        tokenize=False,
-        keep_attributes=[_input, label_attribute_train],
-        dropna=True,
-    ).rename(columns={label_attribute_train: "label"})
-
     df_test = client.get_record_export(
         tokenize=False,
         keep_attributes=[_input, label_attribute_test],
         dropna=True,
     ).rename(columns={label_attribute_test: "label"})
 
-    df_train = df_train.drop(df_test.index)
+    df_train = client.get_record_export(
+        tokenize=False,
+        keep_attributes=[_input, label_attribute_train],
+        dropna=True,
+        num_samples=num_train + len(df_test),
+    ).rename(columns={label_attribute_train: "label"})
+
+    # Remove overlapping data
+    df_train = df_train.drop(df_test.index.intersection(df_train.index))[:num_train]
 
     label_options = list(
         set(df_test.label.unique().tolist() + df_train.label.unique().tolist())