From bd76baf8d9e8c18db4a1c43c277b33b498daca82 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20H=C3=B6tter?= <johannes.hoetter@kern.ai>
Date: Sun, 11 Sep 2022 15:21:41 +0200
Subject: [PATCH 1/6] adds option to add external information sources like
 model callbacks

---
 refinery/__init__.py            | 31 +++++++++++++++++++++++++++++++
 refinery/callbacks/inference.py | 31 ++++++++++++++++++++++++++++---
 refinery/settings.py            |  3 +++
 3 files changed, 62 insertions(+), 3 deletions(-)

diff --git a/refinery/__init__.py b/refinery/__init__.py
index b477ed6..1a823fe 100644
--- a/refinery/__init__.py
+++ b/refinery/__init__.py
@@ -183,6 +183,37 @@ def get_record_export(
             msg.good(f"Downloaded export to {download_to}")
         return df
 
+    def post_associations(
+        self,
+        associations,
+        indices,
+        name,
+        label_task_name,
+        source_type: Optional[str] = "heuristic",
+    ):
+        """Posts associations to the server.
+
+        Args:
+            associations (List[Dict[str, str]]): List of associations to post.
+            indices (List[str]): List of indices to post to.
+            name (str): Name of the association set.
+            label_task_name (str): Name of the label task.
+            source_type (Optional[str], optional): Source type of the associations. Defaults to "heuristic".
+        """
+        url = settings.get_associations_url(self.project_id)
+        api_response = api_calls.post_request(
+            url,
+            {
+                "associations": associations,
+                "indices": indices,
+                "name": name,
+                "label_task_name": label_task_name,
+                "source_type": source_type,
+            },
+            self.session_token,
+        )
+        return api_response
+
     def post_file_import(
         self, path: str, import_file_options: Optional[str] = ""
     ) -> bool:
diff --git a/refinery/callbacks/inference.py b/refinery/callbacks/inference.py
index 42bc870..7c4c340 100644
--- a/refinery/callbacks/inference.py
+++ b/refinery/callbacks/inference.py
@@ -1,11 +1,30 @@
+from typing import Callable, Optional
 import pandas as pd
-from refinery import exceptions
+from refinery import Client, exceptions
 
 
 class ModelCallback:
     def __init__(
-        self, client, inference_fn, preprocessing_fn=None, postprocessing_fn=None
+        self,
+        model_name: str,
+        label_task_name: str,
+        inference_fn: Callable,
+        client: Client,
+        preprocessing_fn: Optional[Callable] = None,
+        postprocessing_fn: Optional[Callable] = None,
     ):
+        """
+
+        Args:
+            model_name (str): Name of the model (as an idenfitier in refinery)
+            label_task_name (str): Name of the label task (from refinery)
+            inference_fn (Callable): Function to predict the output
+            client (Client): Refinery client
+            preprocessing_fn (Optional[Callable], optional): Function to apply preprocessing to your inputs. Defaults to None.
+            postprocessing_fn (Optional[Callable], optional): Function to apply postprocessing to the inference function's output. Defaults to None.
+        """
+        self.model_name = model_name
+        self.label_task_name = label_task_name
         self.client = client
         self.inference_fn = inference_fn
         self.preprocessing_fn = preprocessing_fn
@@ -37,4 +56,10 @@ def run(self, inputs, indices):
             if self.postprocessing_fn is not None:
                 batched_outputs = self.postprocessing_fn(batched_outputs)
 
-            yield {"index": batched_indices, "associations": batched_outputs}
+            response = self.client.post_associations(
+                batched_outputs,
+                batched_indices,
+                self.model_name,
+                self.label_task_name,
+                "model_callback",
+            )
diff --git a/refinery/settings.py b/refinery/settings.py
index 64c7c5a..65075f4 100644
--- a/refinery/settings.py
+++ b/refinery/settings.py
@@ -43,6 +43,9 @@ def get_export_url(project_id: str) -> str:
 def get_import_url(project_id: str) -> str:
     return f"{get_project_url(project_id)}/import"
 
+def get_associations_url(project_id: str) -> str:
+    return f"{get_project_url(project_id)}/associations"
+
 
 def get_base_config(project_id: str) -> str:
     return f"{get_project_url(project_id)}/import/base_config"

From 8e3e8b1c611519f32c6848585b246ba290dcb907 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20H=C3=B6tter?= <johannes.hoetter@kern.ai>
Date: Mon, 12 Sep 2022 23:03:32 +0200
Subject: [PATCH 2/6] refactor model callback

---
 refinery/callbacks/inference.py | 72 +++++++++++++++++++++++++++------
 refinery/callbacks/sklearn.py   | 52 ++++++++++++++++++++++++
 2 files changed, 111 insertions(+), 13 deletions(-)
 create mode 100644 refinery/callbacks/sklearn.py

diff --git a/refinery/callbacks/inference.py b/refinery/callbacks/inference.py
index 7c4c340..8d16f33 100644
--- a/refinery/callbacks/inference.py
+++ b/refinery/callbacks/inference.py
@@ -1,4 +1,5 @@
-from typing import Callable, Optional
+from email.generator import Generator
+from typing import Any, Callable, Dict, List, Optional
 import pandas as pd
 from refinery import Client, exceptions
 
@@ -6,40 +7,73 @@
 class ModelCallback:
     def __init__(
         self,
+        client: Client,
         model_name: str,
         label_task_name: str,
         inference_fn: Callable,
-        client: Client,
+        initialize_fn: Optional[Callable] = None,
         preprocessing_fn: Optional[Callable] = None,
         postprocessing_fn: Optional[Callable] = None,
+        **kwargs
     ):
         """
 
         Args:
-            model_name (str): Name of the model (as an idenfitier in refinery)
-            label_task_name (str): Name of the label task (from refinery)
-            inference_fn (Callable): Function to predict the output
             client (Client): Refinery client
-            preprocessing_fn (Optional[Callable], optional): Function to apply preprocessing to your inputs. Defaults to None.
-            postprocessing_fn (Optional[Callable], optional): Function to apply postprocessing to the inference function's output. Defaults to None.
+            model_name (str): Name of the model
+            label_task_name (str): Name of the label task
+            inference_fn (Callable): Function you want to apply for inference
+            initialize_fn (Optional[Callable], optional): Function to execute to compute internal states. Defaults to None.
+            preprocessing_fn (Optional[Callable], optional): Function to preprocess model inputs. Defaults to None.
+            postprocessing_fn (Optional[Callable], optional): Function to postprocess model outputs. Defaults to None.
         """
         self.model_name = model_name
         self.label_task_name = label_task_name
         self.client = client
         self.inference_fn = inference_fn
+        self.initialize_fn = initialize_fn
         self.preprocessing_fn = preprocessing_fn
         self.postprocessing_fn = postprocessing_fn
-
         self.primary_keys = client.get_primary_keys()
+        self.kwargs = kwargs
 
     @staticmethod
-    def __batch(documents):
+    def __batch(documents: List[Any]) -> Generator:
+        """Batch documents into chunks of BATCH_SIZE.
+
+        Args:
+            documents (List[Any]): List of documents
+
+        Yields:
+            Generator: Generator of batches
+        """
         BATCH_SIZE = 32
         length = len(documents)
         for idx in range(0, length, BATCH_SIZE):
             yield documents[idx : min(idx + BATCH_SIZE, length)]
 
-    def run(self, inputs, indices):
+    def initialize(
+        self, inputs: Optional[List[Any]], labels: Optional[List[Any]] = None
+    ) -> None:
+        """Initialize states for the computation.
+
+        Args:
+            inputs (Optional[List[Any]], optional): List of inputs. Defaults to None.
+            labels (Optional[List[Any]], optional): List of labels. Defaults to None.
+        """
+        if self.initialize_fn:
+            self.kwargs = self.initialize_fn(inputs, labels, **self.kwargs)
+
+    def run(self, inputs: List[Any], indices: List[Dict[str, Any]]) -> None:
+        """Run the pipeline and send the results to refinery.
+
+        Args:
+            inputs (List[Any]): List of inputs
+            indices (List[Dict[str, Any]]): List of indices
+
+        Raises:
+            exceptions.PrimaryKeyError: If the primary key is not found in the indices
+        """
         indices_df = pd.DataFrame(indices)
         if not all([key in indices_df.columns for key in self.primary_keys]):
             raise exceptions.PrimaryKeyError("Errorneous primary keys given for index.")
@@ -49,17 +83,29 @@ def run(self, inputs, indices):
             batched_indices = next(index_generator)
 
             if self.preprocessing_fn is not None:
-                batched_inputs = self.preprocessing_fn(batched_inputs)
+                batched_inputs = self.preprocessing_fn(batched_inputs, **self.kwargs)
 
             batched_outputs = self.inference_fn(batched_inputs)
 
             if self.postprocessing_fn is not None:
-                batched_outputs = self.postprocessing_fn(batched_outputs)
+                batched_outputs = self.postprocessing_fn(batched_outputs, **self.kwargs)
 
-            response = self.client.post_associations(
+            self.client.post_associations(
                 batched_outputs,
                 batched_indices,
                 self.model_name,
                 self.label_task_name,
                 "model_callback",
             )
+
+    def initialize_and_run(
+        self, inputs: List[Any], indices: List[Dict[str, Any]]
+    ) -> None:
+        """Initialize and run the pipeline.
+
+        Args:
+            inputs (List[Any]): List of inputs
+            indices (List[Dict[str, Any]]): List of indices
+        """
+        self.initialize(inputs)
+        self.run(inputs, indices)
diff --git a/refinery/callbacks/sklearn.py b/refinery/callbacks/sklearn.py
new file mode 100644
index 0000000..165ab69
--- /dev/null
+++ b/refinery/callbacks/sklearn.py
@@ -0,0 +1,52 @@
+from typing import Optional, List, Any, Dict, Callable
+from refinery import Client
+from refinery.callbacks.inference import ModelCallback
+from sklearn.base import BaseEstimator
+
+
+def initialize_fn(inputs, labels, **kwargs):
+    return {"clf": kwargs["clf"]}
+
+
+def postprocessing_fn(outputs, **kwargs):
+    named_outputs = []
+    for prediction in outputs:
+        pred_index = prediction.argmax()
+        label = kwargs["clf"].classes_[pred_index]
+        confidence = prediction[pred_index]
+        named_outputs.append([label, confidence])
+    return named_outputs
+
+
+class SklearnCallback(ModelCallback):
+    def __init__(
+        self,
+        client: Client,
+        sklearn_model: BaseEstimator,
+        labeling_task_name: str,
+    ) -> None:
+        """Callback for sklearn models.
+
+        Args:
+            client (Client): Refinery client
+            sklearn_model (BaseEstimator): Sklearn model
+            labeling_task_name (str): Name of the labeling task
+        """
+
+        super().__init__(
+            client,
+            sklearn_model.__class__.__name__,
+            labeling_task_name,
+            inference_fn=sklearn_model.predict_proba,
+            initialize_fn=initialize_fn,
+            postprocessing_fn=postprocessing_fn,
+        )
+        self.sklearn_model = sklearn_model
+        self.initialized = False
+        self.kwargs = {"clf": self.sklearn_model}
+
+    def run(self, inputs: List[Any], indices: List[Dict[str, Any]]) -> None:
+        if not self.initialized:
+            self.initialize(None, None)
+            self.initialized = True
+        super().run(inputs, indices)

From 7f8eb759e71526eee59e6afcde31f4554566b59e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20H=C3=B6tter?= <johannes.hoetter@kern.ai>
Date: Mon, 12 Sep 2022 23:44:11 +0200
Subject: [PATCH 3/6] adds pytorch adapter

---
 refinery/adapter/torch.py | 66 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 refinery/adapter/torch.py

diff --git a/refinery/adapter/torch.py b/refinery/adapter/torch.py
new file mode 100644
index 0000000..b200672
--- /dev/null
+++ b/refinery/adapter/torch.py
@@ -0,0 +1,66 @@
+import numpy as np
+import torch
+from torch.utils.data import Dataset, DataLoader
+from sklearn import preprocessing
+from .sklearn import (
+    build_classification_dataset as sklearn_build_classification_dataset,
+)
+from typing import Any, Dict, Optional, Tuple
+from refinery import Client
+
+
+class Data(Dataset):
+    def __init__(self, X, y, encoder):
+        # need to convert float64 to float32 else
+        # will get the following error
+        # RuntimeError: expected scalar type Double but found Float
+        self.X = torch.FloatTensor(X)
+        # need to convert float64 to Long else
+        # will get the following error
+        # RuntimeError: expected scalar type Long but found Float
+        y_encoded = encoder.transform(y.values)
+        self.y = torch.from_numpy(y_encoded).type(torch.LongTensor)
+        self.len = self.X.shape[0]
+
+    def __getitem__(self, index):
+        return self.X[index], self.y[index]
+
+    def __len__(self):
+        return self.len
+
+
+def build_classification_dataset(
+    client: Client,
+    sentence_input: str,
+    classification_label: str,
+    config_string: Optional[str] = None,
+    num_train: Optional[int] = None,
+    batch_size: Optional[int] = 32,
+) -> Tuple[DataLoader, DataLoader, np.array]:
+    """
+    Builds a classification dataset from a refinery client and a config string.
+
+    Args:
+        client (Client): Refinery client
+        sentence_input (str): Name of the column containing the sentence input.
+        classification_label (str): Name of the label; if this is a task on the full record, enter the string with as "__<label>". Else, input it as "<attribute>__<label>".
+        config_string (Optional[str], optional): Config string for the TransformerSentenceEmbedder. Defaults to None; if None is provided, the text will not be embedded.
+        num_train (Optional[int], optional): Number of training examples to use. Defaults to None; if None is provided, all examples will be used.
+
+    Returns:
+        Dict[str, Dict[str, Any]]: Containing the train and test datasets, with embedded inputs.
+    """
+    data = sklearn_build_classification_dataset(
+        client, sentence_input, classification_label, config_string, num_train
+    )
+
+    le = preprocessing.LabelEncoder()
+    le.fit(data["train"]["labels"].values)
+
+    train_data = Data(data["train"]["inputs"], data["train"]["labels"], le)
+    test_data = Data(data["test"]["inputs"], data["test"]["labels"], le)
+
+    train_loader = DataLoader(dataset=train_data, batch_size=batch_size)
+    test_loader = DataLoader(dataset=test_data, batch_size=batch_size)
+
+    return train_loader, test_loader, le.classes_

From 7d1414a16f9e3fbd88c11e307961107d57fdfc10 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20H=C3=B6tter?= <johannes.hoetter@kern.ai>
Date: Tue, 13 Sep 2022 00:14:39 +0200
Subject: [PATCH 4/6] adds pytorch callback

---
 refinery/adapter/torch.py     |  8 +++--
 refinery/callbacks/sklearn.py |  2 +-
 refinery/callbacks/torch.py   | 55 +++++++++++++++++++++++++++++++++++
 3 files changed, 61 insertions(+), 4 deletions(-)
 create mode 100644 refinery/callbacks/torch.py

diff --git a/refinery/adapter/torch.py b/refinery/adapter/torch.py
index b200672..c516893 100644
--- a/refinery/adapter/torch.py
+++ b/refinery/adapter/torch.py
@@ -36,7 +36,7 @@ def build_classification_dataset(
     config_string: Optional[str] = None,
     num_train: Optional[int] = None,
     batch_size: Optional[int] = 32,
-) -> Tuple[DataLoader, DataLoader, np.array]:
+) -> Tuple[DataLoader, DataLoader, preprocessing.LabelEncoder]:
     """
     Builds a classification dataset from a refinery client and a config string.
 
@@ -48,7 +48,7 @@ def build_classification_dataset(
         num_train (Optional[int], optional): Number of training examples to use. Defaults to None; if None is provided, all examples will be used.
 
     Returns:
-        Dict[str, Dict[str, Any]]: Containing the train and test datasets, with embedded inputs.
+        Tuple[DataLoader, DataLoader, preprocessing.LabelEncoder]: Tuple of train and test dataloaders, and the label encoder.
     """
     data = sklearn_build_classification_dataset(
         client, sentence_input, classification_label, config_string, num_train
@@ -63,4 +63,6 @@ def build_classification_dataset(
     train_loader = DataLoader(dataset=train_data, batch_size=batch_size)
     test_loader = DataLoader(dataset=test_data, batch_size=batch_size)
 
-    return train_loader, test_loader, le.classes_
+    index = {"train": data["train"]["index"], "test": data["test"]["index"]}
+
+    return train_loader, test_loader, le, index
diff --git a/refinery/callbacks/sklearn.py b/refinery/callbacks/sklearn.py
index 165ab69..a0d5b65 100644
--- a/refinery/callbacks/sklearn.py
+++ b/refinery/callbacks/sklearn.py
@@ -1,4 +1,4 @@
-from typing import Optional, List, Any, Dict, Callable
+from typing import List, Any, Dict
 from refinery import Client
 from refinery.callbacks.inference import ModelCallback
 from sklearn.base import BaseEstimator
diff --git a/refinery/callbacks/torch.py b/refinery/callbacks/torch.py
new file mode 100644
index 0000000..6d5d17a
--- /dev/null
+++ b/refinery/callbacks/torch.py
@@ -0,0 +1,55 @@
+from typing import List, Any, Dict
+from refinery import Client
+from refinery.callbacks.inference import ModelCallback
+import torch.nn as nn
+from torch.utils.data import DataLoader
+from sklearn import preprocessing
+
+
+def initialize_fn(inputs, labels, **kwargs):
+    return {"encoder": kwargs["encoder"]}
+
+
+def postprocessing_fn(outputs, **kwargs):
+    named_outputs = []
+    pred_argindices = outputs.argmax(axis=1)
+    for predindex, pred_argindex in enumerate(pred_argindices):
+        label = kwargs["encoder"].classes_[pred_argindex]
+        confidence = outputs[predindex][pred_argindex].tolist()
+        named_outputs.append([label, confidence])
+    return named_outputs
+
+
+class TorchCallback(ModelCallback):
+    def __init__(
+        self,
+        client: Client,
+        torch_model: nn.Module,
+        labeling_task_name: str,
+        encoder: preprocessing.LabelEncoder,
+    ) -> None:
+        """Callback for sklearn models.
+
+        Args:
+            client (Client): Refinery client
+            sklearn_model (BaseEstimator): Sklearn model
+            labeling_task_name (str): Name of the labeling task
+        """
+
+        super().__init__(
+            client,
+            torch_model.__class__.__name__,
+            labeling_task_name,
+            inference_fn=torch_model.forward,
+            initialize_fn=initialize_fn,
+            postprocessing_fn=postprocessing_fn,
+        )
+        self.torch_model = torch_model
+        self.initialized = False
+        self.kwargs = {"encoder": encoder}
+
+    def run(self, loader: DataLoader, indices: List[Dict[str, Any]]) -> None:
+        if not self.initialized:
+            self.initialize(None, None)
+            self.initialized = True
+        super().run(loader.dataset.X, indices)

From aceb6d760e361917f488d83c1c0df44461a5437d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20H=C3=B6tter?= <johannes.hoetter@kern.ai>
Date: Tue, 13 Sep 2022 00:53:03 +0200
Subject: [PATCH 5/6] adds huggingface callback

---
 refinery/adapter/transformers.py   | 15 ++++++---
 refinery/callbacks/transformers.py | 53 ++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 4 deletions(-)
 create mode 100644 refinery/callbacks/transformers.py

diff --git a/refinery/adapter/transformers.py b/refinery/adapter/transformers.py
index 889529f..3269a23 100644
--- a/refinery/adapter/transformers.py
+++ b/refinery/adapter/transformers.py
@@ -1,11 +1,15 @@
 import os
+from typing import Optional
 from refinery import Client
 from refinery.adapter.util import split_train_test_on_weak_supervision
 from datasets import load_dataset
 
 
 def build_classification_dataset(
-    client: Client, sentence_input: str, classification_label: str
+    client: Client,
+    sentence_input: str,
+    classification_label: str,
+    num_train: Optional[int] = 100,
 ):
     """Build a classification dataset from a refinery client and a config string useable for HuggingFace finetuning.
 
@@ -24,7 +28,7 @@ def build_classification_dataset(
         label_options,
         primary_keys,
     ) = split_train_test_on_weak_supervision(
-        client, sentence_input, classification_label
+        client, sentence_input, classification_label, num_train
     )
 
     mapping = {k: v for v, k in enumerate(label_options)}
@@ -49,6 +53,9 @@ def build_classification_dataset(
     if os.path.exists(test_file_path):
         os.remove(test_file_path)
 
-    index = {"train": df_train[primary_keys], "test": df_test[primary_keys]}
+    index = {
+        "train": df_train[primary_keys].to_dict(orient="records"),
+        "test": df_test[primary_keys].to_dict(orient="records"),
+    }
 
-    return dataset, mapping, index
+    return dataset, {f"LABEL_{value}": key for key, value in mapping.items()}, index
diff --git a/refinery/callbacks/transformers.py b/refinery/callbacks/transformers.py
new file mode 100644
index 0000000..bca64fe
--- /dev/null
+++ b/refinery/callbacks/transformers.py
@@ -0,0 +1,53 @@
+from typing import List, Any, Dict
+from refinery import Client
+from refinery.callbacks.inference import ModelCallback
+from transformers import pipeline
+
+
+def initialize_fn(inputs, labels, **kwargs):
+    return {"mapping": kwargs["mapping"]}
+
+
+def postprocessing_fn(outputs, **kwargs):
+    named_outputs = []
+    for prediction in outputs:
+        label = kwargs["mapping"][prediction["label"]]
+        confidence = prediction["score"]
+        named_output = [label, confidence]
+        named_outputs.append(named_output)
+    return named_outputs
+
+
+class TransformerCallback(ModelCallback):
+    def __init__(
+        self,
+        client: Client,
+        transformer_model: pipeline,
+        labeling_task_name: str,
+        mapping: Dict[str, str],
+    ) -> None:
+        """Callback for sklearn models.
+
+        Args:
+            client (Client): Refinery client
+            sklearn_model (BaseEstimator): Sklearn model
+            labeling_task_name (str): Name of the labeling task
+        """
+
+        super().__init__(
+            client,
+            transformer_model.__class__.__name__,
+            labeling_task_name,
+            inference_fn=transformer_model.__call__,
+            initialize_fn=initialize_fn,
+            postprocessing_fn=postprocessing_fn,
+        )
+        self.sklearn_model = transformer_model
+        self.initialized = False
+        self.kwargs = {"mapping": mapping}
+
+    def run(self, inputs: List[Any], indices: List[Dict[str, Any]]) -> None:
+        if not self.initialized:
+            self.initialize(None, None)
+            self.initialized = True
+        super().run(inputs, indices)

From f93f17dad01d14d99f80eb4b9449043bdc4ca338 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20H=C3=B6tter?= <johannes.hoetter@kern.ai>
Date: Tue, 13 Sep 2022 13:15:44 +0200
Subject: [PATCH 6/6] update README

---
 README.md | 228 ++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 195 insertions(+), 33 deletions(-)

diff --git a/README.md b/README.md
index 4ab6d75..f2a1067 100644
--- a/README.md
+++ b/README.md
@@ -12,11 +12,14 @@ This is the official Python SDK for [*refinery*](https://github.com/code-kern-ai
   - [Fetching lookup lists](#fetching-lookup-lists)
   - [Upload files](#upload-files)
   - [Adapters](#adapters)
-    - [HuggingFace](#hugging-face)
-    - [Sklearn](#sklearn)
-    - [Rasa](#rasa)
-    - [What's missing?](#whats-missing)
-- [Roadmap](#roadmap)
+    - [Sklearn](#sklearn-adapter)
+    - [PyTorch](#pytorch-adapter)
+    - [HuggingFace](#hugging-face-adapter)
+    - [Rasa](#rasa-adapter)
+  - [Callbacks](#callbacks)
+    - [Sklearn](#sklearn-callback)
+    - [PyTorch](#pytorch-callback)
+    - [HuggingFace](#hugging-face-callback)
 - [Contributing](#contributing)
 - [License](#license)
 - [Contact](#contact)
@@ -122,7 +125,35 @@ Alternatively, you can `rsdk push <path-to-your-file>` via CLI, given that you h
 
 ### Adapters
 
-#### Hugging Face
+#### Sklearn Adapter
+You can use *refinery* to directly pull data into a format you can apply for building [sklearn](https://github.com/scikit-learn/scikit-learn) models. This can look as follows:
+
+```python
+from refinery.adapter.sklearn import build_classification_dataset
+from sklearn.tree import DecisionTreeClassifier
+
+data = build_classification_dataset(client, "headline", "__clickbait", "distilbert-base-uncased")
+
+clf = DecisionTreeClassifier()
+clf.fit(data["train"]["inputs"], data["train"]["labels"])
+
+pred_test = clf.predict(data["test"]["inputs"])
+accuracy = (pred_test == data["test"]["labels"]).mean()
+```
+
+By the way, we can highly recommend to combine this with [Truss](https://github.com/basetenlabs/truss) for easy model serving!
+
+#### PyTorch Adapter
+If you want to build a [PyTorch](https://github.com/pytorch/pytorch) network, you can build the `train_loader` and `test_loader` as follows:
+
+```python
+from refinery.adapter.torch import build_classification_dataset
+train_loader, test_loader, encoder, index = build_classification_dataset(
+    client, "headline", "__clickbait", "distilbert-base-uncased"
+)
+```
+
+#### Hugging Face Adapter
 Transformers are great, but often times, you want to finetune them for your downstream task. With *refinery*, you can do so easily by letting the SDK build the dataset for you that you can use as a plug-and-play base for your training:
 
 ```python
@@ -175,25 +206,7 @@ trainer.train()
 trainer.save_model("path/to/model")
 ```
 
-#### Sklearn
-You can use *refinery* to directly pull data into a format you can apply for building [sklearn](https://github.com/scikit-learn/scikit-learn) models. This can look as follows:
-
-```python
-from refinery.adapter.sklearn import build_classification_dataset
-from sklearn.tree import DecisionTreeClassifier
-
-data = build_classification_dataset(client, "headline", "__clickbait", "distilbert-base-uncased")
-
-clf = DecisionTreeClassifier()
-clf.fit(data["train"]["inputs"], data["train"]["labels"])
-
-pred_test = clf.predict(data["test"]["inputs"])
-accuracy = (pred_test == data["test"]["labels"]).mean()
-```
-
-By the way, we can highly recommend to combine this with [Truss](https://github.com/basetenlabs/truss) for easy model serving!
-
-#### Rasa
+#### Rasa Adapter
 *refinery* is perfect to be used for building chatbots with [Rasa](https://github.com/RasaHQ/rasa). We've built an adapter with which you can easily create the required Rasa training data directly from *refinery*.
 
 To do so, do the following:
@@ -278,18 +291,167 @@ nlu:
 
 Please make sure to also create the further necessary files (`domain.yml`, `data/stories.yml` and `data/rules.yml`) if you want to train your Rasa chatbot. For further reference, see their [documentation](https://rasa.com/docs/rasa).
 
-#### What's missing?
-Let us know what open-source/closed-source NLP framework you are using, for which you'd like to have an adapter implemented in the SDK. To do so, simply create an issue in this repository with the tag "enhancement".
 
+### Callbacks
+If you want to feed your production model's predictions back into *refinery*, you can do so with any version greater than [1.2.1](https://github.com/code-kern-ai/refinery/releases/tag/v1.2.1).
 
-## Roadmap
-- [ ] Register heuristics via wrappers
-- [ ] Up/download zipped projects for versioning via DVC
-- [x] Add project upload
-- [x] Fetch project statistics
+To do so, we have a generalistic interface and framework-specific classes.
 
+#### Sklearn Callback
+If you want to train a scikit-learn model an feed its outputs back into the refinery, you can do so easily as follows:
+
+```python
+from sklearn.linear_model import LogisticRegression
+clf = LogisticRegression() # we use this as an example, but you can use any model implementing predict_proba
+
+from refinery.adapter.sklearn import build_classification_dataset
+data = build_classification_dataset(client, "headline", "__clickbait", "distilbert-base-uncased")
+clf.fit(data["train"]["inputs"], data["train"]["labels"])
+
+from refinery.callbacks.sklearn import SklearnCallback
+callback = SklearnCallback(
+    client, 
+    clf,
+    "clickbait", 
+)
+
+# executing this will call the refinery API with batches of size 32, so your data is pushed to the app
+callback.run(data["train"]["inputs"], data["train"]["index"])
+callback.run(data["test"]["inputs"], data["test"]["index"])
+```
+
+#### PyTorch Callback
+For PyTorch, the procedure is really similar. You can do as follows:
+
+```python
+from refinery.adapter.torch import build_classification_dataset
+train_loader, test_loader, encoder, index = build_classification_dataset(
+    client, "headline", "__clickbait", "distilbert-base-uncased"
+)
+
+# build your custom model and train it here - example:
+import torch.nn as nn
+import numpy as np
+import torch
+
+# number of features (len of X cols)
+input_dim = 768
+# number of hidden layers
+hidden_layers = 20
+# number of classes (unique of y)
+output_dim = 2
+class Network(nn.Module):
+    def __init__(self):
+        super(Network, self).__init__()
+        self.linear1 = nn.Linear(input_dim, output_dim)
+   
+    def forward(self, x):
+        x = torch.sigmoid(self.linear1(x))
+        return x
+    
+clf = Network()
+criterion = nn.CrossEntropyLoss()
+optimizer = torch.optim.SGD(clf.parameters(), lr=0.1)
+
+epochs = 2
+for epoch in range(epochs):
+    running_loss = 0.0
+    for i, data in enumerate(train_loader, 0):
+        inputs, labels = data
+        # set optimizer to zero grad to remove previous epoch gradients
+        optimizer.zero_grad()
+        # forward propagation
+        outputs = clf(inputs)
+        loss = criterion(outputs, labels)
+        # backward propagation
+        loss.backward()
+        # optimize
+        optimizer.step()
+        running_loss += loss.item()
+        # display statistics
+        print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.5f}')
+        running_loss = 0.0
+
+# with this model trained, you can use the callback
+from refinery.callbacks.torch import TorchCallback
+callback = TorchCallback(
+    client, 
+    clf,
+    "clickbait", 
+    encoder
+)
+
+# and just execute this 
+callback.run(train_loader, index["train"])
+callback.run(test_loader, index["test"])
+```
+
+#### HuggingFace Callback
+Collect the dataset and train your custom transformer model as follows:
+
+```python
+from refinery.adapter import transformers
+dataset, mapping, index = transformers.build_classification_dataset(client, "headline", "__clickbait")
+
+# train a model here, we're simplifying this by just using an existing model w/o retraining
+from transformers import pipeline
+pipe = pipeline("text-classification", model="distilbert-base-uncased")
+
+# if you're interested to see how a training looks like, look into the above HuggingFace adapter
+
+# you can now apply the callback
+from refinery.callbacks.transformers import TransformerCallback
+callback = TransformerCallback(
+    client, 
+    pipe,
+    "clickbait", 
+    mapping
+)
+
+callback.run(dataset["train"]["headline"], index["train"])
+callback.run(dataset["test"]["headline"], index["test"])
+```
+
+#### Generic Callback
+This one is your fallback if you have a very custom solution; other than that, we recommend you look into the framework-specific classes.
+
+```python
+from refinery.callbacks.inference import ModelCallback
+from refinery.adapter.sklearn import build_classification_dataset
+from sklearn.linear_model import LogisticRegression
+
+data = build_classification_dataset(client, "headline", "__clickbait", "distilbert-base-uncased"0)
+clf = LogisticRegression()
+clf.fit(data["train"]["inputs"], data["train"]["labels"])
+
+# you can build initialization functions that set states of objects you use in the pipeline
+def initialize_fn(inputs, labels, **kwargs):
+    return {"clf": kwargs["clf"]}
+
+# postprocessing shifts the model outputs into a format accepted by our API
+def postprocessing_fn(outputs, **kwargs):
+    named_outputs = []
+    for prediction in outputs:
+        pred_index = prediction.argmax()
+        label = kwargs["clf"].classes_[pred_index]
+        confidence = prediction[pred_index]
+        named_outputs.append([label, confidence])
+    return named_outputs
+
+callback = ModelCallback(
+    client: Client,
+    "my-custom-regression",
+    "clickbait",
+    inference_fn=clf.predict_proba,
+    initialize_fn=initialize_fn,
+    postprocessing_fn=postprocessing_fn
+)
+
+# executing this will call the refinery API with batches of size 32
+callback.initialize_and_run(data["train"]["inputs"], data["train"]["index"])
+callback.run(data["test"]["inputs"], data["test"]["index"])
+```
 
-If you want to have something added, feel free to open an [issue](https://github.com/code-kern-ai/refinery-python-sdk/issues).
 
 ## Contributing
 Contributions are what make the open source community such an amazing place to learn, inspire, and create. Any contributions you make are **greatly appreciated**.