From f08082898b31b2ad69237b1a92aa024a148425a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Sun, 4 Sep 2022 13:33:30 +0200 Subject: [PATCH 1/3] adds automated huggingface dataset creation --- refinery/__init__.py | 12 +++++++- refinery/adapter/transformers.py | 52 ++++++++++++++++++++++++++++++++ requirements.txt | 14 ++++----- 3 files changed, 70 insertions(+), 8 deletions(-) create mode 100644 refinery/adapter/transformers.py diff --git a/refinery/__init__.py b/refinery/__init__.py index 6e70e9a..d2b030d 100644 --- a/refinery/__init__.py +++ b/refinery/__init__.py @@ -111,6 +111,8 @@ def get_record_export( num_samples: Optional[int] = None, download_to: Optional[str] = None, tokenize: Optional[bool] = True, + keep_attributes: Optional[List[str]] = None, + dropna: Optional[bool] = False, ) -> pd.DataFrame: """Collects the export data of your project (i.e. the same data if you would export in the web app). @@ -155,6 +157,12 @@ def get_record_export( "There are no attributes that can be tokenized in this project." ) + if keep_attributes is not None: + df = df[keep_attributes] + + if dropna: + df = df.dropna() + if download_to is not None: df.to_json(download_to, orient="records") msg.good(f"Downloaded export to {download_to}") @@ -263,7 +271,9 @@ def __monitor_task(self, upload_task_id: str) -> None: if print_success_message: msg.good("File upload successful.") else: - msg.fail("Upload failed. Please look into the UI notification center for more details.") + msg.fail( + "Upload failed. Please look into the UI notification center for more details." + ) def __get_task(self, upload_task_id: str) -> Dict[str, Any]: api_response = api_calls.get_request( diff --git a/refinery/adapter/transformers.py b/refinery/adapter/transformers.py new file mode 100644 index 0000000..a4d776c --- /dev/null +++ b/refinery/adapter/transformers.py @@ -0,0 +1,52 @@ +from refinery import Client +from datasets import load_dataset +import os + + +def build_dataset(client: Client, sentence_input, classification_label): + + label_manual = f"{classification_label}__MANUAL" + manual_data = client.get_record_export( + tokenize=False, keep_attributes=[sentence_input, label_manual], dropna=True + ).rename(columns={label_manual: "label"}) + + label_weakly_supervised = f"{classification_label}__WEAK_SUPERVISION" + weakly_supervised_data = client.get_record_export( + tokenize=False, + keep_attributes=[sentence_input, label_weakly_supervised], + dropna=True, + ).rename(columns={label_weakly_supervised: "label"}) + + weakly_supervised_data = weakly_supervised_data.drop(manual_data.index) + + labels = list( + set( + manual_data.label.unique().tolist() + + weakly_supervised_data.label.unique().tolist() + ) + ) + + mapping = {k: v for v, k in enumerate(labels)} + + manual_data["label"] = manual_data["label"].apply(lambda x: mapping[x]) + weakly_supervised_data["label"] = weakly_supervised_data["label"].apply( + lambda x: mapping[x] + ) + + train_file_path = f"{hash(label_weakly_supervised)}_train_file.csv" + test_file_path = f"{hash(label_manual)}_test_file.csv" + + manual_data.to_csv(test_file_path, index=False) + weakly_supervised_data.to_csv(train_file_path, index=False) + + dataset = load_dataset( + "csv", data_files={"train": train_file_path, "test": test_file_path} + ) + + if os.path.exists(train_file_path): + os.remove(train_file_path) + + if os.path.exists(test_file_path): + os.remove(test_file_path) + + return dataset, mapping diff --git a/requirements.txt b/requirements.txt index 40ac53f..5a31ec3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ -numpy==1.22.3 -pandas==1.4.2 -requests==2.27.1 -boto3==1.24.26 -botocore==1.27.26 -spacy==3.3.1 -wasabi==0.9.1 \ No newline at end of file +numpy +pandas +requests +boto3 +botocore +spacy +wasabi \ No newline at end of file From fe2fb115f8e81e392297233aa5b122f04b12f719 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Mon, 5 Sep 2022 10:09:39 +0200 Subject: [PATCH 2/3] upgrade version --- README.md | 2 +- refinery/adapter/embedders.py | 27 ++++++++++++++++++++++++ refinery/adapter/transformers.py | 28 +++++++------------------ refinery/adapter/util.py | 35 ++++++++++++++++++++++++++++++++ requirements.txt | 4 +++- setup.py | 18 ++++++++-------- 6 files changed, 83 insertions(+), 31 deletions(-) create mode 100644 refinery/adapter/embedders.py create mode 100644 refinery/adapter/util.py diff --git a/README.md b/README.md index 41d5592..1327b99 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ [![refinery repository](https://uploads-ssl.webflow.com/61e47fafb12bd56b40022a49/62cf1c3cb8272b1e9c01127e_refinery%20sdk%20banner.png)](https://github.com/code-kern-ai/refinery) [![Python 3.9](https://img.shields.io/badge/python-3.9-blue.svg)](https://www.python.org/downloads/release/python-390/) -[![pypi 1.0.2](https://img.shields.io/badge/pypi-1.0.2-yellow.svg)](https://pypi.org/project/refinery-python-sdk/1.0.2/) +[![pypi 1.1.0](https://img.shields.io/badge/pypi-1.1.0-yellow.svg)](https://pypi.org/project/refinery-python-sdk/1.1.0/) This is the official Python SDK for [*refinery*](https://github.com/code-kern-ai/refinery), the **open-source** data-centric IDE for NLP. diff --git a/refinery/adapter/embedders.py b/refinery/adapter/embedders.py new file mode 100644 index 0000000..099562d --- /dev/null +++ b/refinery/adapter/embedders.py @@ -0,0 +1,27 @@ +from embedders.classification.contextual import TransformerSentenceEmbedder +from refinery.adapter.util import split_train_test_on_weak_supervision + + +def build_classification_dataset( + client, sentence_input, classification_label, config_string +): + embedder = TransformerSentenceEmbedder(config_string) + + manual_data, weakly_supervised_data, labels = split_train_test_on_weak_supervision( + client, sentence_input, classification_label + ) + + weakly_supervised_data = weakly_supervised_data.head(100) + + embeddings_test = embedder.transform(manual_data[sentence_input].tolist()) + embeddings_train = embedder.transform( + weakly_supervised_data[sentence_input].tolist() + ) + + return { + "train": { + "inputs": embeddings_train, + "labels": weakly_supervised_data["label"], + }, + "test": {"inputs": embeddings_test, "labels": manual_data["label"]}, + } diff --git a/refinery/adapter/transformers.py b/refinery/adapter/transformers.py index a4d776c..74cf370 100644 --- a/refinery/adapter/transformers.py +++ b/refinery/adapter/transformers.py @@ -1,31 +1,17 @@ +import os from refinery import Client +from refinery.adapter.util import get_label_names, split_train_test_on_weak_supervision from datasets import load_dataset -import os - -def build_dataset(client: Client, sentence_input, classification_label): - label_manual = f"{classification_label}__MANUAL" - manual_data = client.get_record_export( - tokenize=False, keep_attributes=[sentence_input, label_manual], dropna=True - ).rename(columns={label_manual: "label"}) +def build_classification_dataset(client: Client, sentence_input, classification_label): - label_weakly_supervised = f"{classification_label}__WEAK_SUPERVISION" - weakly_supervised_data = client.get_record_export( - tokenize=False, - keep_attributes=[sentence_input, label_weakly_supervised], - dropna=True, - ).rename(columns={label_weakly_supervised: "label"}) - - weakly_supervised_data = weakly_supervised_data.drop(manual_data.index) - - labels = list( - set( - manual_data.label.unique().tolist() - + weakly_supervised_data.label.unique().tolist() - ) + manual_data, weakly_supervised_data, labels = split_train_test_on_weak_supervision( + client, sentence_input, classification_label ) + label_manual, label_weakly_supervised = get_label_names(classification_label) + mapping = {k: v for v, k in enumerate(labels)} manual_data["label"] = manual_data["label"].apply(lambda x: mapping[x]) diff --git a/refinery/adapter/util.py b/refinery/adapter/util.py new file mode 100644 index 0000000..a6a4199 --- /dev/null +++ b/refinery/adapter/util.py @@ -0,0 +1,35 @@ +def get_label_names(_label): + label_manual = f"{_label}__MANUAL" + label_weakly_supervised = f"{_label}__WEAK_SUPERVISION" + return label_manual, label_weakly_supervised + + +def split_train_test_on_weak_supervision(client, _input, _label): + + label_manual, label_weakly_supervised = get_label_names(_label) + manual_data = client.get_record_export( + tokenize=False, + keep_attributes=[_input, label_manual], + dropna=True, + ).rename(columns={label_manual: "label"}) + + weakly_supervised_data = client.get_record_export( + tokenize=False, + keep_attributes=[_input, label_weakly_supervised], + dropna=True, + ).rename(columns={label_weakly_supervised: "label"}) + + weakly_supervised_data = weakly_supervised_data.drop(manual_data.index) + + labels = list( + set( + manual_data.label.unique().tolist() + + weakly_supervised_data.label.unique().tolist() + ) + ) + + return ( + manual_data.reset_index(drop=True), + weakly_supervised_data.reset_index(drop=True), + labels, + ) diff --git a/requirements.txt b/requirements.txt index 5a31ec3..5044a0f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,6 @@ requests boto3 botocore spacy -wasabi \ No newline at end of file +wasabi +embedders +datasets \ No newline at end of file diff --git a/setup.py b/setup.py index 00e78b8..b5122b1 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="refinery-python-sdk", - version="1.0.2", + version="1.1.0", author="jhoetter", author_email="johannes.hoetter@kern.ai", description="Official Python SDK for Kern AI refinery.", @@ -34,13 +34,15 @@ package_dir={"": "."}, packages=find_packages("."), install_requires=[ - "numpy==1.22.3", - "pandas==1.4.2", - "requests==2.27.1", - "boto3==1.24.26", - "botocore==1.27.26", - "spacy==3.3.1", - "wasabi==0.9.1", + "numpy", + "pandas", + "requests", + "boto3", + "botocore", + "spacy", + "wasabi", + "embedders", + "datasets", ], entry_points={ "console_scripts": [ From e7751fbc8e482af5acdf1def4672235c833d90b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Mon, 5 Sep 2022 10:58:17 +0200 Subject: [PATCH 3/3] update README and refactor --- README.md | 73 ++++++++++++++++++++++++++++++++ refinery/adapter/embedders.py | 27 ------------ refinery/adapter/sklearn.py | 44 +++++++++++++++++++ refinery/adapter/transformers.py | 37 ++++++++++------ refinery/adapter/util.py | 54 ++++++++++++++--------- 5 files changed, 173 insertions(+), 62 deletions(-) delete mode 100644 refinery/adapter/embedders.py create mode 100644 refinery/adapter/sklearn.py diff --git a/README.md b/README.md index 1327b99..588c363 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,8 @@ This is the official Python SDK for [*refinery*](https://github.com/code-kern-ai - [Fetching lookup lists](#fetching-lookup-lists) - [Upload files](#upload-files) - [Adapters](#adapters) + - [HuggingFace](#hugging-face) + - [Sklearn](#sklearn) - [Rasa](#rasa) - [What's missing?](#whats-missing) - [Roadmap](#roadmap) @@ -120,6 +122,77 @@ Alternatively, you can `rsdk push ` via CLI, given that you h ### Adapters +#### 🤗 Hugging Face +Transformers are great, but often times, you want to finetune them for your downstream task. With *refinery*, you can do so easily by letting the SDK build the dataset for you that you can use as a plug-and-play base for your training: + +```python +from refinery.adapter import transformers +dataset, mapping = transformers.build_dataset(client, "headline", "__clickbait") +``` + +From here, you can follow the [finetuning example](https://huggingface.co/docs/transformers/training) provided in the official Hugging Face documentation. A next step could look as follows: + +```python +small_train_dataset = dataset["train"].shuffle(seed=42).select(range(1000)) +small_eval_dataset = dataset["test"].shuffle(seed=42).select(range(1000)) + +from transformers import ( + AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer +) +import numpy as np +from datasets import load_metric + +tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") + +def tokenize_function(examples): + return tokenizer(examples["headline"], padding="max_length", truncation=True) + +tokenized_datasets = dataset.map(tokenize_function, batched=True) +model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2) +training_args = TrainingArguments(output_dir="test_trainer") +metric = load_metric("accuracy") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + +training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch") + +small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000)) +small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000)) + +trainer = Trainer( + model=model, + args=training_args, + train_dataset=small_train_dataset, + eval_dataset=small_eval_dataset, + compute_metrics=compute_metrics, +) + +trainer.train() + +trainer.save_model("path/to/model") +``` + +#### Sklearn +You can use *refinery* to directly pull data into a format you can apply for building [sklearn](https://github.com/scikit-learn/scikit-learn) models. This can look as follows: + +```python +from refinery.adapter.embedders import build_classification_dataset +from sklearn.tree import DecisionTreeClassifier + +data = build_classification_dataset(client, "headline", "__clickbait", "distilbert-base-uncased") + +clf = DecisionTreeClassifier() +clf.fit(data["train"]["inputs"], data["train"]["labels"]) + +pred_test = clf.predict(data["test"]["inputs"]) +accuracy = (pred_test == data["test"]["labels"]).mean() +``` + +By the way, we can highly recommend to combine this with [Truss](https://github.com/basetenlabs/truss) for easy model serving! + #### Rasa *refinery* is perfect to be used for building chatbots with [Rasa](https://github.com/RasaHQ/rasa). We've built an adapter with which you can easily create the required Rasa training data directly from *refinery*. diff --git a/refinery/adapter/embedders.py b/refinery/adapter/embedders.py deleted file mode 100644 index 099562d..0000000 --- a/refinery/adapter/embedders.py +++ /dev/null @@ -1,27 +0,0 @@ -from embedders.classification.contextual import TransformerSentenceEmbedder -from refinery.adapter.util import split_train_test_on_weak_supervision - - -def build_classification_dataset( - client, sentence_input, classification_label, config_string -): - embedder = TransformerSentenceEmbedder(config_string) - - manual_data, weakly_supervised_data, labels = split_train_test_on_weak_supervision( - client, sentence_input, classification_label - ) - - weakly_supervised_data = weakly_supervised_data.head(100) - - embeddings_test = embedder.transform(manual_data[sentence_input].tolist()) - embeddings_train = embedder.transform( - weakly_supervised_data[sentence_input].tolist() - ) - - return { - "train": { - "inputs": embeddings_train, - "labels": weakly_supervised_data["label"], - }, - "test": {"inputs": embeddings_test, "labels": manual_data["label"]}, - } diff --git a/refinery/adapter/sklearn.py b/refinery/adapter/sklearn.py new file mode 100644 index 0000000..2f47e62 --- /dev/null +++ b/refinery/adapter/sklearn.py @@ -0,0 +1,44 @@ +from typing import Any, Dict, Optional +from embedders.classification.contextual import TransformerSentenceEmbedder +from refinery import Client +from refinery.adapter.util import split_train_test_on_weak_supervision + + +def build_classification_dataset( + client: Client, + sentence_input: str, + classification_label: str, + config_string: Optional[str] = None, +) -> Dict[str, Dict[str, Any]]: + """ + Builds a classification dataset from a refinery client and a config string. + + Args: + client (Client): Refinery client + sentence_input (str): Name of the column containing the sentence input. + classification_label (str): Name of the label; if this is a task on the full record, enter the string with as "__