GEM-benchmark · jnyiloke · Sep 1, 2021 · Sep 1, 2021 · Sep 1, 2021 · Sep 1, 2021
diff --git a/transformations/stopword_removal/README.md b/transformations/stopword_removal/README.md
@@ -0,0 +1,15 @@
+# Stopword Removal
+Removes stopwords from a piece of text.
+
+Author: Juan Yi Loke
+Email: juanyi.loke@mail.utoronto.ca
+Affliation: University of Toronto
+
+## What type of a transformation is this?
+By default, this simple stopword removal parses a text, removes stopwords, and returns an untokenized version of the text using nltk's toktok tokenizer and treeword bank detokenizer. All stopwords are based on nltk's library of stopwords.
+
+## What tasks does it intend to benefit?
+Removing stopwords is often one of the key steps for text-preprocessing to reduce the size of text data one has to deal with.
+
+## What are the limitations of this transformation?
+The library of stopwords are constrained by nltk's library of stopwords. Different libraries like spaCy or gensim may include or exclude certain stopwords that are inside NLTK's library of stopwords. The NLTK library is chosen simply due to its popularity compared to other libraries.
diff --git a/transformations/stopword_removal/__init__.py b/transformations/stopword_removal/__init__.py
@@ -0,0 +1 @@
+from .transformation import *
diff --git a/transformations/stopword_removal/requirements.txt b/transformations/stopword_removal/requirements.txt
@@ -0,0 +1 @@
+nltk
diff --git a/transformations/stopword_removal/test.json b/transformations/stopword_removal/test.json
@@ -0,0 +1,50 @@
+{
+    "type": "stopword_removal",
+    "test_cases": [
+      {
+        "class": "StopwordRemoval",
+        "inputs": {
+          "sentence": "This is a test."
+        },
+        "outputs": [{
+           "sentence": "test."
+        }]
+      },
+      {
+        "class": "StopwordRemoval",
+        "inputs": {
+          "sentence": "To be or not to be, that is the question?"
+        },
+        "outputs": [{
+          "sentence": ", question?"
+        }]
+      },
+      {
+        "class": "StopwordRemoval",
+        "inputs": {
+          "sentence": "OMG!!! jUSTin is AmAZEballs!!!"
+        },
+        "outputs": [{
+          "sentence": "OMG!!! jUSTin AmAZEballs!!!"
+        }]
+      },
+      {
+        "class": "StopwordRemoval",
+        "inputs": {
+          "sentence": "To to to to"
+        },
+        "outputs": [{
+          "sentence": ""
+        }]
+      },
+      {
+        "class": "StopwordRemoval",
+        "inputs": {
+          "sentence": "Neuroplasticity is a continuous processing allowing short-term, medium-term, and long-term remodeling of the neuronosynaptic organization."
+        },
+        "outputs": [{
+          "sentence": "Neuroplasticity continuous processing allowing short-term, medium-term, long-term remodeling neuronosynaptic organization."
+        }]
+      }
+    ]
+  }
diff --git a/transformations/stopword_removal/transformation.py b/transformations/stopword_removal/transformation.py
@@ -0,0 +1,41 @@
+from nltk.corpus import stopwords
+from nltk.tokenize import ToktokTokenizer
+from nltk.tokenize.treebank import TreebankWordDetokenizer
+from interfaces.SentenceOperation import SentenceOperation
+from tasks.TaskTypes import TaskType
+
+
+def stopword_remove(text):
+    """
+    Remove stopwords using standard list comprehension.
+    Assumes that user_input text is in the English language.
+    Every string in the user_input is detokenized.
+    Returns a tokenized version of the text with stopwords removed.
+    """
+    stop_words = set(stopwords.words('english'))
+    text_tokenized = ToktokTokenizer().tokenize(text)
+    return [TreebankWordDetokenizer().detokenize([word for word in text_tokenized if word.lower() not in stop_words])]
+
+
+class StopwordRemoval(SentenceOperation):
+    """
+      This class offers a method for a stopword removal function to transform
+      the text. Stopword removal is the process of removing stopwords from a text.
+      The library of stopwords chosen is based on NLTK's library of stopwords.
+    """
+    tasks = [
+        TaskType.TEXT_CLASSIFICATION,
+        TaskType.TEXT_TO_TEXT_GENERATION,
+    ]
+    languages = ["en"]
+    heavy = False
+
+    def __init__(self, seed=0, max_outputs=1):
+        super().__init__(seed, max_outputs=max_outputs)
+
+    def generate(self, raw_text: str):
+        perturbed_text = stopword_remove(
+            text=raw_text,
+            max_outputs=self.max_outputs
+        )
+        return perturbed_text