diff --git a/transformations/character_duplication/README.md b/transformations/character_duplication/README.md new file mode 100644 index 000000000..01e5bb9d2 --- /dev/null +++ b/transformations/character_duplication/README.md @@ -0,0 +1,21 @@ +# Character Duplication +This perturbation adds noise to all types of text sources (sentence, paragraph, etc.) proportional to noise erupting from keyboard typos making common spelling errors. + +Author name: Marco Di Giovanni +Author email: marco.digiovanni@polimi.it +Author Affiliation: Politecnico di Milano and University of Bologna + + + +## What type of a transformation is this? +This transformation acts like a perturbation to test robustness. +Few letters picked at random are duplicated. +Generated transformations display high similarity to the source sentences. + +## What tasks does it intend to benefit? +- This perturbation would benefit all tasks which have a sentence/paragraph/document as input like text classification, text generation, etc. +- The generated texts mimic typing mistakes. + +## What are the limitations of this transformation? +- This transformation is not capable of generating linguistically diverse text. +- This transformation will mainly affect the performance of token/word-level models, while character-level models should be much robust. diff --git a/transformations/character_duplication/__init__.py b/transformations/character_duplication/__init__.py new file mode 100644 index 000000000..930cdce0b --- /dev/null +++ b/transformations/character_duplication/__init__.py @@ -0,0 +1 @@ +from .transformation import * diff --git a/transformations/character_duplication/test.json b/transformations/character_duplication/test.json new file mode 100644 index 000000000..613314fe2 --- /dev/null +++ b/transformations/character_duplication/test.json @@ -0,0 +1,50 @@ +{ + "type": "character_duplication", + "test_cases": [ + { + "class": "CharacterDuplication", + "inputs": { + "sentence": "Andrew finally returned the French book to Chris that I bought last week" + }, + "outputs": [{ + "sentence": "Anndrew ffinnallly returrned thee French book too Chhris that I bought last week" + }] + }, + { + "class": "CharacterDuplication", + "inputs": { + "sentence": "Sentences with gapping, such as Paul likes coffee and Mary tea, lack an overt predicate to indicate the relation between two or more arguments." + }, + "outputs": [{ + "sentence": "Seentencees witth gappiing, succhh as Paul likess cooffee and Mary tea, lackk an overt predicate ttoo indiicate tthe relation between two orr moree arrguuments." + }] + }, + { + "class": "CharacterDuplication", + "inputs": { + "sentence": "Alice in Wonderland is a 2010 American live-action/animated dark fantasy adventure film" + }, + "outputs": [{ + "sentence": "Allice inn WWondderland is a 2010 AAmmerican live-acctioon/animated dark fantasyy adventure film" + }] + }, + { + "class": "CharacterDuplication", + "inputs": { + "sentence": "Ujjal Dev Dosanjh served as 33rd Premier of British Columbia from 2000 to 2001" + }, + "outputs": [{ + "sentence": "Ujjjal Deev Dossanjh seerved ass 33rd Premier of Briitish Columbia from 2000 to 2001" + }] + }, + { + "class": "CharacterDuplication", + "inputs": { + "sentence": "Neuroplasticity is a continuous processing allowing short-term, medium-term, and long-term remodeling of the neuronosynaptic organization." + }, + "outputs": [{ + "sentence": "Neeuroplaastticiity is aa continnuuous processingg alllowing short-term, mediium-term, and long-terrmm remoodelingg of the neuronosynaptic orrganizzatiionn." + }] + } + ] +} diff --git a/transformations/character_duplication/transformation.py b/transformations/character_duplication/transformation.py new file mode 100644 index 000000000..9678e4206 --- /dev/null +++ b/transformations/character_duplication/transformation.py @@ -0,0 +1,56 @@ +import random + +from interfaces.SentenceOperation import SentenceOperation +from tasks.TaskTypes import TaskType + + +def duplicate(text, prob=0.1, seed=42, max_outputs=1): + """ + This function duplicates random chars (not digits) in the text string, with specified probability. It returns a list of different perturbed strings, whose length is specified by max_outputs. + """ + random.seed(seed) + + original_text = list(text) + perturbed_texts = [] + for _ in range(max_outputs): + perturbed_text = [ + [letter] + if letter.isdigit() or random.random() > prob + else [letter, letter] + for letter in original_text + ] + perturbed_text = [ + letter for sublist in perturbed_text for letter in sublist + ] + perturbed_texts.append("".join(perturbed_text)) + return perturbed_texts + + +class CharacterDuplication(SentenceOperation): + tasks = [ + TaskType.TEXT_CLASSIFICATION, + TaskType.TEXT_TO_TEXT_GENERATION, + ] + languages = ["All"] + keywords = [ + "morphological", + "noise", + "rule-based", + "highly-meaning-preserving", + "high-precision", + "high-coverage", + "high-generations", + ] + + def __init__(self, seed=42, max_outputs=1, prob=0.1): + super().__init__(seed, max_outputs=max_outputs) + self.prob = prob + + def generate(self, sentence: str): + perturbed_texts = duplicate( + text=sentence, + prob=self.prob, + seed=self.seed, + max_outputs=self.max_outputs, + ) + return perturbed_texts