Generators Module

`anonipy.anonymize.generators`

Module containing the generators.

The generators module provides a set of generators used to generate data substitutes.

Classes:

Name	Description
`LLMLabelGenerator`	The class representing the label generator utilizing LLMs.
`MaskLabelGenerator`	The class representing the label generator utilizing token masking.
`NumberGenerator`	The class representing the number generator.
`DateGenerator`	The class representing the date generator.

`anonipy.anonymize.generators.LLMLabelGenerator`

Bases: GeneratorInterface

The class representing the LLM label generator.

Examples:

>>> from anonipy.anonymize.generators import LLMLabelGenerator
>>> generator = LLMLabelGenerator()
>>> generator.generate(entity)

Attributes:

Name	Type	Description
`model`	`Transformers`	The model used to generate the label substitutes.

Methods:

Name	Description
`generate`	Generate the label based on the entity.

Source code in anonipy/anonymize/generators/llm_label_generator.py

class LLMLabelGenerator(GeneratorInterface):
    """The class representing the LLM label generator.

    Examples:
        >>> from anonipy.anonymize.generators import LLMLabelGenerator
        >>> generator = LLMLabelGenerator()
        >>> generator.generate(entity)

    Attributes:
        model (models.Transformers): The model used to generate the label substitutes.

    Methods:
        generate(entity, entity_prefix, temperature):
            Generate the label based on the entity.

    """

    def __init__(
        self,
        *args,
        model_name: str = "HuggingFaceTB/SmolLM2-1.7B-Instruct",
        use_gpu: bool = False,
        use_quant: bool = False,
        **kwargs,
    ):
        """Initializes the LLM label generator.

        Args:
            model_name: The name of the model to use.
            use_gpu: Whether to use GPU or not.
            use_quant: Whether to use quantization or not.

        Examples:
            >>> from anonipy.anonymize.generators import LLMLabelGenerator
            >>> generator = LLMLabelGenerator()
            LLMLabelGenerator()

        """

        super().__init__(*args, **kwargs)

        if use_gpu and not torch.cuda.is_available():
            warnings.warn(
                "The use_gpu=True flag requires GPU/CUDA, but it is not available. Setting use_gpu=False."
            )
            use_gpu = False

        if use_quant and not is_installed_with(["quant", "all"]):
            warnings.warn(
                "The use_quant=True flag requires the 'quant' extra dependencies, but they are not installed. Setting use_quant=False."
            )
            use_quant = False

        self.model, self.tokenizer = self._prepare_model_and_tokenizer(
            model_name, use_gpu, use_quant
        )

    def generate(
        self,
        entity: Entity,
        *args,
        add_entity_attrs: str = "",
        temperature: float = 1.0,
        top_p: float = 0.95,
        **kwargs,
    ) -> str:
        """Generate the substitute for the entity based on it's attributes.

        Examples:
            >>> from anonipy.anonymize.generators import LLMLabelGenerator
            >>> generator = LLMLabelGenerator()
            >>> generator.generate(entity)
            label

        Args:
            entity: The entity to generate the label from.
            add_entity_attrs: Additional entity attribute description to add to the generation.
            temperature: The temperature to use for the generation.
            top_p: The top p to use for the generation.

        Returns:
            The generated entity label substitute.

        """

        message = [
            {
                "role": "system",
                "content": "You are a helpful AI assistant for generating replacements for text entities.",
            },
            {
                "role": "user",
                "content": f"What is a random {add_entity_attrs} {entity.label} replacement for {entity.text}? Respond only with the replacement.",
            },
        ]
        return self._generate_response(message, temperature, top_p)

    # =================================
    # Private methods
    # =================================

    def _prepare_model_and_tokenizer(
        self, model_name: str, use_gpu: bool, use_quant: bool
    ) -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
        """Prepares the model and tokenizer.

        Args:
            model_name: The name of the model to use.

        Returns:
            The huggingface model.
            The huggingface tokenizer.

        """

        # prepare the model
        device = torch.device(
            "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
        )
        dtype = torch.float32 if device.type == "cpu" else torch.float16

        model = self._load_model(model_name, device, dtype, use_quant, use_gpu)
        tokenizer = self._load_tokenizer(model_name)

        return model, tokenizer

    def _load_model(
        self,
        model_name: str,
        device: torch.device,
        dtype: torch.dtype,
        use_quant: bool,
        use_gpu: bool,
    ) -> AutoModelForCausalLM:
        """Load the model with appropriate configuration.

        Args:
            model_name: The name of the model to use.
            device: The device to use for the model.
            dtype: The data type to use for the model.
            use_quant: Whether to use quantization or not.
            use_gpu: Whether to use GPU or not.

        Returns:
            The huggingface model.

        """
        if use_quant and use_gpu:
            quant_config = BitsAndBytesConfig(
                load_in_8bit=True, bnb_4bit_compute_dtype=dtype
            )
            return AutoModelForCausalLM.from_pretrained(
                model_name,
                device_map=device,
                torch_dtype=dtype,
                quantization_config=quant_config,
            )

        if use_quant:
            warnings.warn(
                "Quantization is only supported on GPU, but use_gpu=False. Loading model without quantization."
            )

        return AutoModelForCausalLM.from_pretrained(
            model_name, device_map=device, torch_dtype=dtype
        )

    def _load_tokenizer(self, model_name: str) -> AutoTokenizer:
        """Load the tokenizer with appropriate configuration.

        Args:
            model_name: The name of the model to use.

        Returns:
            The huggingface tokenizer.
        """
        return AutoTokenizer.from_pretrained(
            model_name, padding_side="right", use_fast=False
        )

    def _generate_response(
        self, message: List[dict], temperature: float, top_p: float
    ) -> str:
        """Generate the response from the LLM.

        Args:
            message: The message to generate the response from.
            temperature: The temperature to use for the generation.
            top_p: The top p to use for the generation.

        Returns:
            The generated response.

        """

        # tokenize the message
        input_ids = self.tokenizer.apply_chat_template(
            message, tokenize=True, return_tensors="pt", add_generation_prompt=True
        ).to(self.model.device)

        # create attention mask (1 for all tokens)
        attention_mask = torch.ones_like(input_ids)

        # set pad token id if not set
        if self.tokenizer.pad_token_id is None:
            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id

        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", message=".*the `logits` model output.*")

            # generate the response
            with torch.no_grad():
                output_ids = self.model.generate(
                    input_ids,
                    attention_mask=attention_mask,
                    max_new_tokens=50,
                    temperature=temperature,
                    top_p=top_p,
                    do_sample=True,
                    pad_token_id=self.tokenizer.pad_token_id,
                )

        # decode the response
        response = self.tokenizer.decode(
            output_ids[0][len(input_ids[0]) :], skip_special_tokens=True
        )
        return response

`init(*args, model_name='HuggingFaceTB/SmolLM2-1.7B-Instruct', use_gpu=False, use_quant=False, **kwargs)`

Initializes the LLM label generator.

Parameters:

Name	Type	Description	Default
`model_name`	`str`	The name of the model to use.	`'HuggingFaceTB/SmolLM2-1.7B-Instruct'`
`use_gpu`	`bool`	Whether to use GPU or not.	`False`
`use_quant`	`bool`	Whether to use quantization or not.	`False`

Examples:

>>> from anonipy.anonymize.generators import LLMLabelGenerator
>>> generator = LLMLabelGenerator()
LLMLabelGenerator()

Source code in anonipy/anonymize/generators/llm_label_generator.py

def __init__(
    self,
    *args,
    model_name: str = "HuggingFaceTB/SmolLM2-1.7B-Instruct",
    use_gpu: bool = False,
    use_quant: bool = False,
    **kwargs,
):
    """Initializes the LLM label generator.

    Args:
        model_name: The name of the model to use.
        use_gpu: Whether to use GPU or not.
        use_quant: Whether to use quantization or not.

    Examples:
        >>> from anonipy.anonymize.generators import LLMLabelGenerator
        >>> generator = LLMLabelGenerator()
        LLMLabelGenerator()

    """

    super().__init__(*args, **kwargs)

    if use_gpu and not torch.cuda.is_available():
        warnings.warn(
            "The use_gpu=True flag requires GPU/CUDA, but it is not available. Setting use_gpu=False."
        )
        use_gpu = False

    if use_quant and not is_installed_with(["quant", "all"]):
        warnings.warn(
            "The use_quant=True flag requires the 'quant' extra dependencies, but they are not installed. Setting use_quant=False."
        )
        use_quant = False

    self.model, self.tokenizer = self._prepare_model_and_tokenizer(
        model_name, use_gpu, use_quant
    )

`generate(entity, *args, add_entity_attrs='', temperature=1.0, top_p=0.95, **kwargs)`

Generate the substitute for the entity based on it's attributes.

Examples:

>>> from anonipy.anonymize.generators import LLMLabelGenerator
>>> generator = LLMLabelGenerator()
>>> generator.generate(entity)
label

Parameters:

Name	Type	Description	Default
`entity`	`Entity`	The entity to generate the label from.	required
`add_entity_attrs`	`str`	Additional entity attribute description to add to the generation.	`''`
`temperature`	`float`	The temperature to use for the generation.	`1.0`
`top_p`	`float`	The top p to use for the generation.	`0.95`

Returns:

Type	Description
`str`	The generated entity label substitute.

Source code in anonipy/anonymize/generators/llm_label_generator.py

def generate(
    self,
    entity: Entity,
    *args,
    add_entity_attrs: str = "",
    temperature: float = 1.0,
    top_p: float = 0.95,
    **kwargs,
) -> str:
    """Generate the substitute for the entity based on it's attributes.

    Examples:
        >>> from anonipy.anonymize.generators import LLMLabelGenerator
        >>> generator = LLMLabelGenerator()
        >>> generator.generate(entity)
        label

    Args:
        entity: The entity to generate the label from.
        add_entity_attrs: Additional entity attribute description to add to the generation.
        temperature: The temperature to use for the generation.
        top_p: The top p to use for the generation.

    Returns:
        The generated entity label substitute.

    """

    message = [
        {
            "role": "system",
            "content": "You are a helpful AI assistant for generating replacements for text entities.",
        },
        {
            "role": "user",
            "content": f"What is a random {add_entity_attrs} {entity.label} replacement for {entity.text}? Respond only with the replacement.",
        },
    ]
    return self._generate_response(message, temperature, top_p)

`anonipy.anonymize.generators.MaskLabelGenerator`

Bases: GeneratorInterface

The class representing the mask label generator.

Examples:

>>> from anonipy.anonymize.generators import MaskLabelGenerator
>>> generator = MaskLabelGenerator(model_name, context_window=100, use_gpu=False)
>>> generator.generate(entity)

Attributes:

Name	Type	Description
`pipeline`	`Pipeline`	The transformers pipeline used to generate the label substitutes.
`context_window`	`int`	The context window size to use to generate the label substitutes.
`mask_token`	`str`	The mask token to use to replace the masked words.

Methods:

Name	Description
`generate`	Generate the substitute for the entity based on it's location in the text.

Source code in anonipy/anonymize/generators/mask_label_generator.py

class MaskLabelGenerator(GeneratorInterface):
    """The class representing the mask label generator.

    Examples:
        >>> from anonipy.anonymize.generators import MaskLabelGenerator
        >>> generator = MaskLabelGenerator(model_name, context_window=100, use_gpu=False)
        >>> generator.generate(entity)


    Attributes:
        pipeline (Pipeline): The transformers pipeline used to generate the label substitutes.
        context_window (int): The context window size to use to generate the label substitutes.
        mask_token (str): The mask token to use to replace the masked words.

    Methods:
        generate(entity, text):
            Generate the substitute for the entity based on it's location in the text.

    """

    def __init__(
        self,
        *args,
        model_name: str = "FacebookAI/xlm-roberta-large",
        use_gpu: bool = False,
        context_window: int = 100,
        **kwargs,
    ):
        """Initializes the mask label generator.

        Examples:
            >>> from anonipy.anonymize.generators import MaskLabelGenerator
            >>> generator = MaskLabelGenerator(context_window=120, use_gpu=True)

        Args:
            model_name: The name of the masking model to use.
            use_gpu: Whether to use GPU/CUDA, if available.
            context_window: The context window size.

        """

        super().__init__(*args, **kwargs)
        self.context_window = context_window
        if use_gpu and not torch.cuda.is_available():
            warnings.warn(
                "The use_gpu=True flag requires GPU/CUDA, but it is not available. Setting use_gpu=False."
            )
            use_gpu = False

        # prepare the fill-mask pipeline and store the mask token
        model, tokenizer, device = self._prepare_model_and_tokenizer(
            model_name, use_gpu
        )
        self.mask_token = tokenizer.mask_token
        self.pipeline = pipeline(
            "fill-mask", model=model, tokenizer=tokenizer, top_k=40, device=device
        )

    def generate(self, entity: Entity, text: str, *args, **kwargs) -> str:
        """Generate the substitute for the entity using the masking model.

        Examples:
            >>> from anonipy.anonymize.generators import MaskLabelGenerator
            >>> generator = MaskLabelGenerator(context_window=120, use_gpu=True)
            >>> generator.generate(entity, text)
            label

        Args:
            entity: The entity used to generate the substitute.
            text: The original text in which the entity is located; used to get the entity's context.

        Returns:
            The generated substitute text.

        """

        masks = self._create_masks(entity)
        input_texts = self._prepare_generate_inputs(masks, text)
        suggestions = self.pipeline(input_texts)
        return self._create_substitute(entity, masks, suggestions)

    # =================================
    # Private methods
    # =================================

    def _prepare_model_and_tokenizer(
        self, model_name: str, use_gpu: bool
    ) -> Tuple[AutoModelForMaskedLM, AutoTokenizer]:
        """Prepares the model and tokenizer.

        Args:
            model_name: The name of the model to use.
            use_gpu: Whether to use GPU/CUDA, if available.

        Returns:
            The huggingface model.
            The huggingface tokenizer.
            The device to use.

        """

        # prepare the model
        device = torch.device(
            "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
        )
        model = AutoModelForMaskedLM.from_pretrained(model_name).to(device)
        # prepare the tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        return model, tokenizer, device

    def _create_masks(self, entity: Entity) -> List[dict]:
        """Creates the masks for the provided entity.

        Args:
            entity: The entity to create the masks for.

        Returns:
            The list of masks attributes, including the true text, mask text, start index, and end index within the original text.

        """

        masks = []
        chunks = re.split(r"\s+", entity.text)
        for idx in range(len(chunks)):
            masks.append(
                {
                    "true_text": chunks[idx],
                    "mask_text": " ".join(
                        chunks[0:idx] + [self.mask_token] + chunks[idx + 1 :]
                    ),
                    "start_index": entity.start_index,
                    "end_index": entity.end_index,
                }
            )
        return masks

    def _get_context_text(self, text: str, start_index: int, end_index: int) -> str:
        """Get the context text.

        Args:
            text: The text to get the context from.
            start_index: The start index of the context window.
            end_index: The end index of the context window.

        Returns:
            The context window text.

        """

        min_index = max(0, start_index - self.context_window)
        max_index = min(end_index + self.context_window, len(text))
        return text[min_index:max_index]

    def _prepare_generate_inputs(self, masks: List[dict], text: str) -> List[str]:
        """Prepares the generate inputs.

        Args:
            masks: The list of masks attributes.
            text: The text to prepare the generate inputs for.

        Returns:
            The list of generate inputs.

        """
        return [
            self._get_context_text(
                text[: m["start_index"]] + m["mask_text"] + text[m["end_index"] :],
                m["start_index"],
                m["end_index"],
            )
            for m in masks
        ]

    def _create_substitute(
        self, entity: Entity, masks: List[dict], suggestions: List[dict]
    ) -> str:
        """Create a substitute for the entity.

        Args:
            entity: The entity to create the substitute for.
            masks: The list of masks attributes.
            suggestions: The list of substitute suggestions.

        Returns:
            The created and selected substitute text.

        """

        substitute_chunks = []
        for mask, suggestion in zip(masks, suggestions):
            suggestion = suggestion if type(suggestion) == list else [suggestion]
            viable_suggestions = list(
                filter(
                    lambda x: x["token_str"] != mask["true_text"]
                    and re.match(entity.regex, x["token_str"])
                    and x["token_str"] not in STOPWORDS,
                    suggestion,
                )
            )
            substitute_chunks.append([s["token_str"] for s in viable_suggestions[:3]])
        combinations = list(itertools.product(*substitute_chunks))
        combinations = list(map(lambda x: " ".join(set(x)), combinations))
        return random.choice(combinations) if len(combinations) > 0 else "None"

`init(*args, model_name='FacebookAI/xlm-roberta-large', use_gpu=False, context_window=100, **kwargs)`

Initializes the mask label generator.

Examples:

>>> from anonipy.anonymize.generators import MaskLabelGenerator
>>> generator = MaskLabelGenerator(context_window=120, use_gpu=True)

Parameters:

Name	Type	Description	Default
`model_name`	`str`	The name of the masking model to use.	`'FacebookAI/xlm-roberta-large'`
`use_gpu`	`bool`	Whether to use GPU/CUDA, if available.	`False`
`context_window`	`int`	The context window size.	`100`

Source code in anonipy/anonymize/generators/mask_label_generator.py

def __init__(
    self,
    *args,
    model_name: str = "FacebookAI/xlm-roberta-large",
    use_gpu: bool = False,
    context_window: int = 100,
    **kwargs,
):
    """Initializes the mask label generator.

    Examples:
        >>> from anonipy.anonymize.generators import MaskLabelGenerator
        >>> generator = MaskLabelGenerator(context_window=120, use_gpu=True)

    Args:
        model_name: The name of the masking model to use.
        use_gpu: Whether to use GPU/CUDA, if available.
        context_window: The context window size.

    """

    super().__init__(*args, **kwargs)
    self.context_window = context_window
    if use_gpu and not torch.cuda.is_available():
        warnings.warn(
            "The use_gpu=True flag requires GPU/CUDA, but it is not available. Setting use_gpu=False."
        )
        use_gpu = False

    # prepare the fill-mask pipeline and store the mask token
    model, tokenizer, device = self._prepare_model_and_tokenizer(
        model_name, use_gpu
    )
    self.mask_token = tokenizer.mask_token
    self.pipeline = pipeline(
        "fill-mask", model=model, tokenizer=tokenizer, top_k=40, device=device
    )

`generate(entity, text, *args, **kwargs)`

Generate the substitute for the entity using the masking model.

Examples:

>>> from anonipy.anonymize.generators import MaskLabelGenerator
>>> generator = MaskLabelGenerator(context_window=120, use_gpu=True)
>>> generator.generate(entity, text)
label

Parameters:

Name	Type	Description	Default
`entity`	`Entity`	The entity used to generate the substitute.	required
`text`	`str`	The original text in which the entity is located; used to get the entity's context.	required

Returns:

Type	Description
`str`	The generated substitute text.

Source code in anonipy/anonymize/generators/mask_label_generator.py

def generate(self, entity: Entity, text: str, *args, **kwargs) -> str:
    """Generate the substitute for the entity using the masking model.

    Examples:
        >>> from anonipy.anonymize.generators import MaskLabelGenerator
        >>> generator = MaskLabelGenerator(context_window=120, use_gpu=True)
        >>> generator.generate(entity, text)
        label

    Args:
        entity: The entity used to generate the substitute.
        text: The original text in which the entity is located; used to get the entity's context.

    Returns:
        The generated substitute text.

    """

    masks = self._create_masks(entity)
    input_texts = self._prepare_generate_inputs(masks, text)
    suggestions = self.pipeline(input_texts)
    return self._create_substitute(entity, masks, suggestions)

`anonipy.anonymize.generators.NumberGenerator`

Bases: GeneratorInterface

The class representing the number generator.

Examples:

>>> from anonipy.anonymize.generators import NumberGenerator
>>> generator = NumberGenerator()
>>> generator.generate(entity)

Methods:

Name	Description
`generate`	Generates a substitute for the numeric entity.

Source code in anonipy/anonymize/generators/number_generator.py

class NumberGenerator(GeneratorInterface):
    """The class representing the number generator.

    Examples:
        >>> from anonipy.anonymize.generators import NumberGenerator
        >>> generator = NumberGenerator()
        >>> generator.generate(entity)

    Methods:
        generate(self, entity):
            Generates a substitute for the numeric entity.

    """

    def __init__(self, *args, **kwargs):
        """Initializes the number generator.

        Examples:
            >>> from anonipy.anonymize.generators import NumberGenerator
            >>> generator = NumberGenerator()

        """

        super().__init__(*args, **kwargs)

    def generate(self, entity: Entity, *args, **kwargs) -> str:
        """Generates the substitute for the numeric entity.

        Examples:
            >>> from anonipy.anonymize.generators import NumberGenerator
            >>> generator = NumberGenerator()
            >>> generator.generate(entity)
            "1234567890"

        Args:
            entity: The numeric entity to generate the numeric substitute.

        Returns:
            The generated numeric substitute.

        Raises:
            ValueError: If the entity type is not `integer`, `float`, `phone_number` or `custom`.

        """

        if entity.type in ["custom"]:
            warnings.warn(
                "The entity type is `custom`. Make sure the generator is returning appropriate values."
            )
        elif entity.type not in ["integer", "float", "phone_number"]:
            raise ValueError(
                "The entity type must be `integer`, `float`, `phone_number` or `custom` to generate numbers."
            )
        return "".join(
            [str(random.randint(0, 9)) if d.isdigit() else d for d in entity.text]
        )

`init(*args, **kwargs)`

Initializes the number generator.

Examples:

>>> from anonipy.anonymize.generators import NumberGenerator
>>> generator = NumberGenerator()

Source code in anonipy/anonymize/generators/number_generator.py

def __init__(self, *args, **kwargs):
    """Initializes the number generator.

    Examples:
        >>> from anonipy.anonymize.generators import NumberGenerator
        >>> generator = NumberGenerator()

    """

    super().__init__(*args, **kwargs)

`generate(entity, *args, **kwargs)`

Generates the substitute for the numeric entity.

Examples:

>>> from anonipy.anonymize.generators import NumberGenerator
>>> generator = NumberGenerator()
>>> generator.generate(entity)
"1234567890"

Parameters:

Name	Type	Description	Default
`entity`	`Entity`	The numeric entity to generate the numeric substitute.	required

Returns:

Type	Description
`str`	The generated numeric substitute.

Raises:

Type	Description
`ValueError`	If the entity type is not `integer`, `float`, `phone_number` or `custom`.

Source code in anonipy/anonymize/generators/number_generator.py

def generate(self, entity: Entity, *args, **kwargs) -> str:
    """Generates the substitute for the numeric entity.

    Examples:
        >>> from anonipy.anonymize.generators import NumberGenerator
        >>> generator = NumberGenerator()
        >>> generator.generate(entity)
        "1234567890"

    Args:
        entity: The numeric entity to generate the numeric substitute.

    Returns:
        The generated numeric substitute.

    Raises:
        ValueError: If the entity type is not `integer`, `float`, `phone_number` or `custom`.

    """

    if entity.type in ["custom"]:
        warnings.warn(
            "The entity type is `custom`. Make sure the generator is returning appropriate values."
        )
    elif entity.type not in ["integer", "float", "phone_number"]:
        raise ValueError(
            "The entity type must be `integer`, `float`, `phone_number` or `custom` to generate numbers."
        )
    return "".join(
        [str(random.randint(0, 9)) if d.isdigit() else d for d in entity.text]
    )

`anonipy.anonymize.generators.DateGenerator`

Bases: GeneratorInterface

The class representing the date generator.

Examples:

>>> from anonipy.anonymize.generators import DateGenerator
>>> generator = DateGenerator(lang="de")
>>> generator.generate(entity)

Attributes:

Name	Type	Description
`lang`	`(str, LANGUAGES)`	The language of the text.
`date_format`	`str`	The date format in which the date should be generated.
`day_sigma`	`int`	The range of the random date in days.

Methods:

Name	Description
`generate`	Generate the date substitute based on the input parameters.

Source code in anonipy/anonymize/generators/date_generator.py

class DateGenerator(GeneratorInterface):
    """The class representing the date generator.

    Examples:
        >>> from anonipy.anonymize.generators import DateGenerator
        >>> generator = DateGenerator(lang="de")
        >>> generator.generate(entity)

    Attributes:
        lang (str, LANGUAGES): The language of the text.
        date_format (str): The date format in which the date should be generated.
        day_sigma (int): The range of the random date in days.

    Methods:
        generate(entity, output_gen):
            Generate the date substitute based on the input parameters.

    """

    def __init__(
        self,
        *args,
        lang: Union[str, LANGUAGES] = "en",
        date_format: str = "auto",
        day_sigma: int = 30,
        **kwargs,
    ):
        """Initializes the date generator.

        Examples:
            >>> from anonipy.anonymize.generators import DateGenerator
            >>> generator = DateGenerator()

        Args:
            lang: The language of the text.
            date_format: The date format in which the date should be generated. More on date formats [see here](https://www.contensis.com/help-and-docs/guides/querying-your-content/zenql-search/date-formats).
            day_sigma: The range of the random date in days.

        """

        super().__init__(*args, **kwargs)
        self.date_format = date_format
        self.day_sigma = day_sigma

        if isinstance(lang, str) and lang in LANGUAGES.supported_languages():
            self.lang = lang
        elif isinstance(lang, LANGUAGES):
            self.lang = lang[0]
        else:
            raise Exception(f"Unknown lang value: {lang}")

    def generate(
        self,
        entity: Entity,
        *args,
        sub_variant: DATE_TRANSFORM_VARIANTS = DATE_TRANSFORM_VARIANTS.RANDOM,
        **kwargs,
    ) -> str:
        """Generate the entity substitute based on the input parameters.

        Args:
            entity: The entity to generate the date substitute from.
            sub_variant: The substitute function variant to use.

        Returns:
            The generated date substitute.

        Raises:
            ValueError: If the entity type is not `date` or `custom`.

        """

        if entity.type in ["custom"]:
            warnings.warn(
                "The entity type is `custom`. Make sure the generator is returning appropriate values."
            )
        elif entity.type not in ["date"]:
            raise ValueError("The entity type must be `date` to generate dates.")

        if not DATE_TRANSFORM_VARIANTS.is_valid(sub_variant):
            raise ValueError(
                f"The sub_variant must be one of {', '.join(DATE_TRANSFORM_VARIANTS.values())} to generate dates."
            )

        # detect the date format
        if self.date_format == "auto":
            entity_date, date_format = detect_datetime_format(entity.text, self.lang)
        else:
            entity_date = dateparser.parse(entity.text, languages=[self.lang])
            date_format = self.date_format

        # validate the input values
        if entity_date is None:
            raise ValueError(f"Entity `{entity.text}` is not a valid date.")
        if date_format is None or isinstance(date_format, ValueError):
            raise ValueError(f"Entity `{entity.text}` is not a valid date.")

        # generate the date substitute
        generate_date = DATE_VARIANTS_MAPPING[sub_variant](entity_date, self.day_sigma)
        return format_datetime(generate_date, format=date_format, locale=self.lang)

`init(*args, lang='en', date_format='auto', day_sigma=30, **kwargs)`

Initializes the date generator.

Examples:

>>> from anonipy.anonymize.generators import DateGenerator
>>> generator = DateGenerator()

Parameters:

Name	Type	Description	Default
`lang`	`Union[str, LANGUAGES]`	The language of the text.	`'en'`
`date_format`	`str`	The date format in which the date should be generated. More on date formats see here.	`'auto'`
`day_sigma`	`int`	The range of the random date in days.	`30`

Source code in anonipy/anonymize/generators/date_generator.py

def __init__(
    self,
    *args,
    lang: Union[str, LANGUAGES] = "en",
    date_format: str = "auto",
    day_sigma: int = 30,
    **kwargs,
):
    """Initializes the date generator.

    Examples:
        >>> from anonipy.anonymize.generators import DateGenerator
        >>> generator = DateGenerator()

    Args:
        lang: The language of the text.
        date_format: The date format in which the date should be generated. More on date formats [see here](https://www.contensis.com/help-and-docs/guides/querying-your-content/zenql-search/date-formats).
        day_sigma: The range of the random date in days.

    """

    super().__init__(*args, **kwargs)
    self.date_format = date_format
    self.day_sigma = day_sigma

    if isinstance(lang, str) and lang in LANGUAGES.supported_languages():
        self.lang = lang
    elif isinstance(lang, LANGUAGES):
        self.lang = lang[0]
    else:
        raise Exception(f"Unknown lang value: {lang}")

`generate(entity, *args, sub_variant=DATE_TRANSFORM_VARIANTS.RANDOM, **kwargs)`

Generate the entity substitute based on the input parameters.

Parameters:

Name	Type	Description	Default
`entity`	`Entity`	The entity to generate the date substitute from.	required
`sub_variant`	`DATE_TRANSFORM_VARIANTS`	The substitute function variant to use.	`RANDOM`

Returns:

Type	Description
`str`	The generated date substitute.

Raises:

Type	Description
`ValueError`	If the entity type is not `date` or `custom`.

Source code in anonipy/anonymize/generators/date_generator.py

def generate(
    self,
    entity: Entity,
    *args,
    sub_variant: DATE_TRANSFORM_VARIANTS = DATE_TRANSFORM_VARIANTS.RANDOM,
    **kwargs,
) -> str:
    """Generate the entity substitute based on the input parameters.

    Args:
        entity: The entity to generate the date substitute from.
        sub_variant: The substitute function variant to use.

    Returns:
        The generated date substitute.

    Raises:
        ValueError: If the entity type is not `date` or `custom`.

    """

    if entity.type in ["custom"]:
        warnings.warn(
            "The entity type is `custom`. Make sure the generator is returning appropriate values."
        )
    elif entity.type not in ["date"]:
        raise ValueError("The entity type must be `date` to generate dates.")

    if not DATE_TRANSFORM_VARIANTS.is_valid(sub_variant):
        raise ValueError(
            f"The sub_variant must be one of {', '.join(DATE_TRANSFORM_VARIANTS.values())} to generate dates."
        )

    # detect the date format
    if self.date_format == "auto":
        entity_date, date_format = detect_datetime_format(entity.text, self.lang)
    else:
        entity_date = dateparser.parse(entity.text, languages=[self.lang])
        date_format = self.date_format

    # validate the input values
    if entity_date is None:
        raise ValueError(f"Entity `{entity.text}` is not a valid date.")
    if date_format is None or isinstance(date_format, ValueError):
        raise ValueError(f"Entity `{entity.text}` is not a valid date.")

    # generate the date substitute
    generate_date = DATE_VARIANTS_MAPPING[sub_variant](entity_date, self.day_sigma)
    return format_datetime(generate_date, format=date_format, locale=self.lang)

`anonipy.anonymize.generators.GeneratorInterface`

The class representing the generator interface.

All generators should inherit from this class.

Methods:

Name	Description
`generate`	Generate a substitute for the entity.

Source code in anonipy/anonymize/generators/interface.py

class GeneratorInterface:
    """The class representing the generator interface.

    All generators should inherit from this class.

    Methods:
        generate(entity):
            Generate a substitute for the entity.

    """

    def __init__(self, *args, **kwargs):
        pass

    def generate(self, entity: Entity, *args, **kwargs):
        pass

Generators Module

anonipy.anonymize.generators

anonipy.anonymize.generators.LLMLabelGenerator

__init__(*args, model_name='HuggingFaceTB/SmolLM2-1.7B-Instruct', use_gpu=False, use_quant=False, **kwargs)

generate(entity, *args, add_entity_attrs='', temperature=1.0, top_p=0.95, **kwargs)

anonipy.anonymize.generators.MaskLabelGenerator

__init__(*args, model_name='FacebookAI/xlm-roberta-large', use_gpu=False, context_window=100, **kwargs)

generate(entity, text, *args, **kwargs)

anonipy.anonymize.generators.NumberGenerator

__init__(*args, **kwargs)

generate(entity, *args, **kwargs)

anonipy.anonymize.generators.DateGenerator

__init__(*args, lang='en', date_format='auto', day_sigma=30, **kwargs)

generate(entity, *args, sub_variant=DATE_TRANSFORM_VARIANTS.RANDOM, **kwargs)

anonipy.anonymize.generators.GeneratorInterface

`anonipy.anonymize.generators`

`anonipy.anonymize.generators.LLMLabelGenerator`

`init(*args, model_name='HuggingFaceTB/SmolLM2-1.7B-Instruct', use_gpu=False, use_quant=False, **kwargs)`

`generate(entity, *args, add_entity_attrs='', temperature=1.0, top_p=0.95, **kwargs)`

`anonipy.anonymize.generators.MaskLabelGenerator`

`init(*args, model_name='FacebookAI/xlm-roberta-large', use_gpu=False, context_window=100, **kwargs)`

`generate(entity, text, *args, **kwargs)`

`anonipy.anonymize.generators.NumberGenerator`

`init(*args, **kwargs)`

`generate(entity, *args, **kwargs)`

`anonipy.anonymize.generators.DateGenerator`

`init(*args, lang='en', date_format='auto', day_sigma=30, **kwargs)`

`generate(entity, *args, sub_variant=DATE_TRANSFORM_VARIANTS.RANDOM, **kwargs)`

`anonipy.anonymize.generators.GeneratorInterface`