Skip to content

File System Module

anonipy.utils.file_system

The module containing the file_system utilities.

The file_system module provides a set of utilities for reading and writing files.

Functions:

Name Description
open_file

Opens a file and returns its content as a string.

write_file

Writes the text to a file.

open_json

Opens a JSON file and returns its content as a dictionary.

write_json

Writes the data to a JSON file.

_remove_extra_spaces(text)

Remove extra spaces from text.

Parameters:

Name Type Description Default
text str

The text to remove extra spaces from.

required

Returns:

Type Description
str

The text with extra spaces removed.

Source code in anonipy/utils/file_system.py
def _remove_extra_spaces(text: str) -> str:
    """Remove extra spaces from text.

    Args:
        text: The text to remove extra spaces from.

    Returns:
        The text with extra spaces removed.

    """

    text = text.strip()
    # remove extra spaces
    text = re.sub(" +", " ", text)
    text = re.sub("\n{2,}", "\n\n", text)
    return text

_remove_page_numbers(text)

Removes page numbers from text.

Parameters:

Name Type Description Default
text str

The text to remove page numbers from.

required

Returns:

Type Description
str

The text with page numbers removed.

Source code in anonipy/utils/file_system.py
def _remove_page_numbers(text: str) -> str:
    """Removes page numbers from text.

    Args:
        text: The text to remove page numbers from.

    Returns:
        The text with page numbers removed.

    """

    page_number_pattern = re.compile(r"^\s*\d+\s*$|\s*\d+\s*$")
    filtered_lines = [
        line.strip()
        for line in text.splitlines()
        if not page_number_pattern.match(line)
    ]
    return "\n".join(filtered_lines)

_extract_text_from_pdf(pdf_path)

Extracts text from a PDF file.

Parameters:

Name Type Description Default
pdf_path str

The path to the PDF file.

required

Returns:

Type Description
str

The text from the PDF file.

Source code in anonipy/utils/file_system.py
def _extract_text_from_pdf(pdf_path: str) -> str:
    """Extracts text from a PDF file.

    Args:
        pdf_path: The path to the PDF file.

    Returns:
        The text from the PDF file.

    """

    pdf_reader = PdfReader(pdf_path)

    pages_text = []
    for page in pdf_reader.pages:
        text = page.extract_text(extraction_mode="layout")
        text = _remove_page_numbers(text)
        text = _remove_extra_spaces(text)
        pages_text.append(text)
    document_text = "\n".join(pages_text)

    return document_text

_word_process_paragraph(p)

Get the text from a paragraph.

Parameters:

Name Type Description Default
p _Element

The paragraph element.

required

Returns:

Type Description
str

The text from the paragraph.

Source code in anonipy/utils/file_system.py
def _word_process_paragraph(p) -> str:
    """Get the text from a paragraph.

    Args:
        p (etree._Element): The paragraph element.

    Returns:
        The text from the paragraph.

    """

    return p.text

_word_process_table(t)

Get the text from a table.

Parameters:

Name Type Description Default
t _Element

The table element.

required

Returns:

Type Description
str

The text from the table.

Source code in anonipy/utils/file_system.py
def _word_process_table(t) -> str:
    """Get the text from a table.

    Args:
        t (etree._Element): The table element.

    Returns:
        The text from the table.

    """

    table_text = []
    for row in t.findall(".//w:tr", WORD_NAMESPACES):
        row_text = []
        for cell in row.findall(".//w:tc", WORD_NAMESPACES):
            cell_text = []
            for p in cell.findall(".//w:p", WORD_NAMESPACES):
                cell_text.append(p.text)
            row_text.append(" ".join(cell_text))
        table_text.append(" ".join(row_text))
    return "\n".join(table_text)

_extract_text_from_word(doc_path)

Extracts text from a Word file.

Parameters:

Name Type Description Default
doc_path str

The path to the Word file.

required

Returns:

Type Description
str

The text from the Word file.

Source code in anonipy/utils/file_system.py
def _extract_text_from_word(doc_path: str) -> str:
    """Extracts text from a Word file.

    Args:
        doc_path: The path to the Word file.

    Returns:
        The text from the Word file.

    """

    doc = Document(doc_path)
    content = []
    for element in doc.element.body:
        if element.tag.endswith("p"):
            # element is a paragraph
            text = _word_process_paragraph(element)
            content.append(text)
        elif element.tag.endswith("tbl"):
            # element is a table
            text = _word_process_table(element)
            content.append(text)
    document_text = "\n".join(content)
    return document_text

open_file(file_path)

Opens a file and returns its content as a string.

Examples:

>>> from anonipy.utils import file_system
>>> file_system.open_file("path/to/file.txt")
"Hello, World!"

Parameters:

Name Type Description Default
file_path str

The path to the file.

required

Returns:

Type Description
str

The content of the file as a string.

Source code in anonipy/utils/file_system.py
def open_file(file_path: str) -> str:
    """Opens a file and returns its content as a string.

    Examples:
        >>> from anonipy.utils import file_system
        >>> file_system.open_file("path/to/file.txt")
        "Hello, World!"

    Args:
        file_path: The path to the file.

    Returns:
        The content of the file as a string.

    """

    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"The file does not exist: {file_path}")

    _, file_extension = os.path.splitext(file_path)
    if file_extension.lower() == ".pdf":
        return _extract_text_from_pdf(file_path)
    elif file_extension.lower() in [".doc", ".docx"]:
        return _extract_text_from_word(file_path)
    elif file_extension.lower() == ".txt":
        with open(file_path, "r", encoding="utf-8") as f:
            return f.read()
    else:
        raise ValueError(f"The file extension is not supported: {file_extension}")

write_file(text, file_path, encode=True)

Writes the text to a file.

Examples:

>>> from anonipy.utils import file_system
>>> file_system.write_file("Hello, World!", "path/to/file.txt")

Parameters:

Name Type Description Default
text str

The text to write to the file.

required
file_path str

The path to the file.

required
encode Union[str, bool]

The encoding to use.

True

Raises:

Type Description
TypeError

If text, file_path is not a string; encode is not a string or a boolean.

FileNotFoundError

If the directory does not exist.

Source code in anonipy/utils/file_system.py
def write_file(text: str, file_path: str, encode: Union[str, bool] = True) -> None:
    """Writes the text to a file.

    Examples:
        >>> from anonipy.utils import file_system
        >>> file_system.write_file("Hello, World!", "path/to/file.txt")

    Args:
        text: The text to write to the file.
        file_path: The path to the file.
        encode: The encoding to use.

    Raises:
        TypeError: If text, `file_path` is not a string; `encode` is not a string or a boolean.
        FileNotFoundError: If the directory does not exist.

    """

    if not isinstance(text, str):
        raise TypeError("text must be a string")

    if not isinstance(file_path, str):
        raise TypeError("file_path must be a string")

    if not os.path.exists(os.path.dirname(file_path)):
        raise FileNotFoundError(
            f"The directory does not exist: {os.path.dirname(file_path)}"
        )

    if not isinstance(encode, str) and not isinstance(encode, bool):
        raise TypeError("encode must be a string or a boolean")

    encoding = None
    if isinstance(encode, str):
        encoding = encode
    elif isinstance(encode, bool):
        encoding = "utf-8" if encode else None

    with open(file_path, "w", encoding=encoding) as f:
        f.write(text)

open_json(file_path)

Opens a JSON file and returns its content as a dictionary.

Examples:

>>> from anonipy.utils import file_system
>>> file_system.open_json("path/to/file.json")
{"hello": "world"}

Parameters:

Name Type Description Default
file_path str

The path to the JSON file.

required

Returns:

Type Description
dict

The content of the JSON file as a dictionary.

Source code in anonipy/utils/file_system.py
def open_json(file_path: str) -> dict:
    """Opens a JSON file and returns its content as a dictionary.

    Examples:
        >>> from anonipy.utils import file_system
        >>> file_system.open_json("path/to/file.json")
        {"hello": "world"}

    Args:
        file_path: The path to the JSON file.

    Returns:
        The content of the JSON file as a dictionary.

    """

    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"The file does not exist: {file_path}")

    with open(file_path, "r", encoding="utf-8") as f:
        return json.load(f)

write_json(data, file_path)

Writes data to a JSON file.

Examples:

>>> from anonipy.utils import file_system
>>> file_system.write_json({"hello": "world"}, "path/to/file.json")

Parameters:

Name Type Description Default
data dict

The data to write to the JSON file.

required
file_path str

The path to the JSON file.

required
Source code in anonipy/utils/file_system.py
def write_json(data: dict, file_path: str) -> None:
    """Writes data to a JSON file.

    Examples:
        >>> from anonipy.utils import file_system
        >>> file_system.write_json({"hello": "world"}, "path/to/file.json")

    Args:
        data: The data to write to the JSON file.
        file_path: The path to the JSON file.

    """

    if not os.path.exists(os.path.dirname(file_path)):
        os.makedirs(os.path.dirname(file_path), exist_ok=True)

    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)