! python --version
! which python

Python 3.12.3
/home/floriane/Documents/visual_code/.venvocc/bin/python

import re

def clean_text(text):
    # 1. Fix inline hyphens: "agnitio- nem" → "agnitionem"
    text = re.sub(r'(\w)-\s+(\w)', r'\1\2', text)

    # 2. Remove soft hyphens (invisible OCR hyphens)
    text = text.replace('\u00AD', '')

    # 3. Remove hyphen + newline (in case those also appear)
    text = re.sub(r'-\s*\r?\n\s*', '', text)
    
    return text

import re
import string

def clean_punctuation(text):
    # 1. Replace ':' with a space before removing it
    text = text.replace(":", " ")

    # 2. Build list of punctuation to remove (everything except '.')
    punctuation_to_remove = string.punctuation.replace(".", "")
    
    # Add extra characters you want removed
    punctuation_to_remove += "„»"

    # 3. Remove all unwanted punctuation
    text = re.sub(f"[{re.escape(punctuation_to_remove)}]", "", text)

    return text

def to_lowercase(text):
    # First convert uppercase V → U
    text = text.replace("V", "U")
    # Then lowercase everything else (U will become u)
    text = text.lower()
    return text

import unicodedata

def remove_accents(text):
    # Normalize text to NFD form (decomposed)
    nfkd = unicodedata.normalize("NFD", text)
    # Keep only characters that are NOT combining marks (accents)
    return "".join([c for c in nfkd if not unicodedata.combining(c)])

import re

def remove_greek(text):
    # Remove characters in Greek Unicode ranges
    return re.sub(r'[\u0370-\u03FF\u1F00-\u1FFF]+', '', text)

import re

def remove_numbers(text):
    # remove all digits 0-9
    return re.sub(r'\d+', '', text)

import pandas as pd
import re 
def ocr_failure (text, csv_path="Dictionnaries/recurrent_ocr_failure_new.csv"):
    df = pd.read_csv(csv_path)
    
    for _, row in df.iterrows():
        pattern = row['mistake']
        repl = row['correction']
        text = re.sub(pattern, repl, text)
    
    return text

import pandas as pd
import re 
def standardize_latin (text, csv_path="Dictionnaries/corpus_standardize.csv"):
    df = pd.read_csv(csv_path)
    
    for _, row in df.iterrows():
        pattern = row['original']
        repl = row['standardize']
        text = re.sub(pattern, repl, text)
    
    return text

import pandas as pd
import re

def solved_abreviation(text, csv_path="Dictionnaries/abreviation_solved_new.csv"):
    # Load CSV into a DataFrame
    df = pd.read_csv(csv_path)
    
    for _, row in df.iterrows():
        pattern = row['abreviation']
        repl = row['resolution']
        text = re.sub(pattern, repl, text)
    
    return text

import pandas as pd
import re

def solved_biblicalab(text, csv_path="Dictionnaries/biblicalab_solved_new.csv"):
    # Load CSV into a DataFrame
    df = pd.read_csv(csv_path)
    
    for _, row in df.iterrows():
        pattern = row['abreviation2']
        repl = row['resolution2']
        text = re.sub(pattern, repl, text)
    
    return text

def remove_dot_with_spaces(text):
    return text.replace(" . ", " ")
    
# Example
s = "This is a test . With some dots . "
cleaned = remove_dot_with_spaces(s)
print(cleaned)  # Output: "This is a test With some dots "

This is a test With some dots

def remove_double_spaces(text):
    """
    Replace all consecutive spaces in the text with a single space.
    """
    # Split by whitespace and join back with single spaces
    return ' '.join(text.split())

# Example usage
text = "This  is   an example  with  double spaces."
cleaned_text = remove_double_spaces(text)
print(cleaned_text)

This is an example with double spaces.

def add_newline_after_dot(text):
    """
    Add a newline character after each period in the text.
    """
    # Use regex to handle period followed by optional spaces
    import re
    # Replace ". " or "." followed by end of text with ".\n"
    return re.sub(r"\.\s*", ".\n", text)

# Example usage
text = "This is a sentence. This is another sentence. And one more."
result = add_newline_after_dot(text)
print(result)

This is a sentence.
This is another sentence.
And one more.

def clean_all(text):
    text = clean_text(text)
    print("clean_text: ok")

    text = clean_punctuation(text)
    print("clean_punctuation_and_newlines: ok")    
    
    text = to_lowercase(text)
    print("to_lowercase: ok")

    text = remove_accents(text)
    print("remove_accents: ok")

    text = remove_greek(text)
    print("remove_greek: ok")

    text = remove_numbers(text)
    print("remove_numbers: ok")

    text= ocr_failure(text)
    print("ocr_failure_corrected : ok")
    text = standardize_latin(text)
    print("standardize_latin: ok")

    text = solved_abreviation(text)
    print("solved_abreviation: ok")
    
    text = solved_biblicalab(text)
    print("solved biblcal abreviation : ok")
    
    text = remove_dot_with_spaces(text)
    print("remove_dot_with_spaces: ok")

    text = remove_double_spaces(text)
    print("remove_double_spaces:ok")
    
    text = add_newline_after_dot(text)
    print("add_newline_after_dot: ok")

    return text

import glob
import os

paths = glob.glob("test/**/*.txt", recursive=True)

for path in paths:
    with open(path, "r", encoding="utf-8") as f:
        text = f.read()

    cleaned = clean_all(text)

    # create new filename: file.txt → file_cl.txt
    base, ext = os.path.splitext(path)
    new_path = base + "_cl" + ext

    with open(new_path, "w", encoding="utf-8") as f:
        f.write(cleaned)

    print(f"Saved cleaned file: {new_path}")

clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Pellicanus/C_II_v1-2_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Pellicanus/C_II_v2_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Pellicanus/C_II_v8_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Pellicanus/C_II_v11-12_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Pellicanus/C_II_v15_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Pellicanus/C_II_v5-6_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Pellicanus/C_II_v6-7_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Pellicanus/C_II_v3-4_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Pellicanus/C_II_v9-10_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Pellicanus/C_II_v13-14_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Unbekannt/C_II_v2_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Unbekannt/C_II_v9_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Unbekannt/C_II_v8_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Unbekannt/C_II_v1_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Unbekannt/C_II_v11-12_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Unbekannt/C_II_v15_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Unbekannt/C_II_v5-6_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Unbekannt/C_II_v6-7_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Unbekannt/C_II_v3-4_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Unbekannt/C_II_v10_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Unbekannt/C_II_v13-14_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Unbekannt/C_II_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Bullinger/C_II_v1-2_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Bullinger/C_II_v15epb_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Bullinger/C_II_v8_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Bullinger/C_II_v1_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Bullinger/C_II_v11-15_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Bullinger/C_II_v3-4_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Bullinger/C_II_v15ep_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Bullinger/C_II_v9-10_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Bullinger/C_II_v5-7_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Bullinger/C_II_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Aretius/C_II_v13_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Aretius/C_II_v2_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Aretius/C_II_v14_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Aretius/C_II_v1b_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Aretius/C_II_v12_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Aretius/C_II_v9_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Aretius/C_II_v2b_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Aretius/C_II_v8_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Aretius/C_II_v6b_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Aretius/C_II_v1_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Aretius/C_II_v15_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Aretius/C_II_v6_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Aretius/C_II_v11_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Aretius/C_II_v3_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Aretius/C_II_v10_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Aretius/C_II_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Aretius/C_II_v7_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Hyperius/C_II_v1-2_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Hyperius/C_II_v8_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Hyperius/C_II_v11-12_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Hyperius/C_II_v15_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Hyperius/C_II_v5-6_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Hyperius/C_II_v3-4_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Hyperius/C_II_v8-10_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Hyperius/C_II_v15ep_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Hyperius/C_II_v13-14_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Hyperius/C_II_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Hyperius/C_II_v7_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Calvin/C_II_v1-2_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Calvin/C_II_v2-4_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Calvin/C_II_v11-15_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Calvin/C_II_v8-10_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Calvin/C_II_v5-7_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Lefevre/C_II_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Bugenhagen/C_II_v5_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Bugenhagen/C_II_v8_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Bugenhagen/C_II_v4_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Bugenhagen/C_II_v1_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Bugenhagen/C_II_v6_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Bugenhagen/C_II_v11_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Bugenhagen/C_II_v8b_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Bugenhagen/C_II_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Cajetan/C_II_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Lambertus/C_II_v5_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Lambertus/C_II_v13_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Lambertus/C_II_v2_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Lambertus/C_II_v14_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Lambertus/C_II_v12_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Lambertus/C_II_v9_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Lambertus/C_II_v8_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Lambertus/C_II_v4_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Lambertus/C_II_v1_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Lambertus/C_II_v15_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Lambertus/C_II_v6_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Lambertus/C_II_v11_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Lambertus/C_II_v12b_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Lambertus/C_II_v3_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Lambertus/C_II_v10_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Lambertus/C_II_cl.txt
clean_text: ok
clean_punctuation_and_newlines: ok
to_lowercase: ok
remove_accents: ok
remove_greek: ok
remove_numbers: ok
ocr_failure_corrected : ok
standardize_latin: ok
solved_abreviation: ok
solved biblcal abreviation : ok
remove_dot_with_spaces: ok
remove_double_spaces:ok
add_newline_after_dot: ok
Saved cleaned file: test/Lambertus/C_II_v7_cl.txt

import re
import glob
import os

def sort_key(filename):
    """
    Return a tuple for sorting:
    1. start number after 'v' (0 if none)
    2. end number after '-' (same as start if no range)
    3. ep_flag: 0 = none, 1 = ep, 2 = epb
    """
    basename = os.path.basename(filename)
    match = re.search(r'_v(\d+)(?:-(\d+))?(epb|ep)?', basename)
    if match:
        start = int(match.group(1))
        end = int(match.group(2)) if match.group(2) else start
        ep_flag = 0
        if match.group(3) == "ep":
            ep_flag = 1
        elif match.group(3) == "epb":
            ep_flag = 2
        return (start, end, ep_flag)
    else:
        return (-1, -1, 0)

# Get all subfolders under 'test'
folders = [f for f in glob.glob("test/*") if os.path.isdir(f)]

for folder in folders:
    # Find all txt files ending with '_cl.txt' in this folder
    files = glob.glob(os.path.join(folder, "*_cl.txt"))
    files_sorted = sorted(files, key=sort_key)
    
    # Output file name is based on the folder
    folder_name = os.path.basename(folder)
    output_file = f"{folder_name}_all_cl.txt"
    
    with open(output_file, "w", encoding="utf-8") as outfile:
        for fname in files_sorted:
            with open(fname, "r", encoding="utf-8") as infile:
                outfile.write(infile.read())
                outfile.write("\n")  # newline between files
    
    print(f"Merged files in folder '{folder_name}' into '{output_file}'")

Merged files in folder 'Pellicanus' into 'Pellicanus_all_cl.txt'
Merged files in folder 'Unbekannt' into 'Unbekannt_all_cl.txt'
Merged files in folder 'Bullinger' into 'Bullinger_all_cl.txt'
Merged files in folder 'Aretius' into 'Aretius_all_cl.txt'
Merged files in folder 'Hyperius' into 'Hyperius_all_cl.txt'
Merged files in folder 'Calvin' into 'Calvin_all_cl.txt'
Merged files in folder 'Lefevre' into 'Lefevre_all_cl.txt'
Merged files in folder 'Bugenhagen' into 'Bugenhagen_all_cl.txt'
Merged files in folder 'Cajetan' into 'Cajetan_all_cl.txt'
Merged files in folder 'Lambertus' into 'Lambertus_all_cl.txt'

"""""""""
import csv

# Input: old CSV
with open('Dictionnaries/recurrent_ocr_failure.csv', newline='', encoding='utf-8') as infile:
    reader = csv.reader(infile)
    rows = [row for row in reader]

# Output: new CSV
with open('new_dict.csv', 'w', newline='', encoding='utf-8') as outfile:
    writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
    for row in rows:
        pattern = row[0]
        replacement = row[1] + " "  # Add trailing space
        writer.writerow([pattern, replacement])

print("CSV updated to new format with trailing spaces!")

"""""""""

'\nimport csv\n\n# Input: old CSV\nwith open(\'Dictionnaries/recurrent_ocr_failure.csv\', newline=\'\', encoding=\'utf-8\') as infile:\n    reader = csv.reader(infile)\n    rows = [row for row in reader]\n\n# Output: new CSV\nwith open(\'new_dict.csv\', \'w\', newline=\'\', encoding=\'utf-8\') as outfile:\n    writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)\n    for row in rows:\n        pattern = row[0]\n        replacement = row[1] + " "  # Add trailing space\n        writer.writerow([pattern, replacement])\n\nprint("CSV updated to new format with trailing spaces!")\n\n'

Data Cleaning and Preparation Script for Lemmatization¶

Cleaning TXT Files¶

Steps¶

Method¶

Tips¶

Verify your python setup¶

Cleaning step¶

For an other data processing¶