1 Introduction

objectif_du_reseau. description_des_données methode_employée.

1.1 bibliographie

@manual{uqslc2025ladal,
  author = {Schweinberger, Martin},
  title = {The Language Technology and Data Analysis Laboratory (LADAL)},
  note = {https://ladal.edu.au},
  year = {2025},
  organization = {The University of Queensland, School of Languages and Cultures},
  address = {Brisbane},
  edition = {2025.04.01}
}

tutorial one https://slcladal.netlify.app/textanalysis#1_Concordancing

1.2 vérification du système

verifier le chemin R et Python :

ouvrir R dans le bon environnement python (pour l’utilise dans visual_code)

#install.packages("reticulate")   # if not already installed
#library(reticulate)
#use_python("/home/floriane/Documents/visual_code/.venvocc/bin/python", required = TRUE)

#use_python("/home/floriane/Documents/visual_code/.venvocc/bin/python", required = TRUE)
#py_config()  # check Python version and path

vérification

system("python --version", intern = TRUE)

## [1] "Python 3.12.3"

system("which python", intern = TRUE)

## [1] "/usr/bin/python"

system("which R", intern = TRUE)

## [1] "/usr/bin/R"

2 Setup

On indique sur quel dossier on va travailler

2.1 Load packages

On choisit le serveur hébergeant le dépot Cran.

# Set CRAN mirror interactively to ETH Zurich (mirror 63)
# chooseCRANmirror() and choose 63 for switzerland e.g. 
options(repos = c(CRAN = "https://stat.ethz.ch/CRAN/"))

# install packages
install.packages("DT")

## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)

install.packages("knitr")

## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)

install.packages("kableExtra")

## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)

install.packages("quanteda")

## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)

install.packages("tidyverse")

## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)

install.packages("ggplot2")

## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)

install.packages("tm")

## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)

install.packages("tidytext")

## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)

install.packages("wordcloud2")

## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)

install.packages("scales")

## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)

install.packages("quanteda.textstats")

## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)

install.packages("quanteda.textplots")

## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)

install.packages("tidyr")

## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)

install.packages("cluster")

## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)

install.packages("class")

## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)

install.packages("NLP")

## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)

#install.packages("openNLP")
#install.packages("openNLPdata")
install.packages("pacman")

## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)

install.packages("webshot2", repos = "https://cloud.r-project.org")

## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)

install.packages("stringr")

## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)

# install klippy for copy-to-clipboard button in code chunks
remotes::install_github("rlesur/klippy")

## Skipping install of 'klippy' from a github remote, the SHA1 (378c247f) has not changed since last install.
##   Use `force = TRUE` to force installation

remotes::install_github("nx10/httpgd") # la version cran a été supprimle en 2025...

## Skipping install of 'httpgd' from a github remote, the SHA1 (dd6ed3a6) has not changed since last install.
##   Use `force = TRUE` to force installation

# set options
options(stringsAsFactors = F)
options(scipen = 999)
options(max.print=1000)
# load packages
library(DT)
library(tidyverse)
library(flextable)
library(quanteda)
library(tm)
library(tidytext)
library(wordcloud2)
library(ggplot2)
library(scales)
library(quanteda.textstats)
library(quanteda.textplots)
library(tidyr)
library(cluster)
library(class)
library(NLP)
#library(openNLP)
#library(openNLPdata)
library(pacman)
pacman::p_load_gh("trinker/entity")

## rJava       (NA -> 1.0-11 ) [CRAN]
## openNLPdata (NA -> 1.5.3-5) [CRAN]
## openNLP     (NA -> 0.2-7  ) [CRAN]

## Warning in i.p(...): l'installation du package 'rJava' a eu un statut de sortie
## non nul

## Warning in i.p(...): l'installation du package 'openNLPdata' a eu un statut de
## sortie non nul

## Warning in i.p(...): l'installation du package 'openNLP' a eu un statut de
## sortie non nul

## ── R CMD build ─────────────────────────────────────────────────────────────────
## * checking for file ‘/tmp/RtmprZiivI/remotes3acda4d4eb2d5/trinker-entity-5549d30/DESCRIPTION’ ... OK
## * preparing ‘entity’:
## * checking DESCRIPTION meta-information ... OK
## * checking for LF line-endings in source and make files and shell scripts
## * checking for empty or unneeded directories
## * building ‘entity_0.1.0.tar.gz’

## Warning in i.p(...): l'installation du package
## '/tmp/RtmprZiivI/file3acda267aa131/entity_0.1.0.tar.gz' a eu un statut de
## sortie non nul

## Warning in p_install_gh(package, dependencies, ...): The following may have incorrect capitalization specification:
## 
## entity

## Warning in pacman::p_load_gh("trinker/entity"): 
## Failed to install/load:
## trinker/entity

library(stringr)
library(webshot2)
library(httpgd)
# activate klippy for copy-to-clipboard button
klippy::klippy()

3 Concordancing

make one liste with all the chapter to compare

3.1 Prepare and store the data

Create a named character vector where each element contains the full text of one chapter.
Store these elements in a vector called chapter_texts, using names formatted as
Author_C_ChapterNumber to clearly organize chapters by author and chapter number.

library(tidyverse)
library(stringr)

# Path to folder that contains the chapter files
path <- "Chapter_II_cl"

# List all .txt files
files <- list.files(path, pattern = "\\.txt$", full.names = TRUE)

# Read all files into a named vector chapter_text
chapter_texts <- files %>%
  set_names(basename(files) %>%    # keep file names as base for names of each data set
  str_replace("^([^_]+)_.*?(C_[IVXLCDM]+).*\\.txt$", "\\1_\\2.txt")) %>%  # regex for reconstituing the file name for lematized file
  map_chr(readr::read_file)         # read each file as one string

Verify the results:
- List the names and the first 500 characters of the first five files.

# Get the first 5 chapters
first5 <- chapter_texts[1:5]

# Loop through each and print nicely
walk2(names(first5), first5, ~ {
  cat("=== File:", .x, "===\n")
  cat(str_sub(.y, 1, 500), "\n\n")  # show first 500 chars
})

## === File: Aretius_all_C_II_cl_lem.txt ===
## caput ii . d uo sum fidelis officium in publicus coetis audio sanus doctrina ex verbum deus et cum totus coetus prex ad dominus fundo . de prior iam actus sum caput eo primo . nunc ad alter descendo admoneo timotheus vt suus auctor sum in ephesinus ecclesia vt in prex seruo pietas congruo praesertim quum ecclesia adhuc omnis sub infidelis ago magistra . et quia in coetis publicus quoque oro mulier adhuc tamenfortus ethnicus mos parum decor se gero hic praescribo suus quidam decus qui is omnino s 
## 
## === File: Bugenhagen_all_C_II_cl_lem.txt ===
## caput ii . o ratio sum cor desiderium pro res ab deus impetro . hic desiderium si uerus sum numquam cesso donec ab deus qui desydero accipio hic sum qui christus dico oportet semper oro et non deficio . qui non desydero nihil oro ut magne multiloquis aer compleo qui uero desydero etiam ab uerbum quandoque non abstineo . hinc facio ut generalis uocabulum oratio uoco quicumque cum deus collocutio etiam cum laudo in psalmus et cantus cis spiritualis is sum qui facio incordus . nam ut numquam cesso  
## 
## === File: Bullinger_all_C_II_cl_lem.txt ===
## cap. ii . rum pelago sum ueritas canonicus fides purus et charitas sincer . in hic qui persto et purus simplicitas ad portus tendo salus saluus portus attingo felicitas aeternus porro qui flatus arrogantia et contentio uela do abreptique malus cupiditas fluctus ab uia rectus recedo et fides lucidus cynosura negligo impingo in perfidium scopulus atque intereo . quis uero sum satana trado expono sum corinthius caput finis hic traditio sum ut pudor probroquus corricpeo disco ab impius parumques chr 
## 
## === File: Cajetan_all_C_II_cl_lem.txt ===
## caput . ii . o bsecrum . pro . adhortor igitur primus omnis . tracto necessitas quare reliquor timotheus ephesus incipio prosequor propono . unde tamquam ad propono exsequor redio illatiue dico adhortor igitur . propono autem quod finis praecaepo sum dilectio de cor purus et propterea ab exsecutio dilectio erga omnis inchho . primus antem et communis dilectio actus erga omnis sum oratio . et ideo tracto ille parriculus dilectio horto ante omnis oro pro omnis . facio obsecratio . pro . preces. pe 
## 
## === File: Calvin_all_C_II_cl_lem.txt ===
## dhoreo igitur vt ante omnis facio deprecatio obsecratio interpellatio gratia actio pro omnis homo pro rex et omnis in eminentia constituo vt placidus et quiesco vitas dego cum omnis pietas et honestas .
## hic enim bonum et accipio coram saluator noster deus qui omnis homo vum saluus facio et ad agnitio veritas venio . adhortor igitur . hic pietas exercitium exerceo nos in sincer cultus deus atque timor foueo bonus conscientia de qui dico . quare non abo res illatiuus particula vto quia ex superior

create a dataframe and visualise it in R:

# Convert named vector into a tibble
chapters_df <- tibble(
  file = names(chapter_texts),    # filenames
  text = chapter_texts             # text content
)

# View the first few rows
head(chapters_df, 3)

## # A tibble: 3 × 2
##   file                text                                                      
##   <chr>               <chr>                                                     
## 1 Aretius_C_II.txt    "caput ii . d uo sum fidelis officium in publicus coetis …
## 2 Bugenhagen_C_II.txt "caput ii . o ratio sum cor desiderium pro res ab deus im…
## 3 Bullinger_C_II.txt  "cap. ii . rum pelago sum ueritas canonicus fides purus e…

# Display as interactive table in HTML

visualise in HTML :

library(DT)
library(stringr)
library(dplyr)

DT::datatable(
  chapters_df %>% 
    mutate(
      # Show first 150 characters, add ellipsis if truncated, tooltip contains full text
      text = paste0(
        str_sub(text, 1, 150),
        ifelse(str_length(text) > 150, "…", ""),
        "<span title='", text, "'></span>"
      )
    ),
  escape = FALSE,           # allow HTML for tooltip
  rownames = FALSE,         # hide row numbers
  options = list(
    pageLength = 10,        # number of rows per page
    scrollX = TRUE,         # allow horizontal scrolling
    searching = FALSE       # remove search box
  )
)

4 Tokenizing and Concordancing

Tokenizing creates a structured, searchable version of your text where each word is a token with a position, which can then be used for analyses like KWIC (Key Word In Context).

Tokenize the character vector (here chapter_texts) to create a new object chapter_tokens, where each element (i.e., word) has an explicit position.

Tokenizing a character vector with quanteda

Splits each chapter in chapter_texts into tokens (words, punctuation, etc.) stored in chapter_tokens.
Each token has an implicit position according to its order in the chapter.
- Example: chapter_tokens[[1]][3] → the 3rd word in the first chapter.

Using KWIC (Key Word In Context)

You can search for a word in chapter_tokens.
kwic(chapter_tokens, "mulier") automatically finds all occurrences of the word and provides explicit positions (from, to) so you can extract the surrounding context.

4.1 Example with the term “mulier

The kwic function allows you to choose the length of the context (number of words before and after the selected occurrence) using window = number of context tokens.
You can select a specific occurrence using pattern = "desired occurrence".

library(quanteda)
library(dplyr)


# tokenize your character vector
chapter_tokens <- tokens(chapter_texts)

# create KWIC
kwic_m <- kwic(chapter_tokens, 
               pattern = "mulier", 
               window = 6) %>%
  as.data.frame() %>%
  select(-to, -from, -pattern)

HTML visualization

library(quanteda)
library(dplyr)
library(DT)


# create KWIC and clean columns
kwic_df <- kwic_m %>%
  select(docname, pre, keyword, post)  # keep relevant columns

# Display as interactive table in HTML
DT::datatable(kwic_df,
              options = list(pageLength = 10, scrollX = TRUE))

4.2 Example: “mulier” and its declensions

To see all forms of a word, you can use a * after the stem.
For example, pattern = "mulier.*" will match all words that start with “mulier”.

# create kwic
kwic_ml <- kwic(chapter_tokens, 
                pattern = "mulier.*",
                window = 6,
                valuetype = "regex") %>%
  # convert into a data frame
  as.data.frame() %>%
  # remove superfluous columns
  dplyr::select(-to, -from, -pattern)

HTML visualization
Simply type the author’s name in the search bar to see all occurrences of a term associated with that name.

library(quanteda)
library(dplyr)
library(DT)


# create KWIC and clean columns
kwic_df <- kwic_ml %>%
  select(docname, pre, keyword, post)  # keep relevant columns

# Display as interactive table in HTML
DT::datatable(kwic_df,
              options = list(pageLength = 10, scrollX = TRUE))

View(kwic_ml)

5 word frequency

To analyze word frequencies, it is necessary to lemmatize the text.
The lemmatization script is executed in Python using the CLTK library. A notebook can be found here:

5.1 Prepare the data

Use the same script as in the Concordancing section, but change the folder path.
Since the same variable names are used, the previous values will be overwritten for the new dataset.

library(tidyverse)
library(stringr)

# Path to folder that contains the chapter files
path <- "Chapter_II_lem"

# List all .txt files
files <- list.files(path, pattern = "\\.txt$", full.names = TRUE)

# Read all files into a named vector chapter_text
chapter_texts <- files %>%
  set_names(basename(files) %>%    # keep file names as base for names of each data set
  str_replace("^([^_]+)_.*?(C_[IVXLCDM]+).*\\.txt$", "\\1_\\2.txt")) %>%  # regex for reconstituing the file name for lematized file
  map_chr(readr::read_file)         # read each file as one string

view(chapter_texts)

5.2 List All Words

Clean the data
Note: An initial cleaning has already been performed before lemmatization (see notebook).
- Here, we complete the cleaning process by:
  - Converting all text to lowercase
  - Removing non-word characters
  - Removing punctuation
  - Removing extra whitespace, keeping only single spaces
Store all the words contained in chapter_texts in a vector chapter_words,
where each element is a single word from all chapters.

chapter_words <- chapter_texts  %>%
  # convert everything to lower case
  tolower() %>%
  # remove non-word characters
  str_replace_all("[^[:alpha:][:space:]]*", "")  %>%
  tm::removePunctuation() %>%
  stringr::str_squish() %>%
  stringr::str_split(" ") %>% # Splits each string into individual words using space " " as the separator.
  unlist()

Verify that all the data are process

head(chapter_words)

## Aretius_C_II.txt1 Aretius_C_II.txt2 Aretius_C_II.txt3 Aretius_C_II.txt4 
##           "caput"              "ii"             "duo"             "sum" 
## Aretius_C_II.txt5 Aretius_C_II.txt6 
##         "fidelis"        "officium"

tail(chapter_words, 20)

## Unbekannt_C_II.txt4010 Unbekannt_C_II.txt4011 Unbekannt_C_II.txt4012 
##                "caueo"              "incumbo"                  "sed" 
## Unbekannt_C_II.txt4013 Unbekannt_C_II.txt4014 Unbekannt_C_II.txt4015 
##               "potius"              "inculco"               "studeo" 
## Unbekannt_C_II.txt4016 Unbekannt_C_II.txt4017 Unbekannt_C_II.txt4018 
##                "mater"               "filius"                  "qui" 
## Unbekannt_C_II.txt4019 Unbekannt_C_II.txt4020 Unbekannt_C_II.txt4021 
##                 "deus"                "gigno"                   "et" 
## Unbekannt_C_II.txt4022 Unbekannt_C_II.txt4023 Unbekannt_C_II.txt4024 
##             "proximus"                "fides"             "dilectio" 
## Unbekannt_C_II.txt4025 Unbekannt_C_II.txt4026 Unbekannt_C_II.txt4027 
##                "sanct"                 "immo"                  "iam" 
## Unbekannt_C_II.txt4028 Unbekannt_C_II.txt4029 
##            "pudicitia"            "sobrietas"

view(chapter_words)       # last 20 words

5.2.1 Create a data frame with word frequencies

Use the table function: counts the number of occurrences of each unique word in chapter_words.
Returns a table where the names are the words and the values are their frequencies.
Convert the table to a data frame for easier manipulation and analysis.
Remove problematic entries before running the analysis, as this simplifies data cleaning. NB : Use the table created in the code to assess data cleanliness. After analyzing, add any necessary corrections.

# remove or replace the problematic words
chapter_words <- gsub("^vers\\.$", "versus", chapter_words, ignore.case = TRUE)
chapter_words <- gsub("^vers$", "versus", chapter_words, ignore.case = TRUE)
chapter_words <- gsub("^verf$", "versus", chapter_words, ignore.case = TRUE)
chapter_words <- gsub("^cap$", "caput", chapter_words, ignore.case = TRUE)

# create table
wfreq <- chapter_words %>%
  table() %>%
  as.data.frame() %>%
  arrange(desc(Freq)) %>%
  dplyr::rename(word = 1,
                frequency = 2)

HTML visualization : dataframe

view(wfreq)

DT::datatable(wfreq,
              options = list(pageLength = 10, scrollX = TRUE))

Remove irrelevant words for thematic classification
- To focus on meaningful words, filter out stopwords.
- Use a stopword list provided in stopword_file.csv to remove common or irrelevant words from your dataset.
Adding additional stopwords
- If some words need to be removed that are not in the stopword list, you can:
  1. Add them directly to the CSV file.
  2. Or remove them programmatically using gsub().

stop_words <- read.csv("stopwords/latin_stop_word.csv", stringsAsFactors = FALSE)

# create table wo stopwords
wfreq_wostop <- wfreq %>%
  anti_join(stop_words, by = "word") %>%
  dplyr::filter(word != "")

HTML visualization : dataframe

view(wfreq_wostop)

DT::datatable(wfreq_wostop,
              options = list(pageLength = 10, scrollX = TRUE))

5.3 Plot the Top 10 Most Frequent Words

Select the 10 most frequent non-stop words from the corpus.

library(ggplot2)
library(dplyr)
# starts the httpgd graphics device
# Create the plot and assign it to an object
p <- wfreq_wostop %>%
  head(10) %>%
  ggplot(aes(x = reorder(word, -frequency), y = frequency)) +  # removed 'mean', not needed here
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(title = "Top 10 Non-Stop Words by Frequency\nin 1 Thimothy's Chapter 2 Commentarie",
       x = "", y = "Frequency") +
  theme(axis.text.x = element_text(angle = 45, size = 12, hjust = 1))

# visualise the plot
print(p)

# Save the plot to a PNG file
ggsave("top10_words_corpus.png", plot = p, width = 8, height = 5, dpi = 300)

6 Word Frequencies for a Precise Text

Here: example of Lambert Daneau
Focus only on the chosen text and use the same code.
Just add the path to your file.

chapter_words_lam <- readLines("Chapter_II_lem/Lambertus_all_C_II_cl_lem.txt") %>%
  # convert everything to lower case
  tolower() %>%
  # remove non-word characters
  str_replace_all("[^[:alpha:][:space:]]*", "")  %>%
  tm::removePunctuation() %>%
  stringr::str_squish() %>%
  stringr::str_split(" ") %>% # Splits each string into individual words using space " " as the separator.
  unlist()

remove word if needed

# remove or replace the problematic words
chapter_words_lam <- gsub("^vers\\.$", "versus", chapter_words_lam, ignore.case = TRUE)
chapter_words_lam <- gsub("^vers$", "versus", chapter_words_lam, ignore.case = TRUE)
chapter_words_lam <- gsub("^verf$", "versus", chapter_words_lam, ignore.case = TRUE)
chapter_words_lam <- gsub("^cap$", "caput", chapter_words_lam, ignore.case = TRUE)

make a table

# create table
wfreq_lam <- chapter_words_lam %>%
  table() %>%
  as.data.frame() %>%
  arrange(desc(Freq)) %>%
  dplyr::rename(word = 1,
                frequency = 2)

remove the stop word

# create table wo stopwords
wfreq_wostop_lam <- wfreq_lam %>%
  anti_join(stop_words, by = "word") %>%
  dplyr::filter(word != "")

DT::datatable(wfreq_wostop_lam,
              options = list(pageLength = 10, scrollX = TRUE))

6.1 Plot the Top 10 Most Frequent Words

library(ggplot2)
library(dplyr)
# starts the httpgd graphics device
# Create the plot and assign it to an object
p <- wfreq_wostop_lam %>%
  head(10) %>%
  ggplot(aes(x = reorder(word, -frequency), y = frequency)) +  # removed 'mean', not needed here
  geom_bar(stat = "identity", fill = "#528852") +
  labs(title = "10 most frequent non-stop words in \nThimothy commentaries *on 1 chapter 2* from Lambert Danneau", 
       x = "", y = "Frequency") +
  theme(axis.text.x = element_text(angle = 45, size = 12, hjust = 1))

# visualise the plot
print(p)

# Save the plot to a PNG file
ggsave("top10_words_lambertus.png", plot = p, width = 8, height = 5, dpi = 300)

6.2 Visualize as Word Cloud

for the corpus

library(wordcloud2)
library(htmlwidgets)
library(webshot2)

wc <- wordcloud2(wfreq_wostop[1:100,],
                 shape = "diamond",
                 color = scales::viridis_pal()(8)
)

# save directly as PNG in one line
saveWidget(wc, "temp_wc.html", selfcontained = TRUE)
webshot("temp_wc.html", "wordcloud_corpus.png", vwidth = 800, vheight = 600)

for Lambertus

library(wordcloud2)
library(htmlwidgets)
library(webshot2)

wc <- wordcloud2(wfreq_wostop_lam[1:100,],
                 shape = "diamond",
                 color = scales::viridis_pal(option = "rocket")(8)
)

# save directly as PNG in one line
saveWidget(wc, "temp_wc.html", selfcontained = TRUE)
webshot("temp_wc.html", "wordcloud_lambertus.png", vwidth = 800, vheight = 600)

6.3 Visualize in Comparaison Cloud

This prepares the texts for comparison word clouds by author. - combines all lines into a single string.

for 2 authors

clean_text <- function(text) {
  # Patterns: optional whitespace before, word boundary after
  patterns <- c("\\s*vers\\.\\b", "\\s*vers\\b", "\\s*verf\\b", "\\s*cap\\b","\\s*quanquam\\b")
  replacements <- c("versus", "versus", "versus", "caput", " ")
  
  for(i in seq_along(patterns)) {
    text <- gsub(patterns[i], replacements[i], text, ignore.case = TRUE)
  }
  
  # Optional: remove extra spaces created by replacement
  text <- gsub("\\s+", " ", text)
  text <- trimws(text)
  
  return(text)
}

aretius <- readLines("./Chapter_II_lem/Aretius_all_C_II_cl_lem.txt") %>%
  paste0(collapse = " ")
 
#bugenhagen <- readLines("./Chapter_II_lem/Bugenhagen_all_C_II_cl_lem.txt") %>%
 # paste0(collapse = " ")
#bullinger <- readLines("./Chapter_II_lem/Bullinger_all_C_II_cl_lem.txt") %>%
  #paste0(collapse = " ")
#cajetan  <- readLines("./Chapter_II_lem/Cajetan_all_C_II_cl_lem.txt") %>%
  #paste0(collapse = " ")
#calvin  <- readLines("./Chapter_II_lem/Calvin_all_C_II_cl_lem.txt") %>%
 #paste0(collapse = " ")
#hyperius  <- readLines("./Chapter_II_lem/Hyperius_all_C_II_cl_lem.txt") %>%
# paste0(collapse = " ")
lambertus  <- readLines("./Chapter_II_lem/Lambertus_all_C_II_cl_lem.txt") %>%
  paste0(collapse = " ")
#lefevre  <- readLines("./Chapter_II_lem/Lefevre_all_C_II_cl_lem.txt") %>%
  #paste0(collapse = " ")
#pellican  <- readLines("./Chapter_II_lem/Pellicanus_all_C_II_cl_lem.txt") %>%
  #paste0(collapse = " ")
#unbekannt <- readLines("./Chapter_II_lem/Unbekannt_all_C_II_cl_lem.txt") %>%
 # paste0(collapse = " ")

aretius <- clean_text(aretius)
lambertus <- clean_text(lambertus)

Create corpus

assign an author name to each document :

corp_dom <- quanteda::corpus(c(aretius, lambertus))  #calvin, hyperius, lambertus, pellican, unbekannt
attr(corp_dom, "docvars")$Author = c("Aretius", "Lambertus") #, "Calvin", "Hyperius", "Lambertus", "Pellicanus", "Unbekant"

verify the data

view(corp_dom)

wordcloud de 2 auteurs - creates a word cloud comparing the authors, showing the 60 most frequent words and scaling the largest word to size 6.

# or
#x11()       # on Linux
# or
#open the plot in a extern window 


# Read your CSV
latin_stop_words <- read.csv("stopwords/latin_stop_word.csv", stringsAsFactors = FALSE)

# If your CSV has a column named "word"
latin_stop_words <- latin_stop_words$word  # character vector

# Then run your code
corp_dom %>%
  quanteda::tokens(remove_punct = TRUE) %>%
  quanteda::tokens_remove(latin_stop_words) %>%
  quanteda::dfm() %>%
  quanteda::dfm_trim(min_termfreq = 15, verbose = FALSE) %>%
  quanteda::dfm_group(groups = corp_dom$Author) %>%
  quanteda.textplots::textplot_wordcloud(
    comparison = TRUE, 
    max_words = 60,
    max_size = 6,
    color = RColorBrewer::brewer.pal(8, "Dark2")
  )

For 4 authors

aretius <- readLines("./Chapter_II_lem/Aretius_all_C_II_cl_lem.txt") %>%
  paste0(collapse = " ")
bugenhagen <- readLines("./Chapter_II_lem/Bugenhagen_all_C_II_cl_lem.txt") %>%
  paste0(collapse = " ")
#bullinger <- readLines("./Chapter_II_lem/Bullinger_all_C_II_cl_lem.txt") %>%
  #paste0(collapse = " ")
#cajetan  <- readLines("./Chapter_II_lem/Cajetan_all_C_II_cl_lem.txt") %>%
  #paste0(collapse = " ")
#calvin  <- readLines("./Chapter_II_lem/Calvin_all_C_II_cl_lem.txt") %>%
 #paste0(collapse = " ")
hyperius  <- readLines("./Chapter_II_lem/Hyperius_all_C_II_cl_lem.txt") %>%
 paste0(collapse = " ")
lambertus  <- readLines("./Chapter_II_lem/Lambertus_all_C_II_cl_lem.txt") %>%
  paste0(collapse = " ")
#lefevre  <- readLines("./Chapter_II_lem/Lefevre_all_C_II_cl_lem.txt") %>%
  #paste0(collapse = " ")
#pellican  <- readLines("./Chapter_II_lem/Pellicanus_all_C_II_cl_lem.txt") %>%
  #paste0(collapse = " ")
#unbekannt <- readLines("./Chapter_II_lem/Unbekannt_all_C_II_cl_lem.txt") %>%
  #paste0(collapse = " ")

use the cleaning fuction

aretius <- clean_text(aretius)
bugenhagen <- clean_text(bugenhagen)
hyperius <- clean_text(hyperius)
lambertus <- clean_text(lambertus)

corp_dom_4 <- quanteda::corpus(c(aretius, bugenhagen,  hyperius, lambertus  )) #bullinger, #calvin, #pellican, #lefevre #cajetan #unbekannt
attr(corp_dom_4, "docvars")$Author = c("Aretius", "Bugenhagen","Hyperius", "Lambertus"  )  #"Bullinger" #"Cajetan", "Lefevre", #"Calvin", #"Unbekant", #"Pellicanus"

Comparison word clouds are useful for 2 authors and still acceptable for 3 authors.
With 4 or more, the visualization becomes cluttered, unclear, and no longer informative.

here with 4 authors is already less interessant

library(quanteda)
library(quanteda.textplots)


# Read your CSV
latin_stop_words <- read.csv("stopwords/latin_stop_word.csv", stringsAsFactors = FALSE)

# If your CSV has a column named "word"
latin_stop_words <- latin_stop_words$word  # character vector

corp_dom_4 %>%
  quanteda::tokens(remove_punct = TRUE) %>%
  quanteda::tokens_remove(latin_stop_words) %>%
  quanteda::dfm() %>%
  quanteda::dfm_trim(min_termfreq = 20, verbose = FALSE) %>%
  quanteda::dfm_group(groups = corp_dom_4$Author) %>%
  quanteda.textplots::textplot_wordcloud(
    comparison = TRUE, 
    max_words = 60,
    max_size = 4,
    color = RColorBrewer::brewer.pal(8, "Set2")
  )

7 Frequency changes

Counting Words per Chapter for comparing the Chapter
Counting Word on a book for seeing the main thema.

7.1 counting word by commentaries

# extract number of words per chapter
Words <- chapter_texts %>%
  stringr::str_split(" ")  %>%
  lengths()

# inspect data
Words

##    Aretius_C_II.txt Bugenhagen_C_II.txt  Bullinger_C_II.txt    Cajetan_C_II.txt 
##                2971                1546                6292                1977 
##     Calvin_C_II.txt   Hyperius_C_II.txt  Lambertus_C_II.txt    Lefevre_C_II.txt 
##                5242                6466               12346                1878 
## Pellicanus_C_II.txt  Unbekannt_C_II.txt 
##                5448                4188

# extract number of matches per chapter
Matches <- chapter_texts %>%
  stringr::str_count("mulier")
# inspect the number of matches per chapter
Matches

##    Aretius_C_II.txt Bugenhagen_C_II.txt  Bullinger_C_II.txt    Cajetan_C_II.txt 
##                  21                  13                  25                  21 
##     Calvin_C_II.txt   Hyperius_C_II.txt  Lambertus_C_II.txt    Lefevre_C_II.txt 
##                  26                  64                  67                  30 
## Pellicanus_C_II.txt  Unbekannt_C_II.txt 
##                  22                  21

7.2 create a table of results

tb <- data.frame(
  Commentaries = gsub("\\.txt$", "", names(chapter_texts)),  # remove .txt
  Matches, Words ) %>%
  dplyr::mutate(Frequency = round(Matches/Words*1000, 2))

View(tb)
DT::datatable(tb)

7.3 plot of terme frequencies comparaison accross commentaries

-base on mulier matches

# create plot
ggplot(tb, aes(x = Commentaries, y = Frequency, group = 1)) + 
  geom_smooth(color = "purple") +
  geom_line(color = "darkgray") +         
  guides(color=guide_legend(override.aes=list(fill=NA))) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  scale_y_continuous(name ="Relative Frequency (per 1,000 words)")+
  ggtitle("Term Frequency Comparison Across Commentaries")

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

8 dispersion plot

# generate corpus from chapters
commentaries_corpus <- quanteda::tokens(chapter_texts)
# generate dispersion plots
quanteda.textplots::textplot_xray(kwic(commentaries_corpus, pattern = "mulier"),
              kwic(commentaries_corpus, pattern = "mediator"),
              kwic(commentaries_corpus, pattern = "silentio"),
              sort = T)

library(quanteda)
library(quanteda.textplots)
library(dplyr)
library(ggplot2)

# 1. Create a corpus
commentaries_corpus <- corpus(chapter_texts)

# 2. Tokenize the corpus
commentaries_tokens <- tokens(commentaries_corpus, remove_punct = TRUE)

# 3. Generate KWIC objects on tokens
kwic_mulier <- kwic(commentaries_tokens, pattern = "mulier")
kwic_mediator <- kwic(commentaries_tokens, pattern = "mediator")
kwic_silentio <- kwic(commentaries_tokens, pattern = "silentio")

# 4. Combine KWIC results into a data frame
kwic_all <- bind_rows(
  as.data.frame(kwic_mulier) %>% mutate(term = "mulier"),
  as.data.frame(kwic_mediator) %>% mutate(term = "mediator"),
  as.data.frame(kwic_silentio) %>% mutate(term = "silentio")
)

visualization with dot

ggplot(kwic_all, aes(x = from, y = docname, color = term)) +
  geom_point(alpha = 0.7) +
  scale_color_manual(values = c(
    "mulier" = "#52305C",
    "mediator" = "red",
    "silentio" = "#16ee0f"
  )) +
  labs(
    x = "Position in Commentary",
    y = "Commentary",
    title = "Term Dispersion per Commentary"
  ) +
  theme_minimal()

* visualization with rays

# Ensure docname is a factor with levels in the original order
kwic_all$docname <- factor(kwic_all$docname, levels = unique(kwic_all$docname))

# Create numeric positions for plotting
kwic_all$y_pos <- as.numeric(kwic_all$docname)

# Plot rays
ggplot(kwic_all, aes(x = from, xend = from, 
                     y = y_pos - 0.4, yend = y_pos + 0.4, 
                     color = term)) +
  geom_segment() +
    scale_color_manual(values = c(
    "mulier" = "#52305C",
    "mediator" = "red",
    "silentio" = "#16ee0f"
  )) +
  scale_y_continuous(breaks = 1:length(levels(kwic_all$docname)),
                     labels = levels(kwic_all$docname)) +
  labs(
    x = "Position in Commentary",
    y = "Commentary",
    title = "Term Dispersion per Commentary"
  ) +
  theme_minimal()

9 3 N-grams, Collocations, and Keyness

# create data frame
commentaries_bigrams <- data.frame( chapter_words[1:length(chapter_words)-1], 
                       chapter_words[2:length(chapter_words)]) %>%
  dplyr::rename(Word1 = 1,
                Word2 = 2) %>%
  dplyr::mutate(Bigram = paste0(Word1, " ", Word2)) %>%
  dplyr::group_by(Bigram) %>%
  dplyr::summarise(Frequency = n()) %>%
  dplyr::arrange(-Frequency)

commentaries_bigrams

## # A tibble: 31,200 × 2
##    Bigram     Frequency
##    <chr>          <int>
##  1 sum et            94
##  2 pro omnis         91
##  3 non sum           88
##  4 ab deus           83
##  5 is sum            72
##  6 sum qui           70
##  7 is qui            67
##  8 omnis homo        63
##  9 sum in            62
## 10 deus et           60
## # ℹ 31,190 more rows

view(commentaries_bigrams)

10 Finding Collocations

commentaries_sentences <- chapter_texts %>%
  tolower() %>%
  paste0(collapse= " ") %>%
  stringr::str_split(fixed(".")) %>%
  unlist() %>%
  tm::removePunctuation() %>%
  stringr::str_squish()

view(commentaries_sentences)
tail(commentaries_sentences)

## [1] "sub dito quidem sum uir mulier et humiliter segero inter uir et in ecclesia agnoscatquo im becillitas et inconstantia ut se doceo munus in ecclesia non arrogeo uerus non ab uir contemno sum qui per generatio liber beatus facio deus munificentia et insignus munus illustro"                                                                          
## [2] "quis enim admirabilis in res natura genero quam inuterus femina non tantum corpus humanus sed et anima creo infundo qui generatio nutritio et educatio pariter et instructio offitium confero deus in mulier et in cunabulum pietas trado et pfectus committo mater ut et lac pasco"                                                                      
## [3] "so lidus cibus ab uir nutrio et in fides et sacer dogma perfectus audio et sequor"                                                                                                                                                                                                                                                                        
## [4] "curo igitur liber mater ut in is qui ab tenerus unguiculus ab se fides et dilectio deus imbibeo filius in is permaneo iugiter inculco fides et timor deus ut sanctus sum mos ab omnis contagio crimen se perpetuo caueo caro et sanguis non indulgens et deus mando qui is mater puer edoceo semper memoria cogito sanctus sum et permaneo corpus et spus"
## [5] "sobrietas in primus curo ut cibus et potus somnus otium lus lenocinium indulgentia sodalitus tu caueo incumbo sed potius inculco studeo mater filius qui deus gigno et proximus fides dilectio sanct immo iam pudicitia sobrietas"                                                                                                                        
## [6] ""

latin_stop_words <- read.csv("stopwords/latin_stop_word.csv", stringsAsFactors = FALSE)$word

# create a token object
commentaries_tokens <- tokens(commentaries_sentences, remove_punct = TRUE) %>%
  tokens_remove(latin_stop_words)
# extract collocations
commentaries_coll <- textstat_collocations(commentaries_tokens, size = 2, min_count = 15)

view(commentaries_coll)

11 Visualizing Collocation Networks

11.1 for all the commentairies

latin_stop_words <- read.csv("stopwords/latin_stop_word.csv", 
                             stringsAsFactors = FALSE)$word

corpus_dfm <- commentaries_sentences %>%
  quanteda::tokens(remove_punct = TRUE) %>%      # 1. tokenize + remove punctuation
  quanteda::tokens_remove(latin_stop_words) %>%  # 2. remove stopwords
  quanteda::dfm() %>%                            # 3. create dfm
  quanteda::dfm_trim(min_termfreq = 10, 
                     verbose = FALSE)            # 4. trim dfm

head(corpus_dfm)

## Document-feature matrix of: 6 documents, 501 features (98.64% sparse) and 0 docvars.
##        features
## docs    caput fidelis officium publicus audio sanus doctrina verbum deus totus
##   text1     1       0        0        0     0     0        0      0    0     0
##   text2     0       1        1        1     1     1        1      1    1     1
##   text3     1       0        0        0     0     0        0      0    0     0
##   text4     0       0        0        0     0     0        0      0    0     0
##   text5     0       0        0        1     0     0        0      0    0     0
##   text6     1       0        0        0     0     0        0      0    0     0
## [ reached max_nfeat ... 491 more features ]

# load function for co-occurrence calculation
source("https://slcladal.github.io/rscripts/calculateCoocStatistics.R")
# define term
coocTerm <- "mulier"
# calculate co-occurrence statistics
coocs <- calculateCoocStatistics(coocTerm, corpus_dfm, measure="LOGLIK")
# inspect results
coocs[1:20]

##      silentio      permitto           uir         decet         doceo 
##      72.31360      69.81742      64.06423      57.92125      45.68653 
##     profiteor          adam         disco    auctoritas       habitus 
##      44.67287      41.21689      36.58135      35.71157      34.98062 
## praeuaricatio     subiectio        coetus        crinus       decipio 
##      29.06123      28.92389      26.69235      26.36221      26.08775 
##      castitas    praeceptum         maneo       amictus   verecundium 
##      23.32984      22.88483      22.59531      22.29790      22.17542

redux_dfm <- dfm_select(corpus_dfm, 
pattern = c(names(coocs)[1:20], "mulier"))
head (redux_dfm)

## Document-feature matrix of: 6 documents, 21 features (98.41% sparse) and 0 docvars.
##        features
## docs    coetus mulier auctoritas doceo maneo disco habitus amictus decet
##   text1      0      0          0     0     0     0       0       0     0
##   text2      1      0          0     0     0     0       0       0     0
##   text3      0      0          0     0     0     0       0       0     0
##   text4      0      0          0     0     0     0       0       0     0
##   text5      0      1          0     0     0     0       0       0     0
##   text6      0      0          0     0     0     0       0       0     0
##        features
## docs    silentio
##   text1        0
##   text2        0
##   text3        0
##   text4        0
##   text5        0
##   text6        0
## [ reached max_nfeat ... 11 more features ]

tag_fcm <- fcm(redux_dfm)
head (tag_fcm)

## Feature co-occurrence matrix of: 6 by 21 features.
##             features
## features     coetus mulier auctoritas doceo maneo disco habitus amictus decet
##   coetus          1     24          1    10     0     5       0       0     0
##   mulier          0     43         18    61    27    20      18      14    29
##   auctoritas      0      0          1    18     0     1       0       0     0
##   doceo           0      0          0    21     2     6       0       1     1
##   maneo           0      0          0     0     7     0       1       0     0
##   disco           0      0          0     0     0     0       1       0     0
##             features
## features     silentio
##   coetus            1
##   mulier           23
##   auctoritas        7
##   doceo            12
##   maneo             0
##   disco            12
## [ reached max_nfeat ... 11 more features ]

# generate network graph
textplot_network(tag_fcm, 
                 min_freq = 1, 
                 edge_alpha = 0.1, 
                 edge_size = 5,
                 edge_color = "purple",
                 vertex_labelsize = log(rowSums(tag_fcm))*2)

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)

11.1.1 for one commentaries : here lambertus

commentaries_sentences_lam <- lambertus %>%
  tolower() %>%
  paste0(collapse= " ") %>%
  stringr::str_split(fixed(".")) %>%
  unlist() %>%
  tm::removePunctuation() %>%
  stringr::str_squish()

view(commentaries_sentences_lam)
tail(commentaries_sentences_lam)

## [1] "ille caritas quoque gigno extra qui nihil ab mater ipse facio deus gratus possum quia si inuitus repugno et cogo potius officium facio qui ex caritas displiceo deus"           
## [2] "sanctificatio qui requiro ab mulier non sum tantum generalis omnis vitas actio ad deus voluntas conformatio"                                                                    
## [3] "thess"                                                                                                                                                                          
## [4] "versus sed magne sanct immo ia quidam corpus qui se vxor et mulier ab omnis turpitudo immunditia et lasciuia purus conseruo qui impudicitie oppono et alius nomen dico castitas"
## [5] "modestia luxui et sumptuo ille cultus de qui supra dico oppono"                                                                                                                 
## [6] ""

latin_stop_words <- read.csv("stopwords/latin_stop_word.csv", stringsAsFactors = FALSE)$word
corpus_lam_dfm <- commentaries_sentences_lam %>%
  quanteda::tokens(remove_punct = TRUE) %>%      # 1. tokenize + remove punctuation
  quanteda::tokens_remove(latin_stop_words) %>%  # 2. remove stopwords
  quanteda::dfm() %>%                            # 3. create dfm
  quanteda::dfm_trim(min_termfreq = 10, 
                     verbose = FALSE)            # 4. trim dfm

head(corpus_lam_dfm)

## Document-feature matrix of: 6 documents, 115 features (96.52% sparse) and 0 docvars.
##        features
## docs    caput facio prex gratia homo munus pars publicus genus deus
##   text1     1     0    0      0    0     0    0        0     0    0
##   text2     0     0    0      0    0     0    0        0     0    0
##   text3     0     1    1      1    1     0    0        0     0    0
##   text4     0     0    0      0    1     1    1        1     1    1
##   text5     0     0    0      0    0     1    1        0     0    0
##   text6     0     0    1      0    0     0    0        0     0    1
## [ reached max_nfeat ... 105 more features ]

# load function for co-occurrence calculation
source("https://slcladal.github.io/rscripts/calculateCoocStatistics.R")
# define term
coocTerm <- "mulier"
# calculate co-occurrence statistics
coocs <- calculateCoocStatistics(coocTerm, corpus_dfm, measure="LOGLIK")
# inspect results
coocs[1:20]

##      silentio      permitto           uir         decet         doceo 
##      72.31360      69.81742      64.06423      57.92125      45.68653 
##     profiteor          adam         disco    auctoritas       habitus 
##      44.67287      41.21689      36.58135      35.71157      34.98062 
## praeuaricatio     subiectio        coetus        crinus       decipio 
##      29.06123      28.92389      26.69235      26.36221      26.08775 
##      castitas    praeceptum         maneo       amictus   verecundium 
##      23.32984      22.88483      22.59531      22.29790      22.17542

redux_dfm <- dfm_select(corpus_lam_dfm, 
pattern = c(names(coocs)[1:20], "mulier"))
head (redux_dfm)

## Document-feature matrix of: 6 documents, 4 features (100.00% sparse) and 0 docvars.
##        features
## docs    coetus doceo praeceptum mulier
##   text1      0     0          0      0
##   text2      0     0          0      0
##   text3      0     0          0      0
##   text4      0     0          0      0
##   text5      0     0          0      0
##   text6      0     0          0      0

tag_lam_fcm <- fcm(redux_dfm)
head (redux_dfm)

## Document-feature matrix of: 6 documents, 4 features (100.00% sparse) and 0 docvars.
##        features
## docs    coetus doceo praeceptum mulier
##   text1      0     0          0      0
##   text2      0     0          0      0
##   text3      0     0          0      0
##   text4      0     0          0      0
##   text5      0     0          0      0
##   text6      0     0          0      0

# generate network graph
textplot_network(tag_lam_fcm, 
                 min_freq = 1, 
                 edge_alpha = 0.1, 
                 edge_size = 5,
                 edge_color = "purple",
                 vertex_labelsize = log(rowSums(tag_lam_fcm))*2)

12 test

visualization Network keeping trace of the corpus d’origine

aretius <- readLines("./Chapter_II_lem/Aretius_all_C_II_cl_lem.txt") %>%
  paste0(collapse = " ")
bugenhagen <- readLines("./Chapter_II_lem/Bugenhagen_all_C_II_cl_lem.txt") %>%
  paste0(collapse = " ")
bullinger <- readLines("./Chapter_II_lem/Bullinger_all_C_II_cl_lem.txt") %>%
  paste0(collapse = " ")
cajetan  <- readLines("./Chapter_II_lem/Cajetan_all_C_II_cl_lem.txt") %>%
  paste0(collapse = " ")
calvin  <- readLines("./Chapter_II_lem/Calvin_all_C_II_cl_lem.txt") %>%
 paste0(collapse = " ")
hyperius  <- readLines("./Chapter_II_lem/Hyperius_all_C_II_cl_lem.txt") %>%
 paste0(collapse = " ")
lambertus  <- readLines("./Chapter_II_lem/Lambertus_all_C_II_cl_lem.txt") %>%
  paste0(collapse = " ")
lefevre  <- readLines("./Chapter_II_lem/Lefevre_all_C_II_cl_lem.txt") %>%
  paste0(collapse = " ")
pellican  <- readLines("./Chapter_II_lem/Pellicanus_all_C_II_cl_lem.txt") %>%
  paste0(collapse = " ")
unbekannt <- readLines("./Chapter_II_lem/Unbekannt_all_C_II_cl_lem.txt") %>%
  paste0(collapse = " ")

aretius <- clean_text(aretius)
bugenhagen <- clean_text(bugenhagen)
bullinger <- clean_text(bullinger)
cajetan <- clean_text(cajetan)
calvin <- clean_text(calvin)
lefevre <- clean_text(lefevre)
pellican <- clean_text(pellican)
hyperius <- clean_text(hyperius)
lambertus <- clean_text(lambertus)
unbekannt <- clean_text(unbekannt)

library(quanteda)

# Combine into a named vector
texts <- c(
  Aretius = aretius,
  Bugenhagen = bugenhagen,
  Bullinger = bullinger,
  Cajetan = cajetan,
  Calvin = calvin,
  Hyperius = hyperius,
  Lambertus = lambertus,
  Lefevre = lefevre,
  Pellicanus = pellican,
  Unbekannt = unbekannt
)

# Create corpus
corpus_all <- corpus(texts)

the following script is working

library(quanteda)
library(dplyr)
library(igraph)

## 
## Attachement du package : 'igraph'

## L'objet suivant est masqué depuis 'package:class':
## 
##     knn

## L'objet suivant est masqué depuis 'package:quanteda.textplots':
## 
##     as.igraph

## L'objet suivant est masqué depuis 'package:flextable':
## 
##     compose

## Les objets suivants sont masqués depuis 'package:lubridate':
## 
##     %--%, union

## Les objets suivants sont masqués depuis 'package:dplyr':
## 
##     as_data_frame, groups, union

## Les objets suivants sont masqués depuis 'package:purrr':
## 
##     compose, simplify

## L'objet suivant est masqué depuis 'package:tidyr':
## 
##     crossing

## L'objet suivant est masqué depuis 'package:tibble':
## 
##     as_data_frame

## Les objets suivants sont masqués depuis 'package:stats':
## 
##     decompose, spectrum

## L'objet suivant est masqué depuis 'package:base':
## 
##     union

# Tokenize
toks <- tokens(corpus_all, remove_punct = TRUE, remove_numbers = TRUE) %>%
  tokens_tolower()

# Build FCMs for each document individually
fcm_list <- lapply(seq_along(corpus_all), function(i) {
  fcm(tokens(corpus_all[i], remove_punct = TRUE) %>% tokens_tolower())
})

names(fcm_list) <- names(corpus_all)

# Extract co-occurrences with "mulier"
cooc_list <- lapply(names(fcm_list), function(docname) {
  fcm_doc <- fcm_list[[docname]]
  
  # Skip if "mulier" is not present
  if (!"mulier" %in% featnames(fcm_doc)) return(NULL)
  
  # Convert FCM row to numeric vector with proper names
  coocs <- as.numeric(fcm_doc["mulier", ])
  names(coocs) <- featnames(fcm_doc)
  
  # Keep only positive co-occurrences, exclude "mulier" itself
  coocs <- coocs[coocs > 0 & names(coocs) != "mulier"]
  
  # Skip if nothing left
  if (length(coocs) == 0) return(NULL)
  
  # Build a clean dataframe
  data.frame(
    term = names(coocs),
    cooc = as.numeric(coocs),
    document = docname,
    stringsAsFactors = FALSE
  )
})

# Combine all into a single dataframe
cooc_df <- bind_rows(cooc_list)

# Check the result
head(cooc_df)

##          term cooc document
## 1 tamenfortus   21  Aretius
## 2    ethnicus   42  Aretius
## 3         mos   84  Aretius
## 4       parum   21  Aretius
## 5       decor   84  Aretius
## 6          se  189  Aretius

library(igraph)

# Create igraph object
g <- graph_from_data_frame(cooc_df, directed = FALSE)

# Assign a color to each document
doc_colors <- RColorBrewer::brewer.pal(n = length(unique(cooc_df$document)), name = "Set3")
names(doc_colors) <- unique(cooc_df$document)
V(g)$color <- doc_colors[cooc_df$document[match(V(g)$name, cooc_df$term)]]

# Plot the network
plot(g, vertex.label.cex = 0.8, vertex.size = 15, edge.width = 1,
     vertex.label.color = "black")

## Warning: vertex attribute color contains NAs. Replacing with default value 1

legend("topright", legend = names(doc_colors), col = doc_colors, pch = 19, pt.cex = 1.5)

library(igraph)
library(RColorBrewer)

# Create igraph object
g <- graph_from_data_frame(cooc_df, directed = FALSE)

# Assign colors by document
doc_colors <- RColorBrewer::brewer.pal(n = length(unique(cooc_df$document)), name = "Set3")
names(doc_colors) <- unique(cooc_df$document)

# Create a vector of colors for all vertices
V(g)$color <- ifelse(
  V(g)$name == "mulier",       # central node
  "gold",                       # color for 'mulier'
  doc_colors[ cooc_df$document[match(V(g)$name, cooc_df$term)] ] # other nodes
)

# Replace any NAs with a default color
V(g)$color[is.na(V(g)$color)] <- "lightgrey"

# Plot
plot(g, vertex.label.cex = 0.8, vertex.size = 15, edge.width = 1,
     vertex.label.color = "black")

# Add legend
legend("topright", legend = c("mulier", names(doc_colors)), 
       col = c("gold", doc_colors), pch = 19, pt.cex = 1.5)

13 Keyness

analyse comparée par auteurs

aretius <- readLines("./Chapter_II_lem/Aretius_all_C_II_cl_lem.txt") %>%
  paste0(collapse = " ")
bugenhagen <- readLines("./Chapter_II_lem/Bugenhagen_all_C_II_cl_lem.txt") %>%
  paste0(collapse = " ")
bullinger <- readLines("./Chapter_II_lem/Bullinger_all_C_II_cl_lem.txt") %>%
  paste0(collapse = " ")
cajetan  <- readLines("./Chapter_II_lem/Cajetan_all_C_II_cl_lem.txt") %>%
  paste0(collapse = " ")
calvin  <- readLines("./Chapter_II_lem/Calvin_all_C_II_cl_lem.txt") %>%
 paste0(collapse = " ")
hyperius  <- readLines("./Chapter_II_lem/Hyperius_all_C_II_cl_lem.txt") %>%
 paste0(collapse = " ")
lambertus  <- readLines("./Chapter_II_lem/Lambertus_all_C_II_cl_lem.txt") %>%
  paste0(collapse = " ")
lefevre  <- readLines("./Chapter_II_lem/Lefevre_all_C_II_cl_lem.txt") %>%
  paste0(collapse = " ")
pellican  <- readLines("./Chapter_II_lem/Pellicanus_all_C_II_cl_lem.txt") %>%
  paste0(collapse = " ")
unbekannt <- readLines("./Chapter_II_lem/Unbekannt_all_C_II_cl_lem.txt") %>%
  paste0(collapse = " ")

corp_dom <- quanteda::corpus(c(aretius, bugenhagen, bullinger,  hyperius, lambertus, pellican, calvin, lefevre, cajetan, unbekannt  )) 
attr(corp_dom, "docvars")$Author = c("Aretius", "Bugenhagen", "Bullinger",  "Hyperius", "Lambertus", "Pellicanus", "Cajetan", "Lefevre", "Calvin", "Unbekant" )

dfm_authors <- corp_dom %>%
  quanteda::tokens(remove_punct = TRUE) %>%
   quanteda::tokens_remove(latin_stop_words) %>%
  quanteda::dfm() %>%
  quanteda::dfm_weight(scheme = "prop")

head (dfm_authors)

## Document-feature matrix of: 6 documents, 5,251 features (76.52% sparse) and 1 docvar.
##        features
## docs          caput      fidelis    officium     publicus       coetis
##   text1 0.005625000 0.0025000000 0.001250000 0.0031250000 0.0025000000
##   text2 0.003856041 0            0           0.0025706941 0           
##   text3 0.005853088 0.0008779631 0.001170618 0.0014632719 0.0005853088
##   text4 0.004418262 0.0005891016 0.002061856 0.0020618557 0           
##   text5 0.006852590 0.0006374502 0.003027888 0.0039840637 0           
##   text6 0.001288245 0.0035426731 0.001932367 0.0003220612 0           
##        features
## docs           audio        sanus    doctrina       verbum       deus
##   text1 0.0012500000 0.0031250000 0.003750000 0.0012500000 0.02375000
##   text2 0.0025706941 0            0.002570694 0            0.04241645
##   text3 0.0017559263 0.0020485806 0.003804507 0            0.02955809
##   text4 0.0008836524 0.0002945508 0.005007364 0.0005891016 0.02562592
##   text5 0.0006374502 0.0006374502 0.001752988 0.0035059761 0.02836653
##   text6 0.0022544283 0            0.001288245 0            0.02222222
## [ reached max_nfeat ... 5,241 more features ]

tail (dfm_authors)

## Document-feature matrix of: 6 documents, 5,251 features (78.00% sparse) and 1 docvar.
##         features
## docs           caput      fidelis    officium     publicus coetis        audio
##   text5  0.006852590 0.0006374502 0.003027888 0.0039840637      0 0.0006374502
##   text6  0.001288245 0.0035426731 0.001932367 0.0003220612      0 0.0022544283
##   text7  0.001812908 0.0010877447 0.003625816 0.0021754895      0 0.0003625816
##   text8  0.004179728 0            0.001044932 0.0083594566      0 0.0031347962
##   text9  0.001011122 0            0.006066734 0                 0 0.0010111223
##   text10 0.001273345 0.0021222411 0.001273345 0.0004244482      0 0.0025466893
##         features
## docs            sanus    doctrina      verbum       deus
##   text5  0.0006374502 0.001752988 0.003505976 0.02836653
##   text6  0            0.001288245 0           0.02222222
##   text7  0.0007251632 0.001450326 0.001812908 0.04242204
##   text8  0            0.003134796 0.002089864 0.02403344
##   text9  0            0.002022245 0           0.02325581
##   text10 0            0.002122241 0           0.03056027
## [ reached max_nfeat ... 5,241 more features ]

# Calculate relative frequency by president
freq_weight <- quanteda.textstats::textstat_frequency(dfm_authors, n = 10,
                                            groups = dfm_authors$Author)

view (freq_weight)

ggplot(freq_weight, aes(nrow(freq_weight):1, frequency)) +
     geom_point() +
     facet_wrap(~ group, scales = "free") +
     coord_flip() +
     scale_x_continuous(breaks = nrow(freq_weight):1,
                        labels = freq_weight$feature) +
     labs(x = NULL, y = "Relative frequency")

Co-occurences and Network on Timothy 1.2

Floriane Goy working on a tutorial from LADAL

2025-12-12