objectif_du_reseau. description_des_données methode_employée.
@manual{uqslc2025ladal,
author = {Schweinberger, Martin},
title = {The Language Technology and Data Analysis Laboratory (LADAL)},
note = {https://ladal.edu.au},
year = {2025},
organization = {The University of Queensland, School of Languages and Cultures},
address = {Brisbane},
edition = {2025.04.01}
}tutorial one https://slcladal.netlify.app/textanalysis#1_Concordancing
ouvrir R dans le bon environnement python (pour l’utilise dans visual_code)
#install.packages("reticulate") # if not already installed
#library(reticulate)
#use_python("/home/floriane/Documents/visual_code/.venvocc/bin/python", required = TRUE)#use_python("/home/floriane/Documents/visual_code/.venvocc/bin/python", required = TRUE)
#py_config() # check Python version and pathvérification
## [1] "Python 3.12.3"
## [1] "/usr/bin/python"
## [1] "/usr/bin/R"
On indique sur quel dossier on va travailler
On choisit le serveur hébergeant le dépot Cran.
# Set CRAN mirror interactively to ETH Zurich (mirror 63)
# chooseCRANmirror() and choose 63 for switzerland e.g.
options(repos = c(CRAN = "https://stat.ethz.ch/CRAN/"))## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)
## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)
## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)
## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)
## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)
## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)
## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)
## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)
## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)
## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)
## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)
## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)
## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)
## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)
## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)
## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)
## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)
## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)
## Installation du package dans '/home/floriane/R/x86_64-pc-linux-gnu-library/4.4'
## (car 'lib' n'est pas spécifié)
# install klippy for copy-to-clipboard button in code chunks
remotes::install_github("rlesur/klippy")## Skipping install of 'klippy' from a github remote, the SHA1 (378c247f) has not changed since last install.
## Use `force = TRUE` to force installation
## Skipping install of 'httpgd' from a github remote, the SHA1 (dd6ed3a6) has not changed since last install.
## Use `force = TRUE` to force installation
# set options
options(stringsAsFactors = F)
options(scipen = 999)
options(max.print=1000)
# load packages
library(DT)
library(tidyverse)
library(flextable)
library(quanteda)
library(tm)
library(tidytext)
library(wordcloud2)
library(ggplot2)
library(scales)
library(quanteda.textstats)
library(quanteda.textplots)
library(tidyr)
library(cluster)
library(class)
library(NLP)
#library(openNLP)
#library(openNLPdata)
library(pacman)
pacman::p_load_gh("trinker/entity")## rJava (NA -> 1.0-11 ) [CRAN]
## openNLPdata (NA -> 1.5.3-5) [CRAN]
## openNLP (NA -> 0.2-7 ) [CRAN]
## Warning in i.p(...): l'installation du package 'rJava' a eu un statut de sortie
## non nul
## Warning in i.p(...): l'installation du package 'openNLPdata' a eu un statut de
## sortie non nul
## Warning in i.p(...): l'installation du package 'openNLP' a eu un statut de
## sortie non nul
## ── R CMD build ─────────────────────────────────────────────────────────────────
## * checking for file ‘/tmp/RtmprZiivI/remotes3acda4d4eb2d5/trinker-entity-5549d30/DESCRIPTION’ ... OK
## * preparing ‘entity’:
## * checking DESCRIPTION meta-information ... OK
## * checking for LF line-endings in source and make files and shell scripts
## * checking for empty or unneeded directories
## * building ‘entity_0.1.0.tar.gz’
## Warning in i.p(...): l'installation du package
## '/tmp/RtmprZiivI/file3acda267aa131/entity_0.1.0.tar.gz' a eu un statut de
## sortie non nul
## Warning in p_install_gh(package, dependencies, ...): The following may have incorrect capitalization specification:
##
## entity
## Warning in pacman::p_load_gh("trinker/entity"):
## Failed to install/load:
## trinker/entity
make one liste with all the chapter to compare
chapter_texts,
using names formatted aslibrary(tidyverse)
library(stringr)
# Path to folder that contains the chapter files
path <- "Chapter_II_cl"
# List all .txt files
files <- list.files(path, pattern = "\\.txt$", full.names = TRUE)
# Read all files into a named vector chapter_text
chapter_texts <- files %>%
set_names(basename(files) %>% # keep file names as base for names of each data set
str_replace("^([^_]+)_.*?(C_[IVXLCDM]+).*\\.txt$", "\\1_\\2.txt")) %>% # regex for reconstituing the file name for lematized file
map_chr(readr::read_file) # read each file as one string# Get the first 5 chapters
first5 <- chapter_texts[1:5]
# Loop through each and print nicely
walk2(names(first5), first5, ~ {
cat("=== File:", .x, "===\n")
cat(str_sub(.y, 1, 500), "\n\n") # show first 500 chars
})## === File: Aretius_all_C_II_cl_lem.txt ===
## caput ii . d uo sum fidelis officium in publicus coetis audio sanus doctrina ex verbum deus et cum totus coetus prex ad dominus fundo . de prior iam actus sum caput eo primo . nunc ad alter descendo admoneo timotheus vt suus auctor sum in ephesinus ecclesia vt in prex seruo pietas congruo praesertim quum ecclesia adhuc omnis sub infidelis ago magistra . et quia in coetis publicus quoque oro mulier adhuc tamenfortus ethnicus mos parum decor se gero hic praescribo suus quidam decus qui is omnino s
##
## === File: Bugenhagen_all_C_II_cl_lem.txt ===
## caput ii . o ratio sum cor desiderium pro res ab deus impetro . hic desiderium si uerus sum numquam cesso donec ab deus qui desydero accipio hic sum qui christus dico oportet semper oro et non deficio . qui non desydero nihil oro ut magne multiloquis aer compleo qui uero desydero etiam ab uerbum quandoque non abstineo . hinc facio ut generalis uocabulum oratio uoco quicumque cum deus collocutio etiam cum laudo in psalmus et cantus cis spiritualis is sum qui facio incordus . nam ut numquam cesso
##
## === File: Bullinger_all_C_II_cl_lem.txt ===
## cap. ii . rum pelago sum ueritas canonicus fides purus et charitas sincer . in hic qui persto et purus simplicitas ad portus tendo salus saluus portus attingo felicitas aeternus porro qui flatus arrogantia et contentio uela do abreptique malus cupiditas fluctus ab uia rectus recedo et fides lucidus cynosura negligo impingo in perfidium scopulus atque intereo . quis uero sum satana trado expono sum corinthius caput finis hic traditio sum ut pudor probroquus corricpeo disco ab impius parumques chr
##
## === File: Cajetan_all_C_II_cl_lem.txt ===
## caput . ii . o bsecrum . pro . adhortor igitur primus omnis . tracto necessitas quare reliquor timotheus ephesus incipio prosequor propono . unde tamquam ad propono exsequor redio illatiue dico adhortor igitur . propono autem quod finis praecaepo sum dilectio de cor purus et propterea ab exsecutio dilectio erga omnis inchho . primus antem et communis dilectio actus erga omnis sum oratio . et ideo tracto ille parriculus dilectio horto ante omnis oro pro omnis . facio obsecratio . pro . preces. pe
##
## === File: Calvin_all_C_II_cl_lem.txt ===
## dhoreo igitur vt ante omnis facio deprecatio obsecratio interpellatio gratia actio pro omnis homo pro rex et omnis in eminentia constituo vt placidus et quiesco vitas dego cum omnis pietas et honestas .
## hic enim bonum et accipio coram saluator noster deus qui omnis homo vum saluus facio et ad agnitio veritas venio . adhortor igitur . hic pietas exercitium exerceo nos in sincer cultus deus atque timor foueo bonus conscientia de qui dico . quare non abo res illatiuus particula vto quia ex superior
# Convert named vector into a tibble
chapters_df <- tibble(
file = names(chapter_texts), # filenames
text = chapter_texts # text content
)
# View the first few rows
head(chapters_df, 3)## # A tibble: 3 × 2
## file text
## <chr> <chr>
## 1 Aretius_C_II.txt "caput ii . d uo sum fidelis officium in publicus coetis …
## 2 Bugenhagen_C_II.txt "caput ii . o ratio sum cor desiderium pro res ab deus im…
## 3 Bullinger_C_II.txt "cap. ii . rum pelago sum ueritas canonicus fides purus e…
library(DT)
library(stringr)
library(dplyr)
DT::datatable(
chapters_df %>%
mutate(
# Show first 150 characters, add ellipsis if truncated, tooltip contains full text
text = paste0(
str_sub(text, 1, 150),
ifelse(str_length(text) > 150, "…", ""),
"<span title='", text, "'></span>"
)
),
escape = FALSE, # allow HTML for tooltip
rownames = FALSE, # hide row numbers
options = list(
pageLength = 10, # number of rows per page
scrollX = TRUE, # allow horizontal scrolling
searching = FALSE # remove search box
)
)Tokenizing creates a structured, searchable version of your text where each word is a token with a position, which can then be used for analyses like KWIC (Key Word In Context).
chapter_texts) to
create a new object chapter_tokens, where each element
(i.e., word) has an explicit position.quantedachapter_texts into
tokens (words, punctuation, etc.) stored in
chapter_tokens.chapter_tokens[[1]][3] → the 3rd word in the
first chapter.chapter_tokens.kwic(chapter_tokens, "mulier") automatically finds all
occurrences of the word and provides explicit positions
(from, to) so you can extract the surrounding
context.kwic function allows you to choose the
length of the context (number of words before and after
the selected occurrence) using
window = number of context tokens.pattern = "desired occurrence".library(quanteda)
library(dplyr)
# tokenize your character vector
chapter_tokens <- tokens(chapter_texts)
# create KWIC
kwic_m <- kwic(chapter_tokens,
pattern = "mulier",
window = 6) %>%
as.data.frame() %>%
select(-to, -from, -pattern)* after the
stem.pattern = "mulier.*" will match all words that
start with “mulier”.# create kwic
kwic_ml <- kwic(chapter_tokens,
pattern = "mulier.*",
window = 6,
valuetype = "regex") %>%
# convert into a data frame
as.data.frame() %>%
# remove superfluous columns
dplyr::select(-to, -from, -pattern)To analyze word frequencies, it is necessary to
lemmatize the text.
The lemmatization script is executed in Python using
the CLTK library. A notebook can be found here:
Use the same script as in the Concordancing section, but change the
folder path.
Since the same variable names are used, the previous values will be
overwritten for the new dataset.
library(tidyverse)
library(stringr)
# Path to folder that contains the chapter files
path <- "Chapter_II_lem"
# List all .txt files
files <- list.files(path, pattern = "\\.txt$", full.names = TRUE)
# Read all files into a named vector chapter_text
chapter_texts <- files %>%
set_names(basename(files) %>% # keep file names as base for names of each data set
str_replace("^([^_]+)_.*?(C_[IVXLCDM]+).*\\.txt$", "\\1_\\2.txt")) %>% # regex for reconstituing the file name for lematized file
map_chr(readr::read_file) # read each file as one stringchapter_texts in a
vector chapter_words,chapter_words <- chapter_texts %>%
# convert everything to lower case
tolower() %>%
# remove non-word characters
str_replace_all("[^[:alpha:][:space:]]*", "") %>%
tm::removePunctuation() %>%
stringr::str_squish() %>%
stringr::str_split(" ") %>% # Splits each string into individual words using space " " as the separator.
unlist()Verify that all the data are process
## Aretius_C_II.txt1 Aretius_C_II.txt2 Aretius_C_II.txt3 Aretius_C_II.txt4
## "caput" "ii" "duo" "sum"
## Aretius_C_II.txt5 Aretius_C_II.txt6
## "fidelis" "officium"
## Unbekannt_C_II.txt4010 Unbekannt_C_II.txt4011 Unbekannt_C_II.txt4012
## "caueo" "incumbo" "sed"
## Unbekannt_C_II.txt4013 Unbekannt_C_II.txt4014 Unbekannt_C_II.txt4015
## "potius" "inculco" "studeo"
## Unbekannt_C_II.txt4016 Unbekannt_C_II.txt4017 Unbekannt_C_II.txt4018
## "mater" "filius" "qui"
## Unbekannt_C_II.txt4019 Unbekannt_C_II.txt4020 Unbekannt_C_II.txt4021
## "deus" "gigno" "et"
## Unbekannt_C_II.txt4022 Unbekannt_C_II.txt4023 Unbekannt_C_II.txt4024
## "proximus" "fides" "dilectio"
## Unbekannt_C_II.txt4025 Unbekannt_C_II.txt4026 Unbekannt_C_II.txt4027
## "sanct" "immo" "iam"
## Unbekannt_C_II.txt4028 Unbekannt_C_II.txt4029
## "pudicitia" "sobrietas"
Use the table function: counts the number of
occurrences of each unique word in chapter_words.
Returns a table where the names are the words
and the values are their frequencies.
Convert the table to a data frame for easier manipulation and analysis.
Remove problematic entries before running the analysis, as this simplifies data cleaning. NB : Use the table created in the code to assess data cleanliness. After analyzing, add any necessary corrections.
# remove or replace the problematic words
chapter_words <- gsub("^vers\\.$", "versus", chapter_words, ignore.case = TRUE)
chapter_words <- gsub("^vers$", "versus", chapter_words, ignore.case = TRUE)
chapter_words <- gsub("^verf$", "versus", chapter_words, ignore.case = TRUE)
chapter_words <- gsub("^cap$", "caput", chapter_words, ignore.case = TRUE)# create table
wfreq <- chapter_words %>%
table() %>%
as.data.frame() %>%
arrange(desc(Freq)) %>%
dplyr::rename(word = 1,
frequency = 2)stopword_file.csv to
remove common or irrelevant words from your dataset.gsub().# create table wo stopwords
wfreq_wostop <- wfreq %>%
anti_join(stop_words, by = "word") %>%
dplyr::filter(word != "")library(ggplot2)
library(dplyr)
# starts the httpgd graphics device
# Create the plot and assign it to an object
p <- wfreq_wostop %>%
head(10) %>%
ggplot(aes(x = reorder(word, -frequency), y = frequency)) + # removed 'mean', not needed here
geom_bar(stat = "identity", fill = "steelblue") +
labs(title = "Top 10 Non-Stop Words by Frequency\nin 1 Thimothy's Chapter 2 Commentarie",
x = "", y = "Frequency") +
theme(axis.text.x = element_text(angle = 45, size = 12, hjust = 1))
# visualise the plot
print(p)chapter_words_lam <- readLines("Chapter_II_lem/Lambertus_all_C_II_cl_lem.txt") %>%
# convert everything to lower case
tolower() %>%
# remove non-word characters
str_replace_all("[^[:alpha:][:space:]]*", "") %>%
tm::removePunctuation() %>%
stringr::str_squish() %>%
stringr::str_split(" ") %>% # Splits each string into individual words using space " " as the separator.
unlist()# remove or replace the problematic words
chapter_words_lam <- gsub("^vers\\.$", "versus", chapter_words_lam, ignore.case = TRUE)
chapter_words_lam <- gsub("^vers$", "versus", chapter_words_lam, ignore.case = TRUE)
chapter_words_lam <- gsub("^verf$", "versus", chapter_words_lam, ignore.case = TRUE)
chapter_words_lam <- gsub("^cap$", "caput", chapter_words_lam, ignore.case = TRUE)# create table
wfreq_lam <- chapter_words_lam %>%
table() %>%
as.data.frame() %>%
arrange(desc(Freq)) %>%
dplyr::rename(word = 1,
frequency = 2)# create table wo stopwords
wfreq_wostop_lam <- wfreq_lam %>%
anti_join(stop_words, by = "word") %>%
dplyr::filter(word != "")library(ggplot2)
library(dplyr)
# starts the httpgd graphics device
# Create the plot and assign it to an object
p <- wfreq_wostop_lam %>%
head(10) %>%
ggplot(aes(x = reorder(word, -frequency), y = frequency)) + # removed 'mean', not needed here
geom_bar(stat = "identity", fill = "#528852") +
labs(title = "10 most frequent non-stop words in \nThimothy commentaries *on 1 chapter 2* from Lambert Danneau",
x = "", y = "Frequency") +
theme(axis.text.x = element_text(angle = 45, size = 12, hjust = 1))
# visualise the plot
print(p)library(wordcloud2)
library(htmlwidgets)
library(webshot2)
wc <- wordcloud2(wfreq_wostop[1:100,],
shape = "diamond",
color = scales::viridis_pal()(8)
)
# save directly as PNG in one line
saveWidget(wc, "temp_wc.html", selfcontained = TRUE)
webshot("temp_wc.html", "wordcloud_corpus.png", vwidth = 800, vheight = 600)library(wordcloud2)
library(htmlwidgets)
library(webshot2)
wc <- wordcloud2(wfreq_wostop_lam[1:100,],
shape = "diamond",
color = scales::viridis_pal(option = "rocket")(8)
)
# save directly as PNG in one line
saveWidget(wc, "temp_wc.html", selfcontained = TRUE)
webshot("temp_wc.html", "wordcloud_lambertus.png", vwidth = 800, vheight = 600)This prepares the texts for comparison word clouds by author. - combines all lines into a single string.
clean_text <- function(text) {
# Patterns: optional whitespace before, word boundary after
patterns <- c("\\s*vers\\.\\b", "\\s*vers\\b", "\\s*verf\\b", "\\s*cap\\b","\\s*quanquam\\b")
replacements <- c("versus", "versus", "versus", "caput", " ")
for(i in seq_along(patterns)) {
text <- gsub(patterns[i], replacements[i], text, ignore.case = TRUE)
}
# Optional: remove extra spaces created by replacement
text <- gsub("\\s+", " ", text)
text <- trimws(text)
return(text)
}aretius <- readLines("./Chapter_II_lem/Aretius_all_C_II_cl_lem.txt") %>%
paste0(collapse = " ")
#bugenhagen <- readLines("./Chapter_II_lem/Bugenhagen_all_C_II_cl_lem.txt") %>%
# paste0(collapse = " ")
#bullinger <- readLines("./Chapter_II_lem/Bullinger_all_C_II_cl_lem.txt") %>%
#paste0(collapse = " ")
#cajetan <- readLines("./Chapter_II_lem/Cajetan_all_C_II_cl_lem.txt") %>%
#paste0(collapse = " ")
#calvin <- readLines("./Chapter_II_lem/Calvin_all_C_II_cl_lem.txt") %>%
#paste0(collapse = " ")
#hyperius <- readLines("./Chapter_II_lem/Hyperius_all_C_II_cl_lem.txt") %>%
# paste0(collapse = " ")
lambertus <- readLines("./Chapter_II_lem/Lambertus_all_C_II_cl_lem.txt") %>%
paste0(collapse = " ")
#lefevre <- readLines("./Chapter_II_lem/Lefevre_all_C_II_cl_lem.txt") %>%
#paste0(collapse = " ")
#pellican <- readLines("./Chapter_II_lem/Pellicanus_all_C_II_cl_lem.txt") %>%
#paste0(collapse = " ")
#unbekannt <- readLines("./Chapter_II_lem/Unbekannt_all_C_II_cl_lem.txt") %>%
# paste0(collapse = " ")corp_dom <- quanteda::corpus(c(aretius, lambertus)) #calvin, hyperius, lambertus, pellican, unbekannt
attr(corp_dom, "docvars")$Author = c("Aretius", "Lambertus") #, "Calvin", "Hyperius", "Lambertus", "Pellicanus", "Unbekant"verify the data
wordcloud de 2 auteurs - creates a word cloud comparing the authors, showing the 60 most frequent words and scaling the largest word to size 6.
# or
#x11() # on Linux
# or
#open the plot in a extern window
# Read your CSV
latin_stop_words <- read.csv("stopwords/latin_stop_word.csv", stringsAsFactors = FALSE)
# If your CSV has a column named "word"
latin_stop_words <- latin_stop_words$word # character vector
# Then run your code
corp_dom %>%
quanteda::tokens(remove_punct = TRUE) %>%
quanteda::tokens_remove(latin_stop_words) %>%
quanteda::dfm() %>%
quanteda::dfm_trim(min_termfreq = 15, verbose = FALSE) %>%
quanteda::dfm_group(groups = corp_dom$Author) %>%
quanteda.textplots::textplot_wordcloud(
comparison = TRUE,
max_words = 60,
max_size = 6,
color = RColorBrewer::brewer.pal(8, "Dark2")
)aretius <- readLines("./Chapter_II_lem/Aretius_all_C_II_cl_lem.txt") %>%
paste0(collapse = " ")
bugenhagen <- readLines("./Chapter_II_lem/Bugenhagen_all_C_II_cl_lem.txt") %>%
paste0(collapse = " ")
#bullinger <- readLines("./Chapter_II_lem/Bullinger_all_C_II_cl_lem.txt") %>%
#paste0(collapse = " ")
#cajetan <- readLines("./Chapter_II_lem/Cajetan_all_C_II_cl_lem.txt") %>%
#paste0(collapse = " ")
#calvin <- readLines("./Chapter_II_lem/Calvin_all_C_II_cl_lem.txt") %>%
#paste0(collapse = " ")
hyperius <- readLines("./Chapter_II_lem/Hyperius_all_C_II_cl_lem.txt") %>%
paste0(collapse = " ")
lambertus <- readLines("./Chapter_II_lem/Lambertus_all_C_II_cl_lem.txt") %>%
paste0(collapse = " ")
#lefevre <- readLines("./Chapter_II_lem/Lefevre_all_C_II_cl_lem.txt") %>%
#paste0(collapse = " ")
#pellican <- readLines("./Chapter_II_lem/Pellicanus_all_C_II_cl_lem.txt") %>%
#paste0(collapse = " ")
#unbekannt <- readLines("./Chapter_II_lem/Unbekannt_all_C_II_cl_lem.txt") %>%
#paste0(collapse = " ")aretius <- clean_text(aretius)
bugenhagen <- clean_text(bugenhagen)
hyperius <- clean_text(hyperius)
lambertus <- clean_text(lambertus)corp_dom_4 <- quanteda::corpus(c(aretius, bugenhagen, hyperius, lambertus )) #bullinger, #calvin, #pellican, #lefevre #cajetan #unbekannt
attr(corp_dom_4, "docvars")$Author = c("Aretius", "Bugenhagen","Hyperius", "Lambertus" ) #"Bullinger" #"Cajetan", "Lefevre", #"Calvin", #"Unbekant", #"Pellicanus"Comparison word clouds are useful for 2 authors and still
acceptable for 3 authors.
With 4 or more, the visualization becomes cluttered, unclear,
and no longer informative.
here with 4 authors is already less interessant
library(quanteda)
library(quanteda.textplots)
# Read your CSV
latin_stop_words <- read.csv("stopwords/latin_stop_word.csv", stringsAsFactors = FALSE)
# If your CSV has a column named "word"
latin_stop_words <- latin_stop_words$word # character vector
corp_dom_4 %>%
quanteda::tokens(remove_punct = TRUE) %>%
quanteda::tokens_remove(latin_stop_words) %>%
quanteda::dfm() %>%
quanteda::dfm_trim(min_termfreq = 20, verbose = FALSE) %>%
quanteda::dfm_group(groups = corp_dom_4$Author) %>%
quanteda.textplots::textplot_wordcloud(
comparison = TRUE,
max_words = 60,
max_size = 4,
color = RColorBrewer::brewer.pal(8, "Set2")
)# extract number of words per chapter
Words <- chapter_texts %>%
stringr::str_split(" ") %>%
lengths()
# inspect data
Words## Aretius_C_II.txt Bugenhagen_C_II.txt Bullinger_C_II.txt Cajetan_C_II.txt
## 2971 1546 6292 1977
## Calvin_C_II.txt Hyperius_C_II.txt Lambertus_C_II.txt Lefevre_C_II.txt
## 5242 6466 12346 1878
## Pellicanus_C_II.txt Unbekannt_C_II.txt
## 5448 4188
# extract number of matches per chapter
Matches <- chapter_texts %>%
stringr::str_count("mulier")
# inspect the number of matches per chapter
Matches## Aretius_C_II.txt Bugenhagen_C_II.txt Bullinger_C_II.txt Cajetan_C_II.txt
## 21 13 25 21
## Calvin_C_II.txt Hyperius_C_II.txt Lambertus_C_II.txt Lefevre_C_II.txt
## 26 64 67 30
## Pellicanus_C_II.txt Unbekannt_C_II.txt
## 22 21
-base on mulier matches
# create plot
ggplot(tb, aes(x = Commentaries, y = Frequency, group = 1)) +
geom_smooth(color = "purple") +
geom_line(color = "darkgray") +
guides(color=guide_legend(override.aes=list(fill=NA))) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
scale_y_continuous(name ="Relative Frequency (per 1,000 words)")+
ggtitle("Term Frequency Comparison Across Commentaries")## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
# generate corpus from chapters
commentaries_corpus <- quanteda::tokens(chapter_texts)
# generate dispersion plots
quanteda.textplots::textplot_xray(kwic(commentaries_corpus, pattern = "mulier"),
kwic(commentaries_corpus, pattern = "mediator"),
kwic(commentaries_corpus, pattern = "silentio"),
sort = T)library(quanteda)
library(quanteda.textplots)
library(dplyr)
library(ggplot2)
# 1. Create a corpus
commentaries_corpus <- corpus(chapter_texts)
# 2. Tokenize the corpus
commentaries_tokens <- tokens(commentaries_corpus, remove_punct = TRUE)
# 3. Generate KWIC objects on tokens
kwic_mulier <- kwic(commentaries_tokens, pattern = "mulier")
kwic_mediator <- kwic(commentaries_tokens, pattern = "mediator")
kwic_silentio <- kwic(commentaries_tokens, pattern = "silentio")
# 4. Combine KWIC results into a data frame
kwic_all <- bind_rows(
as.data.frame(kwic_mulier) %>% mutate(term = "mulier"),
as.data.frame(kwic_mediator) %>% mutate(term = "mediator"),
as.data.frame(kwic_silentio) %>% mutate(term = "silentio")
)ggplot(kwic_all, aes(x = from, y = docname, color = term)) +
geom_point(alpha = 0.7) +
scale_color_manual(values = c(
"mulier" = "#52305C",
"mediator" = "red",
"silentio" = "#16ee0f"
)) +
labs(
x = "Position in Commentary",
y = "Commentary",
title = "Term Dispersion per Commentary"
) +
theme_minimal()
* visualization with rays
# Ensure docname is a factor with levels in the original order
kwic_all$docname <- factor(kwic_all$docname, levels = unique(kwic_all$docname))
# Create numeric positions for plotting
kwic_all$y_pos <- as.numeric(kwic_all$docname)
# Plot rays
ggplot(kwic_all, aes(x = from, xend = from,
y = y_pos - 0.4, yend = y_pos + 0.4,
color = term)) +
geom_segment() +
scale_color_manual(values = c(
"mulier" = "#52305C",
"mediator" = "red",
"silentio" = "#16ee0f"
)) +
scale_y_continuous(breaks = 1:length(levels(kwic_all$docname)),
labels = levels(kwic_all$docname)) +
labs(
x = "Position in Commentary",
y = "Commentary",
title = "Term Dispersion per Commentary"
) +
theme_minimal()# create data frame
commentaries_bigrams <- data.frame( chapter_words[1:length(chapter_words)-1],
chapter_words[2:length(chapter_words)]) %>%
dplyr::rename(Word1 = 1,
Word2 = 2) %>%
dplyr::mutate(Bigram = paste0(Word1, " ", Word2)) %>%
dplyr::group_by(Bigram) %>%
dplyr::summarise(Frequency = n()) %>%
dplyr::arrange(-Frequency)## # A tibble: 31,200 × 2
## Bigram Frequency
## <chr> <int>
## 1 sum et 94
## 2 pro omnis 91
## 3 non sum 88
## 4 ab deus 83
## 5 is sum 72
## 6 sum qui 70
## 7 is qui 67
## 8 omnis homo 63
## 9 sum in 62
## 10 deus et 60
## # ℹ 31,190 more rows
commentaries_sentences <- chapter_texts %>%
tolower() %>%
paste0(collapse= " ") %>%
stringr::str_split(fixed(".")) %>%
unlist() %>%
tm::removePunctuation() %>%
stringr::str_squish()## [1] "sub dito quidem sum uir mulier et humiliter segero inter uir et in ecclesia agnoscatquo im becillitas et inconstantia ut se doceo munus in ecclesia non arrogeo uerus non ab uir contemno sum qui per generatio liber beatus facio deus munificentia et insignus munus illustro"
## [2] "quis enim admirabilis in res natura genero quam inuterus femina non tantum corpus humanus sed et anima creo infundo qui generatio nutritio et educatio pariter et instructio offitium confero deus in mulier et in cunabulum pietas trado et pfectus committo mater ut et lac pasco"
## [3] "so lidus cibus ab uir nutrio et in fides et sacer dogma perfectus audio et sequor"
## [4] "curo igitur liber mater ut in is qui ab tenerus unguiculus ab se fides et dilectio deus imbibeo filius in is permaneo iugiter inculco fides et timor deus ut sanctus sum mos ab omnis contagio crimen se perpetuo caueo caro et sanguis non indulgens et deus mando qui is mater puer edoceo semper memoria cogito sanctus sum et permaneo corpus et spus"
## [5] "sobrietas in primus curo ut cibus et potus somnus otium lus lenocinium indulgentia sodalitus tu caueo incumbo sed potius inculco studeo mater filius qui deus gigno et proximus fides dilectio sanct immo iam pudicitia sobrietas"
## [6] ""
latin_stop_words <- read.csv("stopwords/latin_stop_word.csv",
stringsAsFactors = FALSE)$word
corpus_dfm <- commentaries_sentences %>%
quanteda::tokens(remove_punct = TRUE) %>% # 1. tokenize + remove punctuation
quanteda::tokens_remove(latin_stop_words) %>% # 2. remove stopwords
quanteda::dfm() %>% # 3. create dfm
quanteda::dfm_trim(min_termfreq = 10,
verbose = FALSE) # 4. trim dfm## Document-feature matrix of: 6 documents, 501 features (98.64% sparse) and 0 docvars.
## features
## docs caput fidelis officium publicus audio sanus doctrina verbum deus totus
## text1 1 0 0 0 0 0 0 0 0 0
## text2 0 1 1 1 1 1 1 1 1 1
## text3 1 0 0 0 0 0 0 0 0 0
## text4 0 0 0 0 0 0 0 0 0 0
## text5 0 0 0 1 0 0 0 0 0 0
## text6 1 0 0 0 0 0 0 0 0 0
## [ reached max_nfeat ... 491 more features ]
# load function for co-occurrence calculation
source("https://slcladal.github.io/rscripts/calculateCoocStatistics.R")
# define term
coocTerm <- "mulier"
# calculate co-occurrence statistics
coocs <- calculateCoocStatistics(coocTerm, corpus_dfm, measure="LOGLIK")
# inspect results
coocs[1:20]## silentio permitto uir decet doceo
## 72.31360 69.81742 64.06423 57.92125 45.68653
## profiteor adam disco auctoritas habitus
## 44.67287 41.21689 36.58135 35.71157 34.98062
## praeuaricatio subiectio coetus crinus decipio
## 29.06123 28.92389 26.69235 26.36221 26.08775
## castitas praeceptum maneo amictus verecundium
## 23.32984 22.88483 22.59531 22.29790 22.17542
## Document-feature matrix of: 6 documents, 21 features (98.41% sparse) and 0 docvars.
## features
## docs coetus mulier auctoritas doceo maneo disco habitus amictus decet
## text1 0 0 0 0 0 0 0 0 0
## text2 1 0 0 0 0 0 0 0 0
## text3 0 0 0 0 0 0 0 0 0
## text4 0 0 0 0 0 0 0 0 0
## text5 0 1 0 0 0 0 0 0 0
## text6 0 0 0 0 0 0 0 0 0
## features
## docs silentio
## text1 0
## text2 0
## text3 0
## text4 0
## text5 0
## text6 0
## [ reached max_nfeat ... 11 more features ]
## Feature co-occurrence matrix of: 6 by 21 features.
## features
## features coetus mulier auctoritas doceo maneo disco habitus amictus decet
## coetus 1 24 1 10 0 5 0 0 0
## mulier 0 43 18 61 27 20 18 14 29
## auctoritas 0 0 1 18 0 1 0 0 0
## doceo 0 0 0 21 2 6 0 1 1
## maneo 0 0 0 0 7 0 1 0 0
## disco 0 0 0 0 0 0 1 0 0
## features
## features silentio
## coetus 1
## mulier 23
## auctoritas 7
## doceo 12
## maneo 0
## disco 12
## [ reached max_nfeat ... 11 more features ]
# generate network graph
textplot_network(tag_fcm,
min_freq = 1,
edge_alpha = 0.1,
edge_size = 5,
edge_color = "purple",
vertex_labelsize = log(rowSums(tag_fcm))*2)## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Unable to calculate text width/height (using zero)
commentaries_sentences_lam <- lambertus %>%
tolower() %>%
paste0(collapse= " ") %>%
stringr::str_split(fixed(".")) %>%
unlist() %>%
tm::removePunctuation() %>%
stringr::str_squish()## [1] "ille caritas quoque gigno extra qui nihil ab mater ipse facio deus gratus possum quia si inuitus repugno et cogo potius officium facio qui ex caritas displiceo deus"
## [2] "sanctificatio qui requiro ab mulier non sum tantum generalis omnis vitas actio ad deus voluntas conformatio"
## [3] "thess"
## [4] "versus sed magne sanct immo ia quidam corpus qui se vxor et mulier ab omnis turpitudo immunditia et lasciuia purus conseruo qui impudicitie oppono et alius nomen dico castitas"
## [5] "modestia luxui et sumptuo ille cultus de qui supra dico oppono"
## [6] ""
latin_stop_words <- read.csv("stopwords/latin_stop_word.csv", stringsAsFactors = FALSE)$word
corpus_lam_dfm <- commentaries_sentences_lam %>%
quanteda::tokens(remove_punct = TRUE) %>% # 1. tokenize + remove punctuation
quanteda::tokens_remove(latin_stop_words) %>% # 2. remove stopwords
quanteda::dfm() %>% # 3. create dfm
quanteda::dfm_trim(min_termfreq = 10,
verbose = FALSE) # 4. trim dfm## Document-feature matrix of: 6 documents, 115 features (96.52% sparse) and 0 docvars.
## features
## docs caput facio prex gratia homo munus pars publicus genus deus
## text1 1 0 0 0 0 0 0 0 0 0
## text2 0 0 0 0 0 0 0 0 0 0
## text3 0 1 1 1 1 0 0 0 0 0
## text4 0 0 0 0 1 1 1 1 1 1
## text5 0 0 0 0 0 1 1 0 0 0
## text6 0 0 1 0 0 0 0 0 0 1
## [ reached max_nfeat ... 105 more features ]
# load function for co-occurrence calculation
source("https://slcladal.github.io/rscripts/calculateCoocStatistics.R")
# define term
coocTerm <- "mulier"
# calculate co-occurrence statistics
coocs <- calculateCoocStatistics(coocTerm, corpus_dfm, measure="LOGLIK")
# inspect results
coocs[1:20]## silentio permitto uir decet doceo
## 72.31360 69.81742 64.06423 57.92125 45.68653
## profiteor adam disco auctoritas habitus
## 44.67287 41.21689 36.58135 35.71157 34.98062
## praeuaricatio subiectio coetus crinus decipio
## 29.06123 28.92389 26.69235 26.36221 26.08775
## castitas praeceptum maneo amictus verecundium
## 23.32984 22.88483 22.59531 22.29790 22.17542
## Document-feature matrix of: 6 documents, 4 features (100.00% sparse) and 0 docvars.
## features
## docs coetus doceo praeceptum mulier
## text1 0 0 0 0
## text2 0 0 0 0
## text3 0 0 0 0
## text4 0 0 0 0
## text5 0 0 0 0
## text6 0 0 0 0
## Document-feature matrix of: 6 documents, 4 features (100.00% sparse) and 0 docvars.
## features
## docs coetus doceo praeceptum mulier
## text1 0 0 0 0
## text2 0 0 0 0
## text3 0 0 0 0
## text4 0 0 0 0
## text5 0 0 0 0
## text6 0 0 0 0
# generate network graph
textplot_network(tag_lam_fcm,
min_freq = 1,
edge_alpha = 0.1,
edge_size = 5,
edge_color = "purple",
vertex_labelsize = log(rowSums(tag_lam_fcm))*2)aretius <- readLines("./Chapter_II_lem/Aretius_all_C_II_cl_lem.txt") %>%
paste0(collapse = " ")
bugenhagen <- readLines("./Chapter_II_lem/Bugenhagen_all_C_II_cl_lem.txt") %>%
paste0(collapse = " ")
bullinger <- readLines("./Chapter_II_lem/Bullinger_all_C_II_cl_lem.txt") %>%
paste0(collapse = " ")
cajetan <- readLines("./Chapter_II_lem/Cajetan_all_C_II_cl_lem.txt") %>%
paste0(collapse = " ")
calvin <- readLines("./Chapter_II_lem/Calvin_all_C_II_cl_lem.txt") %>%
paste0(collapse = " ")
hyperius <- readLines("./Chapter_II_lem/Hyperius_all_C_II_cl_lem.txt") %>%
paste0(collapse = " ")
lambertus <- readLines("./Chapter_II_lem/Lambertus_all_C_II_cl_lem.txt") %>%
paste0(collapse = " ")
lefevre <- readLines("./Chapter_II_lem/Lefevre_all_C_II_cl_lem.txt") %>%
paste0(collapse = " ")
pellican <- readLines("./Chapter_II_lem/Pellicanus_all_C_II_cl_lem.txt") %>%
paste0(collapse = " ")
unbekannt <- readLines("./Chapter_II_lem/Unbekannt_all_C_II_cl_lem.txt") %>%
paste0(collapse = " ")aretius <- clean_text(aretius)
bugenhagen <- clean_text(bugenhagen)
bullinger <- clean_text(bullinger)
cajetan <- clean_text(cajetan)
calvin <- clean_text(calvin)
lefevre <- clean_text(lefevre)
pellican <- clean_text(pellican)
hyperius <- clean_text(hyperius)
lambertus <- clean_text(lambertus)
unbekannt <- clean_text(unbekannt)library(quanteda)
# Combine into a named vector
texts <- c(
Aretius = aretius,
Bugenhagen = bugenhagen,
Bullinger = bullinger,
Cajetan = cajetan,
Calvin = calvin,
Hyperius = hyperius,
Lambertus = lambertus,
Lefevre = lefevre,
Pellicanus = pellican,
Unbekannt = unbekannt
)
# Create corpus
corpus_all <- corpus(texts)the following script is working
##
## Attachement du package : 'igraph'
## L'objet suivant est masqué depuis 'package:class':
##
## knn
## L'objet suivant est masqué depuis 'package:quanteda.textplots':
##
## as.igraph
## L'objet suivant est masqué depuis 'package:flextable':
##
## compose
## Les objets suivants sont masqués depuis 'package:lubridate':
##
## %--%, union
## Les objets suivants sont masqués depuis 'package:dplyr':
##
## as_data_frame, groups, union
## Les objets suivants sont masqués depuis 'package:purrr':
##
## compose, simplify
## L'objet suivant est masqué depuis 'package:tidyr':
##
## crossing
## L'objet suivant est masqué depuis 'package:tibble':
##
## as_data_frame
## Les objets suivants sont masqués depuis 'package:stats':
##
## decompose, spectrum
## L'objet suivant est masqué depuis 'package:base':
##
## union
# Tokenize
toks <- tokens(corpus_all, remove_punct = TRUE, remove_numbers = TRUE) %>%
tokens_tolower()
# Build FCMs for each document individually
fcm_list <- lapply(seq_along(corpus_all), function(i) {
fcm(tokens(corpus_all[i], remove_punct = TRUE) %>% tokens_tolower())
})
names(fcm_list) <- names(corpus_all)
# Extract co-occurrences with "mulier"
cooc_list <- lapply(names(fcm_list), function(docname) {
fcm_doc <- fcm_list[[docname]]
# Skip if "mulier" is not present
if (!"mulier" %in% featnames(fcm_doc)) return(NULL)
# Convert FCM row to numeric vector with proper names
coocs <- as.numeric(fcm_doc["mulier", ])
names(coocs) <- featnames(fcm_doc)
# Keep only positive co-occurrences, exclude "mulier" itself
coocs <- coocs[coocs > 0 & names(coocs) != "mulier"]
# Skip if nothing left
if (length(coocs) == 0) return(NULL)
# Build a clean dataframe
data.frame(
term = names(coocs),
cooc = as.numeric(coocs),
document = docname,
stringsAsFactors = FALSE
)
})
# Combine all into a single dataframe
cooc_df <- bind_rows(cooc_list)
# Check the result
head(cooc_df)## term cooc document
## 1 tamenfortus 21 Aretius
## 2 ethnicus 42 Aretius
## 3 mos 84 Aretius
## 4 parum 21 Aretius
## 5 decor 84 Aretius
## 6 se 189 Aretius
library(igraph)
# Create igraph object
g <- graph_from_data_frame(cooc_df, directed = FALSE)
# Assign a color to each document
doc_colors <- RColorBrewer::brewer.pal(n = length(unique(cooc_df$document)), name = "Set3")
names(doc_colors) <- unique(cooc_df$document)
V(g)$color <- doc_colors[cooc_df$document[match(V(g)$name, cooc_df$term)]]
# Plot the network
plot(g, vertex.label.cex = 0.8, vertex.size = 15, edge.width = 1,
vertex.label.color = "black")## Warning: vertex attribute color contains NAs. Replacing with default value 1
library(igraph)
library(RColorBrewer)
# Create igraph object
g <- graph_from_data_frame(cooc_df, directed = FALSE)
# Assign colors by document
doc_colors <- RColorBrewer::brewer.pal(n = length(unique(cooc_df$document)), name = "Set3")
names(doc_colors) <- unique(cooc_df$document)
# Create a vector of colors for all vertices
V(g)$color <- ifelse(
V(g)$name == "mulier", # central node
"gold", # color for 'mulier'
doc_colors[ cooc_df$document[match(V(g)$name, cooc_df$term)] ] # other nodes
)
# Replace any NAs with a default color
V(g)$color[is.na(V(g)$color)] <- "lightgrey"
# Plot
plot(g, vertex.label.cex = 0.8, vertex.size = 15, edge.width = 1,
vertex.label.color = "black")
# Add legend
legend("topright", legend = c("mulier", names(doc_colors)),
col = c("gold", doc_colors), pch = 19, pt.cex = 1.5)analyse comparée par auteurs
aretius <- readLines("./Chapter_II_lem/Aretius_all_C_II_cl_lem.txt") %>%
paste0(collapse = " ")
bugenhagen <- readLines("./Chapter_II_lem/Bugenhagen_all_C_II_cl_lem.txt") %>%
paste0(collapse = " ")
bullinger <- readLines("./Chapter_II_lem/Bullinger_all_C_II_cl_lem.txt") %>%
paste0(collapse = " ")
cajetan <- readLines("./Chapter_II_lem/Cajetan_all_C_II_cl_lem.txt") %>%
paste0(collapse = " ")
calvin <- readLines("./Chapter_II_lem/Calvin_all_C_II_cl_lem.txt") %>%
paste0(collapse = " ")
hyperius <- readLines("./Chapter_II_lem/Hyperius_all_C_II_cl_lem.txt") %>%
paste0(collapse = " ")
lambertus <- readLines("./Chapter_II_lem/Lambertus_all_C_II_cl_lem.txt") %>%
paste0(collapse = " ")
lefevre <- readLines("./Chapter_II_lem/Lefevre_all_C_II_cl_lem.txt") %>%
paste0(collapse = " ")
pellican <- readLines("./Chapter_II_lem/Pellicanus_all_C_II_cl_lem.txt") %>%
paste0(collapse = " ")
unbekannt <- readLines("./Chapter_II_lem/Unbekannt_all_C_II_cl_lem.txt") %>%
paste0(collapse = " ")corp_dom <- quanteda::corpus(c(aretius, bugenhagen, bullinger, hyperius, lambertus, pellican, calvin, lefevre, cajetan, unbekannt ))
attr(corp_dom, "docvars")$Author = c("Aretius", "Bugenhagen", "Bullinger", "Hyperius", "Lambertus", "Pellicanus", "Cajetan", "Lefevre", "Calvin", "Unbekant" ) dfm_authors <- corp_dom %>%
quanteda::tokens(remove_punct = TRUE) %>%
quanteda::tokens_remove(latin_stop_words) %>%
quanteda::dfm() %>%
quanteda::dfm_weight(scheme = "prop")## Document-feature matrix of: 6 documents, 5,251 features (76.52% sparse) and 1 docvar.
## features
## docs caput fidelis officium publicus coetis
## text1 0.005625000 0.0025000000 0.001250000 0.0031250000 0.0025000000
## text2 0.003856041 0 0 0.0025706941 0
## text3 0.005853088 0.0008779631 0.001170618 0.0014632719 0.0005853088
## text4 0.004418262 0.0005891016 0.002061856 0.0020618557 0
## text5 0.006852590 0.0006374502 0.003027888 0.0039840637 0
## text6 0.001288245 0.0035426731 0.001932367 0.0003220612 0
## features
## docs audio sanus doctrina verbum deus
## text1 0.0012500000 0.0031250000 0.003750000 0.0012500000 0.02375000
## text2 0.0025706941 0 0.002570694 0 0.04241645
## text3 0.0017559263 0.0020485806 0.003804507 0 0.02955809
## text4 0.0008836524 0.0002945508 0.005007364 0.0005891016 0.02562592
## text5 0.0006374502 0.0006374502 0.001752988 0.0035059761 0.02836653
## text6 0.0022544283 0 0.001288245 0 0.02222222
## [ reached max_nfeat ... 5,241 more features ]
## Document-feature matrix of: 6 documents, 5,251 features (78.00% sparse) and 1 docvar.
## features
## docs caput fidelis officium publicus coetis audio
## text5 0.006852590 0.0006374502 0.003027888 0.0039840637 0 0.0006374502
## text6 0.001288245 0.0035426731 0.001932367 0.0003220612 0 0.0022544283
## text7 0.001812908 0.0010877447 0.003625816 0.0021754895 0 0.0003625816
## text8 0.004179728 0 0.001044932 0.0083594566 0 0.0031347962
## text9 0.001011122 0 0.006066734 0 0 0.0010111223
## text10 0.001273345 0.0021222411 0.001273345 0.0004244482 0 0.0025466893
## features
## docs sanus doctrina verbum deus
## text5 0.0006374502 0.001752988 0.003505976 0.02836653
## text6 0 0.001288245 0 0.02222222
## text7 0.0007251632 0.001450326 0.001812908 0.04242204
## text8 0 0.003134796 0.002089864 0.02403344
## text9 0 0.002022245 0 0.02325581
## text10 0 0.002122241 0 0.03056027
## [ reached max_nfeat ... 5,241 more features ]
# Calculate relative frequency by president
freq_weight <- quanteda.textstats::textstat_frequency(dfm_authors, n = 10,
groups = dfm_authors$Author)ggplot(freq_weight, aes(nrow(freq_weight):1, frequency)) +
geom_point() +
facet_wrap(~ group, scales = "free") +
coord_flip() +
scale_x_continuous(breaks = nrow(freq_weight):1,
labels = freq_weight$feature) +
labs(x = NULL, y = "Relative frequency")