Transparency in nursing home services, Online appendix
This online appendix contains contains robustness checks and model comparisons for the topic model in the original article.

Transparency in nursing home services, Online appendix
Alfredo Hernandez Sanchez and Ixchel Perez Duran
12/10/2021
Residencial Home Tweet Corpus
The corpus consists of 3042 documents (tweets) from 3 actors. The tweets were collected between 2013-11-14 and 2021-05-24. The actors considered were the following:
- ceap_ps
- FEDdependencia
- AESTE_oficial
The co-variates in the corpus are status_id, screen_name, text, date, after_covid, covid_months. Only tweets issued 12 months before and after the start of the state of emergency (2020-03-14) were considered.
## # A tibble: 10 x 3
## screen_name text date
## <chr> <chr> <date>
## 1 AESTE_oficial "no te pierdas en minutos a @jcuberoherr secretar~ 2020-03-27
## 2 cea_ps "porque en la atencion profesional no hay distancia~ 2020-11-05
## 3 AESTE_oficial "fundacion memora y el hospital universitario de la~ 2019-08-02
## 4 AESTE_oficial "creemos necesario optimizar los sistemas de barema~ 2019-05-30
## 5 cea_ps "quieres colaborar en nuestro cuestionario de vacun~ 2021-02-08
## 6 AESTE_oficial "debemos plantearnos la creacion de centros donde s~ 2020-11-06
## 7 cea_ps "el #miercoles cita en #pamplona con la gestion de ~ 2019-10-06
## 8 AESTE_oficial "afortunadamente parece ser que ya contamos con el ~ 2020-04-01
## 9 AESTE_oficial "hemos solicitado la realizacion de un analisis de ~ 2020-09-22
## 10 AESTE_oficial "el grupo eulen ha contratado a mas de mujeres vic~ 2019-11-25
Topic Models
Several parameters of k were considered, as well as various models of tokenization and trimming. The latter include uni-grams, bi-grams and corpus/document level term trimming. We settle on Model 3, bi-grams and \(k=4\).
Selecting K
dfmat1 <- my_corpus %>%
mutate(date=ymd(date),
after_covid=ifelse(after_covid==1,"After Covid", "Before Covid")) %>%
filter(date %within% interval(ymd(190314),ymd(210314))) %>%
mutate(text=str_remove_all(text, "@[:alnum:]+"),
text=str_remove_all(text, "#[:alnum:]+")) %>%
unnest_tokens(word,text) %>%
anti_join(esp_stopwords) %>%
filter(nchar(word)>=2) %>%
count(status_id, word, sort = TRUE) %>%
rename(document=status_id,
term=word,
count=n) %>%
cast_dfm(document, term, count) %>%
dfm_trim(max_docfreq = 0.05, min_termfreq = 2, docfreq_type = "prop")
my_docvars1 <- my_corpus %>%
rename(document=status_id,
actor=screen_name) %>%
mutate(document=as.character(document)) %>%
select(-text) %>%
filter(document %in% rownames(dfmat1))
Spanish stopwords and tokens with a length < 2 were removed, alongside numbers and punctuation.
- Residuals are minimized at 10
- Held-out likelihood is maximized at 2
- In both, there is an inflection point at 4 topics.
- At \(k=4\), Semantic Coherence and Exclusivity are mostly in the upper right quadrant.
Model I (Unigrams / Token-Document Frequency)
#Topic Model 1
if(cache==T){
mystm <- readRDS("mystm.rds")
} else {
mystm <- stm(documents = dfmat1,
K = 4,
data = my_docvars1,
init.type = "Spectral",
verbose = F
)
saveRDS(mystm, "mystm.rds")
}
#Content (beta)
td_beta_stm <- tidy(mystm)
#Prevalence (gamma)
stm_prevalence <- tidy(mystm, matrix = "gamma",
document_names = rownames(dfmat1)
) %>%
inner_join(my_docvars1) %>%
group_by(date, topic) %>%
summarise(topic_av=mean(gamma, na.rm=T))
Model II (Unigrams/Token Frequency)
tokens <- my_corpus %>%
mutate(date=ymd(date),
after_covid=ifelse(after_covid==1,"After Covid", "Before Covid")) %>%
filter(date %within% interval(ymd(190314),ymd(210314))) %>%
mutate(text=str_remove_all(text, "@[:alnum:]+"),
text=str_remove_all(text, "#[:alnum:]+")) %>%
unnest_tokens(word,text) %>%
anti_join(esp_stopwords) %>%
filter(nchar(word)>=2) %>%
count(status_id, word, sort = TRUE) %>%
rename(document=status_id,
term=word,
count=n)
## Joining, by = "word"
#Text Cleaning
pruned_terms <- tokens %>%
group_by(term) %>%
summarise(n=n()) %>%
mutate(
prune=ifelse(n>=round(quantile(n, probs = .99), digits = 0),1,0),
prune=ifelse(n<=round(quantile(n, probs = .01), digits = 0),1,prune),
) %>%
filter(prune==1) %>%
select(term)
dfmat2 <- tokens %>%
anti_join(pruned_terms) %>%
cast_dfm(document, term, count)
## Joining, by = "term"
my_docvars2 <- my_corpus %>%
rename(document=status_id,
actor=screen_name) %>%
mutate(document=as.character(document)) %>%
select(-text) %>%
filter(document %in% rownames(dfmat2))
#Topic Model 1
if(cache==T){
mystm2 <- readRDS("mystm2.rds")
} else {
mystm2 <- stm(documents = dfmat2,
K = 4,
prevalence = ~after_covid+actor,
data = my_docvars2,
init.type = "Spectral",
verbose = F
)
saveRDS(mystm2, "mystm2.rds")
}
#Content (beta)
td_beta_stm2 <- tidy(mystm2)
#Prevalence (gamma)
stm_prevalence2 <- tidy(mystm2, matrix = "gamma",
document_names = rownames(dfmat2)
) %>%
inner_join(my_docvars2) %>%
group_by(date, topic) %>%
summarise(topic_av=mean(gamma, na.rm=T))
Model III (Bigrams/Token Frequency)
#BiGrams
tokens_bigrams <- my_corpus %>%
mutate(date=ymd(date),
after_covid=ifelse(after_covid==1,"After Covid", "Before Covid")) %>%
filter(date %within% interval(ymd(190314),ymd(210314))) %>%
mutate(text=str_remove_all(text, "@[:alnum:]+"),
text=str_remove_all(text, "#[:alnum:]+")) %>%
unnest_tokens(word,text,token = "ngrams", n = 2) %>%
separate(word, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% esp_stopwords$word) %>%
filter(!word2 %in% esp_stopwords$word) %>%
filter(nchar(word1)>=2) %>%
filter(nchar(word2)>=2) %>%
unite(word, word1, word2, sep = "-") %>%
count(status_id, word, sort = TRUE) %>%
rename(document=status_id,
term=word,
count=n)
pruned_terms_b <- tokens_bigrams %>%
group_by(term) %>%
summarise(n=n()) %>%
mutate(
prune=ifelse(n>=round(quantile(n, probs = .99), digits = 0),1,0),
prune=ifelse(n<=round(quantile(n, probs = .01), digits = 0),1,prune),
) %>%
filter(prune==1) %>%
select(term)
dfmat_bigrams <- tokens_bigrams %>%
anti_join(pruned_terms_b) %>%
cast_dfm(document, term, count)
my_docvars_b <- my_corpus %>%
rename(document=status_id,
actor=screen_name) %>%
mutate(document=as.character(document)) %>%
select(-text) %>%
filter(document %in% rownames(dfmat_bigrams))
mystm_b <- stm(documents = dfmat_bigrams,
K = 4,
prevalence = ~after_covid+actor,
data = my_docvars_b,
init.type = "Spectral",
verbose = F
)
#Content (beta)
td_beta_stm_b <- tidy(mystm_b)
#Prevalence (gamma)
stm_prevalence_b <- tidy(mystm_b, matrix = "gamma",
document_names = rownames(dfmat_bigrams)
) %>%
inner_join(my_docvars_b) %>%
group_by(date, topic) %>%
summarise(topic_av=mean(gamma, na.rm=T))