This lab journal is created for the paper
Tolsma, J., Hofstra, B. and Mulders, AM (2025). How COVID-19 Exacerbated Gender Inequalities in Dutch Academia. Scientometrics.
In this file we show how we went from the raw data sets scraped from NARCIS to the working samples on which all analysis are based. Note that the raw data files are not shared but please email jochem.tolsma@ru.nl if you are interested in working with these raw data files. The working sample(s) can be downloaded here.
rm(list = ls())
fpackage.check <- function(packages) {
lapply(packages, FUN = function(x) {
if (!require(x, character.only = TRUE)) {
install.packages(x, dependencies = TRUE)
library(x, character.only = TRUE)
}
})
}
fsave <- function(x, file = NULL, location = "./data/processed/") {
ifelse(!dir.exists("data"), dir.create("data"), FALSE)
ifelse(!dir.exists("data/processed"), dir.create("data/processed"), FALSE)
if (is.null(file))
file = deparse(substitute(x))
datename <- substr(gsub("[:-]", "", Sys.time()), 1, 8)
totalname <- paste(location, file, "_", datename, ".rda", sep = "")
save(x, file = totalname) #need to fix if file is reloaded as input name, not as x.
}
fload <- function(filename) {
load(filename)
get(ls()[ls() != "filename"])
}
fshowdf <- function(x, ...) {
knitr::kable(x, digits = 2, "html", ...) %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>%
kableExtra::scroll_box(width = "100%", height = "300px")
}
colorize <- function(x, color) {sprintf("<span style='color: %s;'>%s</span>", color, x) }
packages = c("ggplot2", "tidyverse", "RColorBrewer", "dplyr", "stringdist", "stringi", "future.apply")
fpackage.check(packages)
Scraped publications via NARCIS. These raw data sets are not shared publicly but please email jochem.tolsma@ru.nl if you want to work with these raw data.
# publications raw datafiles
pubs1 <- fload(file = "./data/pubs_meta_df.rda")
pubs2 <- fload(file = "./data/pubs_meta_df_2020_2021.rda")
pubs3 <- fload(file = "./data/pubs_meta_df_2020_2021_adden.rda")
pubs4 <- fload(file = "./data/pubs_meta_df_2022_adden.rda")
# Different variables in each publication dataset, align this
pubs1 <- pubs1[, c(1:4, 7, 231, 13)]
pubs2 <- pubs2[, c(1:3, 5, 9, 136, 16)]
pubs3 <- pubs3[, c(1:3, 5, 9, 141, 16)]
pubs4 <- pubs4[, c(1:4, 6, 113, 12)]
# combine
pubs12 <- rbind.data.frame(pubs1, pubs2)
pubs123 <- rbind.data.frame(pubs12, pubs3)
pubs <- rbind.data.frame(pubs123, pubs4)
Selection of publications
table(pubs$Type, useNA = "always")
pubs$Type <- as.factor(pubs$Type)
pubs <- pubs[(pubs$Type == "Artikel") | (pubs$Type == "Boek" | (pubs$Type == "Boekdeel") | (pubs$Type ==
"Conference Paper") | (pubs$Type == "Conference Proceedings") | (pubs$Type == "Conferentiebijdrage") |
(pubs$Type == "Patent") | (pubs$Type == "Rapport") | (pubs$Type == "Dataset") | (pubs$Type == "Review")),
] #845256
Removing duplicates
pubs %>%
filter(!is.na(Type)) %>%
mutate(Titel_ori = Titel, Titel = tolower(Titel), Titel = str_sub(Titel, 1, 40), Titel = str_replace_all(Titel,
"\\s", ""), Titel = str_replace_all(Titel, intToUtf8(8217), intToUtf8(39)), Titel = str_replace_all(Titel,
" :", ":"), Titel = str_replace_all(Titel, " \\(", "\\("), Titel = str_replace_all(Titel, "\\( ",
"\\("), Titel = str_replace_all(Titel, "\\) ", "\\)"), Titel = str_replace_all(Titel, " \\)",
"\\)")) %>%
group_by(person_id) %>%
distinct(Titel, .keep_all = TRUE) %>%
ungroup -> df_pubs
Removing pubs without pubyear
colnames(df_pubs) <- make.names(colnames(df_pubs), unique = TRUE)
# derive publication year from variable Data.issued
df_pubs %>%
mutate(pub_year = as.character(Date.issued), pub_year = substr(pub_year, 1, 4), pub_year = as.numeric(pub_year)) ->
df_pubs
df_pubs %>%
filter(!is.na(pub_year)) -> df_pubs #697695
Set pubs published in 2023 (16) to 2022
df_pubs %>%
mutate(pub_year = ifelse(pub_year > 2022, 2022, pub_year)) -> df_pubs
length(unique(df_pubs$Titel)) #478761
length(unique(df_pubs$person_id)) #27285
Saving the data
fsave(df_pubs, file = "df_covid_pubs", location = "./data/processed/")
We use three processed datasets.
phds_variables.rda
: dataset of Dutch PhDs between
1990-2021This data set has been prepared by Anne Maaike Mulders, see (Mulders, Hofstra, and Tolsma 2024).
url_to_unique_id.rda
: used to match different scraped
info from NARCIS and where different id variables are used.Also load our publication dataset.
df_phd <- fload(file = "./data/processed/phds_variables.rda")
df_pubs <- fload(file = "./data/processed/20240701df_covid_pubs.rda")
df_id <- fload(file = "./data/url_to_unique_id.rda")
how many unique phds do we start with
length(unique(df_phd$id))
table(df_phd$start_pub)
add diss_url variable to df_phd dataframe
df_id$id <- paste0("i", df_id$id)
df_phd2 <- left_join(df_phd, df_id, by = c("id")) #we created some duplicates
add df_phd to publications
df_pubs2 <- left_join(df_pubs, df_phd2, by = c(person_id = "person_id1")) #we created some duplicates
df_pubs2 <- df_pubs2 %>%
filter(!is.na(id)) %>%
filter(dupid == 0) #6277728
so we lost approximately 70.000 publication. probably of scholars who did publish but do not have a profile.
This time use the DOI.
df_pubs2 %>%
group_by(person_id) %>%
mutate(duplicateDOI = (duplicated(DOI) & !is.na(DOI))) %>%
filter(!(duplicateDOI)) -> df_pubs2 #624143
We are only interested in publications after the Phd.
df_pubs2 <- df_pubs2 %>%
filter(phd_year < pub_year) #465283
We add three variables to the publications data:
df_pubs2 %>%
mutate(covidpub = as.numeric(pub_year > 2016 & pub_year < 2020)) %>%
group_by(id) %>%
dplyr::summarize(firstpub = min(pub_year, na.rm = T), lastpub = max(pub_year, na.rm = T), covidpub = sum(covidpub)) ->
flpubs
df_pubs2 %>%
left_join(flpubs) -> df_pubs2
How many phds do we now have publications for:
length(unique(df_pubs2$id)) #19368
Only select Phds who published at least one publication in the three years before COVID-19.
df_pubs2 %>%
filter(covidpub > 0) -> df_pubs2
length(unique(df_pubs2$id)) #10001
And only select Phds who have a publication career of at least three years.
df_pubs2 %>%
filter(lastpub > firstpub + 1) -> df_pubs2
length(unique(df_pubs2$id)) #8303
select only papers published after 2014
df_pubs_topic <- df_pubs2 %>%
filter(pub_year > 2014)
# fsave(df_pubs_topic, 'df_pubs_topic')
Based on these pubs we determine the field in which the authors are active via OpenAlex. See here.
# first make an empty dataset
pub_year <- c(1990:2022)
npubs_zero <- rep(0, length(pub_year))
id <- unique(df_pubs2$id)
nid <- length(id)
pub_year <- rep(pub_year, nid)
npubs_zero <- rep(npubs_zero, nid)
id <- rep(id, each = length(c(1990:2022)))
df <- data.frame(id, pub_year)
df %>%
arrange(id, pub_year) -> df
data_ppf <- left_join(df, pubs3, by = c("id")) %>%
filter(pub_year >= firstpub) %>%
left_join(pubs2, by = c("id", "pub_year")) %>%
mutate(npubs = replace_na(npubs, 0), npubs_a = replace_na(npubs_a, 0), npubs_first = replace_na(npubs_first,
0), npubs_last = replace_na(npubs_last, 0), npubs_sole = replace_na(npubs_sole, 0), npubs_first_a = replace_na(npubs_first_a,
0), npubs_last_a = replace_na(npubs_last_a, 0), npubs_sole_a = replace_na(npubs_sole_a, 0)) %>%
rename_with(~gsub(".x", "", .x, fixed = TRUE)) %>%
dplyr::select(c("id", "pub_year", "phd_year", "gender", "field", "ethnicity", "uni", "npubs", "npubs_a",
"npubs_first", "npubs_last", "npubs_sole", "npubs_first_a", "npubs_last_a", "npubs_sole_a", "npubs_t",
"npubs_a_t", "npubs_first_t", "npubs_last_t", "npubs_sole_t", "npubs_first_a_t", "npubs_last_a_t",
"npubs_sole_a_t"))
fsave(data_ppf, "df_ppf") #the person period file dataset for the analysis
fsave(df_pubs2, "df_pubs") #publications included in the ppf (with all info on phds attached as well)
We pick the first because an inspection of Narcis seems to indicate that the lead author is at times additionally mentioned as last author.↩︎
Copyright © 2024- Jochem Tolsma