This lab journal is created for the paper
Tolsma, J., Hofstra, B. and Mulders, AM (2025). How COVID-19
Exacerbated Gender Inequalities in Dutch Academia.
Scientometrics.
In this file we show how we went from the raw data sets scraped from
NARCIS to the working samples on which all analysis are based. Note that
the raw data files are not shared but please email jochem.tolsma@ru.nl
if you are interested in working with these raw data files. The working
sample(s) can be downloaded here.
Custom functions
rm(list = ls())
fpackage.check <- function(packages) {
lapply(packages, FUN = function(x) {
if (!require(x, character.only = TRUE)) {
install.packages(x, dependencies = TRUE)
library(x, character.only = TRUE)
}
})
}
fsave <- function(x, file = NULL, location = "./data/processed/") {
ifelse(!dir.exists("data"), dir.create("data"), FALSE)
ifelse(!dir.exists("data/processed"), dir.create("data/processed"), FALSE)
if (is.null(file))
file = deparse(substitute(x))
datename <- substr(gsub("[:-]", "", Sys.time()), 1, 8)
totalname <- paste(location, file, "_", datename, ".rda", sep = "")
save(x, file = totalname) #need to fix if file is reloaded as input name, not as x.
}
fload <- function(filename) {
load(filename)
get(ls()[ls() != "filename"])
}
fshowdf <- function(x, ...) {
knitr::kable(x, digits = 2, "html", ...) %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>%
kableExtra::scroll_box(width = "100%", height = "300px")
}
colorize <- function(x, color) {sprintf("<span style='color: %s;'>%s</span>", color, x) }
Packages
packages = c("ggplot2", "tidyverse", "RColorBrewer", "dplyr", "stringdist", "stringi", "future.apply")
fpackage.check(packages)
Publications (part
1)
Scraped publications via NARCIS. These raw data sets are not shared
publicly but please email jochem.tolsma@ru.nl if you want to work with these raw
data.
# publications raw datafiles
pubs1 <- fload(file = "./data/pubs_meta_df.rda")
pubs2 <- fload(file = "./data/pubs_meta_df_2020_2021.rda")
pubs3 <- fload(file = "./data/pubs_meta_df_2020_2021_adden.rda")
pubs4 <- fload(file = "./data/pubs_meta_df_2022_adden.rda")
# Different variables in each publication dataset, align this
pubs1 <- pubs1[, c(1:4, 7, 231, 13)]
pubs2 <- pubs2[, c(1:3, 5, 9, 136, 16)]
pubs3 <- pubs3[, c(1:3, 5, 9, 141, 16)]
pubs4 <- pubs4[, c(1:4, 6, 113, 12)]
# combine
pubs12 <- rbind.data.frame(pubs1, pubs2)
pubs123 <- rbind.data.frame(pubs12, pubs3)
pubs <- rbind.data.frame(pubs123, pubs4)
Selection of publications
table(pubs$Type, useNA = "always")
pubs$Type <- as.factor(pubs$Type)
pubs <- pubs[(pubs$Type == "Artikel") | (pubs$Type == "Boek" | (pubs$Type == "Boekdeel") | (pubs$Type ==
"Conference Paper") | (pubs$Type == "Conference Proceedings") | (pubs$Type == "Conferentiebijdrage") |
(pubs$Type == "Patent") | (pubs$Type == "Rapport") | (pubs$Type == "Dataset") | (pubs$Type == "Review")),
] #845256
Removing duplicates
pubs %>%
filter(!is.na(Type)) %>%
mutate(Titel_ori = Titel, Titel = tolower(Titel), Titel = str_sub(Titel, 1, 40), Titel = str_replace_all(Titel,
"\\s", ""), Titel = str_replace_all(Titel, intToUtf8(8217), intToUtf8(39)), Titel = str_replace_all(Titel,
" :", ":"), Titel = str_replace_all(Titel, " \\(", "\\("), Titel = str_replace_all(Titel, "\\( ",
"\\("), Titel = str_replace_all(Titel, "\\) ", "\\)"), Titel = str_replace_all(Titel, " \\)",
"\\)")) %>%
group_by(person_id) %>%
distinct(Titel, .keep_all = TRUE) %>%
ungroup -> df_pubs
Removing pubs without pubyear
colnames(df_pubs) <- make.names(colnames(df_pubs), unique = TRUE)
# derive publication year from variable Data.issued
df_pubs %>%
mutate(pub_year = as.character(Date.issued), pub_year = substr(pub_year, 1, 4), pub_year = as.numeric(pub_year)) ->
df_pubs
df_pubs %>%
filter(!is.na(pub_year)) -> df_pubs #697695
Set pubs published in 2023 (16) to 2022
df_pubs %>%
mutate(pub_year = ifelse(pub_year > 2022, 2022, pub_year)) -> df_pubs
length(unique(df_pubs$Titel)) #478761
length(unique(df_pubs$person_id)) #27285
Saving the data
fsave(df_pubs, file = "df_covid_pubs", location = "./data/processed/")
PhDs
We use three processed datasets.
phds_variables.rda: dataset of Dutch PhDs between
1990-2021
This data set has been prepared by Anne Maaike Mulders, see (Mulders, Hofstra, and Tolsma 2024).
url_to_unique_id.rda: used to match different scraped
info from NARCIS and where different id variables are used.
Also load our publication dataset.
df_phd <- fload(file = "./data/processed/phds_variables.rda")
df_pubs <- fload(file = "./data/processed/20240701df_covid_pubs.rda")
df_id <- fload(file = "./data/url_to_unique_id.rda")
how many unique phds do we start with
length(unique(df_phd$id))
table(df_phd$start_pub)
add diss_url variable to df_phd dataframe
df_id$id <- paste0("i", df_id$id)
df_phd2 <- left_join(df_phd, df_id, by = c("id")) #we created some duplicates
add df_phd to publications
df_pubs2 <- left_join(df_pubs, df_phd2, by = c(person_id = "person_id1")) #we created some duplicates
Publications (part
2)
Delete publications
without phd info
df_pubs2 <- df_pubs2 %>%
filter(!is.na(id)) %>%
filter(dupid == 0) #6277728
so we lost approximately 70.000 publication. probably of scholars who
did publish but do not have a profile.
Removing
duplicates
This time use the DOI.
df_pubs2 %>%
group_by(person_id) %>%
mutate(duplicateDOI = (duplicated(DOI) & !is.na(DOI))) %>%
filter(!(duplicateDOI)) -> df_pubs2 #624143
We are only interested in publications after the Phd.
df_pubs2 <- df_pubs2 %>%
filter(phd_year < pub_year) #465283
We add three variables to the publications data:
- first year of publication after their PhD
- last year of publication after their PhD
- how many publications written 1-3 years prior to covid
df_pubs2 %>%
mutate(covidpub = as.numeric(pub_year > 2016 & pub_year < 2020)) %>%
group_by(id) %>%
dplyr::summarize(firstpub = min(pub_year, na.rm = T), lastpub = max(pub_year, na.rm = T), covidpub = sum(covidpub)) ->
flpubs
df_pubs2 %>%
left_join(flpubs) -> df_pubs2
Remove publications
and PhDs, given inclusion criteria
How many phds do we now have publications for:
length(unique(df_pubs2$id)) #19368
Only select Phds who published at least one publication in the three
years before COVID-19.
df_pubs2 %>%
filter(covidpub > 0) -> df_pubs2
length(unique(df_pubs2$id)) #10001
And only select Phds who have a publication career of at least three
years.
df_pubs2 %>%
filter(lastpub > firstpub + 1) -> df_pubs2
length(unique(df_pubs2$id)) #8303
Intermediate save to
construct our research domain variable
select only papers published after 2014
df_pubs_topic <- df_pubs2 %>%
filter(pub_year > 2014)
# fsave(df_pubs_topic, 'df_pubs_topic')
Based on these pubs we determine the field in which the authors are
active via OpenAlex. See here.
Author position of PhD
in publication
Prepare author-list
of each publication
CAUTION: We use multisession.
plan(multisession, workers = 10) ## Parallelize
# make a list for each publication of autors, cleaned a bit
str_split(df_pubs2$Auteur, ";") %>%
# lowercase
future_lapply(tolower) %>%
# Removing diacritics
future_lapply(stri_trans_general, id = "latin-ascii") %>%
# remove first names
future_lapply(sub, pattern = "\\(.*\\)", replacement = "") %>%
future_lapply(sub, pattern = ",.*", replacement = "") %>%
# remove van de etc.
future_lapply(sub, pattern = "'t ", replacement = "") %>%
future_lapply(sub, pattern = "d' ", replacement = "") %>%
future_lapply(sub, pattern = "de ", replacement = "") %>%
future_lapply(sub, pattern = "de la ", replacement = "") %>%
future_lapply(sub, pattern = "den ", replacement = "") %>%
future_lapply(sub, pattern = "del ", replacement = "") %>%
future_lapply(sub, pattern = "der ", replacement = "") %>%
future_lapply(sub, pattern = "des ", replacement = "") %>%
future_lapply(sub, pattern = "el ", replacement = "") %>%
future_lapply(sub, pattern = "el- ", replacement = "") %>%
future_lapply(sub, pattern = "in 't ", replacement = "") %>%
future_lapply(sub, pattern = "la ", replacement = "") %>%
future_lapply(sub, pattern = "le ", replacement = "") %>%
future_lapply(sub, pattern = "les ", replacement = "") %>%
future_lapply(sub, pattern = "op den ", replacement = "") %>%
future_lapply(sub, pattern = "ten ", replacement = "") %>%
future_lapply(sub, pattern = "ter ", replacement = "") %>%
future_lapply(sub, pattern = "tes ", replacement = "") %>%
future_lapply(sub, pattern = "van ", replacement = "") %>%
future_lapply(sub, pattern = "van 't ", replacement = "") %>%
future_lapply(sub, pattern = "van de ", replacement = "") %>%
future_lapply(sub, pattern = "van der ", replacement = "") %>%
future_lapply(sub, pattern = "van den ", replacement = "") %>%
future_lapply(sub, pattern = "von der ", replacement = "") %>%
future_lapply(sub, pattern = "op den ", replacement = "") %>%
future_lapply(sub, pattern = "ul ", replacement = "") %>%
# remove white spaces
future_lapply(str_squish) %>%
# remove initials
future_lapply(gsub, pattern = ".*\\.", replacement = "") %>%
# Remove double names separated by a dash THINK WE NEED TO KEEP NOT THE FIRST BUT LAST
future_lapply(sub, pattern = "\\-.*", replacement = "") %>%
# remove white spaces again
future_lapply(str_squish) %>%
# remove double names sperated by a space WE KEEP TEH FIRST
future_lapply(gsub, pattern = " .*$", replacement = "") -> pubsautors
Author position
We determine the position of the PhD in the author-list of each
publication.
We add two variables to the publicatin dataset pubs:
aut_max: the number of authors of each
publication
aut_pos: the position of the PhD in the
publication
Flow plan:
- step -1: check to see if we have a name.
- Stap 0: check to see if it is a sole-authored paper.
- Stap 1: check to see if we have one or more exact matches.
- 1.1: one exact match, determine position,
done.
- 1.2: two or more exact mathces, pick first done.
- Stap 2: check whether last name consists of one letter.
- 2.1: Yes, set position to
NA, done
- 2.2 No
- Stap 3: Determine distance between last name and the names on the
publication.
- 3.1: No minimial distance observed, set position to
NA,
done
- 3.2: One minimum, set position, done.
- Stap 4: when multiple minima, check to see if it is the same last
name.
- 4.1: Yes, pick first (arbitrary), done
- 4.2: No, set position to
NA, done
warnings <- NA
aut_pot <- aut_max <- code <- rep(NA, length(df_pubs2$lastname))
table(is.na(df_pubs2$Auteur), useNA = "always")
# loop over each publication
for (i in 1:length(df_pubs2$lastname)) {
tryCatch({
# step -1
if (is.na(df_pubs2$Auteur[i])) {
code[i] <- "(-1) no name" #probably have to set to 0 later
next
}
aut_max[i] <- length(pubsautors[[i]]) #number of authors
# step 0:
if (aut_max[i] == 1) {
aut_pot[i] <- 1
code[i] <- "(0) publication with 1 author"
next
}
# step 1:
if (df_pubs2$lastname[i] %in% pubsautors[[i]]) {
aut_pot[i] <- which.min(pubsautors[[i]] %in% df_pubs2$lastname[i]) #########pasop. which.min vindt eerste false, niet eerste true!!!!!!!!!!!!
if (sum(pubsautors[[i]] %in% df_pubs2$lastname[i]) == 1) {
code[i] <- "(1.1) one exact match"
} else {
code[i] <- "(1.2) >1 exact matches, position of 1st"
}
next
}
# step 2:
if (nchar(df_pubs2$lastname[i]) == 1) {
aut_pot[i] <- NA
code[i] <- "(2) lastname just 1 character, set to NA"
next
}
# step 3:
naut1 <- stringdist(df_pubs2$lastname[i], pubsautors[[i]], method = "jaccard")
naut2 <- stringdist(df_pubs2$lastname[i], pubsautors[[i]], method = "lv")
naut3 <- stringdist(df_pubs2$lastname[i], pubsautors[[i]], method = "jw", p = 0) #this one helps with double last names 'tolsma berkhof'
naut <- naut1 + naut2 + naut3
# step 3.1
if (is.na(min(naut)) | is.nan(min(naut))) {
aut_pot[i] <- NA
code[i] <- "(3.1) no minimum via matching, set to NA"
next
}
# step 3.2
if (sum(naut == min(naut)) == 1) {
aut_pot[i] <- which(naut == min(naut))
code[i] <- "(3.2) 1 minimum via matching"
next
}
# step 4.1
if (pubsautors[[i]][which(naut == min(naut))[1]] == pubsautors[[i]][which(naut == min(naut))[2]]) {
aut_pot[i] <- which.min(naut) #take the first, use this one to run everything.
code[i] <- "(4.1) >1 minima after matching of same lastname, position of 1st"
} else {
aut_pot[i] <- NA
code[i] <- "(4.2) >1 minima after matching of different lastnames, set to NA"
}
}, error = function(e) e, warning = function(w) {
print(i)
w
warnings <- append(warnings, i)
# print(j) stop('converted from warning: ', conditionMessage(w))
})
}
table(code, useNA = "always")
table(aut_pot, useNA = "always")
table(aut_max, useNA = "always")
df_pubs2$aut_pot <- aut_pot
df_pubs2$aut_max <- aut_max
df_pubs2$aut_code <- code
So we can play a little later by filtering on specific codes. or by
recoding the NAs of author positions via the code variable. For example,
setting all publications without author names as first author
publications. etc.
person period file
pubs per year per
author
I count the:
- total number of publications
- all first authored publications (inclusive sole authored
publications)
- all last authored publications (exclusive sole authored
publications)
- all sole authored publications
- all first authored articles (inclusive sole authored articles)
- all last authored articles (exclusive sole authored articles)
- all sole authored articles
pubs <- df_pubs2
# publication year
pubs %>%
mutate(pub_a = as.numeric(Type == "Artikel"), pub_first = as.numeric(aut_pot == 1), pub_last = as.numeric(aut_pot ==
aut_max & aut_pot != 1), pub_sole = as.numeric(aut_max == 1), pub_first_a = as.numeric(aut_pot ==
1 & Type == "Artikel"), pub_last_a = as.numeric(aut_pot == aut_max & aut_pot != 1 & Type == "Artikel"),
pub_sole_a = as.numeric(aut_max == 1 & Type == "Artikel")) -> pubs
# aggregate by id and pub_year. this is what we want in the person period file.
pubs %>%
group_by(id, pub_year) %>%
summarize(npubs = n(), npubs_a = sum(pub_a, na.rm = TRUE), firstpub = first(firstpub), lastpub = first(lastpub),
npubs_first = sum(pub_first, na.rm = TRUE), npubs_last = sum(pub_last, na.rm = TRUE), npubs_sole = sum(pub_sole,
na.rm = TRUE), npubs_first_a = sum(pub_first_a, na.rm = TRUE), npubs_last_a = sum(pub_last_a,
na.rm = TRUE), npubs_sole_a = sum(pub_sole_a, na.rm = TRUE), phd_year = first(phd_year),
gender = first(gender_2), field = first(field), ethnicity = first(ethnicity3), uni = first(uni),
id = first(id)) -> pubs2
# we aggregate also by id. this will help to find us the total publications over the career. but
# also to fill in the empty rows of the df_ppp later on.
pubs %>%
group_by(id) %>%
summarize(npubs_t = n(), npubs_a_t = sum(pub_a, na.rm = TRUE), firstpub = first(firstpub), lastpub = first(lastpub),
npubs_first_t = sum(pub_first, na.rm = TRUE), npubs_last_t = sum(pub_last, na.rm = TRUE), npubs_sole_t = sum(pub_sole,
na.rm = TRUE), npubs_first_a_t = sum(pub_first_a, na.rm = TRUE), npubs_last_a_t = sum(pub_last_a,
na.rm = TRUE), npubs_sole_a_t = sum(pub_sole_a, na.rm = TRUE), phd_year = first(phd_year),
gender = first(gender_2), field = first(field), ethnicity = first(ethnicity3), uni = first(uni),
id = first(id)) -> pubs3
empty ppf
# first make an empty dataset
pub_year <- c(1990:2022)
npubs_zero <- rep(0, length(pub_year))
id <- unique(df_pubs2$id)
nid <- length(id)
pub_year <- rep(pub_year, nid)
npubs_zero <- rep(npubs_zero, nid)
id <- rep(id, each = length(c(1990:2022)))
df <- data.frame(id, pub_year)
df %>%
arrange(id, pub_year) -> df
fill the ppf
data_ppf <- left_join(df, pubs3, by = c("id")) %>%
filter(pub_year >= firstpub) %>%
left_join(pubs2, by = c("id", "pub_year")) %>%
mutate(npubs = replace_na(npubs, 0), npubs_a = replace_na(npubs_a, 0), npubs_first = replace_na(npubs_first,
0), npubs_last = replace_na(npubs_last, 0), npubs_sole = replace_na(npubs_sole, 0), npubs_first_a = replace_na(npubs_first_a,
0), npubs_last_a = replace_na(npubs_last_a, 0), npubs_sole_a = replace_na(npubs_sole_a, 0)) %>%
rename_with(~gsub(".x", "", .x, fixed = TRUE)) %>%
dplyr::select(c("id", "pub_year", "phd_year", "gender", "field", "ethnicity", "uni", "npubs", "npubs_a",
"npubs_first", "npubs_last", "npubs_sole", "npubs_first_a", "npubs_last_a", "npubs_sole_a", "npubs_t",
"npubs_a_t", "npubs_first_t", "npubs_last_t", "npubs_sole_t", "npubs_first_a_t", "npubs_last_a_t",
"npubs_sole_a_t"))
saving
fsave(data_ppf, "df_ppf") #the person period file dataset for the analysis
fsave(df_pubs2, "df_pubs") #publications included in the ppf (with all info on phds attached as well)
Mulders, Anne Maaike, Bas Hofstra, and Jochem Tolsma. 2024.
“A
Matter of Time? Gender and Ethnic Inequality in the Academic Publishing
Careers of Dutch PhDs.” Quantitative Science Studies,
May, 1–29.
https://doi.org/10.1162/qss_a_00306.
---
title: "Datawrangling"
bibliography: references.bib
---

<!---please be aware that caching large objects is problematic, hence cache.lazy=FALSE and you may need to turn cache=FALSE for chuncks in which you load large datasets --->

```{r, globalsettings, echo=FALSE, warning=FALSE}

library(knitr)
#library(rgl)
opts_chunk$set(tidy.opts=list(width.cutoff=100),tidy=TRUE, warning = FALSE, eval = FALSE, message = FALSE,comment = "#>", cache=TRUE, results='hold', class.source=c("test"), class.output=c("test2"), cache.lazy = FALSE)
options(width = 100)
rgl::setupKnitr()

knitr::opts_chunk$set(tab.cap.pre = "", tab.cap.sep = "")

colorize <- function(x, color) {sprintf("<span style='color: %s;'>%s</span>", color, x) }

```

```{r klippy, echo=FALSE, include=TRUE, eval=TRUE}
klippy::klippy(position = c('top', 'right'))
# install.packages("remotes")
#remotes::install_github("rlesur/klippy")
#klippy::klippy(color = 'darkred')
#klippy::klippy(tooltip_message = 'Click to copy', tooltip_success = 'Done')
```

------------------------------------------------------------------------

This lab journal is created for the paper 

**Tolsma, J., Hofstra, B. and Mulders, AM (2025). How COVID-19 Exacerbated Gender Inequalities in Dutch Academia. *Scientometrics*.**

In this file we show how we went from the raw data sets scraped from NARCIS to the working samples on which all analysis are based. Note that the raw data files are not shared but please email jochem.tolsma@ru.nl if you are interested in working with these raw data files. The working sample(s) can be downloaded [here](https://coviddutchacademia.netlify.app/analysis).

------------------------------------------------------------------------

# Custom functions

```{r, results='hide', eval = TRUE}
rm(list = ls())

fpackage.check <- function(packages) {
    lapply(packages, FUN = function(x) {
        if (!require(x, character.only = TRUE)) {
            install.packages(x, dependencies = TRUE)
            library(x, character.only = TRUE)
        }
    })
}

fsave <- function(x, file = NULL, location = "./data/processed/") {
    ifelse(!dir.exists("data"), dir.create("data"), FALSE)
    ifelse(!dir.exists("data/processed"), dir.create("data/processed"), FALSE)
    if (is.null(file))
        file = deparse(substitute(x))
    datename <- substr(gsub("[:-]", "", Sys.time()), 1, 8)
    totalname <- paste(location, file, "_", datename,  ".rda", sep = "")
    save(x, file = totalname)  #need to fix if file is reloaded as input name, not as x. 
}

fload <- function(filename) {
    load(filename)
    get(ls()[ls() != "filename"])
}

fshowdf <- function(x, ...) {
    knitr::kable(x, digits = 2, "html", ...) %>%
        kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>%
        kableExtra::scroll_box(width = "100%", height = "300px")
}

colorize <- function(x, color) {sprintf("<span style='color: %s;'>%s</span>", color, x) }
```

------------------------------------------------------------------------

# Packages

```{r, results='hide', eval = TRUE}
packages = c("ggplot2", "tidyverse", "RColorBrewer", "dplyr", "stringdist", "stringi", "future.apply")

fpackage.check(packages)
```

------------------------------------------------------------------------

# Publications (part 1)

Scraped publications via NARCIS. These raw data sets are not shared publicly but please email jochem.tolsma@ru.nl if you want to work with these raw data. 

```{r}
# publications raw datafiles
pubs1 <- fload(file = "./data/pubs_meta_df.rda")
pubs2 <- fload(file = "./data/pubs_meta_df_2020_2021.rda")
pubs3 <- fload(file = "./data/pubs_meta_df_2020_2021_adden.rda")
pubs4 <- fload(file = "./data/pubs_meta_df_2022_adden.rda")
```

```{r data}
# Different variables in each publication dataset, align this
 pubs1 <- pubs1[,c(1:4,7,231,13)]
pubs2 <- pubs2[,c(1:3,5,9,136, 16)]
 pubs3 <- pubs3[,c(1:3,5,9,141,16)]
 pubs4 <- pubs4[,c(1:4,6,113,12)]

# combine
pubs12 <- rbind.data.frame(pubs1, pubs2)
pubs123 <- rbind.data.frame(pubs12, pubs3)
pubs <- rbind.data.frame(pubs123, pubs4)
```

Selection of publications

```{r }
table(pubs$Type, useNA = "always")
pubs$Type <- as.factor(pubs$Type)
 pubs <- pubs[(pubs$Type=="Artikel") | (pubs$Type=="Boek" | (pubs$Type=="Boekdeel") | (pubs$Type=="Conference Paper") | (pubs$Type=="Conference Proceedings") | (pubs$Type=="Conferentiebijdrage") | (pubs$Type=="Patent") | (pubs$Type=="Rapport") | (pubs$Type=="Dataset") | (pubs$Type=="Review")), ] #845256
```

Removing duplicates

```{r}
pubs %>%
  filter(!is.na(Type)) %>%
  mutate(Titel_ori = Titel, 
         Titel = tolower(Titel),
         Titel = str_sub(Titel, 1, 40),
         Titel = str_replace_all(Titel, "\\s", ""),
         Titel = str_replace_all(Titel, intToUtf8(8217), intToUtf8(39) ), 
         Titel = str_replace_all(Titel, " :", ":" ), 
         Titel = str_replace_all(Titel, " \\(", "\\(" ),
         Titel = str_replace_all(Titel, "\\( ", "\\(" ),
         Titel = str_replace_all(Titel, "\\) ", "\\)" ),
         Titel = str_replace_all(Titel, " \\)", "\\)" )) %>%
  group_by(person_id) %>%
  distinct(Titel, .keep_all=TRUE) %>%
  ungroup -> df_pubs
```

Removing pubs without pubyear

```{r}
colnames(df_pubs) <- make.names(colnames(df_pubs), unique = TRUE)

# derive publication year from variable Data.issued
df_pubs %>% 
  mutate(pub_year = as.character(Date.issued),
         pub_year = substr(pub_year, 1,4),
         pub_year = as.numeric(pub_year)) -> df_pubs

df_pubs %>%
  filter(!is.na(pub_year)) -> df_pubs #697695

```

Set pubs published in 2023 (16) to 2022

```{r}
df_pubs %>% 
  mutate(pub_year = ifelse(pub_year>2022, 2022, pub_year)) -> df_pubs
```

```{r}
length(unique(df_pubs$Titel)) #478761
length(unique(df_pubs$person_id)) #27285

```

Saving the data

```{r, eval=FALSE}
fsave(df_pubs, file = "df_covid_pubs", location = "./data/processed/")
```

------------------------------------------------------------------------

# PhDs

We use three processed datasets.

-   `phds_variables.rda`: dataset of Dutch PhDs between 1990-2021

This data set has been prepared by Anne Maaike Mulders, see [@mulders2024]. 

-   `url_to_unique_id.rda`: used to match different scraped info from NARCIS and where different id variables are used.

Also load our publication dataset.

```{r }
df_phd <- fload(file = "./data/processed/phds_variables.rda")

df_pubs <- fload(file = "./data/processed/20240701df_covid_pubs.rda")

df_id <- fload(file = "./data/url_to_unique_id.rda")
```

how many unique phds do we start with

```{r}
length(unique(df_phd$id))
table(df_phd$start_pub)
```

add diss_url variable to df_phd dataframe

```{r}
df_id$id <- paste0("i", df_id$id)
df_phd2 <- left_join(df_phd, df_id, by=c("id")) #we created some duplicates
```

add df_phd to publications

```{r}
df_pubs2 <- left_join(df_pubs, df_phd2, by=c("person_id" = "person_id1")) #we created some duplicates
```

------------------------------------------------------------------------

# Publications (part 2)

## Delete publications without phd info

```{r}
df_pubs2 <- df_pubs2 %>% 
            filter(!is.na(id)) %>%
            filter(dupid == 0) #6277728
```

so we lost approximately 70.000 publication. probably of scholars who did publish but do not have a profile.

## Removing duplicates

This time use the DOI.

```{r}
df_pubs2 %>%
  group_by(person_id) %>%
  mutate(duplicateDOI = (duplicated(DOI) & !is.na(DOI))) %>%
  filter(!(duplicateDOI)) -> df_pubs2 #624143

```

We are only interested in publications after the Phd.

```{r}
df_pubs2 <- df_pubs2 %>% 
            filter(phd_year<pub_year) #465283
```

We add three variables to the publications data:

-   first year of publication after their PhD\
-   last year of publication after their PhD\
-   how many publications written 1-3 years prior to covid

```{r}
df_pubs2 %>% 
  mutate(covidpub = as.numeric(pub_year>2016 & pub_year<2020)) %>%
  group_by(id) %>%
  dplyr::summarize(
            firstpub = min(pub_year, na.rm=T),
            lastpub = max(pub_year, na.rm=T), 
            covidpub= sum(covidpub)) -> flpubs

df_pubs2 %>% left_join(flpubs) -> df_pubs2
```

## Remove publications and PhDs, given inclusion criteria

How many phds do we now have publications for:

```{r}
length(unique(df_pubs2$id)) #19368
```

Only select Phds who published at least one publication in the three years before COVID-19.

```{r}
df_pubs2 %>% filter(covidpub > 0) -> df_pubs2
length(unique(df_pubs2$id)) #10001
```

And only select Phds who have a publication career of at least three years.

```{r}
df_pubs2 %>% filter(lastpub > firstpub + 1) -> df_pubs2
length(unique(df_pubs2$id)) #8303
```

## Intermediate save to construct our research domain variable

select only papers published after 2014

```{r}
df_pubs_topic <- df_pubs2 %>% 
            filter(pub_year > 2014) 
#fsave(df_pubs_topic, "df_pubs_topic")
```

Based on these pubs we determine the field in which the authors are active via OpenAlex. See [here](https://coviddutchacademia.netlify.app/oa_topics_fetch).

------------------------------------------------------------------------

# Author position of PhD in publication

## Prepare author-list of each publication

`r colorize("CAUTION: We use multisession.", "red")`

```{r}
plan(multisession, workers = 10) ## Parallelize 

#make a list for each publication of autors, cleaned a bit
str_split(df_pubs2$Auteur, ";") %>%
  #lowercase
  future_lapply(tolower) %>% 
  # Removing diacritics
  future_lapply(stri_trans_general, id = "latin-ascii") %>%
  #remove first names
  future_lapply(sub, pattern="\\(.*\\)", replacement="") %>%
  future_lapply(sub, pattern=",.*", replacement="") %>%
  #remove van de etc. 

  future_lapply(sub, pattern="'t ", replacement="") %>%
  future_lapply(sub, pattern="d' ", replacement="") %>%
  future_lapply(sub, pattern="de ", replacement="") %>%
  future_lapply(sub, pattern="de la ", replacement="") %>%
  future_lapply(sub, pattern="den ", replacement="") %>%
  future_lapply(sub, pattern="del ", replacement="") %>%
  future_lapply(sub, pattern="der ", replacement="") %>%
  future_lapply(sub, pattern="des ", replacement="") %>%
  future_lapply(sub, pattern="el ", replacement="") %>%
  future_lapply(sub, pattern="el- ", replacement="") %>%
  future_lapply(sub, pattern="in 't ", replacement="") %>%
  future_lapply(sub, pattern="la ", replacement="") %>%
  future_lapply(sub, pattern="le ", replacement="") %>%
  future_lapply(sub, pattern="les ", replacement="") %>%
  future_lapply(sub, pattern="op den ", replacement="") %>%
  future_lapply(sub, pattern="ten ", replacement="") %>%
  future_lapply(sub, pattern="ter ", replacement="") %>%
  future_lapply(sub, pattern="tes ", replacement="") %>%
  future_lapply(sub, pattern="van ", replacement="") %>%
  future_lapply(sub, pattern="van 't ", replacement="") %>%
  future_lapply(sub, pattern="van de ", replacement="") %>%
  future_lapply(sub, pattern="van der ", replacement="") %>%
  future_lapply(sub, pattern="van den ", replacement="") %>%
  future_lapply(sub, pattern="von der ", replacement="") %>%
  future_lapply(sub, pattern="op den ", replacement="") %>%
  future_lapply(sub, pattern="ul ", replacement="") %>%
  #remove white spaces
  future_lapply(str_squish) %>%
  # remove initials
  future_lapply(gsub, pattern= ".*\\.", replacement="") %>%
  # Remove double names separated by a dash THINK WE NEED TO KEEP NOT THE FIRST BUT LAST
  future_lapply(sub, pattern= "\\-.*", replacement="") %>%
  #remove white spaces again
  future_lapply(str_squish) %>%
  #remove double names sperated by a space WE KEEP TEH FIRST
  future_lapply(gsub, pattern= " .*$", replacement="") ->  pubsautors


```

## Author position

<!---would be great if we look together at the flow plan and the code--->

We determine the position of the PhD in the author-list of each publication.

We add two variables to the publicatin dataset `pubs`:

-   `aut_max`: the number of authors of each publication\
-   `aut_pos`: the position of the PhD in the publication

**Flow plan:**

-   step -1: check to see if we have a name.
    -   1.1: No, **done**\
-   Stap 0: check to see if it is a sole-authored paper.
    -   0.1: Yes, **done**\
    -   0.2: No\
-   Stap 1: check to see if we have one or more exact matches.
    -   1.1: one exact match, determine position, **done**.\
    -   1.2: two or more exact mathces, pick first **done**.[^1]\
-   Stap 2: check whether last name consists of one letter.
    -   2.1: Yes, set position to `NA`, done\
    -   2.2 No\
-   Stap 3: Determine distance between last name and the names on the publication.
    -   3.1: No minimial distance observed, set position to `NA`, **done**\
    -   3.2: One minimum, set position, **done**.\
-   Stap 4: when multiple minima, check to see if it is the same last name.
    -   4.1: Yes, pick first (arbitrary), **done**\
    -   4.2: No, set position to `NA`, **done**

[^1]: We pick the first because an inspection of Narcis seems to indicate that the lead author is at times additionally mentioned as last author.

```{r}
warnings <- NA
aut_pot <- aut_max <- code <- rep(NA, length(df_pubs2$lastname))

table(is.na(df_pubs2$Auteur), useNA = "always")

#loop over each publication
for (i in 1:length(df_pubs2$lastname)) {
tryCatch({
  #step -1 
  if(is.na(df_pubs2$Auteur[i])) {
    code[i] <- "(-1) no name" #probably have to set to 0 later
    next
  }
  aut_max[i] <- length(pubsautors[[i]]) #number of authors 
  #step 0: 
  if (aut_max[i] == 1) {
    aut_pot[i] <- 1
    code[i] <- "(0) publication with 1 author"
    next 
  }
  #step 1: 
  if (df_pubs2$lastname[i] %in% pubsautors[[i]]) {
    aut_pot[i] <- which.min(pubsautors[[i]] %in% df_pubs2$lastname[i]) #########pasop. which.min vindt eerste false, niet eerste true!!!!!!!!!!!!
    if (sum(pubsautors[[i]] %in% df_pubs2$lastname[i]) == 1) {
    code[i] <- "(1.1) one exact match"  
    } else {
      code[i] <- "(1.2) >1 exact matches, position of 1st"  
      }
    next
  }
  #step 2:
  if (nchar(df_pubs2$lastname[i])==1) {
    aut_pot[i] <- NA
    code[i] <- "(2) lastname just 1 character, set to NA"
    next
  }
  #step 3: 
  naut1 <- stringdist(df_pubs2$lastname[i], pubsautors[[i]], method="jaccard")
  naut2 <- stringdist(df_pubs2$lastname[i], pubsautors[[i]], method="lv")
  naut3 <- stringdist(df_pubs2$lastname[i], pubsautors[[i]], method="jw", p=0) #this one helps with double last names "tolsma berkhof"
  naut <- naut1 + naut2 + naut3
  #step 3.1
  if (is.na(min(naut)) | is.nan(min(naut))) {
    aut_pot[i] <- NA
    code[i] <- "(3.1) no minimum via matching, set to NA"
    next
  }
  #step 3.2 
  if (sum(naut==min(naut)) == 1) {
    aut_pot[i] <- which(naut==min(naut))
    code[i] <- "(3.2) 1 minimum via matching"
    next
  }
  #step 4.1
  if (pubsautors[[i]][which(naut==min(naut))[1]] ==  pubsautors[[i]][which(naut==min(naut))[2]]) {
    aut_pot[i] <- which.min(naut)   #take the first, use this one to run everything.
    code[i] <- "(4.1) >1 minima after matching of same lastname, position of 1st"
  } else {
    aut_pot[i] <- NA
    code[i] <- "(4.2) >1 minima after matching of different lastnames, set to NA"
  }
} , error = function(e) e, warning = function(w) {
  print(i)
  w
  warnings <- append(warnings, i)
  #print(j)
  #stop("converted from warning: ", conditionMessage(w))
  }
)
}

table(code, useNA = "always")
table(aut_pot, useNA = "always")
table(aut_max, useNA = "always")

df_pubs2$aut_pot <- aut_pot
df_pubs2$aut_max <- aut_max
df_pubs2$aut_code <- code



```

So we can play a little later by filtering on specific codes. or by recoding the NAs of author positions via the code variable. For example, setting all publications without author names as first author publications. etc.

------------------------------------------------------------------------

# person period file

## pubs per year per author

I count the:

-   total number of publications\
-   all first authored publications (inclusive sole authored publications)
-   all last authored publications (exclusive sole authored publications)\
-   all sole authored publications\
-   all first authored articles (inclusive sole authored articles)
-   all last authored articles (exclusive sole authored articles)\
-   all sole authored articles

```{r, echo = FALSE}
table(df_pubs2$gender, useNA="always")
table(df_pubs2$gender_2, useNA="always")
table(df_pubs2$ethnicity, useNA="always")
table(df_pubs2$ethnicity2, useNA="always")
table(df_pubs2$ethnicity3, useNA="always")
```

```{r}
pubs <- df_pubs2

#publication year
pubs %>% 
  mutate(pub_a = as.numeric(Type=="Artikel"),
          pub_first = as.numeric(aut_pot==1),
         pub_last = as.numeric(aut_pot==aut_max & aut_pot!=1),
         pub_sole = as.numeric(aut_max==1),
         pub_first_a = as.numeric(aut_pot==1 & Type=="Artikel"),
         pub_last_a = as.numeric(aut_pot==aut_max & aut_pot!=1 & Type=="Artikel"), 
         pub_sole_a = as.numeric(aut_max==1 & Type=="Artikel")) -> pubs


#aggregate by id and pub_year. this is what we want in the person period file. 
pubs %>% 
  group_by(id, pub_year) %>%
  summarize(npubs = n(),
            npubs_a = sum(pub_a, na.rm=TRUE),
            firstpub = first(firstpub),
            lastpub = first(lastpub),
            npubs_first = sum(pub_first, na.rm=TRUE),
            npubs_last = sum(pub_last, na.rm=TRUE),
            npubs_sole = sum(pub_sole, na.rm=TRUE),
            npubs_first_a = sum(pub_first_a, na.rm=TRUE),
            npubs_last_a = sum(pub_last_a, na.rm=TRUE),
            npubs_sole_a = sum(pub_sole_a, na.rm=TRUE),
            phd_year = first(phd_year),
            gender = first(gender_2),
            field = first(field),
            ethnicity = first(ethnicity3),
            uni = first(uni),
            id = first(id)) -> pubs2

#we aggregate also by id. this will help to find us the total publications over the career. but also to fill in the empty rows of the df_ppp later on. 
pubs %>% 
  group_by(id) %>%
  summarize(npubs_t = n(),
            npubs_a_t = sum(pub_a, na.rm=TRUE),
            firstpub = first(firstpub),
            lastpub = first(lastpub),
            npubs_first_t = sum(pub_first, na.rm=TRUE),
            npubs_last_t = sum(pub_last, na.rm=TRUE),
            npubs_sole_t = sum(pub_sole, na.rm=TRUE),
            npubs_first_a_t = sum(pub_first_a, na.rm=TRUE),
            npubs_last_a_t = sum(pub_last_a, na.rm=TRUE),
            npubs_sole_a_t = sum(pub_sole_a, na.rm=TRUE),
            phd_year = first(phd_year),
            gender = first(gender_2),
            field = first(field),
            ethnicity = first(ethnicity3),
            uni = first(uni),
            id = first(id)) -> pubs3


```

## empty ppf

```{r}
#first make an empty dataset

pub_year <- c(1990:2022)
npubs_zero <- rep(0, length(pub_year))
id <- unique(df_pubs2$id)
nid <- length(id)
pub_year <- rep(pub_year, nid)
npubs_zero <- rep(npubs_zero, nid)
id <- rep(id, each=length(c(1990:2022)))  

df <- data.frame(id, pub_year)
df %>% 
  arrange(id, pub_year) -> df
```

## fill the ppf

```{r}

data_ppf <- left_join(df, pubs3, by=c("id")) %>%
  filter(pub_year>=firstpub) %>%
  left_join(pubs2, by=c("id", "pub_year")) %>%
  mutate(npubs = replace_na(npubs, 0 ),
         npubs_a = replace_na(npubs_a, 0 ),
         npubs_first = replace_na(npubs_first, 0),
         npubs_last = replace_na(npubs_last, 0),
         npubs_sole = replace_na(npubs_sole, 0),
         npubs_first_a = replace_na(npubs_first_a, 0),
         npubs_last_a = replace_na(npubs_last_a, 0), 
         npubs_sole_a = replace_na(npubs_sole_a, 0)) %>%
  rename_with(~gsub(".x", "", .x, fixed = TRUE)) %>% 
  dplyr::select(c("id", "pub_year", "phd_year", "gender", "field", "ethnicity", "uni", "npubs", "npubs_a", "npubs_first", "npubs_last", "npubs_sole","npubs_first_a", "npubs_last_a", "npubs_sole_a", "npubs_t", "npubs_a_t", "npubs_first_t", "npubs_last_t", "npubs_sole_t","npubs_first_a_t", "npubs_last_a_t", "npubs_sole_a_t" ))

```

## saving

```{r}
fsave(data_ppf, "df_ppf") #the person period file dataset for the analysis
fsave(df_pubs2, "df_pubs") #publications included in the ppf (with all info on phds attached as well)
```

------------------------------------------------------------------------

Copyright © 2024- Jochem Tolsma