• 1 Custom functions
  • 2 Packages
  • 3 Publications (part 1)
  • 4 PhDs
  • 5 Publications (part 2)
    • 5.1 Delete publications without phd info
    • 5.2 Removing duplicates
    • 5.3 Remove publications and PhDs, given inclusion criteria
    • 5.4 Intermediate save to construct our research domain variable
  • 6 Author position of PhD in publication
    • 6.1 Prepare author-list of each publication
    • 6.2 Author position
  • 7 person period file
    • 7.1 pubs per year per author
    • 7.2 empty ppf
    • 7.3 fill the ppf
    • 7.4 saving

This lab journal is created for the paper

Tolsma, J., Hofstra, B. and Mulders, AM (2025). How COVID-19 Exacerbated Gender Inequalities in Dutch Academia. Scientometrics.

In this file we show how we went from the raw data sets scraped from NARCIS to the working samples on which all analysis are based. Note that the raw data files are not shared but please email if you are interested in working with these raw data files. The working sample(s) can be downloaded here.


1 Custom functions

rm(list = ls())

fpackage.check <- function(packages) {
    lapply(packages, FUN = function(x) {
        if (!require(x, character.only = TRUE)) {
            install.packages(x, dependencies = TRUE)
            library(x, character.only = TRUE)
        }
    })
}

fsave <- function(x, file = NULL, location = "./data/processed/") {
    ifelse(!dir.exists("data"), dir.create("data"), FALSE)
    ifelse(!dir.exists("data/processed"), dir.create("data/processed"), FALSE)
    if (is.null(file))
        file = deparse(substitute(x))
    datename <- substr(gsub("[:-]", "", Sys.time()), 1, 8)
    totalname <- paste(location, file, "_", datename,  ".rda", sep = "")
    save(x, file = totalname)  #need to fix if file is reloaded as input name, not as x. 
}

fload <- function(filename) {
    load(filename)
    get(ls()[ls() != "filename"])
}

fshowdf <- function(x, ...) {
    knitr::kable(x, digits = 2, "html", ...) %>%
        kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>%
        kableExtra::scroll_box(width = "100%", height = "300px")
}

colorize <- function(x, color) {sprintf("<span style='color: %s;'>%s</span>", color, x) }

2 Packages

packages = c("ggplot2", "tidyverse", "RColorBrewer", "dplyr", "stringdist", "stringi", "future.apply")

fpackage.check(packages)

3 Publications (part 1)

Scraped publications via NARCIS. These raw data sets are not shared publicly but please email if you want to work with these raw data.

# publications raw datafiles
pubs1 <- fload(file = "./data/pubs_meta_df.rda")
pubs2 <- fload(file = "./data/pubs_meta_df_2020_2021.rda")
pubs3 <- fload(file = "./data/pubs_meta_df_2020_2021_adden.rda")
pubs4 <- fload(file = "./data/pubs_meta_df_2022_adden.rda")
# Different variables in each publication dataset, align this
pubs1 <- pubs1[, c(1:4, 7, 231, 13)]
pubs2 <- pubs2[, c(1:3, 5, 9, 136, 16)]
pubs3 <- pubs3[, c(1:3, 5, 9, 141, 16)]
pubs4 <- pubs4[, c(1:4, 6, 113, 12)]

# combine
pubs12 <- rbind.data.frame(pubs1, pubs2)
pubs123 <- rbind.data.frame(pubs12, pubs3)
pubs <- rbind.data.frame(pubs123, pubs4)

Selection of publications

table(pubs$Type, useNA = "always")
pubs$Type <- as.factor(pubs$Type)
pubs <- pubs[(pubs$Type == "Artikel") | (pubs$Type == "Boek" | (pubs$Type == "Boekdeel") | (pubs$Type ==
    "Conference Paper") | (pubs$Type == "Conference Proceedings") | (pubs$Type == "Conferentiebijdrage") |
    (pubs$Type == "Patent") | (pubs$Type == "Rapport") | (pubs$Type == "Dataset") | (pubs$Type == "Review")),
    ]  #845256

Removing duplicates

pubs %>%
    filter(!is.na(Type)) %>%
    mutate(Titel_ori = Titel, Titel = tolower(Titel), Titel = str_sub(Titel, 1, 40), Titel = str_replace_all(Titel,
        "\\s", ""), Titel = str_replace_all(Titel, intToUtf8(8217), intToUtf8(39)), Titel = str_replace_all(Titel,
        " :", ":"), Titel = str_replace_all(Titel, " \\(", "\\("), Titel = str_replace_all(Titel, "\\( ",
        "\\("), Titel = str_replace_all(Titel, "\\) ", "\\)"), Titel = str_replace_all(Titel, " \\)",
        "\\)")) %>%
    group_by(person_id) %>%
    distinct(Titel, .keep_all = TRUE) %>%
    ungroup -> df_pubs

Removing pubs without pubyear

colnames(df_pubs) <- make.names(colnames(df_pubs), unique = TRUE)

# derive publication year from variable Data.issued
df_pubs %>%
    mutate(pub_year = as.character(Date.issued), pub_year = substr(pub_year, 1, 4), pub_year = as.numeric(pub_year)) ->
    df_pubs

df_pubs %>%
    filter(!is.na(pub_year)) -> df_pubs  #697695

Set pubs published in 2023 (16) to 2022

df_pubs %>%
    mutate(pub_year = ifelse(pub_year > 2022, 2022, pub_year)) -> df_pubs
length(unique(df_pubs$Titel))  #478761
length(unique(df_pubs$person_id))  #27285

Saving the data

fsave(df_pubs, file = "df_covid_pubs", location = "./data/processed/")

4 PhDs

We use three processed datasets.

  • phds_variables.rda: dataset of Dutch PhDs between 1990-2021

This data set has been prepared by Anne Maaike Mulders, see (Mulders, Hofstra, and Tolsma 2024).

  • url_to_unique_id.rda: used to match different scraped info from NARCIS and where different id variables are used.

Also load our publication dataset.

df_phd <- fload(file = "./data/processed/phds_variables.rda")

df_pubs <- fload(file = "./data/processed/20240701df_covid_pubs.rda")

df_id <- fload(file = "./data/url_to_unique_id.rda")

how many unique phds do we start with

length(unique(df_phd$id))
table(df_phd$start_pub)

add diss_url variable to df_phd dataframe

df_id$id <- paste0("i", df_id$id)
df_phd2 <- left_join(df_phd, df_id, by = c("id"))  #we created some duplicates

add df_phd to publications

df_pubs2 <- left_join(df_pubs, df_phd2, by = c(person_id = "person_id1"))  #we created some duplicates

5 Publications (part 2)

5.1 Delete publications without phd info

df_pubs2 <- df_pubs2 %>%
    filter(!is.na(id)) %>%
    filter(dupid == 0)  #6277728

so we lost approximately 70.000 publication. probably of scholars who did publish but do not have a profile.

5.2 Removing duplicates

This time use the DOI.

df_pubs2 %>%
    group_by(person_id) %>%
    mutate(duplicateDOI = (duplicated(DOI) & !is.na(DOI))) %>%
    filter(!(duplicateDOI)) -> df_pubs2  #624143

We are only interested in publications after the Phd.

df_pubs2 <- df_pubs2 %>%
    filter(phd_year < pub_year)  #465283

We add three variables to the publications data:

  • first year of publication after their PhD
  • last year of publication after their PhD
  • how many publications written 1-3 years prior to covid
df_pubs2 %>%
    mutate(covidpub = as.numeric(pub_year > 2016 & pub_year < 2020)) %>%
    group_by(id) %>%
    dplyr::summarize(firstpub = min(pub_year, na.rm = T), lastpub = max(pub_year, na.rm = T), covidpub = sum(covidpub)) ->
    flpubs

df_pubs2 %>%
    left_join(flpubs) -> df_pubs2

5.3 Remove publications and PhDs, given inclusion criteria

How many phds do we now have publications for:

length(unique(df_pubs2$id))  #19368

Only select Phds who published at least one publication in the three years before COVID-19.

df_pubs2 %>%
    filter(covidpub > 0) -> df_pubs2
length(unique(df_pubs2$id))  #10001

And only select Phds who have a publication career of at least three years.

df_pubs2 %>%
    filter(lastpub > firstpub + 1) -> df_pubs2
length(unique(df_pubs2$id))  #8303

5.4 Intermediate save to construct our research domain variable

select only papers published after 2014

df_pubs_topic <- df_pubs2 %>%
    filter(pub_year > 2014)
# fsave(df_pubs_topic, 'df_pubs_topic')

Based on these pubs we determine the field in which the authors are active via OpenAlex. See here.


6 Author position of PhD in publication

6.1 Prepare author-list of each publication

CAUTION: We use multisession.

plan(multisession, workers = 10)  ## Parallelize 

# make a list for each publication of autors, cleaned a bit
str_split(df_pubs2$Auteur, ";") %>%
    # lowercase
future_lapply(tolower) %>%
    # Removing diacritics
future_lapply(stri_trans_general, id = "latin-ascii") %>%
    # remove first names
future_lapply(sub, pattern = "\\(.*\\)", replacement = "") %>%
    future_lapply(sub, pattern = ",.*", replacement = "") %>%
    # remove van de etc.

future_lapply(sub, pattern = "'t ", replacement = "") %>%
    future_lapply(sub, pattern = "d' ", replacement = "") %>%
    future_lapply(sub, pattern = "de ", replacement = "") %>%
    future_lapply(sub, pattern = "de la ", replacement = "") %>%
    future_lapply(sub, pattern = "den ", replacement = "") %>%
    future_lapply(sub, pattern = "del ", replacement = "") %>%
    future_lapply(sub, pattern = "der ", replacement = "") %>%
    future_lapply(sub, pattern = "des ", replacement = "") %>%
    future_lapply(sub, pattern = "el ", replacement = "") %>%
    future_lapply(sub, pattern = "el- ", replacement = "") %>%
    future_lapply(sub, pattern = "in 't ", replacement = "") %>%
    future_lapply(sub, pattern = "la ", replacement = "") %>%
    future_lapply(sub, pattern = "le ", replacement = "") %>%
    future_lapply(sub, pattern = "les ", replacement = "") %>%
    future_lapply(sub, pattern = "op den ", replacement = "") %>%
    future_lapply(sub, pattern = "ten ", replacement = "") %>%
    future_lapply(sub, pattern = "ter ", replacement = "") %>%
    future_lapply(sub, pattern = "tes ", replacement = "") %>%
    future_lapply(sub, pattern = "van ", replacement = "") %>%
    future_lapply(sub, pattern = "van 't ", replacement = "") %>%
    future_lapply(sub, pattern = "van de ", replacement = "") %>%
    future_lapply(sub, pattern = "van der ", replacement = "") %>%
    future_lapply(sub, pattern = "van den ", replacement = "") %>%
    future_lapply(sub, pattern = "von der ", replacement = "") %>%
    future_lapply(sub, pattern = "op den ", replacement = "") %>%
    future_lapply(sub, pattern = "ul ", replacement = "") %>%
    # remove white spaces
future_lapply(str_squish) %>%
    # remove initials
future_lapply(gsub, pattern = ".*\\.", replacement = "") %>%
    # Remove double names separated by a dash THINK WE NEED TO KEEP NOT THE FIRST BUT LAST
future_lapply(sub, pattern = "\\-.*", replacement = "") %>%
    # remove white spaces again
future_lapply(str_squish) %>%
    # remove double names sperated by a space WE KEEP TEH FIRST
future_lapply(gsub, pattern = " .*$", replacement = "") -> pubsautors

6.2 Author position

We determine the position of the PhD in the author-list of each publication.

We add two variables to the publicatin dataset pubs:

  • aut_max: the number of authors of each publication
  • aut_pos: the position of the PhD in the publication

Flow plan:

  • step -1: check to see if we have a name.
    • 1.1: No, done
  • Stap 0: check to see if it is a sole-authored paper.
    • 0.1: Yes, done
    • 0.2: No
  • Stap 1: check to see if we have one or more exact matches.
    • 1.1: one exact match, determine position, done.
    • 1.2: two or more exact mathces, pick first done.1
  • Stap 2: check whether last name consists of one letter.
    • 2.1: Yes, set position to NA, done
    • 2.2 No
  • Stap 3: Determine distance between last name and the names on the publication.
    • 3.1: No minimial distance observed, set position to NA, done
    • 3.2: One minimum, set position, done.
  • Stap 4: when multiple minima, check to see if it is the same last name.
    • 4.1: Yes, pick first (arbitrary), done
    • 4.2: No, set position to NA, done
warnings <- NA
aut_pot <- aut_max <- code <- rep(NA, length(df_pubs2$lastname))

table(is.na(df_pubs2$Auteur), useNA = "always")

# loop over each publication
for (i in 1:length(df_pubs2$lastname)) {
    tryCatch({
        # step -1
        if (is.na(df_pubs2$Auteur[i])) {
            code[i] <- "(-1) no name"  #probably have to set to 0 later
            next
        }
        aut_max[i] <- length(pubsautors[[i]])  #number of authors 
        # step 0:
        if (aut_max[i] == 1) {
            aut_pot[i] <- 1
            code[i] <- "(0) publication with 1 author"
            next
        }
        # step 1:
        if (df_pubs2$lastname[i] %in% pubsautors[[i]]) {
            aut_pot[i] <- which.min(pubsautors[[i]] %in% df_pubs2$lastname[i])  #########pasop. which.min vindt eerste false, niet eerste true!!!!!!!!!!!!
            if (sum(pubsautors[[i]] %in% df_pubs2$lastname[i]) == 1) {
                code[i] <- "(1.1) one exact match"
            } else {
                code[i] <- "(1.2) >1 exact matches, position of 1st"
            }
            next
        }
        # step 2:
        if (nchar(df_pubs2$lastname[i]) == 1) {
            aut_pot[i] <- NA
            code[i] <- "(2) lastname just 1 character, set to NA"
            next
        }
        # step 3:
        naut1 <- stringdist(df_pubs2$lastname[i], pubsautors[[i]], method = "jaccard")
        naut2 <- stringdist(df_pubs2$lastname[i], pubsautors[[i]], method = "lv")
        naut3 <- stringdist(df_pubs2$lastname[i], pubsautors[[i]], method = "jw", p = 0)  #this one helps with double last names 'tolsma berkhof'
        naut <- naut1 + naut2 + naut3
        # step 3.1
        if (is.na(min(naut)) | is.nan(min(naut))) {
            aut_pot[i] <- NA
            code[i] <- "(3.1) no minimum via matching, set to NA"
            next
        }
        # step 3.2
        if (sum(naut == min(naut)) == 1) {
            aut_pot[i] <- which(naut == min(naut))
            code[i] <- "(3.2) 1 minimum via matching"
            next
        }
        # step 4.1
        if (pubsautors[[i]][which(naut == min(naut))[1]] == pubsautors[[i]][which(naut == min(naut))[2]]) {
            aut_pot[i] <- which.min(naut)  #take the first, use this one to run everything.
            code[i] <- "(4.1) >1 minima after matching of same lastname, position of 1st"
        } else {
            aut_pot[i] <- NA
            code[i] <- "(4.2) >1 minima after matching of different lastnames, set to NA"
        }
    }, error = function(e) e, warning = function(w) {
        print(i)
        w
        warnings <- append(warnings, i)
        # print(j) stop('converted from warning: ', conditionMessage(w))
    })
}

table(code, useNA = "always")
table(aut_pot, useNA = "always")
table(aut_max, useNA = "always")

df_pubs2$aut_pot <- aut_pot
df_pubs2$aut_max <- aut_max
df_pubs2$aut_code <- code

So we can play a little later by filtering on specific codes. or by recoding the NAs of author positions via the code variable. For example, setting all publications without author names as first author publications. etc.


7 person period file

7.1 pubs per year per author

I count the:

  • total number of publications
  • all first authored publications (inclusive sole authored publications)
  • all last authored publications (exclusive sole authored publications)
  • all sole authored publications
  • all first authored articles (inclusive sole authored articles)
  • all last authored articles (exclusive sole authored articles)
  • all sole authored articles
pubs <- df_pubs2

# publication year
pubs %>%
    mutate(pub_a = as.numeric(Type == "Artikel"), pub_first = as.numeric(aut_pot == 1), pub_last = as.numeric(aut_pot ==
        aut_max & aut_pot != 1), pub_sole = as.numeric(aut_max == 1), pub_first_a = as.numeric(aut_pot ==
        1 & Type == "Artikel"), pub_last_a = as.numeric(aut_pot == aut_max & aut_pot != 1 & Type == "Artikel"),
        pub_sole_a = as.numeric(aut_max == 1 & Type == "Artikel")) -> pubs


# aggregate by id and pub_year. this is what we want in the person period file.
pubs %>%
    group_by(id, pub_year) %>%
    summarize(npubs = n(), npubs_a = sum(pub_a, na.rm = TRUE), firstpub = first(firstpub), lastpub = first(lastpub),
        npubs_first = sum(pub_first, na.rm = TRUE), npubs_last = sum(pub_last, na.rm = TRUE), npubs_sole = sum(pub_sole,
            na.rm = TRUE), npubs_first_a = sum(pub_first_a, na.rm = TRUE), npubs_last_a = sum(pub_last_a,
            na.rm = TRUE), npubs_sole_a = sum(pub_sole_a, na.rm = TRUE), phd_year = first(phd_year),
        gender = first(gender_2), field = first(field), ethnicity = first(ethnicity3), uni = first(uni),
        id = first(id)) -> pubs2

# we aggregate also by id. this will help to find us the total publications over the career. but
# also to fill in the empty rows of the df_ppp later on.
pubs %>%
    group_by(id) %>%
    summarize(npubs_t = n(), npubs_a_t = sum(pub_a, na.rm = TRUE), firstpub = first(firstpub), lastpub = first(lastpub),
        npubs_first_t = sum(pub_first, na.rm = TRUE), npubs_last_t = sum(pub_last, na.rm = TRUE), npubs_sole_t = sum(pub_sole,
            na.rm = TRUE), npubs_first_a_t = sum(pub_first_a, na.rm = TRUE), npubs_last_a_t = sum(pub_last_a,
            na.rm = TRUE), npubs_sole_a_t = sum(pub_sole_a, na.rm = TRUE), phd_year = first(phd_year),
        gender = first(gender_2), field = first(field), ethnicity = first(ethnicity3), uni = first(uni),
        id = first(id)) -> pubs3

7.2 empty ppf

# first make an empty dataset

pub_year <- c(1990:2022)
npubs_zero <- rep(0, length(pub_year))
id <- unique(df_pubs2$id)
nid <- length(id)
pub_year <- rep(pub_year, nid)
npubs_zero <- rep(npubs_zero, nid)
id <- rep(id, each = length(c(1990:2022)))

df <- data.frame(id, pub_year)
df %>%
    arrange(id, pub_year) -> df

7.3 fill the ppf

data_ppf <- left_join(df, pubs3, by = c("id")) %>%
    filter(pub_year >= firstpub) %>%
    left_join(pubs2, by = c("id", "pub_year")) %>%
    mutate(npubs = replace_na(npubs, 0), npubs_a = replace_na(npubs_a, 0), npubs_first = replace_na(npubs_first,
        0), npubs_last = replace_na(npubs_last, 0), npubs_sole = replace_na(npubs_sole, 0), npubs_first_a = replace_na(npubs_first_a,
        0), npubs_last_a = replace_na(npubs_last_a, 0), npubs_sole_a = replace_na(npubs_sole_a, 0)) %>%
    rename_with(~gsub(".x", "", .x, fixed = TRUE)) %>%
    dplyr::select(c("id", "pub_year", "phd_year", "gender", "field", "ethnicity", "uni", "npubs", "npubs_a",
        "npubs_first", "npubs_last", "npubs_sole", "npubs_first_a", "npubs_last_a", "npubs_sole_a", "npubs_t",
        "npubs_a_t", "npubs_first_t", "npubs_last_t", "npubs_sole_t", "npubs_first_a_t", "npubs_last_a_t",
        "npubs_sole_a_t"))

7.4 saving

fsave(data_ppf, "df_ppf")  #the person period file dataset for the analysis
fsave(df_pubs2, "df_pubs")  #publications included in the ppf (with all info on phds attached as well)

Mulders, Anne Maaike, Bas Hofstra, and Jochem Tolsma. 2024. “A Matter of Time? Gender and Ethnic Inequality in the Academic Publishing Careers of Dutch PhDs.” Quantitative Science Studies, May, 1–29. https://doi.org/10.1162/qss_a_00306.

  1. We pick the first because an inspection of Narcis seems to indicate that the lead author is at times additionally mentioned as last author.↩︎

---
title: "Datawrangling"
bibliography: references.bib
---

<!---please be aware that caching large objects is problematic, hence cache.lazy=FALSE and you may need to turn cache=FALSE for chuncks in which you load large datasets --->

```{r, globalsettings, echo=FALSE, warning=FALSE}

library(knitr)
#library(rgl)
opts_chunk$set(tidy.opts=list(width.cutoff=100),tidy=TRUE, warning = FALSE, eval = FALSE, message = FALSE,comment = "#>", cache=TRUE, results='hold', class.source=c("test"), class.output=c("test2"), cache.lazy = FALSE)
options(width = 100)
rgl::setupKnitr()

knitr::opts_chunk$set(tab.cap.pre = "", tab.cap.sep = "")

colorize <- function(x, color) {sprintf("<span style='color: %s;'>%s</span>", color, x) }

```

```{r klippy, echo=FALSE, include=TRUE, eval=TRUE}
klippy::klippy(position = c('top', 'right'))
# install.packages("remotes")
#remotes::install_github("rlesur/klippy")
#klippy::klippy(color = 'darkred')
#klippy::klippy(tooltip_message = 'Click to copy', tooltip_success = 'Done')
```

------------------------------------------------------------------------

This lab journal is created for the paper 

**Tolsma, J., Hofstra, B. and Mulders, AM (2025). How COVID-19 Exacerbated Gender Inequalities in Dutch Academia. *Scientometrics*.**

In this file we show how we went from the raw data sets scraped from NARCIS to the working samples on which all analysis are based. Note that the raw data files are not shared but please email jochem.tolsma@ru.nl if you are interested in working with these raw data files. The working sample(s) can be downloaded [here](https://coviddutchacademia.netlify.app/analysis).

------------------------------------------------------------------------

# Custom functions

```{r, results='hide', eval = TRUE}
rm(list = ls())

fpackage.check <- function(packages) {
    lapply(packages, FUN = function(x) {
        if (!require(x, character.only = TRUE)) {
            install.packages(x, dependencies = TRUE)
            library(x, character.only = TRUE)
        }
    })
}

fsave <- function(x, file = NULL, location = "./data/processed/") {
    ifelse(!dir.exists("data"), dir.create("data"), FALSE)
    ifelse(!dir.exists("data/processed"), dir.create("data/processed"), FALSE)
    if (is.null(file))
        file = deparse(substitute(x))
    datename <- substr(gsub("[:-]", "", Sys.time()), 1, 8)
    totalname <- paste(location, file, "_", datename,  ".rda", sep = "")
    save(x, file = totalname)  #need to fix if file is reloaded as input name, not as x. 
}

fload <- function(filename) {
    load(filename)
    get(ls()[ls() != "filename"])
}

fshowdf <- function(x, ...) {
    knitr::kable(x, digits = 2, "html", ...) %>%
        kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>%
        kableExtra::scroll_box(width = "100%", height = "300px")
}

colorize <- function(x, color) {sprintf("<span style='color: %s;'>%s</span>", color, x) }
```

------------------------------------------------------------------------

# Packages

```{r, results='hide', eval = TRUE}
packages = c("ggplot2", "tidyverse", "RColorBrewer", "dplyr", "stringdist", "stringi", "future.apply")

fpackage.check(packages)
```

------------------------------------------------------------------------

# Publications (part 1)

Scraped publications via NARCIS. These raw data sets are not shared publicly but please email jochem.tolsma@ru.nl if you want to work with these raw data. 

```{r}
# publications raw datafiles
pubs1 <- fload(file = "./data/pubs_meta_df.rda")
pubs2 <- fload(file = "./data/pubs_meta_df_2020_2021.rda")
pubs3 <- fload(file = "./data/pubs_meta_df_2020_2021_adden.rda")
pubs4 <- fload(file = "./data/pubs_meta_df_2022_adden.rda")
```

```{r data}
# Different variables in each publication dataset, align this
 pubs1 <- pubs1[,c(1:4,7,231,13)]
pubs2 <- pubs2[,c(1:3,5,9,136, 16)]
 pubs3 <- pubs3[,c(1:3,5,9,141,16)]
 pubs4 <- pubs4[,c(1:4,6,113,12)]

# combine
pubs12 <- rbind.data.frame(pubs1, pubs2)
pubs123 <- rbind.data.frame(pubs12, pubs3)
pubs <- rbind.data.frame(pubs123, pubs4)
```

Selection of publications

```{r }
table(pubs$Type, useNA = "always")
pubs$Type <- as.factor(pubs$Type)
 pubs <- pubs[(pubs$Type=="Artikel") | (pubs$Type=="Boek" | (pubs$Type=="Boekdeel") | (pubs$Type=="Conference Paper") | (pubs$Type=="Conference Proceedings") | (pubs$Type=="Conferentiebijdrage") | (pubs$Type=="Patent") | (pubs$Type=="Rapport") | (pubs$Type=="Dataset") | (pubs$Type=="Review")), ] #845256
```

Removing duplicates

```{r}
pubs %>%
  filter(!is.na(Type)) %>%
  mutate(Titel_ori = Titel, 
         Titel = tolower(Titel),
         Titel = str_sub(Titel, 1, 40),
         Titel = str_replace_all(Titel, "\\s", ""),
         Titel = str_replace_all(Titel, intToUtf8(8217), intToUtf8(39) ), 
         Titel = str_replace_all(Titel, " :", ":" ), 
         Titel = str_replace_all(Titel, " \\(", "\\(" ),
         Titel = str_replace_all(Titel, "\\( ", "\\(" ),
         Titel = str_replace_all(Titel, "\\) ", "\\)" ),
         Titel = str_replace_all(Titel, " \\)", "\\)" )) %>%
  group_by(person_id) %>%
  distinct(Titel, .keep_all=TRUE) %>%
  ungroup -> df_pubs
```

Removing pubs without pubyear

```{r}
colnames(df_pubs) <- make.names(colnames(df_pubs), unique = TRUE)

# derive publication year from variable Data.issued
df_pubs %>% 
  mutate(pub_year = as.character(Date.issued),
         pub_year = substr(pub_year, 1,4),
         pub_year = as.numeric(pub_year)) -> df_pubs

df_pubs %>%
  filter(!is.na(pub_year)) -> df_pubs #697695

```

Set pubs published in 2023 (16) to 2022

```{r}
df_pubs %>% 
  mutate(pub_year = ifelse(pub_year>2022, 2022, pub_year)) -> df_pubs
```

```{r}
length(unique(df_pubs$Titel)) #478761
length(unique(df_pubs$person_id)) #27285

```

Saving the data

```{r, eval=FALSE}
fsave(df_pubs, file = "df_covid_pubs", location = "./data/processed/")
```

------------------------------------------------------------------------

# PhDs

We use three processed datasets.

-   `phds_variables.rda`: dataset of Dutch PhDs between 1990-2021

This data set has been prepared by Anne Maaike Mulders, see [@mulders2024]. 

-   `url_to_unique_id.rda`: used to match different scraped info from NARCIS and where different id variables are used.

Also load our publication dataset.

```{r }
df_phd <- fload(file = "./data/processed/phds_variables.rda")

df_pubs <- fload(file = "./data/processed/20240701df_covid_pubs.rda")

df_id <- fload(file = "./data/url_to_unique_id.rda")
```

how many unique phds do we start with

```{r}
length(unique(df_phd$id))
table(df_phd$start_pub)
```

add diss_url variable to df_phd dataframe

```{r}
df_id$id <- paste0("i", df_id$id)
df_phd2 <- left_join(df_phd, df_id, by=c("id")) #we created some duplicates
```

add df_phd to publications

```{r}
df_pubs2 <- left_join(df_pubs, df_phd2, by=c("person_id" = "person_id1")) #we created some duplicates
```

------------------------------------------------------------------------

# Publications (part 2)

## Delete publications without phd info

```{r}
df_pubs2 <- df_pubs2 %>% 
            filter(!is.na(id)) %>%
            filter(dupid == 0) #6277728
```

so we lost approximately 70.000 publication. probably of scholars who did publish but do not have a profile.

## Removing duplicates

This time use the DOI.

```{r}
df_pubs2 %>%
  group_by(person_id) %>%
  mutate(duplicateDOI = (duplicated(DOI) & !is.na(DOI))) %>%
  filter(!(duplicateDOI)) -> df_pubs2 #624143

```

We are only interested in publications after the Phd.

```{r}
df_pubs2 <- df_pubs2 %>% 
            filter(phd_year<pub_year) #465283
```

We add three variables to the publications data:

-   first year of publication after their PhD\
-   last year of publication after their PhD\
-   how many publications written 1-3 years prior to covid

```{r}
df_pubs2 %>% 
  mutate(covidpub = as.numeric(pub_year>2016 & pub_year<2020)) %>%
  group_by(id) %>%
  dplyr::summarize(
            firstpub = min(pub_year, na.rm=T),
            lastpub = max(pub_year, na.rm=T), 
            covidpub= sum(covidpub)) -> flpubs

df_pubs2 %>% left_join(flpubs) -> df_pubs2
```

## Remove publications and PhDs, given inclusion criteria

How many phds do we now have publications for:

```{r}
length(unique(df_pubs2$id)) #19368
```

Only select Phds who published at least one publication in the three years before COVID-19.

```{r}
df_pubs2 %>% filter(covidpub > 0) -> df_pubs2
length(unique(df_pubs2$id)) #10001
```

And only select Phds who have a publication career of at least three years.

```{r}
df_pubs2 %>% filter(lastpub > firstpub + 1) -> df_pubs2
length(unique(df_pubs2$id)) #8303
```

## Intermediate save to construct our research domain variable

select only papers published after 2014

```{r}
df_pubs_topic <- df_pubs2 %>% 
            filter(pub_year > 2014) 
#fsave(df_pubs_topic, "df_pubs_topic")
```

Based on these pubs we determine the field in which the authors are active via OpenAlex. See [here](https://coviddutchacademia.netlify.app/oa_topics_fetch).

------------------------------------------------------------------------

# Author position of PhD in publication

## Prepare author-list of each publication

`r colorize("CAUTION: We use multisession.", "red")`

```{r}
plan(multisession, workers = 10) ## Parallelize 

#make a list for each publication of autors, cleaned a bit
str_split(df_pubs2$Auteur, ";") %>%
  #lowercase
  future_lapply(tolower) %>% 
  # Removing diacritics
  future_lapply(stri_trans_general, id = "latin-ascii") %>%
  #remove first names
  future_lapply(sub, pattern="\\(.*\\)", replacement="") %>%
  future_lapply(sub, pattern=",.*", replacement="") %>%
  #remove van de etc. 

  future_lapply(sub, pattern="'t ", replacement="") %>%
  future_lapply(sub, pattern="d' ", replacement="") %>%
  future_lapply(sub, pattern="de ", replacement="") %>%
  future_lapply(sub, pattern="de la ", replacement="") %>%
  future_lapply(sub, pattern="den ", replacement="") %>%
  future_lapply(sub, pattern="del ", replacement="") %>%
  future_lapply(sub, pattern="der ", replacement="") %>%
  future_lapply(sub, pattern="des ", replacement="") %>%
  future_lapply(sub, pattern="el ", replacement="") %>%
  future_lapply(sub, pattern="el- ", replacement="") %>%
  future_lapply(sub, pattern="in 't ", replacement="") %>%
  future_lapply(sub, pattern="la ", replacement="") %>%
  future_lapply(sub, pattern="le ", replacement="") %>%
  future_lapply(sub, pattern="les ", replacement="") %>%
  future_lapply(sub, pattern="op den ", replacement="") %>%
  future_lapply(sub, pattern="ten ", replacement="") %>%
  future_lapply(sub, pattern="ter ", replacement="") %>%
  future_lapply(sub, pattern="tes ", replacement="") %>%
  future_lapply(sub, pattern="van ", replacement="") %>%
  future_lapply(sub, pattern="van 't ", replacement="") %>%
  future_lapply(sub, pattern="van de ", replacement="") %>%
  future_lapply(sub, pattern="van der ", replacement="") %>%
  future_lapply(sub, pattern="van den ", replacement="") %>%
  future_lapply(sub, pattern="von der ", replacement="") %>%
  future_lapply(sub, pattern="op den ", replacement="") %>%
  future_lapply(sub, pattern="ul ", replacement="") %>%
  #remove white spaces
  future_lapply(str_squish) %>%
  # remove initials
  future_lapply(gsub, pattern= ".*\\.", replacement="") %>%
  # Remove double names separated by a dash THINK WE NEED TO KEEP NOT THE FIRST BUT LAST
  future_lapply(sub, pattern= "\\-.*", replacement="") %>%
  #remove white spaces again
  future_lapply(str_squish) %>%
  #remove double names sperated by a space WE KEEP TEH FIRST
  future_lapply(gsub, pattern= " .*$", replacement="") ->  pubsautors


```

## Author position

<!---would be great if we look together at the flow plan and the code--->

We determine the position of the PhD in the author-list of each publication.

We add two variables to the publicatin dataset `pubs`:

-   `aut_max`: the number of authors of each publication\
-   `aut_pos`: the position of the PhD in the publication

**Flow plan:**

-   step -1: check to see if we have a name.
    -   1.1: No, **done**\
-   Stap 0: check to see if it is a sole-authored paper.
    -   0.1: Yes, **done**\
    -   0.2: No\
-   Stap 1: check to see if we have one or more exact matches.
    -   1.1: one exact match, determine position, **done**.\
    -   1.2: two or more exact mathces, pick first **done**.[^1]\
-   Stap 2: check whether last name consists of one letter.
    -   2.1: Yes, set position to `NA`, done\
    -   2.2 No\
-   Stap 3: Determine distance between last name and the names on the publication.
    -   3.1: No minimial distance observed, set position to `NA`, **done**\
    -   3.2: One minimum, set position, **done**.\
-   Stap 4: when multiple minima, check to see if it is the same last name.
    -   4.1: Yes, pick first (arbitrary), **done**\
    -   4.2: No, set position to `NA`, **done**

[^1]: We pick the first because an inspection of Narcis seems to indicate that the lead author is at times additionally mentioned as last author.

```{r}
warnings <- NA
aut_pot <- aut_max <- code <- rep(NA, length(df_pubs2$lastname))

table(is.na(df_pubs2$Auteur), useNA = "always")

#loop over each publication
for (i in 1:length(df_pubs2$lastname)) {
tryCatch({
  #step -1 
  if(is.na(df_pubs2$Auteur[i])) {
    code[i] <- "(-1) no name" #probably have to set to 0 later
    next
  }
  aut_max[i] <- length(pubsautors[[i]]) #number of authors 
  #step 0: 
  if (aut_max[i] == 1) {
    aut_pot[i] <- 1
    code[i] <- "(0) publication with 1 author"
    next 
  }
  #step 1: 
  if (df_pubs2$lastname[i] %in% pubsautors[[i]]) {
    aut_pot[i] <- which.min(pubsautors[[i]] %in% df_pubs2$lastname[i]) #########pasop. which.min vindt eerste false, niet eerste true!!!!!!!!!!!!
    if (sum(pubsautors[[i]] %in% df_pubs2$lastname[i]) == 1) {
    code[i] <- "(1.1) one exact match"  
    } else {
      code[i] <- "(1.2) >1 exact matches, position of 1st"  
      }
    next
  }
  #step 2:
  if (nchar(df_pubs2$lastname[i])==1) {
    aut_pot[i] <- NA
    code[i] <- "(2) lastname just 1 character, set to NA"
    next
  }
  #step 3: 
  naut1 <- stringdist(df_pubs2$lastname[i], pubsautors[[i]], method="jaccard")
  naut2 <- stringdist(df_pubs2$lastname[i], pubsautors[[i]], method="lv")
  naut3 <- stringdist(df_pubs2$lastname[i], pubsautors[[i]], method="jw", p=0) #this one helps with double last names "tolsma berkhof"
  naut <- naut1 + naut2 + naut3
  #step 3.1
  if (is.na(min(naut)) | is.nan(min(naut))) {
    aut_pot[i] <- NA
    code[i] <- "(3.1) no minimum via matching, set to NA"
    next
  }
  #step 3.2 
  if (sum(naut==min(naut)) == 1) {
    aut_pot[i] <- which(naut==min(naut))
    code[i] <- "(3.2) 1 minimum via matching"
    next
  }
  #step 4.1
  if (pubsautors[[i]][which(naut==min(naut))[1]] ==  pubsautors[[i]][which(naut==min(naut))[2]]) {
    aut_pot[i] <- which.min(naut)   #take the first, use this one to run everything.
    code[i] <- "(4.1) >1 minima after matching of same lastname, position of 1st"
  } else {
    aut_pot[i] <- NA
    code[i] <- "(4.2) >1 minima after matching of different lastnames, set to NA"
  }
} , error = function(e) e, warning = function(w) {
  print(i)
  w
  warnings <- append(warnings, i)
  #print(j)
  #stop("converted from warning: ", conditionMessage(w))
  }
)
}

table(code, useNA = "always")
table(aut_pot, useNA = "always")
table(aut_max, useNA = "always")

df_pubs2$aut_pot <- aut_pot
df_pubs2$aut_max <- aut_max
df_pubs2$aut_code <- code



```

So we can play a little later by filtering on specific codes. or by recoding the NAs of author positions via the code variable. For example, setting all publications without author names as first author publications. etc.

------------------------------------------------------------------------

# person period file

## pubs per year per author

I count the:

-   total number of publications\
-   all first authored publications (inclusive sole authored publications)
-   all last authored publications (exclusive sole authored publications)\
-   all sole authored publications\
-   all first authored articles (inclusive sole authored articles)
-   all last authored articles (exclusive sole authored articles)\
-   all sole authored articles

```{r, echo = FALSE}
table(df_pubs2$gender, useNA="always")
table(df_pubs2$gender_2, useNA="always")
table(df_pubs2$ethnicity, useNA="always")
table(df_pubs2$ethnicity2, useNA="always")
table(df_pubs2$ethnicity3, useNA="always")
```

```{r}
pubs <- df_pubs2

#publication year
pubs %>% 
  mutate(pub_a = as.numeric(Type=="Artikel"),
          pub_first = as.numeric(aut_pot==1),
         pub_last = as.numeric(aut_pot==aut_max & aut_pot!=1),
         pub_sole = as.numeric(aut_max==1),
         pub_first_a = as.numeric(aut_pot==1 & Type=="Artikel"),
         pub_last_a = as.numeric(aut_pot==aut_max & aut_pot!=1 & Type=="Artikel"), 
         pub_sole_a = as.numeric(aut_max==1 & Type=="Artikel")) -> pubs


#aggregate by id and pub_year. this is what we want in the person period file. 
pubs %>% 
  group_by(id, pub_year) %>%
  summarize(npubs = n(),
            npubs_a = sum(pub_a, na.rm=TRUE),
            firstpub = first(firstpub),
            lastpub = first(lastpub),
            npubs_first = sum(pub_first, na.rm=TRUE),
            npubs_last = sum(pub_last, na.rm=TRUE),
            npubs_sole = sum(pub_sole, na.rm=TRUE),
            npubs_first_a = sum(pub_first_a, na.rm=TRUE),
            npubs_last_a = sum(pub_last_a, na.rm=TRUE),
            npubs_sole_a = sum(pub_sole_a, na.rm=TRUE),
            phd_year = first(phd_year),
            gender = first(gender_2),
            field = first(field),
            ethnicity = first(ethnicity3),
            uni = first(uni),
            id = first(id)) -> pubs2

#we aggregate also by id. this will help to find us the total publications over the career. but also to fill in the empty rows of the df_ppp later on. 
pubs %>% 
  group_by(id) %>%
  summarize(npubs_t = n(),
            npubs_a_t = sum(pub_a, na.rm=TRUE),
            firstpub = first(firstpub),
            lastpub = first(lastpub),
            npubs_first_t = sum(pub_first, na.rm=TRUE),
            npubs_last_t = sum(pub_last, na.rm=TRUE),
            npubs_sole_t = sum(pub_sole, na.rm=TRUE),
            npubs_first_a_t = sum(pub_first_a, na.rm=TRUE),
            npubs_last_a_t = sum(pub_last_a, na.rm=TRUE),
            npubs_sole_a_t = sum(pub_sole_a, na.rm=TRUE),
            phd_year = first(phd_year),
            gender = first(gender_2),
            field = first(field),
            ethnicity = first(ethnicity3),
            uni = first(uni),
            id = first(id)) -> pubs3


```

## empty ppf

```{r}
#first make an empty dataset

pub_year <- c(1990:2022)
npubs_zero <- rep(0, length(pub_year))
id <- unique(df_pubs2$id)
nid <- length(id)
pub_year <- rep(pub_year, nid)
npubs_zero <- rep(npubs_zero, nid)
id <- rep(id, each=length(c(1990:2022)))  

df <- data.frame(id, pub_year)
df %>% 
  arrange(id, pub_year) -> df
```

## fill the ppf

```{r}

data_ppf <- left_join(df, pubs3, by=c("id")) %>%
  filter(pub_year>=firstpub) %>%
  left_join(pubs2, by=c("id", "pub_year")) %>%
  mutate(npubs = replace_na(npubs, 0 ),
         npubs_a = replace_na(npubs_a, 0 ),
         npubs_first = replace_na(npubs_first, 0),
         npubs_last = replace_na(npubs_last, 0),
         npubs_sole = replace_na(npubs_sole, 0),
         npubs_first_a = replace_na(npubs_first_a, 0),
         npubs_last_a = replace_na(npubs_last_a, 0), 
         npubs_sole_a = replace_na(npubs_sole_a, 0)) %>%
  rename_with(~gsub(".x", "", .x, fixed = TRUE)) %>% 
  dplyr::select(c("id", "pub_year", "phd_year", "gender", "field", "ethnicity", "uni", "npubs", "npubs_a", "npubs_first", "npubs_last", "npubs_sole","npubs_first_a", "npubs_last_a", "npubs_sole_a", "npubs_t", "npubs_a_t", "npubs_first_t", "npubs_last_t", "npubs_sole_t","npubs_first_a_t", "npubs_last_a_t", "npubs_sole_a_t" ))

```

## saving

```{r}
fsave(data_ppf, "df_ppf") #the person period file dataset for the analysis
fsave(df_pubs2, "df_pubs") #publications included in the ppf (with all info on phds attached as well)
```

------------------------------------------------------------------------



Copyright © 2024- Jochem Tolsma