options(stringsAsFactors = F, verbose=F, warn.conflicts=F, quietly=T, warn=-1) require("optparse", quietly=T) option_list = list( make_option(c('-w','--wos'), action='store', help='Path to file containing Web of Science records.'), make_option(c('-s','--scopus'), action='store', help='Path to file continaing Scopus records.'), make_option(c('-b','--blacklist'), action='store', help='Path to file containing list of blacklisted journals.'), make_option(c('-f','--faculty'), action='store', help='Path to file containing faculty names and departments.'), make_option(c('-o','--output'), action='store', help='File to save output.') ) opt = parse_args(OptionParser(option_list=option_list)) # load libraries suppressMessages({ require("tidyr") require("dplyr") require("stringr") require("digest") }) w_records = read.csv(opt$wos, sep="\t") s_records = read.csv(opt$scopus) f_list = read.csv(opt$faculty) b_list = readLines(opt$blacklist) # function for cleaning up text sanitize = function(str){ str %>% toupper() %>% gsub(pattern="[[:punct:]]", replace=" ") %>% gsub(pattern=" {1,}", replace=" ") %>% trimws() } # process web of science records wos = w_records %>% select(AU,TI,PY,SO,PT) %>% rename(Authors = AU, Title = TI, Year = PY, Journal = SO, Type = PT) %>% # create record id mutate(record_id = paste("WOS", 1:n(), sep='-')) %>% # flag records with missing titles or authors mutate(missing_info = ((Title == "") | (Authors == ""))) %>% # recode pub type mutate(Type = case_when(Type == 'J' ~ 'Article', Type == 'B' ~ 'Book/Chapter', Type == 'S' ~ 'Book/Chapter', TRUE ~ 'NA')) %>% mutate(incorrect_type = !(Type %in% c('Article', 'Chapter', 'Book'))) %>% # make authors upcase mutate(Authors = Authors %>% toupper()) %>% # clean up title strings mutate(Title = sanitize(Title)) %>% # clean up journal strings mutate(Journal = sanitize(Journal)) %>% # format so each author is on row with own record mutate(Authors = str_split(Authors, ';')) %>% unnest(Authors) %>% mutate(Authors = Authors %>% str_trim(), # strip punctuation from author names Authors = gsub('[^A-Z, \\-]', '', Authors, perl=T)) %>% # parse first and last names and initials mutate(LastName = gsub('^([A-Z \\-]+), *([A-Z\\-]+) *([A-Z]?).*', '\\1', Authors, perl=T), FirstName = gsub('^([A-Z \\-]+), *([A-Z\\-]+) *([A-Z]?).*', '\\2', Authors, perl=T), MiddleInitial = gsub('^([A-Z \\-]+), *([A-Z\\-]+) *([A-Z]?).*', '\\3', Authors, perl=T), MiddleInitial = str_sub(MiddleInitial, 1,1), FirstInitial = str_sub(FirstName,1,1), FirstNameLen = str_count(FirstName)) %>% # indicate where the data came from mutate(preference = 1) %>% mutate(source="WOS") # load and process scopus records scopus = s_records %>% # create record id mutate(record_id = paste("SCOPUS", 1:n(), sep='-')) %>% # flag records with missing authors and titles mutate(missing_info = ((Title == "" | Authors == ""))) %>% # recode pub types mutate(Type = case_when(Type == 'Article' ~ 'Article', Type == 'Book Chapter' ~ 'Chapter', Type == 'Review' ~ 'Article', Type == 'Article in Press' ~ 'Article', Type == 'Book' ~ 'Book', TRUE ~ 'NA')) %>% mutate(incorrect_type = !(Type %in% c('Article', 'Chapter', 'Book'))) %>% # authors are formatted differently in scopus # reformat authors and split them into distinct people mutate(Authors = str_replace_all(Authors, " ", ""), Authors = gsub("([A-Z]),([A-Z])", "\\1;\\2", Authors, perl=T), Authors = Authors %>% toupper(), Authors = str_split(Authors, ";")) %>% # clean up title strings mutate(Title = sanitize(Title)) %>% # clean up journal strings mutate(Journal = sanitize(Journal)) %>% # reformat data frame so each row is an author unnest(Authors) %>% # strip punctuation from authors mutate(Authors = gsub('[^A-Z, \\-]', '', Authors, perl=T)) %>% # parse first and last names and initials mutate(LastName = gsub('^([A-Z \\-]+),([A-Z])([A-Z]?)', '\\1', Authors, perl=T), FirstName = NA, FirstInitial = gsub('^([A-Z \\-]+),([A-Z])([A-Z]?)', '\\2', Authors, perl=T), FirstInitial = str_sub(FirstInitial, 1, 1), FirstName = FirstInitial, MiddleInitial = gsub('^([A-Z \\-]+),([A-Z])([A-Z]?)', '\\3', Authors, perl=T), MiddleInitial = str_sub(MiddleInitial, 1, 1), FirstNameLen = 1) %>% # indicate where the data came from mutate(preference = 2) %>% mutate(source = "SCOPUS") # combine wos and scopus records into single data set for analysis combined_records = rbind(wos, scopus) # if article title appears in both wos and scopus keep the record # with the most author info (web of science) scopus_dupes = combined_records %>% select(Title, source) %>% unique() %>% group_by(Title) %>% mutate(cnt = n_distinct(source)) %>% filter(source == 'SCOPUS', cnt >= 2) # drop those scopus records that appear in wos combined_records = combined_records %>% filter(!((Title %in% scopus_dupes$Title) & source == 'SCOPUS')) # process faculty names from faculty list faculty = f_list %>% mutate(LastName = LastName %>% toupper() %>% str_trim(), FirstName = FirstName %>% toupper() %>% str_trim(), Department = Department %>% toupper(), MiddleInitial = sanitize(MiddleInitial) %>% substr(1,1), FirstInitial = substr(FirstName, 1, 1)) # merge reference records to faculty on last name only to # find candidate matches candidate_matches = combined_records %>% left_join(faculty, by = 'LastName') %>% mutate(FirstNameMatch = (FirstName.x == FirstName.y), FirstInitialMatch = (FirstInitial.x == FirstInitial.y), MiddleInitialMatch = (MiddleInitial.x == MiddleInitial.y), OnlyFirstInitAvailable = (FirstNameLen == 1)) # (middle init match OR one middle init missing) positive_matches = candidate_matches %>% # flag positive matches mutate(Match = case_when(FirstNameMatch & (MiddleInitialMatch | MiddleInitial.x == '' | MiddleInitial.y == '') ~ TRUE, FirstInitialMatch & (FirstNameLen == 1) & (MiddleInitialMatch | MiddleInitial.x == '' | MiddleInitial.y=='') ~ TRUE, TRUE ~ FALSE)) %>% rename(FirstName = FirstName.y, MiddleInitial = MiddleInitial.y) %>% select(Department, Authors, LastName, FirstName, MiddleInitial, Title, Journal, Year, Type, preference, missing_info, incorrect_type, source, record_id, Match) %>% # flag blacklisted journals mutate(blacklisted = (Journal %in% b_list)) # get li9st of record ids that found faculty matches matched_ids = positive_matches %>% filter(Match) %>% select(record_id) %>% unique() # remove duplicates deduped_records = positive_matches %>% filter((record_id %in% matched_ids$record_id)) %>% filter(Match) %>% filter(!missing_info) %>% filter(!incorrect_type) %>% filter(!blacklisted) %>% # create pub id using SHA1 ungroup() %>% rowwise() %>% mutate(pub_id = sha1(c(FirstName, LastName, Title))) %>% # remove duplicates group_by(Department, Year, pub_id) %>% top_n(1,preference) # write deduplicated records to file outfile = deduped_records %>% select(Year, Department, Authors, FirstName, MiddleInitial, LastName, Title, Journal, Type, pub_id) %>% unique() write.csv(outfile, file=opt$output, row.names=F)