at main 242 lines 9.5 kB view raw
1# install.packages("tidyverse") 2library(tibble) # tibble (comes from tidyverse) 3# install.packages("word2vec") 4library(word2vec) # read.wordvectors 5# install.packages("reticulate") 6library(reticulate) # reticulate::py_eval 7# install.packages("tidytext") 8library(tidytext) # data("stop_words") 9# install.packages("dplyr") 10library(dplyr) # anti_join 11# install.packages("SnowballC") 12library(SnowballC) # wordStem 13# install.packages("gglot2") 14library(ggplot2) # ggplot 15 16# This will load the data set needed for stop words 17data("stop_words") 18 19# w2v : (String, Embedding) -> Vector 20# This function, given a string and an embedding will return the vector 21# associated to that word. In the case where the word cannot be found in the 22# embedding, the vector returned is the zero vector. Additionally, the word is 23# cleaned before processing it. 24w2v <- function(str, emb) { 25 clean_str <- txt_clean_word2vec(str) 26 if (any(clean_str == rownames(emb))) emb[clean_str,] else numeric(ncol(emb)) 27} 28 29# w2v_v : ([String], Embedding) -> Matrix 30# This function, given a list of strings and an embedding will return the 31# vectors associated to each word as a matrix. Each column of the Matrix 32# returned corresponds to the vectors associated to that word. Because of this, 33# you can access corresponding vectors like: 34# > apple <- w2v_v(list("apple", "grape") |> as.characte(), emb)[,"apple"] 35# > grape <- w2v_v(list("apple", "grape") |> as.characte(), emb)[,"grape"] 36w2v_v <- function(words, emb) Vectorize(function(str) w2v(str, emb))(words) 37 38# sim : (String, String, Embedding) -> Matrix 39# sim : (String, String, Embedding, String) -> Matrix 40# sim : ([String], [String], Embedding) -> Matrix 41# sim : ([String], [String], Embedding, String) -> Matrix 42# This function, given two strings or lists of strings and an embedding, will 43# return the semantic similarity between those two words or list of words as a 44# Floating-point Matrix where each entry is between -1.0 and 1.0. This semantic 45# similarity is just the cosine of the angle between the two vectors. 46sim <- function(w1, w2, emb, type = "cosine") word2vec_similarity(w2v(w1, emb), w2v(w2, emb), type = type) 47 48# When Python writes a pandas.DataFrame into a CSV, it will run each cell in the 49# DataFrame through repr and write that into the CSV. This means that when R 50# reads the csv, R will interpret each cell in the CSV as a string, instead of 51# the correct type that is supposed to actually be there. 52py2r <- function(df, rows) { 53 for (row in rows) { 54 # In this case we use reticulate::py_eval to convert a python string 55 # into the R representation. 56 df[[row]] <- lapply(df[[row]], reticulate::py_eval) 57 } 58 return(df) 59} 60 61# rep : (String, Integer) -> [String] 62# This function, given a string and an integer, will return a list containing 63# the string repeated however many times the integer says it should. 64rep <- function(str, count) { 65 out <- character(length = count) 66 for (i in 1:count) { 67 out[i] <- str 68 } 69 out 70} 71 72# These are the tags that are used to categorize the data. 73lgbt_tags <- c("lgbt","lgbtq","sex","identity","gender","orientation","nonbinary") |> as.character() 74race_ethnicity_tags <- c("race","ethnicity","african","american","black","hispanic","asian","indigenous","native","latino","latina","latine") |> as.character() 75women_tags <- c("woman","women","girl","feminine","femeninity","ms","mrs") |> as.character() 76men_tags <- c("man", "men", "boy", "male", "masculine", "masculinity", "mr") |> as.character() 77disabilities_tags <- c("disabilities","disabled","disability","handicap","handicapped","neurodivergent") |> as.character() 78 79# This variable holds all of the tags. Additionally, tag_categories holds all 80# the tags together with their categories. 81tags <- c(lgbt_tags, race_ethnicity_tags, women_tags, men_tags, disabilities_tags) 82tag_categories <- c( 83 rep("lgbt", length(lgbt_tags)), 84 rep("race/ethnicity", length(race_ethnicity_tags)), 85 rep("women", length(women_tags)), 86 rep("men", length(men_tags)), 87 rep("disabilities", length(disabilities_tags)) 88) 89 90# 91similarity_values <- numeric(length = length(tags)) 92word_category <- function(word, tag_vectors, emb, threshold = 0.3) { 93 for (i in seq_along(tags)) { 94 similarity_values[i] <- word2vec_similarity(w2v(word, emb), tag_vectors[,i], type = "cosine") 95 } 96 97 similarities <- data.frame( 98 sim = similarity_values, 99 tag = tags, 100 tag_category = tag_categories 101 ) 102 103 similarities <- similarities[similarities |> complete.cases(),] 104 m <- max(similarities $ sim, na.rm = TRUE) 105 if (m > threshold) similarities[m == similarities,] |> head(1) else data.frame(sim = NA, tag = NA, tag_category = NA) 106} 107 108memo <- new.env(hash = TRUE, parent = emptyenv()) 109word_category_m <- function(x, tag_vectors, emb, threshold = 0.3) { 110 if (is.null(memo[[x]])) { 111 memo[[x]] <- word_category(x, tag_vectors, emb, threshold) 112 } 113 return(memo[[x]]) 114} 115 116word_category_v <- function(words, tag_vectors, emb, threshold = 0.3) { 117 res <- Vectorize(function(word) word_category_m(word, tag_vectors, emb))(words) |> t() 118 sim <- numeric(length = length(words)) 119 tags <- character(length = length(words)) 120 tag_categories <- character(length = length(words)) 121 for (i in 1:nrow(res)) { 122 sim[i] <- res[,"sim"][[i]] 123 tags[i] <- res[,"tag"][[i]] 124 tag_categories[i] <- res[,"tag_category"][[i]] 125 } 126 data.frame( 127 sim = sim, 128 tag = tags, 129 tag_category = tag_categories 130 ) 131} 132 133# Read data from CSV and convert it into its R representation 134data <- read.csv("data/variables.csv") |> py2r(rows = 4:9) 135ams_selection <- data[,"title"] == "American Mathematical Society" 136data[ams_selection,"title"] <- "AMS" 137cbms_selection <- data[,"title"] == "Conference Board of the Mathematical Sciences 2021 Survey" 138data[cbms_selection,"title"] <- "CBMS" 139ipeds_selection <- data[,"title"] == "Integrated Postsecondary Education Data System (IPEDS) Institution Lookup" 140data[ipeds_selection,"title"] <- "IPEDS" 141selection <- ams_selection | cbms_selection | ipeds_selection 142 143data <- data[selection,] 144 145clean_text <- function(raw_text) { 146 tmp <- tibble( 147 line = seq_along(raw_text), 148 text = raw_text 149 ) |> unnest_tokens(word, text) 150 tmp[!grepl("\\d", tmp $ word),] |> anti_join(stop_words) 151} 152 153# Word stems analysis 154 155for (row_idx in 1:nrow(data)) { 156 title <- data[row_idx,"title"] 157 stem_txt <- clean_text(data[row_idx,"variables"][[1]]) |> 158 mutate(word_stem = wordStem(word)) 159 160 stem_count <- stem_txt |> 161 inner_join(count(stem_txt, word_stem)) |> 162 filter(n > 5) |> 163 distinct(word_stem, .keep_all = TRUE) 164 165 save(stem_count, file = paste("data/stem_", title, ".Rda", sep = "")) 166} 167 168 169# Word semantics analysis 170 171word_semantic_analysis <- function(emb, data, model_name, threshold = 0.3) { 172 tag_vectors <- w2v_v(tags, emb) 173 for (row_idx in 1:nrow(data)) { 174 title <- data[row_idx,"title"] 175 clean_txt <- clean_text(data[row_idx, "variables"][[1]]) 176 177 memo <- new.env(hash = TRUE, parent = emptyenv()) 178 word_category_m <- function(x, tag_vectors, emb, threshold = 0.3) { 179 if (is.null(memo[[x]])) { 180 memo[[x]] <- word_category(x, tag_vectors, emb, threshold) 181 } 182 return(memo[[x]]) 183 } 184 185 word_category_v <- function(words, tag_vectors, emb, threshold = 0.3) { 186 res <- Vectorize(function(word) word_category_m(word, tag_vectors, emb, threshold))(words) |> t() 187 sim <- numeric(length = length(words)) 188 tags <- character(length = length(words)) 189 tag_categories <- character(length = length(words)) 190 for (i in 1:nrow(res)) { 191 sim[i] <- res[,"sim"][[i]] 192 tags[i] <- res[,"tag"][[i]] 193 tag_categories[i] <- res[,"tag_category"][[i]] 194 } 195 data.frame( 196 sim = sim, 197 tag = tags, 198 tag_category = tag_categories 199 ) 200 } 201 202 word_categories <- word_category_v(clean_txt $ word, tag_vectors, emb, threshold) 203 sema_txt <- clean_txt |> 204 mutate(tag = word_categories $ tag, word_category = word_categories $ tag_category) 205 206 sema_count <- sema_txt |> 207 inner_join(count(sema_txt, tag)) |> 208 distinct(tag, .keep_all = TRUE) 209 210 save(sema_count, file = paste("data/", model_name, "_", threshold, "_", title, ".Rda", sep = "")) 211 } 212} 213 214# emb (short for embedding) is a matrix with 3,000,000 rows and 300 columns. 215# Each row represents a point in 300-dimensional space. Since this is a 216# two-dimensional matrix, you can access a specific coordinate using: 217# > emb["your_word_here",number_of_coordinate_here] 218# However, what we want to do is associate an english word with a point in 300 219# dimensions. The way in which we will be using emb is: 220# > grape <- emb["apple",] 221# > apple <- emb["grape",] 222# After this, we can calculate how "close" grape and apple are semantically: 223# > word2vec_similarity(grape, apple, type = "cosine") 224# This returns a number between 0.0 and 1.0, where 0.0 represents completely 225# different words and 1.0 represents the same word. 226# For more information on how this works check out: 227# - https://code.google.com/archive/p/word2vec/ 228 229read.wordvectors("google_vecs.bin", type = "bin") |> 230 word_semantic_analysis(data = data, model_name = "google_news", threshold = 0.185) 231 232read.wordvectors("glove.6B.300d.txt", type = "txt") |> 233 word_semantic_analysis(data = data, model_name = "glove_300d", threshold = 0.134) 234 235read.wordvectors("glove.6B.200d.txt", type = "txt") |> 236 word_semantic_analysis(data = data, model_name = "glove_200d", threshold = 0.164) 237 238read.wordvectors("glove.6B.100d.txt", type = "txt") |> 239 word_semantic_analysis(data = data, model_name = "glove_100d", threshold = 0.219) 240 241read.wordvectors("glove.6B.50d.txt", type = "txt") |> 242 word_semantic_analysis(data = data, model_name = "glove_50d", threshold = 0.273)