Source code for wefe.metrics.RND

"""Relative Norm Distance (RND) metric implementation."""
from typing import Any, Callable, Dict, List, Tuple, Union

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from wefe.metrics.base_metric import BaseMetric
from wefe.preprocessing import get_embeddings_from_query
from wefe.query import Query
from wefe.word_embedding_model import WordEmbeddingModel


[docs]class RND(BaseMetric): """Relative Norm Distance (RND). It measures the relative strength of association of a set of neutral words with respect to two groups. References ---------- | [1]: Nikhil Garg, Londa Schiebinger, Dan Jurafsky, and James Zou. | Word embeddings quantify 100 years of gender and ethnic stereotypes. | Proceedings of the National Academy of Sciences, 115(16):E3635–E3644,2018. | [2]: https://github.com/nikhgarg/EmbeddingDynamicStereotypes """ metric_template = (2, 1) metric_name = "Relative Norm Distance" metric_short_name = "RND" def __calc_distance( self, vec1: np.ndarray, vec2: np.ndarray, distance_type: str = "norm", ) -> float: if distance_type == "norm": return np.linalg.norm(np.subtract(vec1, vec2)) elif distance_type == "cos": # c = np.dot(vec1, vec2) / np.linalg.norm(vec1) / np.linalg.norm(vec2) c = cosine_similarity([vec1], [vec2]).flatten() return c[0] else: raise ValueError( 'distance_type can be either "norm" or "cos", ' "got: {} ".format(distance_type) ) def __calc_rnd( self, target_0: np.ndarray, target_1: np.ndarray, attribute: np.ndarray, attribute_words: list, distance_type: str, ) -> Tuple[float, Dict[str, float]]: # calculates the average wv for the group words. target_1_avg_vector = np.average(target_0, axis=0) target_2_avg_vector = np.average(target_1, axis=0) sum_of_distances = 0.0 distance_by_words = {} for attribute_word_index, attribute_embedding in enumerate(attribute): # calculate the distance current_distance = self.__calc_distance( attribute_embedding, target_1_avg_vector, distance_type=distance_type, ) - self.__calc_distance( attribute_embedding, target_2_avg_vector, distance_type=distance_type, ) # add the distance of the neutral word to the accumulated # distances. sum_of_distances += current_distance # add the distance of the neutral word to the list of distances # by word distance_by_words[attribute_words[attribute_word_index]] = current_distance sorted_distances_by_word = { k: v for k, v in sorted(distance_by_words.items(), key=lambda item: item[1]) } # calculate the average of the distances and return mean_distance = sum_of_distances / len(distance_by_words) return mean_distance, sorted_distances_by_word
[docs] def run_query( self, query: Query, model: WordEmbeddingModel, distance: str = "norm", lost_vocabulary_threshold: float = 0.2, preprocessors: List[Dict[str, Union[str, bool, Callable]]] = [{}], strategy: str = "first", normalize: bool = False, warn_not_found_words: bool = False, *args: Any, **kwargs: Any ) -> Dict[str, Any]: """Calculate the RND metric over the provided parameters. Parameters ---------- query : Query A Query object that contains the target and attribute sets to be tested. model : WordEmbeddingModel A word embedding model. distance : str, optional Specifies which type of distance will be calculated. It could be: {norm, cos} , by default 'norm'. preprocessors : List[Dict[str, Union[str, bool, Callable]]] A list with preprocessor options. A ``preprocessor`` is a dictionary that specifies what processing(s) are performed on each word before it is looked up in the model vocabulary. For example, the ``preprocessor`` ``{'lowecase': True, 'strip_accents': True}`` allows you to lowercase and remove the accent from each word before searching for them in the model vocabulary. Note that an empty dictionary ``{}`` indicates that no preprocessing is done. The possible options for a preprocessor are: * ``lowercase``: ``bool``. Indicates that the words are transformed to lowercase. * ``uppercase``: ``bool``. Indicates that the words are transformed to uppercase. * ``titlecase``: ``bool``. Indicates that the words are transformed to titlecase. * ``strip_accents``: ``bool``, ``{'ascii', 'unicode'}``: Specifies that the accents of the words are eliminated. The stripping type can be specified. True uses ‘unicode’ by default. * ``preprocessor``: ``Callable``. It receives a function that operates on each word. In the case of specifying a function, it overrides the default preprocessor (i.e., the previous options stop working). A list of preprocessor options allows you to search for several variants of the words into the model. For example, the preprocessors ``[{}, {"lowercase": True, "strip_accents": True}]`` ``{}`` allows first to search for the original words in the vocabulary of the model. In case some of them are not found, ``{"lowercase": True, "strip_accents": True}`` is executed on these words and then they are searched in the model vocabulary. strategy : str, optional The strategy indicates how it will use the preprocessed words: 'first' will include only the first transformed word found. all' will include all transformed words found, by default "first". normalize : bool, optional True indicates that embeddings will be normalized, by default False warn_not_found_words : bool, optional Specifies if the function will warn (in the logger) the words that were not found in the model's vocabulary, by default False. Returns ------- Dict[str, Any] A dictionary with the query name, the resulting score of the metric, and a dictionary with the distances of each attribute word with respect to the target sets means. Examples -------- >>> from wefe.metrics import RND >>> from wefe.query import Query >>> from wefe.utils import load_test_model >>> >>> # define the query >>> query = Query( ... target_sets=[ ... ["female", "woman", "girl", "sister", "she", "her", "hers", ... "daughter"], ... ["male", "man", "boy", "brother", "he", "him", "his", "son"], ... ], ... attribute_sets=[ ... [ ... "home", "parents", "children", "family", "cousins", "marriage", ... "wedding", "relatives", ... ], ... ], ... target_sets_names=["Female terms", "Male Terms"], ... attribute_sets_names=["Family"], ... ) >>> >>> # load the model (in this case, the test model included in wefe) >>> model = load_test_model() >>> >>> # instance the metric and run the query >>> RND().run_query(query, model) # doctest: +SKIP {'query_name': 'Female terms and Male Terms wrt Family', 'result': 0.030381828546524048, 'rnd': 0.030381828546524048, 'distances_by_word': {'wedding': -0.1056304, 'marriage': -0.10163283, 'children': -0.068374634, 'parents': 0.00097084045, 'relatives': 0.0483346, 'family': 0.12408042, 'cousins': 0.17195654, 'home': 0.1733501}} >>> >>> # if you want the embeddings to be normalized before calculating the metrics >>> # use the normalize parameter as True before executing the query. >>> RND().run_query(query, model, normalize=True) # doctest: +SKIP {'query_name': 'Female terms and Male Terms wrt Family', 'result': -0.006278775632381439, 'rnd': -0.006278775632381439, 'distances_by_word': {'children': -0.05244279, 'wedding': -0.04642248, 'marriage': -0.04268837, 'parents': -0.022358716, 'relatives': 0.005497098, 'family': 0.023389697, 'home': 0.04009247, 'cousins': 0.044702888}} >>> >>> # if you want to use cosine distance instead of euclidean norm >>> # use the distance parameter as 'cos' before executing the query. >>> RND().run_query(query, model, normalize=True, distance='cos') # doctest: +SKIP {'query_name': 'Female terms and Male Terms wrt Family', 'result': 0.03643466345965862, 'rnd': 0.03643466345965862, 'distances_by_word': {'cousins': -0.035989374, 'home': -0.026971221, 'family': -0.009296179, 'relatives': 0.015690982, 'parents': 0.051281124, 'children': 0.09255883, 'marriage': 0.09959312, 'wedding': 0.104610026}} """ # check the types of the provided arguments (only the defaults). self._check_input(query, model, locals()) # transform query word sets into embeddings embeddings = get_embeddings_from_query( model=model, query=query, lost_vocabulary_threshold=lost_vocabulary_threshold, preprocessors=preprocessors, strategy=strategy, normalize=normalize, warn_not_found_words=warn_not_found_words, ) # if there is any/some set has less words than the allowed limit, # return the default value (nan) if embeddings is None: return { "query_name": query.query_name, "result": np.nan, "rnd": np.nan, "distances_by_word": {}, } # get the targets and attribute sets transformed into embeddings. target_sets, attribute_sets = embeddings # get only the embeddings of the sets. target_embeddings = list(target_sets.values()) attribute_embeddings = list(attribute_sets.values()) target_0_embeddings = np.array(list(target_embeddings[0].values())) target_1_embeddings = np.array(list(target_embeddings[1].values())) attribute_0_embeddings = np.array(list(attribute_embeddings[0].values())) # get a list with the transformed attribute words attribute_0_words = list(attribute_embeddings[0].keys()) rnd, distances_by_word = self.__calc_rnd( target_0_embeddings, target_1_embeddings, attribute_0_embeddings, attribute_0_words, distance, ) return { "query_name": query.query_name, "result": rnd, "rnd": rnd, "distances_by_word": distances_by_word, }