Source code for wefe.query

"""Module that implements the Query object."""
from itertools import combinations
from typing import Any, Dict, List, Union

import numpy as np


[docs]class Query: """A container for attribute and target word sets."""
[docs] def __init__( self, target_sets: List[Any], attribute_sets: List[Any], target_sets_names: Union[List[str], None] = None, attribute_sets_names: Union[List[str], None] = None, ) -> None: """Initializes the container. It could include a name for each word set. Parameters ---------- target_sets : Union[np.ndarray, list] Array or list that contains the target word sets. attribute_sets : Union[np.ndarray, Iterable] Array or list that contains the attribute word sets. target_sets_names : Union[np.ndarray, Iterable], optional Array or list that contains the word sets names, by default None attribute_sets_names : Union[np.ndarray, Iterable], optional Array or list that contains the attribute sets names, by default None Attributes ---------- target_sets : list Array or list with the lists of target words. attribute_sets : list Array or list with the lists of target words. template : tuple A tuple that contains the template: the cardinality of the target and attribute sets respectively. target_sets_names : list Array or list with the names of target sets. attribute_sets_names : list Array or list with the lists of target words. query_name : str A string that contains the auto-generated name of the query. Raises ------ TypeError if target_sets are not an iterable or np.ndarray instance. TypeError if attribute_sets are not an iterable or np.ndarray instance. Exception if the length of target_sets is 0. TypeError if some element of target_sets is not an array or list. TypeError if some element of some target set is not an string. TypeError if some element of attribute_sets is not an array or list. TypeError if some element of some attribute set is not an string. Examples -------- Construct a Query with 2 sets of target words and one set of attribute words. >>> male_terms = ['male', 'man', 'boy'] >>> female_terms = ['female', 'woman', 'girl'] >>> science_terms = ['science','technology','physics'] >>> query = Query([male_terms, female_terms], [science_terms], ... ['Male terms', 'Female terms'], ['Science terms']) >>> query.target_sets [['male', 'man', 'boy'], ['female', 'woman', 'girl']] >>> query.attribute_sets [['science', 'technology', 'physics']] >>> query.query_name 'Male terms and Female terms wrt Science terms' """ # check input type if not isinstance(target_sets, (list, np.ndarray)): raise TypeError( "target_sets must be a numpy array or list. Given: {}".format( type(target_sets) ) ) if not isinstance(attribute_sets, (list, np.ndarray)): raise TypeError( "attribute_sets must be a numpy array or list. Given: {}".format( type(attribute_sets) ) ) # check input array sizes if len(target_sets) == 0: raise Exception( "target_sets must have at least one array or list of words. " "given: {}".format(target_sets) ) # check all words that target sets contains. for idx, target_set in enumerate(target_sets): if not isinstance(target_set, (np.ndarray, list)): raise TypeError( "Each target set must be a list or an array of strings. " "Given: {} at postion {}".format(type(target_set), idx) ) for word_idx, word in enumerate(target_set): if not isinstance(word, str): raise TypeError( "All elements in target set {} must be strings. " "Given: {} at position {}".format(idx, type(word), word_idx) ) # check all words that attribute sets contains. for idx, attribute_set in enumerate(attribute_sets): if not isinstance(attribute_set, (np.ndarray, list)): raise TypeError( "Each attribute set must be a list or an array of strings." " Given: {} at postion {}".format(type(attribute_set), idx) ) for word_idx, word in enumerate(attribute_set): if not isinstance(word, str): raise TypeError( "All elements in attribute set {} must be strings. " "Given: {} at position {}".format(idx, type(word), word_idx) ) # set target and attributes sets to this instance. self.target_sets = target_sets self.attribute_sets = attribute_sets # set the template/cardinality (t, a) of the sets self.template = (len(target_sets), len(attribute_sets)) # set target sets names. if target_sets_names is None: self.target_sets_names = [ "Target set {}".format(i) for i in range(self.template[0]) ] else: if len(target_sets_names) != self.template[0]: raise ValueError( f"target_sets (len={self.template[0]}) does not have the same " "number of elements as target_sets_names " f"(len={len(target_sets_names)})" ) self.target_sets_names = target_sets_names # set attribute and attribute sets names. if attribute_sets_names is None: self.attribute_sets_names = [ "Attribute set {}".format(i) for i in range(self.template[1]) ] else: if len(attribute_sets_names) != self.template[1]: raise ValueError( f"attribute_sets (len={self.template[1]}) does not have the same " "number of elements as attribute_sets_names " f"(len={len(attribute_sets_names)})" ) self.attribute_sets_names = attribute_sets_names self.query_name = self._get_query_name()
def __eq__(self, other: Any) -> bool: """Check if some object is equal to this query. Steps: - Check other type. - Compare template. - Check if the number of target sets in both queries is equal. - Check if the number of attribute sets in both queries is equal. - Check if every target set is the same in both queries. - Check if every attribute set is the same in both queries. - Check if the names of the target sets are equal in both queries. - Check if the names of the attribute sets are equal in both queries. Parameters ---------- other : Any The object to compare. Returns ------- bool True if other is the same query, False in any other case. """ if not isinstance(other, Query): return False if self.template[0] != other.template[0]: return False if self.template[1] != other.template[1]: return False if len(self.target_sets) != len(other.target_sets): return False if len(self.attribute_sets) != len(other.attribute_sets): return False for target_set, other_target_set in zip( self.target_sets, other.target_sets, ): if target_set != other_target_set: return False for attribute_set, other_attribute_set in zip( self.attribute_sets, other.attribute_sets, ): if attribute_set != other_attribute_set: return False for names, other_names in zip( self.target_sets_names, other.target_sets_names, ): if names != other_names: return False for names, other_names in zip( self.attribute_sets_names, other.attribute_sets_names, ): if names != other_names: return False return True def __repr__(self) -> str: """Generates a repr that shows the name, target and attributes of the query. Returns ------- str The generated representation. """ try: repr_ = ( "<Query: " + self.query_name + "\n- Target sets: " + repr(self.target_sets) + "\n- Attribute sets:" + repr(self.attribute_sets) + ">" ) return repr_ except AttributeError: # it can happen if some of the attributes (query_name, target_sets # or attribute_sets) are not defined. return "<Query with wrong __repr__>"
[docs] def dict(self) -> Dict[str, Any]: """Generate a dictionary from the Query data. This includes the target and attribute sets, as well as their names, the query name generated from them and the query template. Returns ------- Dict[str, Any] The dictionary generated with the query data. """ return { "target_sets": self.target_sets, "attribute_sets": self.attribute_sets, "target_sets_names": self.target_sets_names, "attribute_sets_names": self.attribute_sets_names, "query_name": self.query_name, "template": self.template, }
[docs] def get_subqueries(self, new_template: tuple) -> list: """Generate the subqueries from this query using the given template.""" if not isinstance(new_template[0], int): raise TypeError( "The new target cardinality (new_template[0]) must be int. " "Given: {}".format(new_template[0]) ) if not isinstance(new_template[1], int): raise TypeError( "The new attribute cardinality (new_template[1]) must be int. " "Given: {}".format(new_template[1]) ) if new_template[0] > self.template[0]: raise Exception( "The new target cardinality (new_template[0]) must be equal or" " less than the original target set cardinality. Given: {}".format( new_template[0] ) ) if new_template[1] > self.template[1]: raise Exception( "The new attribute cardinality (new_template[1]) must be equal" " or less than the original attribute set cardinality. " "Given: {}".format(new_template[1]) ) target_combinations = list( combinations(range(self.template[0]), new_template[0]) ) attribute_combinations = list( combinations(range(self.template[1]), new_template[1]) ) target_subsets = [ [self.target_sets[idx] for idx in combination] for combination in target_combinations ] target_subsets_names = [ [self.target_sets_names[idx] for idx in combination] for combination in target_combinations ] attribute_subsets = [ [self.attribute_sets[idx] for idx in combination] for combination in attribute_combinations ] attribute_subsets_names = [ [self.attribute_sets_names[idx] for idx in combination] for combination in attribute_combinations ] subqueries = [ [ Query( target_subset, attribute_subset, target_subset_name, attribute_subset_name, ) for attribute_subset, attribute_subset_name in zip( attribute_subsets, attribute_subsets_names ) ] for target_subset, target_subset_name in zip( target_subsets, target_subsets_names ) ] return np.array(subqueries).flatten().tolist()
def _get_query_name(self) -> str: """Generate the query name from the name of its target and attribute sets. Returns ------- str The name of the query. """ target_sets_names = self.target_sets_names attribute_sets_names = self.attribute_sets_names if len(target_sets_names) == 1: target = target_sets_names[0] elif len(target_sets_names) == 2: target = target_sets_names[0] + " and " + target_sets_names[1] else: target = ( ", ".join([str(x) for x in target_sets_names[0:-1]]) + " and " + target_sets_names[-1] ) if len(attribute_sets_names) == 0: return target if len(attribute_sets_names) == 1: attribute = attribute_sets_names[0] elif len(attribute_sets_names) == 2: attribute = attribute_sets_names[0] + " and " + attribute_sets_names[1] else: attribute = ( ", ".join([str(x) for x in attribute_sets_names[0:-1]]) + " and " + attribute_sets_names[-1] ) return target + " wrt " + attribute