Source code for wefe.query

import logging
import numpy as np
from itertools import combinations
from typing import Any, List, Union


[docs]class Query:
    """A container for attribute and target word sets."""
[docs]    def __init__(self,
                 target_sets: List[Any],
                 attribute_sets: List[Any],
                 target_sets_names: Union[List[str], None] = None,
                 attribute_sets_names: Union[List[str], None] = None):
        """Initializes the container. It could include a name for each
        word set.

        Parameters
        ----------
        target_sets : Union[np.ndarray, list]
            Array or list that contains the target word sets.
        attribute_sets : Union[np.ndarray, Iterable]
            Array or list that contains the attribute word sets.
        target_sets_names : Union[np.ndarray, Iterable], optional
            Array or list that contains the word sets names, by default None
        attribute_sets_names : Union[np.ndarray, Iterable], optional
            Array or list that contains the attribute sets names,
            by default None

        Attributes
        ----------
        target_sets : list
            Array or list with the lists of target words.
        attribute_sets : list
            Array or list with the lists of target words.
        template : tuple
            A tuple that contains the template: the cardinality of the target
            and attribute sets respectively.
        target_sets_names : list
            Array or list with the names of target sets.
        attribute_sets_names : list
            Array or list with the lists of target words.
        query_name : str
            A string that contains the auto-generated name of the query.

        Raises
        ------
        TypeError
            if target_sets are not an iterable or np.ndarray instance.
        TypeError
            if attribute_sets are not an iterable or np.ndarray instance.
        Exception
            if the length of target_sets is 0.
        TypeError
            if some element of target_sets is not an array or list.
        TypeError
            if some element of some target set is not an string.
        TypeError
            if some element of attribute_sets is not an array or list.
        TypeError
            if some element of some attribute set is not an string.

        Examples
        --------
        Construct a Query with 2 sets of target words and one set of
        attribute words.

        >>> male_terms = ['male', 'man', 'boy']
        >>> female_terms = ['female', 'woman', 'girl']
        >>> science_terms = ['science','technology','physics']
        >>> query = Query([male_terms, female_terms], [science_terms],
        ...               ['Male terms', 'Female terms'], ['Science terms'])
        >>> query.target_sets
        [['male', 'man', 'boy'], ['female', 'woman', 'girl']]
        >>> query.attribute_sets
        [['science', 'technology', 'physics']]
        >>> query.query_name
        'Male terms and Female terms wrt Science terms'
        """

        # check input type
        if (not isinstance(target_sets, (list, np.ndarray))):
            raise TypeError(
                "target_sets must be a numpy array or list. Given: {}".format(
                    type(target_sets)))

        if (not isinstance(attribute_sets, (list, np.ndarray))):
            raise TypeError(
                "attribute_sets must be a numpy array or list. Given: {}".format(
                    type(attribute_sets)))

        # check input array sizes
        if len(target_sets) == 0:
            raise Exception('target_sets must have at least one array or list of words. '
                            'given: {}'.format(target_sets))

        # check all words that target sets contains.
        for idx, target_set in enumerate(target_sets):
            if not isinstance(target_set, (np.ndarray, list)):
                raise TypeError('Each target set must be a list or an array of strings. '
                                'Given: {} at postion {}'.format(type(target_set), idx))
            for word_idx, word in enumerate(target_set):
                if (not isinstance(word, str)):
                    raise TypeError('All elements in target set {} must be strings. '
                                    'Given: {} at position {}'.format(
                                        idx, type(word), word_idx))

        # check all words that attribute sets contains.
        for idx, attribute_set in enumerate(attribute_sets):
            if not isinstance(attribute_set, (np.ndarray, list)):
                raise TypeError(
                    'Each attribute set must be a list or an array of strings.'
                    ' Given: {} at postion {}'.format(type(attribute_set), idx))
            for word_idx, word in enumerate(attribute_set):
                if (not isinstance(word, str)):
                    raise TypeError('All elements in attribute set {} must be strings. '
                                    'Given: {} at position {}'.format(
                                        idx, type(word), word_idx))

        # set target and attributes sets to this instance.
        self.target_sets = target_sets
        self.attribute_sets = attribute_sets

        # set the template/cardinality (t, a) of the sets
        self.template = (len(target_sets), len(attribute_sets))

        # set target sets names.
        if target_sets_names is None:
            self.target_sets_names = [
                "Target set {}".format(i) for i in range(self.template[0])
            ]
        else:
            if (len(target_sets_names) != self.template[0]):
                logging.warning(
                    'target_sets_names does not have the same elements ({}) as'
                    ' target_sets ({}). Setting default names'.format(
                        len(target_sets_names), self.template[0]))
                self.target_sets_names = [
                    "Target set {}".format(i) for i in range(self.template[0])
                ]
            else:
                self.target_sets_names = target_sets_names

        # set attribute and attribute sets names.
        if attribute_sets_names is None:
            self.attribute_sets_names = [
                "Attribute set {}".format(i) for i in range(self.template[1])
            ]
        else:
            if (len(attribute_sets_names) != self.template[1]):
                logging.warning(
                    'attribute_sets_names does not have the same elements '
                    ' ({}) as attribute_sets ({}). Setting default names'.format(
                        len(attribute_sets_names), self.template[1]))
                self.attribute_sets_names = [
                    "Attribute set {}".format(i) for i in range(self.template[1])
                ]

            else:
                self.attribute_sets_names = attribute_sets_names

        self.query_name = self._generate_query_name()

    def __eq__(self, other):

        if not isinstance(other, Query):
            return False

        if self.template[0] != other.template[0]:
            return False
        if self.template[1] != other.template[1]:
            return False

        for target_set, other_target_set in zip(self.target_sets, other.target_sets):
            if target_set != other_target_set:
                return False

        for attribute_set, other_attribute_set in zip(self.attribute_sets,
                                                      other.attribute_sets):
            if attribute_set != other_attribute_set:
                return False

        for names, other_names in zip(self.target_sets_names, other.target_sets_names):
            if names != other_names:
                return False

        for names, other_names in zip(self.attribute_sets_names,
                                      other.attribute_sets_names):
            if names != other_names:
                return False
        return True

[docs]    def get_subqueries(self, new_template: tuple) -> list:
        """Generate the subqueries from this query using the given template
        """

        if not isinstance(new_template[0], int):
            raise TypeError('The new target cardinality (new_template[0]) must be int. '
                            'Given: {}'.format(new_template[0]))
        if not isinstance(new_template[1], int):
            raise TypeError(
                'The new attribute cardinality (new_template[1]) must be int. '
                'Given: {}'.format(new_template[1]))

        if new_template[0] > self.template[0]:
            raise Exception(
                'The new target cardinality (new_template[0]) must be equal or'
                ' less than the original target set cardinality. Given: {}'.format(
                    new_template[0]))
        if new_template[1] > self.template[1]:
            raise Exception(
                'The new attribute cardinality (new_template[1]) must be equal'
                ' or less than the original attribute set cardinality. '
                'Given: {}'.format(new_template[1]))

        target_combinations = list(combinations(range(self.template[0]),
                                                new_template[0]))
        attribute_combinations = list(
            combinations(range(self.template[1]), new_template[1]))

        target_subsets = [[self.target_sets[idx] for idx in combination]
                          for combination in target_combinations]
        target_subsets_names = [[self.target_sets_names[idx] for idx in combination]
                                for combination in target_combinations]
        attribute_subsets = [[self.attribute_sets[idx] for idx in combination]
                             for combination in attribute_combinations]
        attribute_subsets_names = [[
            self.attribute_sets_names[idx] for idx in combination
        ] for combination in attribute_combinations]

        subqueries = [[
            Query(target_subset, attribute_subset, target_subset_name,
                  attribute_subset_name)
            for attribute_subset, attribute_subset_name in zip(
                attribute_subsets, attribute_subsets_names)
        ]
                      for target_subset, target_subset_name in zip(
                          target_subsets, target_subsets_names)]

        return np.array(subqueries).flatten().tolist()

    def _generate_query_name(self) -> str:
        """Generates the query name from the name of its target and
        attribute sets.

        Returns
        -------
        str
            The name of the query.
        """

        target_sets_names = self.target_sets_names
        attribute_sets_names = self.attribute_sets_names

        if len(target_sets_names) == 1:
            target = target_sets_names[0]
        elif len(target_sets_names) == 2:
            target = target_sets_names[0] + " and " + target_sets_names[1]
        else:
            target = ', '.join([str(x) for x in target_sets_names[0:-1]
                                ]) + ' and ' + target_sets_names[-1]

        if len(attribute_sets_names) == 0:
            return target

        if len(attribute_sets_names) == 1:
            attribute = attribute_sets_names[0]
        elif len(attribute_sets_names) == 2:
            attribute = attribute_sets_names[0] + " and " + attribute_sets_names[1]
        else:
            attribute = ', '.join([str(x) for x in attribute_sets_names[0:-1]
                                   ]) + ' and ' + attribute_sets_names[-1]

        return target + ' wrt ' + attribute