Source code for wefe.query

"""Module that implements the Query object."""

from itertools import combinations
from typing import Any

import numpy as np



[docs]
class Query:
    """A container for attribute and target word sets."""


[docs]
    def __init__(
        self,
        target_sets: list[Any],
        attribute_sets: list[Any],
        target_sets_names: list[str] | None = None,
        attribute_sets_names: list[str] | None = None,
    ) -> None:
        """Initializes the container. It could include a name for each word set.

        Parameters
        ----------
        target_sets : Union[np.ndarray, list]
            Array or list that contains the target word sets.
        attribute_sets : Union[np.ndarray, Iterable]
            Array or list that contains the attribute word sets.
        target_sets_names : Union[np.ndarray, Iterable], optional
            Array or list that contains the word sets names, by default None
        attribute_sets_names : Union[np.ndarray, Iterable], optional
            Array or list that contains the attribute sets names,
            by default None

        Attributes
        ----------
        target_sets : list
            Array or list with the lists of target words.
        attribute_sets : list
            Array or list with the lists of target words.
        template : tuple
            A tuple that contains the template: the cardinality of the target
            and attribute sets respectively.
        target_sets_names : list
            Array or list with the names of target sets.
        attribute_sets_names : list
            Array or list with the lists of target words.
        query_name : str
            A string that contains the auto-generated name of the query.

        Raises
        ------
        TypeError
            if target_sets are not an iterable or np.ndarray instance.
        TypeError
            if attribute_sets are not an iterable or np.ndarray instance.
        Exception
            if the length of target_sets is 0.
        TypeError
            if some element of target_sets is not an array or list.
        TypeError
            if some element of some target set is not an string.
        TypeError
            if some element of attribute_sets is not an array or list.
        TypeError
            if some element of some attribute set is not an string.

        Examples
        --------
        Construct a Query with 2 sets of target words and one set of
        attribute words.

        >>> male_terms = ['male', 'man', 'boy']
        >>> female_terms = ['female', 'woman', 'girl']
        >>> science_terms = ['science','technology','physics']
        >>> query = Query([male_terms, female_terms], [science_terms],
        ...               ['Male terms', 'Female terms'], ['Science terms'])
        >>> query.target_sets
        [['male', 'man', 'boy'], ['female', 'woman', 'girl']]
        >>> query.attribute_sets
        [['science', 'technology', 'physics']]
        >>> query.query_name
        'Male terms and Female terms wrt Science terms'

        """
        # check input type
        if not isinstance(target_sets, list | np.ndarray):
            raise TypeError(
                f"target_sets must be a numpy array or list. Given: {type(target_sets)}"
            )

        if not isinstance(attribute_sets, list | np.ndarray):
            raise TypeError(
                f"attribute_sets must be a numpy array or list. "
                f"Given: {type(attribute_sets)}"
            )

        # check input array sizes
        if len(target_sets) == 0:
            raise Exception(
                "target_sets must have at least one array or list of words. "
                f"given: {target_sets}"
            )

        # check all words that target sets contains.
        for idx, target_set in enumerate(target_sets):
            if not isinstance(target_set, np.ndarray | list):
                raise TypeError(
                    "Each target set must be a list or an array of strings. "
                    f"Given: {type(target_set)} at postion {idx}"
                )
            for word_idx, word in enumerate(target_set):
                if not isinstance(word, str):
                    raise TypeError(
                        f"All elements in target set {idx} must be strings. "
                        f"Given: {type(word)} at position {word_idx}"
                    )

        # check all words that attribute sets contains.
        for idx, attribute_set in enumerate(attribute_sets):
            if not isinstance(attribute_set, np.ndarray | list):
                raise TypeError(
                    "Each attribute set must be a list or an array of strings."
                    f" Given: {type(attribute_set)} at postion {idx}"
                )
            for word_idx, word in enumerate(attribute_set):
                if not isinstance(word, str):
                    raise TypeError(
                        f"All elements in attribute set {idx} must be strings. "
                        f"Given: {type(word)} at position {word_idx}"
                    )

        # set target and attributes sets to this instance.
        self.target_sets = target_sets
        self.attribute_sets = attribute_sets

        # set the template/cardinality (t, a) of the sets
        self.template = (len(target_sets), len(attribute_sets))

        # set target sets names.
        if target_sets_names is None:
            self.target_sets_names = [
                f"Target set {i}" for i in range(self.template[0])
            ]
        else:
            if len(target_sets_names) != self.template[0]:
                raise ValueError(
                    f"target_sets (len={self.template[0]}) does not have the same "
                    "number of elements as target_sets_names "
                    f"(len={len(target_sets_names)})"
                )
            self.target_sets_names = target_sets_names

        # set attribute and attribute sets names.
        if attribute_sets_names is None:
            self.attribute_sets_names = [
                f"Attribute set {i}" for i in range(self.template[1])
            ]
        else:
            if len(attribute_sets_names) != self.template[1]:
                raise ValueError(
                    f"attribute_sets (len={self.template[1]}) does not have the same "
                    "number of elements as attribute_sets_names "
                    f"(len={len(attribute_sets_names)})"
                )
            self.attribute_sets_names = attribute_sets_names

        self.query_name = self._get_query_name()


    def __eq__(self, other: Any) -> bool:
        """Check if some object is equal to this query.

        Steps:
        - Check other type.
        - Compare template.
        - Check if the number of target sets in both queries is equal.
        - Check if the number of attribute sets in both queries is equal.
        - Check if every target set is the same in both queries.
        - Check if every attribute set is the same in both queries.
        - Check if the names of the target sets are equal in both queries.
        - Check if the names of the attribute sets are equal in both queries.

        Parameters
        ----------
        other : Any
            The object to compare.

        Returns
        -------
        bool
            True if other is the same query, False in any other case.

        """
        if not isinstance(other, Query):
            return False

        if self.template[0] != other.template[0]:
            return False
        if self.template[1] != other.template[1]:
            return False

        if len(self.target_sets) != len(other.target_sets):
            return False
        if len(self.attribute_sets) != len(other.attribute_sets):
            return False

        for target_set, other_target_set in zip(
            self.target_sets,
            other.target_sets,
            strict=False,
        ):
            if target_set != other_target_set:
                return False

        for attribute_set, other_attribute_set in zip(
            self.attribute_sets,
            other.attribute_sets,
            strict=False,
        ):
            if attribute_set != other_attribute_set:
                return False

        for names, other_names in zip(
            self.target_sets_names,
            other.target_sets_names,
            strict=False,
        ):
            if names != other_names:
                return False

        for names, other_names in zip(
            self.attribute_sets_names,
            other.attribute_sets_names,
            strict=False,
        ):
            if names != other_names:
                return False
        return True

    def __repr__(self) -> str:
        """Generates a repr that shows the name, target and attributes of the query.

        Returns
        -------
        str
            The generated representation.

        """
        try:
            repr_ = (
                "<Query: "
                + self.query_name
                + "\n- Target sets: "
                + repr(self.target_sets)
                + "\n- Attribute sets:"
                + repr(self.attribute_sets)
                + ">"
            )
            return repr_
        except AttributeError:
            # it can happen if some of the attributes (query_name, target_sets
            # or attribute_sets) are not defined.
            return "<Query with wrong __repr__>"


[docs]
    def dict(self) -> dict[str, Any]:
        """Generate a dictionary from the Query data.

        This includes the target and attribute sets, as well as their names,
        the query name generated from them and the query template.

        Returns
        -------
        Dict[str, Any]
            The dictionary generated with the query data.

        """
        return {
            "target_sets": self.target_sets,
            "attribute_sets": self.attribute_sets,
            "target_sets_names": self.target_sets_names,
            "attribute_sets_names": self.attribute_sets_names,
            "query_name": self.query_name,
            "template": self.template,
        }



[docs]
    def get_subqueries(self, new_template: tuple) -> list:
        """Generate the subqueries from this query using the given template."""
        if not isinstance(new_template[0], int):
            raise TypeError(
                "The new target cardinality (new_template[0]) must be int. "
                f"Given: {new_template[0]}"
            )
        if not isinstance(new_template[1], int):
            raise TypeError(
                "The new attribute cardinality (new_template[1]) must be int. "
                f"Given: {new_template[1]}"
            )

        if new_template[0] > self.template[0]:
            raise Exception(
                "The new target cardinality (new_template[0]) must be equal or "
                f"less than the original target set cardinality. "
                f"Given: {new_template[0]}"
            )
        if new_template[1] > self.template[1]:
            raise Exception(
                "The new attribute cardinality (new_template[1]) must be equal"
                " or less than the original attribute set cardinality. "
                f"Given: {new_template[1]}"
            )

        target_combinations = list(
            combinations(range(self.template[0]), new_template[0])
        )
        attribute_combinations = list(
            combinations(range(self.template[1]), new_template[1])
        )

        target_subsets = [
            [self.target_sets[idx] for idx in combination]
            for combination in target_combinations
        ]
        target_subsets_names = [
            [self.target_sets_names[idx] for idx in combination]
            for combination in target_combinations
        ]
        attribute_subsets = [
            [self.attribute_sets[idx] for idx in combination]
            for combination in attribute_combinations
        ]
        attribute_subsets_names = [
            [self.attribute_sets_names[idx] for idx in combination]
            for combination in attribute_combinations
        ]

        subqueries = [
            [
                Query(
                    target_subset,
                    attribute_subset,
                    target_subset_name,
                    attribute_subset_name,
                )
                for attribute_subset, attribute_subset_name in zip(
                    attribute_subsets, attribute_subsets_names, strict=False
                )
            ]
            for target_subset, target_subset_name in zip(
                target_subsets, target_subsets_names, strict=False
            )
        ]

        return np.array(subqueries).flatten().tolist()


    def _get_query_name(self) -> str:
        """Generate the query name from the name of its target and attribute sets.

        Returns
        -------
        str
            The name of the query.

        """
        target_sets_names = self.target_sets_names
        attribute_sets_names = self.attribute_sets_names

        if len(target_sets_names) == 1:
            target = target_sets_names[0]
        elif len(target_sets_names) == 2:
            target = target_sets_names[0] + " and " + target_sets_names[1]
        else:
            target = (
                ", ".join([str(x) for x in target_sets_names[0:-1]])
                + " and "
                + target_sets_names[-1]
            )

        if len(attribute_sets_names) == 0:
            return target

        if len(attribute_sets_names) == 1:
            attribute = attribute_sets_names[0]
        elif len(attribute_sets_names) == 2:
            attribute = attribute_sets_names[0] + " and " + attribute_sets_names[1]
        else:
            attribute = (
                ", ".join([str(x) for x in attribute_sets_names[0:-1]])
                + " and "
                + attribute_sets_names[-1]
            )

        return target + " wrt " + attribute