Source code for brainscopypaste.mine

"""Mine substitutions with various mining models.

This module defines several classes and mixins to mine substitutions in the
MemeTracker dataset with a series of different models.

:class:`Time`, :class:`Source`, :class:`Past` and :class:`Durl` together define
how a substitution :class:`Model` behaves. :class:`Interval` is a utility class
used internally in :class:`Model`. The :class:`ClusterMinerMixin` mixin builds
on this definition of a substitution model to provide
:meth:`ClusterMinerMixin.substitutions` which iterates over all valid
substitutions in a :class:`~.db.Cluster`. Finally,
:func:`mine_substitutions_with_model` brings :class:`ClusterMinerMixin` and
:class:`SubstitutionValidatorMixin` (which checks for spam substitutions)
together to mine for all substitutions in the dataset for a given
:class:`Model`.

"""


from enum import Enum, unique
from datetime import timedelta, datetime
import logging

import click
from progressbar import ProgressBar
import numpy as np
from nltk.corpus import wordnet

from brainscopypaste.conf import settings
from brainscopypaste.utils import (is_int, is_same_ending_us_uk_spelling,
                                   stopwords, levenshtein, subhamming,
                                   session_scope, memoized)


logger = logging.getLogger(__name__)


[docs]def mine_substitutions_with_model(model, limit=None): """Mine all substitutions in the MemeTracker dataset conforming to `model`. Iterates through the whole MemeTracker dataset to find all substitutions that are considered valid by `model`, and save the results to the database. The MemeTracker dataset must have been loaded and filtered previously, or an excetion will be raised (see :ref:`usage` or :mod:`.cli` for more about that). Mined substitutions are saved each time the function moves to a new cluster, and progress is printed to stdout. The number of substitutions seen and the number of substitutions kept (i.e. validated by :meth:`SubstitutionValidatorMixin.validate`) are also printed to stdout. Parameters ---------- model : :class:`Model` The substitution model to use for mining. limit : int, optional If not `None` (default), mining will stop after `limit` clusters have been examined. Raises ------ Exception If no filtered clusters are found in the database, or if there already are some substitutions from model `model` in the database. """ from brainscopypaste.db import Cluster, Substitution logger.info('Mining clusters for substitutions') if limit is not None: logger.info('Mining is limited to %s clusters', limit) click.echo('Mining clusters for substitutions with {}{}...' .format(model, '' if limit is None else ' (limit={})'.format(limit))) # Check we haven't already mined substitutions with this model. with session_scope() as session: substitution_count = session.query(Substitution)\ .filter(Substitution.model == model).count() if substitution_count != 0: raise Exception(('The database already contains substitutions ' 'mined with this model ({} - {} substitutions). ' 'You should drop these before doing anything ' 'else.'.format(model, substitution_count))) # Check clusters have been filtered. with session_scope() as session: if session.query(Cluster)\ .filter(Cluster.filtered.is_(True)).count() == 0: raise Exception('Found no filtered clusters, aborting.') query = session.query(Cluster.id).filter(Cluster.filtered.is_(True)) if limit is not None: query = query.limit(limit) cluster_ids = [id for (id,) in query] logger.info('Got %s clusters to mine', len(cluster_ids)) # Mine. seen = 0 kept = 0 for cluster_id in ProgressBar()(cluster_ids): model.drop_caches() with session_scope() as session: cluster = session.query(Cluster).get(cluster_id) for substitution in cluster.substitutions(model): seen += 1 if substitution.validate(): logger.debug('Found valid substitution in cluster #%s', cluster.sid) kept += 1 session.commit() else: logger.debug('Dropping substitution from cluster #%s', cluster.sid) session.rollback() # Sanity check. This session business is tricky. with session_scope() as session: assert session.query(Substitution)\ .filter(Substitution.model == model).count() == kept click.secho('OK', fg='green', bold=True) logger.info('Seen %s candidate substitutions, kept %s', seen, kept) click.echo('Seen {} candidate substitutions, kept {}.'.format(seen, kept))
@unique
[docs]class Time(Enum): """Type of time that determines the positioning of occurrence bins.""" #: Continuous time: bins are sliding, end at the destination occurrence, #: and start :attr:`Model.bin_span` before that. continuous = 1 #: Discrete time: bins are aligned at midnight, end at or before the #: destination occurrence, and start :attr:`Model.bin_span` before that. discrete = 2
@unique
[docs]class Source(Enum): """Type of quotes accepted as substitution sources.""" #: All quotes are potential sources for substitutions. all = 1 #: Majority rule: only quotes that are the most frequent in the considered #: past bin can be the source of substitutions (note that several quotes in #: a single bin can have the same maximal frequency). majority = 2
@unique
[docs]class Past(Enum): """How far back in the past can a substitution find its source.""" #: The past is everything: substitution sources can be in any bin preceding #: the destination occurrence (which is an interval that can end at #: midnight before the destination occurrence when using #: :attr:`Time.discrete`). all = 1 #: The past is the last bin: substitution sources must be in the bin #: preceding the destination occurrence (which can end at midnight before #: the destination occurrence when using :attr:`Time.discrete`). last_bin = 2
@unique
[docs]class Durl(Enum): """Type of quotes accepted as substitution destinations.""" #: All quotes are potential destinations for substitutions. all = 1 #: Excluded past rule: only quotes that do not appear in what :class:`Time` #: and :class:`Past` define as "the past" can be the destination of a #: substitution. exclude_past = 2
[docs]class Interval: """Time interval defined by `start` and `end` :class:`~datetime.datetime`\ s. Parameters ---------- start : :class:datetime.datetime The interval's start (or left) bound. end : :class:datetime.datetime The interval's end (or right) bound. Raises ------ Exception If `start` is strictly after `end` in time. Examples -------- Test if a :class:`~datetime.datetime` is in an interval: >>> from datetime import datetime >>> itv = Interval(datetime(2016, 7, 5, 12, 15, 5), ... datetime(2016, 7, 9, 13, 30, 0)) >>> datetime(2016, 7, 8) in itv True >>> datetime(2016, 8, 1) in itv False """ def __init__(self, start, end): assert start <= end self.start = start self.end = end def __contains__(self, other): """Test if `other` is in this :class:`Interval`.""" return self.start <= other < self.end def __key(self): """Unique identifier for this interval, used to compute e.g. equality between two :class:`Interval` instances.""" return (self.start, self.end) def __eq__(self, other): """Determine if two instances represent the same interval (underlies e.g. ``itv1 == itv2``)""" return self.__key() == other.__key() def __hash__(self): """Hash for this interval (makes this class hashable, so usable e.g. as dict keys).""" return hash(self.__key()) def __repr__(self): """String representation of this interval.""" return 'Interval(start={0.start}, end={0.end})'.format(self)
[docs]class Model: """Substitution mining model. A mining model is defined by the combination of one parameter for each of :class:`Time`, :class:`Source`, :class:`Past`, :class:`Durl`, and a maximum hamming distance between source string (or substring) and destination string. This class represents such a model. It defines a couple of utility functions used in :class:`ClusterMinerMixin` (:meth:`find_start` and :meth:`past_surls`), and a :meth:`validate` method which determines if a given substitution conforms to the model. Other methods, prefixed with an underscore, are utilities for the methods cited above. Parameters ---------- time : :class:`Time` Type of time defining how occurrence bins of the model are positioned. source : :class:`Source` Type of quotes that the model accepts as substitution sources. past : :class:`Past` How far back does the model look for substitution sources. durl : :class:`Durl` Type of quotes that the model accepts as substitution destinations. max_distance : int Maximum number of substitutions between a source string (or substring) and a destination string that the model will detect. Raises ------ Exception If `max_distance` is more than half of :data:`~.settings.MT_FILTER_MIN_TOKENS`. """ #: Span of occurrence bins the model makes. bin_span = timedelta(days=1) def __init__(self, time, source, past, durl, max_distance): assert time in Time self.time = time assert source in Source self.source = source assert past in Past self.past = past assert durl in Durl self.durl = durl assert 0 < max_distance <= settings.MT_FILTER_MIN_TOKENS // 2 self.max_distance = max_distance #: dict associating a :class:`Source` to its validation method. self._source_validation_table = { Source.all: self._ok, Source.majority: self._validate_source_majority } #: dict associating a :class:`Durl` to its validation method. self._durl_validation_table = { Durl.all: self._ok, Durl.exclude_past: self._validate_durl_exclude_past } def __repr__(self): """String representation of this model.""" return ('Model(time={0.time}, source={0.source}, past={0.past}, ' 'durl={0.durl}, max_distance={0.max_distance})').format(self) @memoized
[docs] def validate(self, source, durl): """Test if potential substitutions from `source` quote to `durl` destination url are valid for this model. This method is :func:`~.utils.memoized` for performance. Parameters ---------- source : :class:`~.db.Quote` Candidate source quote for substitutions; the substitutions can be from a substring of `source.string`. durl : :class:`~.db.Url` Candidate destination url for the substitutions. Returns ------- bool `True` if the proposed source and destination url are considered valid by this model, `False` otherwise. """ return (self._validate_distance(source, durl) and self._validate_base(source, durl) and self._validate_source(source, durl) and self._validate_durl(source, durl))
[docs] def _validate_distance(self, source, durl): """Check that `source` and `durl` differ by no more than `self.max_distance`.""" return 0 < self._distance_start(source, durl)[0] <= self.max_distance
[docs] def _validate_base(self, source, durl): """Check that `source` has at least one occurrence in what this model considers to be the past before `durl`.""" past = self._past(source.cluster, durl) return np.any([url.timestamp in past for url in source.urls])
[docs] def _validate_source(self, source, durl): """Check that `source` is an acceptable substitution source for this model. This method proxies to the proper validation method, depending on the value of `self.source`. """ return self._source_validation_table[self.source](source, durl)
[docs] def _validate_durl(self, source, durl): """Check that `durl` is an acceptable substitution destination occurrence for this model. This method proxies to the proper validation method, depending on the value of `self.durl`. """ return self._durl_validation_table[self.durl](source, durl)
[docs] def _ok(self, *args, **kwargs): """Dummy method used when a validation should always pass.""" return True
[docs] def _validate_source_majority(self, source, durl): """Check that `source` verifies the majority rule.""" # Source must be a majority quote in `past`. past_quote_ids = np.array([surl.quote.id for surl in self.past_surls(source.cluster, durl)]) if source.id not in past_quote_ids: return False counts = dict((i, c) for (i, c) in zip(*np.unique(past_quote_ids, return_counts=True))) if len(counts) == 0: return False return counts[source.id] == max(counts.values())
[docs] def _validate_durl_exclude_past(self, source, durl): """Check that `durl` verifies the excluded past rule.""" # Durl.quote must not be in `past`. past_quotes = [surl.quote for surl in self.past_surls(source.cluster, durl)] return durl.quote not in past_quotes
[docs] def _distance_start(self, source, durl): """Get a `(distance, start)` tuple indicating the minimal distance between `source` and `durl`, and the position of `source`'s substring that achieves that minimum. This is in fact an alias for what the model considers to be valid transformations and how to define them, but provides proper encapsulation of concerns. """ # We allow for substrings. # Note here that there can be a difference in lemmas without # there being a difference in tokens, because of fluctuations # in lemmatization. This is caught later on in the validation # of substitutions (see SubstitutionValidatorMixin.validate()), # instead of making this function more complicated. return subhamming(source.lemmas, durl.quote.lemmas)
[docs] def find_start(self, source, durl): """Get the position of the substring of `source` that achieves minimal distance to `durl`.""" return self._distance_start(source, durl)[1]
@memoized
[docs] def past_surls(self, cluster, durl): """Get the list of all :class:`~.db.Url`\ s that are in what this model considers to be the past before `durl`. This method is :func:`~.utils.memoized` for performance. """ past = self._past(cluster, durl) return list(filter(lambda url: url.timestamp in past, cluster.urls))
@memoized
[docs] def _past(self, cluster, durl): """Get an :class:`Interval` representing what this model considers to be the past before `durl`. See :class:`Time` and :class:`Past` to understand what this interval looks like. This method is :func:`~.utils.memoized` for performance. """ cluster_start = min([url.timestamp for url in cluster.urls]) # The bins are aligned to midnight, so get the midnight # before cluster start. cluster_bin_start = datetime(year=cluster_start.year, month=cluster_start.month, day=cluster_start.day) # Check our known `time` types. assert self.time in [Time.continuous, Time.discrete] if self.time is Time.continuous: # Time is continuous. end = durl.timestamp else: # Time is discrete. previous_bin_count = (durl.timestamp - cluster_bin_start) // self.bin_span end = max(cluster_start, cluster_bin_start + previous_bin_count * self.bin_span) # Check our known `past` types. assert self.past in [Past.all, Past.last_bin] if self.past is Past.all: # The past is everything until the start of the cluster. start = cluster_start else: # The past is only the last bin. start = max(cluster_start, end - self.bin_span) return Interval(start, end)
[docs] def drop_caches(self): """Drop the caches of all :func:`~.utils.memoized` methods of the class.""" self.validate.drop_cache() self.past_surls.drop_cache() self._past.drop_cache()
def __key(self): """Unique identifier for this model, used to compute e.g. equality between two :class:`Model` instances.""" return (self.time, self.source, self.past, self.durl, self.max_distance) def __eq__(self, other): """Determine if two instances represent the same model (underlies e.g. ``model1 == model2``)""" return hasattr(other, '_Model__key') and self.__key() == other.__key() def __hash__(self): """Hash for this model (makes this class hashable, so usable e.g. as dict keys).""" return hash(self.__key())
[docs]class ClusterMinerMixin: """Mixin for :class:`~.db.Cluster`\ s that provides substitution mining functionality. This mixin defines the :meth:`substitutions` method (based on the private :meth:`_substitutions` method) that iterates through all valid substitutions for a given :class:`Model`. """
[docs] def substitutions(self, model): """Iterate through all substitutions in this cluster considered valid by `model`. Multiple occurrences of a sentence at the same url (url "frequency") are ignored, so as not to artificially inflate results. Parameters ---------- model : :class:`Model` Model for which to mine substitutions in this cluster. Yields ------ substitution : :class:`~.db.Substitution` All the substitutions in this cluster considered valid by `model`. When `model` allows for multiple substitutions between a quote and a destination url, each substitution is yielded individually. Any substitution yielded is attached to this cluster, so if you use this in a :func:`~.utils.session_scope` substitutions will be saved automatically unless you explicitly rollback the session. """ # Iterate through candidate substitutions. for durl in self.urls: past_quotes_set = set([surl.quote for surl in model.past_surls(self, durl)]) # Don't test against ourselves. past_quotes_set.discard(durl.quote) for source in past_quotes_set: # Source can't be shorter than destination if len(source.lemmas) < len(durl.quote.lemmas): continue # Check distance, source and durl validity. if model.validate(source, durl): logger.debug('Found candidate substitution(s) between ' 'quote #%s and durl #%s/%s', source.sid, durl.quote.sid, durl.occurrence) for substitution in self._substitutions(source, durl, model): yield substitution
@classmethod
[docs] def _substitutions(cls, source, durl, model): """Iterate through all substitutions from `source` to `durl` considered valid by `model`. This method yields all the substitutions between `source` and `durl` when `model` allows for multiple substitutions. Parameters ---------- source : :class:`~.db.Quote` Source for the substitutions. durl : :class:`~.db.Url` Destination url for the substitutions. model : :class:`Model` Model that validates the substitutions between `source` and `durl`. """ from brainscopypaste.db import Substitution start = model.find_start(source, durl) dlemmas = durl.quote.lemmas slemmas = source.lemmas[start:start + len(dlemmas)] positions = np.where([c1 != c2 for (c1, c2) in zip(slemmas, dlemmas)])[0] assert 0 < len(positions) <= model.max_distance for position in positions: yield Substitution(source=source, destination=durl.quote, occurrence=durl.occurrence, start=int(start), position=int(position), model=model)
@memoized
[docs]def _get_wordnet_words(): """Get the set of all words known by WordNet. This is the set of all lemma names for all synonym sets in WordNet. """ return set(word.lower() for synset in wordnet.all_synsets() for word in synset.lemma_names())
[docs]class SubstitutionValidatorMixin: """Mixin for :class:`~.db.Substitution` that adds validation functionality. A non-negligible part of the substitutions found by :class:`ClusterMinerMixin` are spam or changes we're not interested in: minor spelling changes, abbreviations, changes of articles, symptoms of a deleted word that appear as substitutions, etc. This class defines the :meth:`validate` method, which tests for all these cases and returns whether or not the substitution is worth keeping. """
[docs] def validate(self): """Check whether or not this substitution is worth keeping.""" token1, token2 = self.tokens lem1, lem2 = self.lemmas tokens1, tokens2 = self.source.tokens, self.destination.tokens lemmas1, lemmas2 = self.source.lemmas, self.destination.lemmas # Only real-word lemmas. wordnet_words = _get_wordnet_words() if lem1 not in wordnet_words or lem2 not in wordnet_words: return False # '21st'/'twenty-first', etc. if (is_int(token1[0]) or is_int(token2[0]) or is_int(lem1[0]) or is_int(lem2[0])): return False # 'sen'/'senator', 'gov'/'governor', 'nov'/'november', etc. if (token1 == token2[:3] or token2 == token1[:3] or lem1 == lem2[:3] or lem2 == lem1[:3]): return False # 'programme'/'program', etc. if (token1[:-2] == token2 or token2[:-2] == token1 or lem1[:-2] == lem2 or lem2[:-2] == lem1): return False # 'centre'/'center', etc. if is_same_ending_us_uk_spelling(token1, token2): return False if is_same_ending_us_uk_spelling(lem1, lem2): return False # stopwords if (token1 in stopwords or token2 in stopwords or lem1 in stopwords or lem2 in stopwords): return False # Other minor spelling changes, also catching cases where tokens are # not different but lemmas are (because of lemmatization fluctuations). if levenshtein(token1, token2) <= 1: return False if levenshtein(lem1, lem2) <= 1: return False # Word deletion ('high school' -> 'school') if (self.start + self.position > 0 and (token2 == tokens1[self.start + self.position - 1] or lem2 == lemmas1[self.start + self.position - 1])): return False if (self.start + self.position < len(tokens1) - 1 and (token2 == tokens1[self.start + self.position + 1] or lem2 == lemmas1[self.start + self.position + 1])): return False # Word insertion ('school' -> 'high school') if (self.position > 0 and (token1 == tokens2[self.position - 1] or lem1 == lemmas2[self.position - 1])): return False if (self.position < len(tokens2) - 1 and (token1 == tokens2[self.position + 1] or lem1 == lemmas2[self.position + 1])): return False # Two words deletion ('supply of energy' -> 'supply') if (self.start + self.position > 1 and (token2 == tokens1[self.start + self.position - 2] or lem2 == lemmas1[self.start + self.position - 2])): return False if (self.start + self.position < len(tokens1) - 2 and (token2 == tokens1[self.start + self.position + 2] or lem2 == lemmas1[self.start + self.position + 2])): return False # Words stuck together ('policy maker' -> 'policymaker' # or 'policy-maker') if (self.start + self.position > 0 and (token2 == tokens1[self.start + self.position - 1] + token1 or token2 == tokens1[self.start + self.position - 1] + '-' + token1 or lem2 == lemmas1[self.start + self.position - 1] + lem1 or lem2 == lemmas1[self.start + self.position - 1] + '-' + lem1)): return False if (self.start + self.position < len(tokens1) - 1 and (token2 == token1 + tokens1[self.start + self.position + 1] or token2 == token1 + '-' + tokens1[self.start + self.position + 1] or lem2 == lem1 + lemmas1[self.start + self.position + 1] or lem2 == lem1 + '-' + lemmas1[self.start + self.position + 1])): return False # Words separated ('policymaker' or 'policy-maker' -> 'policy maker') if (self.position > 0 and (token1 == tokens2[self.position - 1] + token2 or token1 == tokens2[self.position - 1] + '-' + token2 or lem1 == lemmas2[self.position - 1] + lem2 or lem1 == lemmas2[self.position - 1] + '-' + lem2)): return False if (self.position < len(tokens2) - 1 and (token1 == token2 + tokens2[self.position + 1] or token1 == token2 + '-' + tokens2[self.position + 1] or lem1 == lem2 + lemmas2[self.position + 1] or lem1 == lem2 + '-' + lemmas2[self.position + 1])): return False # We need 2 extra checks compare to the words-stuck-together situation, # to detect teh second substitution appearing because of word # separation. Indeed in this case, contrary to words-stuck-together, we # can't rely on word shifts always being present, since the destination # can be cut shorter. In other words, in the following case: # (1) i'll come anytime there # (2) i'll come any time # these checks let us exclude 'there' -> 'time' as a substitution (in # the words-stuck-together case, the word 'there' would be present in # both sentences, shifted). if (self.position > 0 and (tokens1[self.start + self.position - 1] == tokens2[self.position - 1] + token2 or tokens1[self.start + self.position - 1] == tokens2[self.position - 1] + '-' + token2 or lemmas1[self.start + self.position - 1] == lemmas2[self.position - 1] + lem2 or lemmas1[self.start + self.position - 1] == lemmas2[self.position - 1] + '-' + lem2)): return False if (self.position < len(tokens2) - 1 and (tokens1[self.start + self.position + 1] == token2 + tokens2[self.position + 1] or tokens1[self.start + self.position + 1] == token2 + '-' + tokens2[self.position + 1] or lemmas1[self.start + self.position + 1] == lem2 + lemmas2[self.position + 1] or lemmas1[self.start + self.position + 1] == lem2 + '-' + lemmas2[self.position + 1])): return False return True