"""Mine substitutions with various mining models.
This module defines several classes and mixins to mine substitutions in the
MemeTracker dataset with a series of different models.
:class:`Time`, :class:`Source`, :class:`Past` and :class:`Durl` together define
how a substitution :class:`Model` behaves. :class:`Interval` is a utility class
used internally in :class:`Model`. The :class:`ClusterMinerMixin` mixin builds
on this definition of a substitution model to provide
:meth:`ClusterMinerMixin.substitutions` which iterates over all valid
substitutions in a :class:`~.db.Cluster`. Finally,
:func:`mine_substitutions_with_model` brings :class:`ClusterMinerMixin` and
:class:`SubstitutionValidatorMixin` (which checks for spam substitutions)
together to mine for all substitutions in the dataset for a given
:class:`Model`.
"""
from enum import Enum, unique
from datetime import timedelta, datetime
import logging
import click
from progressbar import ProgressBar
import numpy as np
from nltk.corpus import wordnet
from brainscopypaste.conf import settings
from brainscopypaste.utils import (is_int, is_same_ending_us_uk_spelling,
stopwords, levenshtein, subhamming,
session_scope, memoized)
logger = logging.getLogger(__name__)
[docs]def mine_substitutions_with_model(model, limit=None):
"""Mine all substitutions in the MemeTracker dataset conforming to `model`.
Iterates through the whole MemeTracker dataset to find all substitutions
that are considered valid by `model`, and save the results to the database.
The MemeTracker dataset must have been loaded and filtered previously, or
an excetion will be raised (see :ref:`usage` or :mod:`.cli` for more about
that). Mined substitutions are saved each time the function moves to a new
cluster, and progress is printed to stdout. The number of substitutions
seen and the number of substitutions kept (i.e. validated by
:meth:`SubstitutionValidatorMixin.validate`) are also printed to stdout.
Parameters
----------
model : :class:`Model`
The substitution model to use for mining.
limit : int, optional
If not `None` (default), mining will stop after `limit` clusters have
been examined.
Raises
------
Exception
If no filtered clusters are found in the database, or if there already
are some substitutions from model `model` in the database.
"""
from brainscopypaste.db import Cluster, Substitution
logger.info('Mining clusters for substitutions')
if limit is not None:
logger.info('Mining is limited to %s clusters', limit)
click.echo('Mining clusters for substitutions with {}{}...'
.format(model, '' if limit is None
else ' (limit={})'.format(limit)))
# Check we haven't already mined substitutions with this model.
with session_scope() as session:
substitution_count = session.query(Substitution)\
.filter(Substitution.model == model).count()
if substitution_count != 0:
raise Exception(('The database already contains substitutions '
'mined with this model ({} - {} substitutions). '
'You should drop these before doing anything '
'else.'.format(model, substitution_count)))
# Check clusters have been filtered.
with session_scope() as session:
if session.query(Cluster)\
.filter(Cluster.filtered.is_(True)).count() == 0:
raise Exception('Found no filtered clusters, aborting.')
query = session.query(Cluster.id).filter(Cluster.filtered.is_(True))
if limit is not None:
query = query.limit(limit)
cluster_ids = [id for (id,) in query]
logger.info('Got %s clusters to mine', len(cluster_ids))
# Mine.
seen = 0
kept = 0
for cluster_id in ProgressBar()(cluster_ids):
model.drop_caches()
with session_scope() as session:
cluster = session.query(Cluster).get(cluster_id)
for substitution in cluster.substitutions(model):
seen += 1
if substitution.validate():
logger.debug('Found valid substitution in cluster #%s',
cluster.sid)
kept += 1
session.commit()
else:
logger.debug('Dropping substitution from cluster #%s',
cluster.sid)
session.rollback()
# Sanity check. This session business is tricky.
with session_scope() as session:
assert session.query(Substitution)\
.filter(Substitution.model == model).count() == kept
click.secho('OK', fg='green', bold=True)
logger.info('Seen %s candidate substitutions, kept %s', seen, kept)
click.echo('Seen {} candidate substitutions, kept {}.'.format(seen, kept))
@unique
[docs]class Time(Enum):
"""Type of time that determines the positioning of occurrence bins."""
#: Continuous time: bins are sliding, end at the destination occurrence,
#: and start :attr:`Model.bin_span` before that.
continuous = 1
#: Discrete time: bins are aligned at midnight, end at or before the
#: destination occurrence, and start :attr:`Model.bin_span` before that.
discrete = 2
@unique
[docs]class Source(Enum):
"""Type of quotes accepted as substitution sources."""
#: All quotes are potential sources for substitutions.
all = 1
#: Majority rule: only quotes that are the most frequent in the considered
#: past bin can be the source of substitutions (note that several quotes in
#: a single bin can have the same maximal frequency).
majority = 2
@unique
[docs]class Past(Enum):
"""How far back in the past can a substitution find its source."""
#: The past is everything: substitution sources can be in any bin preceding
#: the destination occurrence (which is an interval that can end at
#: midnight before the destination occurrence when using
#: :attr:`Time.discrete`).
all = 1
#: The past is the last bin: substitution sources must be in the bin
#: preceding the destination occurrence (which can end at midnight before
#: the destination occurrence when using :attr:`Time.discrete`).
last_bin = 2
@unique
[docs]class Durl(Enum):
"""Type of quotes accepted as substitution destinations."""
#: All quotes are potential destinations for substitutions.
all = 1
#: Excluded past rule: only quotes that do not appear in what :class:`Time`
#: and :class:`Past` define as "the past" can be the destination of a
#: substitution.
exclude_past = 2
[docs]class Interval:
"""Time interval defined by `start` and `end`
:class:`~datetime.datetime`\ s.
Parameters
----------
start : :class:datetime.datetime
The interval's start (or left) bound.
end : :class:datetime.datetime
The interval's end (or right) bound.
Raises
------
Exception
If `start` is strictly after `end` in time.
Examples
--------
Test if a :class:`~datetime.datetime` is in an interval:
>>> from datetime import datetime
>>> itv = Interval(datetime(2016, 7, 5, 12, 15, 5),
... datetime(2016, 7, 9, 13, 30, 0))
>>> datetime(2016, 7, 8) in itv
True
>>> datetime(2016, 8, 1) in itv
False
"""
def __init__(self, start, end):
assert start <= end
self.start = start
self.end = end
def __contains__(self, other):
"""Test if `other` is in this :class:`Interval`."""
return self.start <= other < self.end
def __key(self):
"""Unique identifier for this interval, used to compute e.g. equality
between two :class:`Interval` instances."""
return (self.start, self.end)
def __eq__(self, other):
"""Determine if two instances represent the same interval (underlies
e.g. ``itv1 == itv2``)"""
return self.__key() == other.__key()
def __hash__(self):
"""Hash for this interval (makes this class hashable, so usable e.g. as
dict keys)."""
return hash(self.__key())
def __repr__(self):
"""String representation of this interval."""
return 'Interval(start={0.start}, end={0.end})'.format(self)
[docs]class Model:
"""Substitution mining model.
A mining model is defined by the combination of one parameter for each of
:class:`Time`, :class:`Source`, :class:`Past`, :class:`Durl`, and a maximum
hamming distance between source string (or substring) and destination
string. This class represents such a model. It defines a couple of utility
functions used in :class:`ClusterMinerMixin` (:meth:`find_start` and
:meth:`past_surls`), and a :meth:`validate` method which determines if a
given substitution conforms to the model. Other methods, prefixed with an
underscore, are utilities for the methods cited above.
Parameters
----------
time : :class:`Time`
Type of time defining how occurrence bins of the model are positioned.
source : :class:`Source`
Type of quotes that the model accepts as substitution sources.
past : :class:`Past`
How far back does the model look for substitution sources.
durl : :class:`Durl`
Type of quotes that the model accepts as substitution destinations.
max_distance : int
Maximum number of substitutions between a source string (or substring)
and a destination string that the model will detect.
Raises
------
Exception
If `max_distance` is more than half of
:data:`~.settings.MT_FILTER_MIN_TOKENS`.
"""
#: Span of occurrence bins the model makes.
bin_span = timedelta(days=1)
def __init__(self, time, source, past, durl, max_distance):
assert time in Time
self.time = time
assert source in Source
self.source = source
assert past in Past
self.past = past
assert durl in Durl
self.durl = durl
assert 0 < max_distance <= settings.MT_FILTER_MIN_TOKENS // 2
self.max_distance = max_distance
#: dict associating a :class:`Source` to its validation method.
self._source_validation_table = {
Source.all: self._ok,
Source.majority: self._validate_source_majority
}
#: dict associating a :class:`Durl` to its validation method.
self._durl_validation_table = {
Durl.all: self._ok,
Durl.exclude_past: self._validate_durl_exclude_past
}
def __repr__(self):
"""String representation of this model."""
return ('Model(time={0.time}, source={0.source}, past={0.past}, '
'durl={0.durl}, max_distance={0.max_distance})').format(self)
@memoized
[docs] def validate(self, source, durl):
"""Test if potential substitutions from `source` quote to `durl`
destination url are valid for this model.
This method is :func:`~.utils.memoized` for performance.
Parameters
----------
source : :class:`~.db.Quote`
Candidate source quote for substitutions; the substitutions can be
from a substring of `source.string`.
durl : :class:`~.db.Url`
Candidate destination url for the substitutions.
Returns
-------
bool
`True` if the proposed source and destination url are considered
valid by this model, `False` otherwise.
"""
return (self._validate_distance(source, durl) and
self._validate_base(source, durl) and
self._validate_source(source, durl) and
self._validate_durl(source, durl))
[docs] def _validate_distance(self, source, durl):
"""Check that `source` and `durl` differ by no more than
`self.max_distance`."""
return 0 < self._distance_start(source, durl)[0] <= self.max_distance
[docs] def _validate_base(self, source, durl):
"""Check that `source` has at least one occurrence in what this model
considers to be the past before `durl`."""
past = self._past(source.cluster, durl)
return np.any([url.timestamp in past for url in source.urls])
[docs] def _validate_source(self, source, durl):
"""Check that `source` is an acceptable substitution source for this
model.
This method proxies to the proper validation method, depending on the
value of `self.source`.
"""
return self._source_validation_table[self.source](source, durl)
[docs] def _validate_durl(self, source, durl):
"""Check that `durl` is an acceptable substitution destination
occurrence for this model.
This method proxies to the proper validation method, depending on the
value of `self.durl`.
"""
return self._durl_validation_table[self.durl](source, durl)
[docs] def _ok(self, *args, **kwargs):
"""Dummy method used when a validation should always pass."""
return True
[docs] def _validate_source_majority(self, source, durl):
"""Check that `source` verifies the majority rule."""
# Source must be a majority quote in `past`.
past_quote_ids = np.array([surl.quote.id for surl in
self.past_surls(source.cluster, durl)])
if source.id not in past_quote_ids:
return False
counts = dict((i, c) for (i, c) in
zip(*np.unique(past_quote_ids, return_counts=True)))
if len(counts) == 0:
return False
return counts[source.id] == max(counts.values())
[docs] def _validate_durl_exclude_past(self, source, durl):
"""Check that `durl` verifies the excluded past rule."""
# Durl.quote must not be in `past`.
past_quotes = [surl.quote for surl in
self.past_surls(source.cluster, durl)]
return durl.quote not in past_quotes
[docs] def _distance_start(self, source, durl):
"""Get a `(distance, start)` tuple indicating the minimal distance
between `source` and `durl`, and the position of `source`'s substring
that achieves that minimum.
This is in fact an alias for what the model considers to be valid
transformations and how to define them, but provides proper
encapsulation of concerns.
"""
# We allow for substrings.
# Note here that there can be a difference in lemmas without
# there being a difference in tokens, because of fluctuations
# in lemmatization. This is caught later on in the validation
# of substitutions (see SubstitutionValidatorMixin.validate()),
# instead of making this function more complicated.
return subhamming(source.lemmas, durl.quote.lemmas)
[docs] def find_start(self, source, durl):
"""Get the position of the substring of `source` that achieves minimal
distance to `durl`."""
return self._distance_start(source, durl)[1]
@memoized
[docs] def past_surls(self, cluster, durl):
"""Get the list of all :class:`~.db.Url`\ s that are in what this model
considers to be the past before `durl`.
This method is :func:`~.utils.memoized` for performance.
"""
past = self._past(cluster, durl)
return list(filter(lambda url: url.timestamp in past, cluster.urls))
@memoized
[docs] def _past(self, cluster, durl):
"""Get an :class:`Interval` representing what this model considers to
be the past before `durl`.
See :class:`Time` and :class:`Past` to understand what this interval
looks like. This method is :func:`~.utils.memoized` for performance.
"""
cluster_start = min([url.timestamp for url in cluster.urls])
# The bins are aligned to midnight, so get the midnight
# before cluster start.
cluster_bin_start = datetime(year=cluster_start.year,
month=cluster_start.month,
day=cluster_start.day)
# Check our known `time` types.
assert self.time in [Time.continuous, Time.discrete]
if self.time is Time.continuous:
# Time is continuous.
end = durl.timestamp
else:
# Time is discrete.
previous_bin_count = (durl.timestamp -
cluster_bin_start) // self.bin_span
end = max(cluster_start,
cluster_bin_start + previous_bin_count * self.bin_span)
# Check our known `past` types.
assert self.past in [Past.all, Past.last_bin]
if self.past is Past.all:
# The past is everything until the start of the cluster.
start = cluster_start
else:
# The past is only the last bin.
start = max(cluster_start, end - self.bin_span)
return Interval(start, end)
[docs] def drop_caches(self):
"""Drop the caches of all :func:`~.utils.memoized` methods of the
class."""
self.validate.drop_cache()
self.past_surls.drop_cache()
self._past.drop_cache()
def __key(self):
"""Unique identifier for this model, used to compute e.g. equality
between two :class:`Model` instances."""
return (self.time, self.source, self.past, self.durl,
self.max_distance)
def __eq__(self, other):
"""Determine if two instances represent the same model (underlies
e.g. ``model1 == model2``)"""
return hasattr(other, '_Model__key') and self.__key() == other.__key()
def __hash__(self):
"""Hash for this model (makes this class hashable, so usable e.g. as
dict keys)."""
return hash(self.__key())
[docs]class ClusterMinerMixin:
"""Mixin for :class:`~.db.Cluster`\ s that provides substitution mining
functionality.
This mixin defines the :meth:`substitutions` method (based on the private
:meth:`_substitutions` method) that iterates through all valid
substitutions for a given :class:`Model`.
"""
[docs] def substitutions(self, model):
"""Iterate through all substitutions in this cluster considered valid
by `model`.
Multiple occurrences of a sentence at the same url (url "frequency")
are ignored, so as not to artificially inflate results.
Parameters
----------
model : :class:`Model`
Model for which to mine substitutions in this cluster.
Yields
------
substitution : :class:`~.db.Substitution`
All the substitutions in this cluster considered valid by `model`.
When `model` allows for multiple substitutions between a quote and
a destination url, each substitution is yielded individually. Any
substitution yielded is attached to this cluster, so if you use
this in a :func:`~.utils.session_scope` substitutions will be saved
automatically unless you explicitly rollback the session.
"""
# Iterate through candidate substitutions.
for durl in self.urls:
past_quotes_set = set([surl.quote for surl in
model.past_surls(self, durl)])
# Don't test against ourselves.
past_quotes_set.discard(durl.quote)
for source in past_quotes_set:
# Source can't be shorter than destination
if len(source.lemmas) < len(durl.quote.lemmas):
continue
# Check distance, source and durl validity.
if model.validate(source, durl):
logger.debug('Found candidate substitution(s) between '
'quote #%s and durl #%s/%s', source.sid,
durl.quote.sid, durl.occurrence)
for substitution in self._substitutions(source, durl,
model):
yield substitution
@classmethod
[docs] def _substitutions(cls, source, durl, model):
"""Iterate through all substitutions from `source` to `durl` considered
valid by `model`.
This method yields all the substitutions between `source` and `durl`
when `model` allows for multiple substitutions.
Parameters
----------
source : :class:`~.db.Quote`
Source for the substitutions.
durl : :class:`~.db.Url`
Destination url for the substitutions.
model : :class:`Model`
Model that validates the substitutions between `source` and `durl`.
"""
from brainscopypaste.db import Substitution
start = model.find_start(source, durl)
dlemmas = durl.quote.lemmas
slemmas = source.lemmas[start:start + len(dlemmas)]
positions = np.where([c1 != c2
for (c1, c2) in zip(slemmas, dlemmas)])[0]
assert 0 < len(positions) <= model.max_distance
for position in positions:
yield Substitution(source=source, destination=durl.quote,
occurrence=durl.occurrence, start=int(start),
position=int(position), model=model)
@memoized
[docs]def _get_wordnet_words():
"""Get the set of all words known by WordNet.
This is the set of all lemma names for all synonym sets in WordNet.
"""
return set(word.lower()
for synset in wordnet.all_synsets()
for word in synset.lemma_names())
[docs]class SubstitutionValidatorMixin:
"""Mixin for :class:`~.db.Substitution` that adds validation functionality.
A non-negligible part of the substitutions found by
:class:`ClusterMinerMixin` are spam or changes we're not interested in:
minor spelling changes, abbreviations, changes of articles, symptoms of a
deleted word that appear as substitutions, etc. This class defines the
:meth:`validate` method, which tests for all these cases and returns
whether or not the substitution is worth keeping.
"""
[docs] def validate(self):
"""Check whether or not this substitution is worth keeping."""
token1, token2 = self.tokens
lem1, lem2 = self.lemmas
tokens1, tokens2 = self.source.tokens, self.destination.tokens
lemmas1, lemmas2 = self.source.lemmas, self.destination.lemmas
# Only real-word lemmas.
wordnet_words = _get_wordnet_words()
if lem1 not in wordnet_words or lem2 not in wordnet_words:
return False
# '21st'/'twenty-first', etc.
if (is_int(token1[0]) or is_int(token2[0]) or
is_int(lem1[0]) or is_int(lem2[0])):
return False
# 'sen'/'senator', 'gov'/'governor', 'nov'/'november', etc.
if (token1 == token2[:3] or token2 == token1[:3] or
lem1 == lem2[:3] or lem2 == lem1[:3]):
return False
# 'programme'/'program', etc.
if (token1[:-2] == token2 or token2[:-2] == token1 or
lem1[:-2] == lem2 or lem2[:-2] == lem1):
return False
# 'centre'/'center', etc.
if is_same_ending_us_uk_spelling(token1, token2):
return False
if is_same_ending_us_uk_spelling(lem1, lem2):
return False
# stopwords
if (token1 in stopwords or token2 in stopwords or
lem1 in stopwords or lem2 in stopwords):
return False
# Other minor spelling changes, also catching cases where tokens are
# not different but lemmas are (because of lemmatization fluctuations).
if levenshtein(token1, token2) <= 1:
return False
if levenshtein(lem1, lem2) <= 1:
return False
# Word deletion ('high school' -> 'school')
if (self.start + self.position > 0 and
(token2 == tokens1[self.start + self.position - 1] or
lem2 == lemmas1[self.start + self.position - 1])):
return False
if (self.start + self.position < len(tokens1) - 1 and
(token2 == tokens1[self.start + self.position + 1] or
lem2 == lemmas1[self.start + self.position + 1])):
return False
# Word insertion ('school' -> 'high school')
if (self.position > 0 and
(token1 == tokens2[self.position - 1] or
lem1 == lemmas2[self.position - 1])):
return False
if (self.position < len(tokens2) - 1 and
(token1 == tokens2[self.position + 1] or
lem1 == lemmas2[self.position + 1])):
return False
# Two words deletion ('supply of energy' -> 'supply')
if (self.start + self.position > 1 and
(token2 == tokens1[self.start + self.position - 2] or
lem2 == lemmas1[self.start + self.position - 2])):
return False
if (self.start + self.position < len(tokens1) - 2 and
(token2 == tokens1[self.start + self.position + 2] or
lem2 == lemmas1[self.start + self.position + 2])):
return False
# Words stuck together ('policy maker' -> 'policymaker'
# or 'policy-maker')
if (self.start + self.position > 0 and
(token2 == tokens1[self.start + self.position - 1] + token1 or
token2 == tokens1[self.start + self.position - 1] +
'-' + token1 or
lem2 == lemmas1[self.start + self.position - 1] + lem1 or
lem2 == lemmas1[self.start + self.position - 1] + '-' + lem1)):
return False
if (self.start + self.position < len(tokens1) - 1 and
(token2 == token1 + tokens1[self.start + self.position + 1] or
token2 == token1 + '-' +
tokens1[self.start + self.position + 1] or
lem2 == lem1 + lemmas1[self.start + self.position + 1] or
lem2 == lem1 + '-' + lemmas1[self.start + self.position + 1])):
return False
# Words separated ('policymaker' or 'policy-maker' -> 'policy maker')
if (self.position > 0 and
(token1 == tokens2[self.position - 1] + token2 or
token1 == tokens2[self.position - 1] + '-' + token2 or
lem1 == lemmas2[self.position - 1] + lem2 or
lem1 == lemmas2[self.position - 1] + '-' + lem2)):
return False
if (self.position < len(tokens2) - 1 and
(token1 == token2 + tokens2[self.position + 1] or
token1 == token2 + '-' + tokens2[self.position + 1] or
lem1 == lem2 + lemmas2[self.position + 1] or
lem1 == lem2 + '-' + lemmas2[self.position + 1])):
return False
# We need 2 extra checks compare to the words-stuck-together situation,
# to detect teh second substitution appearing because of word
# separation. Indeed in this case, contrary to words-stuck-together, we
# can't rely on word shifts always being present, since the destination
# can be cut shorter. In other words, in the following case:
# (1) i'll come anytime there
# (2) i'll come any time
# these checks let us exclude 'there' -> 'time' as a substitution (in
# the words-stuck-together case, the word 'there' would be present in
# both sentences, shifted).
if (self.position > 0 and
(tokens1[self.start + self.position - 1] ==
tokens2[self.position - 1] + token2 or
tokens1[self.start + self.position - 1] ==
tokens2[self.position - 1] + '-' + token2 or
lemmas1[self.start + self.position - 1] ==
lemmas2[self.position - 1] + lem2 or
lemmas1[self.start + self.position - 1] ==
lemmas2[self.position - 1] + '-' + lem2)):
return False
if (self.position < len(tokens2) - 1 and
(tokens1[self.start + self.position + 1] ==
token2 + tokens2[self.position + 1] or
tokens1[self.start + self.position + 1] ==
token2 + '-' + tokens2[self.position + 1] or
lemmas1[self.start + self.position + 1] ==
lem2 + lemmas2[self.position + 1] or
lemmas1[self.start + self.position + 1] ==
lem2 + '-' + lemmas2[self.position + 1])):
return False
return True