Source code for brainscopypaste.utils

"""Miscellaneous utilities.

"""


import logging
import pickle
from contextlib import contextmanager
from itertools import zip_longest
import os

import numpy as np
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
from sqlalchemy import create_engine
from decorator import decorate


logger = logging.getLogger(__name__)


[docs]class Namespace: """Convert a dict to a namespace by creating a class out of it. Parameters ---------- init_dict : dict The dict you wish to turn into a namespace. """ def __init__(self, init_dict): self.__dict__.update(init_dict)
[docs]def grouper(iterable, n, fillvalue=None): """Iterate over `n`-wide slices of `iterable`, filling the last slice with `fillvalue`. See :func:`grouper_adaptive` for a version of this that doesn't fill the last slice. """ args = [iter(iterable)] * n return zip_longest(*args, fillvalue=fillvalue)
[docs]def grouper_adaptive(iterable, n): """Iterate over `n`-wide slices of `iterable`, ending the last slice once `iterable` is empty. See :func:`grouper_adaptive` for a version of this that fills the last slice with a value of your choosing. """ it = iter(iterable) keepgoing = True def block(): nonlocal keepgoing for i in range(n): try: yield next(it) except StopIteration: keepgoing = False while keepgoing: yield block()
[docs]class cache: """Compute an attribute's value and cache it in the instance. This is meant to be used as a decorator on class methods, to turn them into cached computed attributes: the value is computed the first time you access the attribute, and this decorator then replaces the method with the computed value. Any subsequent access gives you the cached value immediately. Taken from the `Python Cookbook (Denis Otkidach) <http://stackoverflow.com/users/168352/denis-otkidach>`_. """ def __init__(self, method, name=None): # Record the unbound-method and the name self.method = method self.name = name or method.__name__ self.__doc__ = method.__doc__ def __get__(self, inst, cls): # self: <__main__.cache object at 0xb781340c> # inst: <__main__.Foo object at 0xb781348c> # cls: <class '__main__.Foo'> if inst is None: # instance attribute accessed on class, return self # You get here if you write `Foo.bar` return self # Compute, cache and return the instance's attribute value result = self.method(inst) # setattr redefines the instance's attribute so this doesn't # get called again setattr(inst, self.name, result) return result
def _memoize(func, *args, **kwargs): # frozenset is used to ensure hashability if kwargs: key = args, frozenset(kwargs.items()) else: key = args # Attribute added by memoized cache = func.cache if key not in cache: cache[key] = func(*args, **kwargs) return cache[key]
[docs]def memoized(f): """Decorate a function to cache its return value the first time it is called. If called later with the same arguments, the cached value is returned (not reevaluated). """ f.cache = {} def drop_cache(): logger.debug('Dropping cache for %s', f) f.cache = {} f.drop_cache = drop_cache return decorate(f, _memoize)
[docs]def mpl_palette(n_colors, variation='Set2'): # or variation='colorblind' """Get any seaborn palette as a usable matplotlib colormap.""" import seaborn as sb palette = sb.color_palette(variation, n_colors, desat=0.8) return (sb.blend_palette(palette, n_colors=n_colors, as_cmap=True), sb.blend_palette(palette, n_colors=n_colors))
@contextmanager
[docs]def session_scope(): """Provide an SQLAlchemy transactional scope around a series of operations. Wrap your SQLAlchemy operations (queries, insertions, modifications, etc.) in a ``with session_scope() as session`` block to deal with sessions easily. Changes are committed when the block finishes. If an exception occurrs in the block, the session is rolled back and the exception propagated. """ from brainscopypaste.db import Session session = Session() logger.debug('Opened session %s', session) try: yield session logger.debug('Committing session %s', session) session.commit() except: logger.debug('Rolling back session %s', session) session.rollback() raise finally: logger.debug('Closing session %s', session) session.close()
[docs]def mkdirp(folder): """Create `folder` if it doesn't exist.""" if not os.path.exists(folder): logger.debug("Creating folder '%s'", folder) os.makedirs(folder)
[docs]def iter_parent_dirs(rel_dir): """Iterate through parent directories of current working directory, appending `rel_dir` to those successive directories.""" d = os.path.abspath('.') pd = None while pd != d: yield os.path.join(d, rel_dir) pd = d d = os.path.split(d)[0]
[docs]def find_parent_rel_dir(rel_dir): """Find a relative directory in parent directories. Searches for directory `rel_dir` in all parent directories of the current directory. Parameters ---------- rel_dir : string The relative directory to search for. Returns ------- d : string Full path to the first found directory. Raises ------ NotFoundError If no relative directory is found in the parent directories. """ for d in iter_parent_dirs(rel_dir): if os.path.exists(d) and os.path.isdir(d): return d raise NotFoundError('No relative directory found in parent directories')
[docs]class NotFoundError(Exception): """Signal a file or directory can't be found."""
@memoized
[docs]def langdetect(sentence): """Detect the language of `sentence`.""" try: return detect(sentence) except LangDetectException: return None
[docs]def execute_raw(engine, statement): """Execute the raw SQL statement `statement` on SQLAlchemy engine `engine`. Useful to run ANALYZE or VACUUM operations on the database. Parameters ---------- engine : :class:`sqlalchemy.engine.Engine` The engine to run `statement` on. statement : str A valid SQL statement for `engine`. """ logger.debug("Raw execution of SQL '%s'", statement) connection = engine.connect() raw_connection = connection.connection old_isolation_level = raw_connection.isolation_level raw_connection.set_isolation_level(0) with raw_connection.cursor() as cursor: cursor.execute(statement) raw_connection.set_isolation_level(old_isolation_level) connection.close()
@memoized
[docs]def is_same_ending_us_uk_spelling(w1, w2): """Test if `w1` and `w2` differ by only the last two letters inverted, as in `center`/`centre` (words must be at least 4 letters).""" if len(w1) < 4 or len(w2) < 4: # Words too short return False if w1[:-2] != w2[:-2]: # There's a change before the last two letters return False if w1[:-3:-1] == w2[-2:]: # The last two letters are inverted return True return False
@memoized
[docs]def is_int(s): """Test if `s` is a string that represents an integer; returns `True` if so, `False` in any other case.""" if not isinstance(s, str) or isinstance(s, bytes): return False try: int(s) return True except (ValueError, TypeError): return False
@memoized
[docs]def levenshtein(s1, s2): """Compute the levenshtein distance between strings or lists `s1` and `s2`.""" if len(s1) < len(s2): return levenshtein(s2, s1) if not s2: return len(s1) previous_row = range(len(s2) + 1) for i, c1 in enumerate(s1): current_row = [i + 1] for j, c2 in enumerate(s2): # previous_row and current_row are one character longer than s2, # hence the 'j + 1' insertions = previous_row[j + 1] + 1 deletions = current_row[j] + 1 substitutions = previous_row[j] + (c1 != c2) current_row.append(min(insertions, deletions, substitutions)) previous_row = current_row return previous_row[-1]
@memoized
[docs]def hamming(s1, s2): """Compute the hamming distance between strings or lists `s1` and `s2`.""" if len(s1) != len(s2): raise ValueError('Strings must be the same length.') else: return np.sum(c1 != c2 for c1, c2 in zip(s1, s2))
@memoized
[docs]def sublists(s, l): """Get all sublists of `s` of length `l`.""" if l == 0: return () if l > len(s): raise ValueError('Sublists must be shorter or as long as source.') return tuple(s[i:i + l] for i in range(len(s) - l + 1))
@memoized
[docs]def subhamming(s1, s2): """Compute the minimum hamming distance between `s2` and all sublists of `s1` as long as `s2`, returning `(distance, sublist start in s1)`.""" l1 = len(s1) l2 = len(s2) if l2 == 0: return l1, 0 if l1 < l2: raise ValueError('The second string must be shorter or ' 'as long as the first one.') if l1 == l2: return hamming(s1, s2), 0 distances = np.zeros(l1 - l2 + 1) for i, subs in enumerate(sublists(s1, l2)): distances[i] = hamming(subs, s2) amin = np.argmin(distances) return int(distances[amin]), amin
[docs]class Stopwords: """Detect if a word is a stopword. Prefer using this module's :data:`stopwords` instance of this class for stopword-checking. """ def __init__(self): self._loaded = False
[docs] def _load(self): """Read and load the underlying stopwords file.""" logger.debug('Loading stopwords') from brainscopypaste.conf import settings stopwords = set([]) with open(settings.STOPWORDS) as f: for l in f: stopwords.add(l.strip().lower()) self._stopwords = stopwords self._loaded = True
def __contains__(self, word): """Test if `word` is a stopword or not.""" if not self._loaded: self._load() return word in self._stopwords
#: Instance of :class:`Stopwords` to be used for stopword-testing. stopwords = Stopwords() @memoized
[docs]def unpickle(filename): """Load a pickle file at path `filename`. This function is :func:`memoized` so a file is only loaded the first time. """ with open(filename, 'rb') as file: return pickle.load(file)
[docs]def init_db(echo_sql=False): """Connect to the database and bind :mod:`.db`'s `Session` object to it. Uses the :data:`~.settings.DB_USER` and :data:`~.settings.DB_PASSWORD` credentials to connect to PostgreSQL database :data:`~.settings.DB_NAME`. It binds the `Session` object in :mod:`.db` to this engine, and returns the engine object. Note that once this is done, you can directly use :func:`session_scope` since it uses the right `Session` object. Parameters ---------- echo_sql : bool, optional If `True`, print to stdout all SQL commands sent to the engine; defaults to `False`. Returns ------- :class:`sqlalchemy.engine.Engine` The engine connected to the database. """ from brainscopypaste.db import Base, Session from brainscopypaste.conf import settings logger.info('Initializing database connection') engine = create_engine( 'postgresql+psycopg2://{user}:{pw}@localhost:5432/{db}' .format(user=settings.DB_USER, pw=settings.DB_PASSWORD, db=settings.DB_NAME), client_encoding='utf8', echo=echo_sql) Session.configure(bind=engine) logger.info('Database connected') logger.debug('Checking tables to create') Base.metadata.create_all(engine) return engine