Source code for cam.sgnmt.misc.sparse

# -*- coding: utf-8 -*-
# coding=utf-8
# Copyright 2019 The SGNMT Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""This module adds support for sparse input or output features. In
standard NMT we normally use a one-hot-representation, and input and
output layers are lookup tables (embedding matrices). The Blocks
NMT implementation in SGNMT supports explicit definition of word 
representations as sparse features, in which more than one neuron can 
be activated at a time.
"""

from abc import abstractmethod
from cam.sgnmt.misc.trie import SimpleTrie
import copy
import logging
import numpy as np
import operator


[docs]def sparse_euclidean2(v1, v2): """Calculates the squared Euclidean distance between two sparse vectors. Args: v1 (dict): First sparse vector v2 (dict): Second sparse vector Returns: float. Squared distance between ``v1`` and ``v2``. """ d = 0.0 for i,v in v1.items(): v = v - v2.get(i, 0.0) d = d + v*v d = d + sum([v*v for i,v in v2.items() if not i in v1]) return d
[docs]def sparse_euclidean(v1, v2): """Calculates the Euclidean distance between two sparse vectors. Args: v1 (dict): First sparse vector v2 (dict): Second sparse vector Returns: float. Distance between ``v1`` and ``v2``. """ return np.sqrt(sparse_euclidean2(v1, v2))
[docs]def dense_euclidean2(v1, v2): """Calculates the squared Euclidean distance between two dense vectors. Args: v1 (dict): First dense vector v2 (dict): Second dense vector Returns: float. Squared distance between ``v1`` and ``v2``. """ return sum([(a-b)*(a-b) for a,b in zip(v1,v2)])
[docs]def dense_euclidean(v1, v2): """Calculates the Euclidean distance between two sparse vectors. Args: v1 (dict): First dense vector v2 (dict): Second dense vector Returns: float. Distance between ``v1`` and ``v2``. """ return np.sqrt(dense_euclidean2(v1, v2))
[docs]class SparseFeatMap(object): """This is the super class for mapping strategies between sparse feature representations and symbolic word indices. The translation needs to be implemented in ``sparse2word`` and ``word2sparse``. """ def __init__(self, dim): """Initializes this map. Args: dim (int): Dimensionality of the feature representation """ self.dim = dim self.null_vec = np.zeros(self.dim, dtype=np.float32) @abstractmethod
[docs] def sparse2word(self, feat): """Gets the word id for a sparse feature. The sparse feature format is a list of tuples [(dim1,val1),..,(dimN,valN)] Args: feat (list): Sparse feature to look up Returns: int. Word ID of a match for ``feat`` or ``None`` is no match could be found Raises: NotImplementedError. """ raise NotImplementedError
@abstractmethod
[docs] def word2sparse(self, word): """Gets the sparse feature representation for a word. Args: word (int): Word ID Returns: list. Sparse feature representation for ``word`` or ``None`` if the word could not be converted Raises: NotImplementedError. """ raise NotImplementedError
[docs] def sparse2nwords(self, feat, n=1): """Returns the n closest words to ``feat``. Subclasses can implement this method to implement search for words. The default implementation returns the single best word only. Args: feat (list): Sparse feature n (int): Number of words to retrieve Returns: list: List of (wordid, distance) tuples with words which are close to ``feat``. Note: The default implementation does not use the ``n`` argument and always returns distance 0. """ return [(self.sparse2word(feat), 0.0)]
[docs] def word2dense(self, word): """Gets the feature representation in dense format, i.e. a ``self.dim``-dimensional vector as list. Args: word(int): Word ID Returns: list. Dense vector corresponding to ``word`` or the null vector if no representation found """ return self.sparse2dense(self.word2sparse(word))
[docs] def words2dense(self, seq): """Applies ``word2dense`` to a word sequence. """ return [self.word2dense(w) for w in seq]
[docs] def dense2sparse(self, dense, eps = 0.5): """Converts a dense vector to a sparse vector. Args: dense (list): Dense vector (list of length n) eps (float): Values smaller than this are set to zero in the sparse representation Returns: list. List of (dimension, value) tuples (sparse vector) """ return [(d,v) for d,v in enumerate(dense) if v > eps]
[docs] def sparse2dense(self, sparse): """Converts a sparse vector to its dense representation. Args: sparse (list): Sparse vector (list of tuples) Raises: IndexError. If the input vector exceeds the dimensionality of this map """ vec = copy.copy(self.null_vec) if sparse: for (d,v) in sparse: vec[d] = float(v) return vec
[docs] def dense2word(self, feat): """Gets the word id for a dense feature. Args: feat (list): Dense feature vector to look up Returns: int. Word ID of a match for ``feat`` or ``None`` is no match could be found Raises: NotImplementedError. """ return self.sparse2word(self.dense2sparse(feat))
[docs] def dense2nwords(self, feat, n=1): """Returns the n closest words to ``feat``. Args: feat (list): Dense feature vector n (int): Number of words to retrieve Returns: list: List of (wordid, distance) tuples with words which are close to ``feat``. Note: The default implementation does not use the ``n`` argument and always returns distance 0. """ return self.sparse2nwords(self.dense2sparse(feat), n)
[docs] def dense2words(self, seq): """Applies ``dense2word`` to a sequence. """ return [self.dense2word(w) for w in seq]
[docs]class FlatSparseFeatMap(SparseFeatMap): """Can be used as replacement if a ``SparseFeatMap`` is required but you wish to use flat word ids. It overrides the dense methods with the identities. """ def __init__(self, dim=0): """ Args: dim (int): not used """ super(FlatSparseFeatMap, self).__init__(0)
[docs] def sparse2word(self, feat): """ Raise: NotImplementedError. """ return NotImplementedError
[docs] def word2sparse(self, word): """ Raise: NotImplementedError. """ return NotImplementedError
[docs] def word2dense(self, word): """Identity. """ return word
[docs] def words2dense(self, seq): """Identity. """ return seq
[docs] def dense2sparse(self, dense, eps = 0.3): """ Raise: NotImplementedError. """ return NotImplementedError
[docs] def sparse2dense(self, sparse): """ Raise: NotImplementedError. """ return NotImplementedError
[docs] def dense2word(self, feat): """Identity. """ return feat
[docs]class TrivialSparseFeatMap(SparseFeatMap): """This is the null-object (GoF) implementation for ``SparseFeatMap``. It corresponds to the usual one-hot representation. """ def __init__(self, dim): """Pass through to ``SparseFeatMap``. Args: dim (int): Dimensionality of the feature representation (should be the vocabulary size) """ super(TrivialSparseFeatMap, self).__init__(dim)
[docs] def sparse2word(self, feat): """Returns feat[0][0] """ return feat[0][0]
[docs] def word2sparse(self, word): """Returns [(word, 1)] """ return [(word, 1)]
[docs]class FileBasedFeatMap(SparseFeatMap): """This class loads the mapping from word to sparse feature from a file (see ``--src_sparse_feat_map`` and ``--trg_sparse_feat_map``) The mapping from word to feature is a simple dictionary lookup. The mapping from feature to word is implemented with a Trie based nearest neighbor implementation and does not require an exact match. However, in case of an exact match, is runs linearly in the number of non-zero entries in the vector. """ def __init__(self, dim, path): """Loads the feature map from the file system. Args: dim (int). Dimensionality of the feature space path (string). Path to the mapping file Raises: IOError. If the file could not be loaded """ super(FileBasedFeatMap, self).__init__(dim) self.f2w = None self.w2f = {} logging.info("Loading sparse feat map from %s" % path) with open(path) as f: for line in reversed(f.readlines()): (word_str, feat_str) = line.strip().split(None, 1) word = int(word_str) feat = [] for e in feat_str.split(","): d,v = e.split(":", 1) feat.append((int(d), float(v))) self.w2f[word] = feat logging.info("Loaded %d entries from %s" % (len(self.w2f), path)) def _load_f2w(self): logging.info("Building Trie with %d elements for sparse vector lookup" % len(self.w2f)) self.f2w = SimpleTrie() for w,f in sorted(self.w2f.items(), key=operator.itemgetter(0), reverse=True): # We iterate through the file in reversed ordered such that if # vector representations clash we keep the first one # in f2w self.f2w.add_sparse(f, w)
[docs] def sparse2word(self, feat): if not self.f2w: self._load_f2w() w,_ = self.f2w.nearest_sparse(feat) return w
[docs] def sparse2nwords(self, feat, n=1): if not self.f2w: self._load_f2w() return self.f2w.n_nearest_sparse(feat, n)
[docs] def word2sparse(self, word): return self.w2f.get(word, None)