Source code for cam.sgnmt.misc.sparse

# -*- coding: utf-8 -*-
# coding=utf-8
# Copyright 2019 The SGNMT Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""This module adds support for sparse input or output features. In
standard NMT we normally use a one-hot-representation, and input and
output layers are lookup tables (embedding matrices). The Blocks
NMT implementation in SGNMT supports explicit definition of word 
representations as sparse features, in which more than one neuron can 
be activated at a time.
"""

from abc import abstractmethod
from cam.sgnmt.misc.trie import SimpleTrie
import copy
import logging
import numpy as np
import operator


[docs]def sparse_euclidean2(v1, v2):
    """Calculates the squared Euclidean distance between two sparse 
    vectors.
    
    Args:
        v1 (dict): First sparse vector
        v2 (dict): Second sparse vector
    
    Returns:
        float. Squared distance between ``v1`` and ``v2``.
    """
    d = 0.0
    for i,v in v1.items():
        v = v - v2.get(i, 0.0)
        d = d + v*v
    d = d + sum([v*v for i,v in v2.items() if not i in v1])
    return d


[docs]def sparse_euclidean(v1, v2):
    """Calculates the Euclidean distance between two sparse 
    vectors.
    
    Args:
        v1 (dict): First sparse vector
        v2 (dict): Second sparse vector
    
    Returns:
        float. Distance between ``v1`` and ``v2``.
    """
    return np.sqrt(sparse_euclidean2(v1, v2))


[docs]def dense_euclidean2(v1, v2):
    """Calculates the squared Euclidean distance between two dense 
    vectors.
    
    Args:
        v1 (dict): First dense vector
        v2 (dict): Second dense vector
    
    Returns:
        float. Squared distance between ``v1`` and ``v2``.
    """
    return sum([(a-b)*(a-b) for a,b in zip(v1,v2)])


[docs]def dense_euclidean(v1, v2):
    """Calculates the Euclidean distance between two sparse 
    vectors.
    
    Args:
        v1 (dict): First dense vector
        v2 (dict): Second dense vector
    
    Returns:
        float. Distance between ``v1`` and ``v2``.
    """
    return np.sqrt(dense_euclidean2(v1, v2))


[docs]class SparseFeatMap(object):
    """This is the super class for mapping strategies between sparse
    feature representations and symbolic word indices. The translation
    needs to be implemented in ``sparse2word`` and ``word2sparse``.
    """
    
    def __init__(self, dim):
        """Initializes this map.
        
        Args:
            dim (int): Dimensionality of the feature representation
        """
        self.dim = dim
        self.null_vec = np.zeros(self.dim, dtype=np.float32)
    
    @abstractmethod
[docs]    def sparse2word(self, feat):
        """Gets the word id for a sparse feature. The sparse feature 
        format is a list of tuples [(dim1,val1),..,(dimN,valN)]
        
        Args:
            feat (list): Sparse feature to look up
        Returns:
            int. Word ID of a match for ``feat`` or ``None`` is no
                 match could be found
        Raises:
            NotImplementedError.
        """
        raise NotImplementedError
    
    @abstractmethod
[docs]    def word2sparse(self, word):
        """Gets the sparse feature representation for a word.
        
        Args:
            word (int): Word ID
        Returns:
            list. Sparse feature representation for ``word`` or ``None``
                  if the word could not be converted
        Raises:
            NotImplementedError.
        """
        raise NotImplementedError
    
[docs]    def sparse2nwords(self, feat, n=1):
        """Returns the n closest words to ``feat``. Subclasses can
        implement this method to implement search for words. The 
        default implementation returns the single best word only.
        
        Args:
            feat (list): Sparse feature
            n (int): Number of words to retrieve
        
        Returns:
            list: List of (wordid, distance) tuples with words which
                  are close to ``feat``.
        
        Note:
            The default implementation does not use the ``n`` argument
            and always returns distance 0.
        """
        return [(self.sparse2word(feat), 0.0)]
    
[docs]    def word2dense(self, word):
        """Gets the feature representation in dense format, i.e. a
        ``self.dim``-dimensional vector as list.
        
        Args:
            word(int): Word ID
        
        Returns:
            list. Dense vector corresponding to ``word`` or the null
            vector if no representation found
        """
        return self.sparse2dense(self.word2sparse(word))
    
[docs]    def words2dense(self, seq):
        """Applies ``word2dense`` to a word sequence. """
        return [self.word2dense(w) for w in seq]
    
[docs]    def dense2sparse(self, dense, eps = 0.5):
        """Converts a dense vector to a sparse vector.
        
        Args:
            dense (list): Dense vector (list of length n)
            eps (float): Values smaller than this are set to zero
                         in the sparse representation
        
        Returns:
            list. List of (dimension, value) tuples (sparse vector)
        """
        return [(d,v) for d,v in enumerate(dense) if v > eps]
    
[docs]    def sparse2dense(self, sparse):
        """Converts a sparse vector to its dense representation.
        
        Args:
            sparse (list): Sparse vector (list of tuples)
            
        Raises:
            IndexError. If the input vector exceeds the dimensionality
                        of this map
        """
        vec = copy.copy(self.null_vec)
        if sparse:
            for (d,v) in sparse:
                vec[d] = float(v)
        return vec
    
[docs]    def dense2word(self, feat):
        """Gets the word id for a dense feature.
        
        Args:
            feat (list): Dense feature vector to look up
        Returns:
            int. Word ID of a match for ``feat`` or ``None`` is no
                 match could be found
        Raises:
            NotImplementedError.
        """
        return self.sparse2word(self.dense2sparse(feat))
    
[docs]    def dense2nwords(self, feat, n=1):
        """Returns the n closest words to ``feat``. 
        
        Args:
            feat (list): Dense feature vector
            n (int): Number of words to retrieve
        
        Returns:
            list: List of (wordid, distance) tuples with words which
                  are close to ``feat``.
        
        Note:
            The default implementation does not use the ``n`` argument
            and always returns distance 0.
        """
        return self.sparse2nwords(self.dense2sparse(feat), n)
    
[docs]    def dense2words(self, seq):
        """Applies ``dense2word`` to a sequence. """
        return [self.dense2word(w) for w in seq]


[docs]class FlatSparseFeatMap(SparseFeatMap):
    """Can be used as replacement if a ``SparseFeatMap`` is required
    but you wish to use flat word ids. It overrides the dense methods
    with the identities.
    """
    
    def __init__(self, dim=0):
        """
        Args:
            dim (int): not used
        """
        super(FlatSparseFeatMap, self).__init__(0)
    
[docs]    def sparse2word(self, feat):
        """
        Raise:
            NotImplementedError.
        """
        return NotImplementedError
    
[docs]    def word2sparse(self, word):
        """
        Raise:
            NotImplementedError.
        """
        return NotImplementedError
    
[docs]    def word2dense(self, word):
        """Identity. """
        return word
    
[docs]    def words2dense(self, seq):
        """Identity. """
        return seq
    
[docs]    def dense2sparse(self, dense, eps = 0.3):
        """
        Raise:
            NotImplementedError.
        """
        return NotImplementedError
    
[docs]    def sparse2dense(self, sparse):
        """
        Raise:
            NotImplementedError.
        """
        return NotImplementedError
    
[docs]    def dense2word(self, feat):
        """Identity. """
        return feat


[docs]class TrivialSparseFeatMap(SparseFeatMap):
    """This is the null-object (GoF) implementation for
    ``SparseFeatMap``. It corresponds to the usual one-hot
    representation.
    """
    
    def __init__(self, dim):
        """Pass through to ``SparseFeatMap``.
        
        Args:
            dim (int): Dimensionality of the feature representation
                       (should be the vocabulary size)
        """
        super(TrivialSparseFeatMap, self).__init__(dim)
    
[docs]    def sparse2word(self, feat):
        """Returns feat[0][0] """
        return feat[0][0]
    
[docs]    def word2sparse(self, word):
        """Returns [(word, 1)] """
        return [(word, 1)]
    
    
[docs]class FileBasedFeatMap(SparseFeatMap):
    """This class loads the mapping from word to sparse feature from
    a file (see ``--src_sparse_feat_map`` and ``--trg_sparse_feat_map``)
    The mapping from word to feature is a simple dictionary lookup.
    
    The mapping from feature to word is implemented with a Trie based
    nearest neighbor implementation and does not require an exact
    match. However, in case of an exact match, is runs linearly in the
    number of non-zero entries in the vector.
    """
    
    def __init__(self, dim, path):
        """Loads the feature map from the file system.
        
        Args:
            dim (int). Dimensionality of the feature space
            path (string). Path to the mapping file
        
        Raises:
            IOError. If the file could not be loaded
        """
        super(FileBasedFeatMap, self).__init__(dim)
        self.f2w = None
        self.w2f = {}
        logging.info("Loading sparse feat map from %s" % path)
        with open(path) as f:
            for line in reversed(f.readlines()):
                (word_str, feat_str) = line.strip().split(None, 1)
                word = int(word_str)
                feat = []
                for e in feat_str.split(","):
                    d,v = e.split(":", 1)
                    feat.append((int(d), float(v)))
                self.w2f[word] = feat
        logging.info("Loaded %d entries from %s" % (len(self.w2f), path))

    def _load_f2w(self):
        logging.info("Building Trie with %d elements for sparse vector lookup"
                     % len(self.w2f))
        self.f2w = SimpleTrie()
        for w,f in sorted(self.w2f.items(), 
                          key=operator.itemgetter(0), 
                          reverse=True):
            # We iterate through the file in reversed ordered such that if 
            # vector representations clash we keep the first one
            # in f2w
            self.f2w.add_sparse(f, w)
    
[docs]    def sparse2word(self, feat):
        if not self.f2w:
            self._load_f2w()
        w,_ = self.f2w.nearest_sparse(feat)
        return w
    
[docs]    def sparse2nwords(self, feat, n=1):
        if not self.f2w:
            self._load_f2w()
        return self.f2w.n_nearest_sparse(feat, n)
    
[docs]    def word2sparse(self, word):
        return self.w2f.get(word, None)