Source code for sng.Generator

# coding: utf-8

import os
import numpy as np
import pickle

from .Config import Config
from .helpers import temp_scale

import keras
from keras.preprocessing.text import text_to_word_sequence
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM, TimeDistributed  # , SimpleRNN, GRU
from keras.callbacks import LambdaCallback


[docs]class Generator:
    """Main class that holds the config, wordlist, and the trained model.

    Parameters
    ----------
    config : sng.Config, optional
        A Config instance specifying training and simulation parameters.
        If not supplied, a default configuration will be created.
    wordlist_file : str
        Path to a textfile holding the text corpus you want to use.
    wordlist : list of strings
        Alternatively to ``wordlist_file``, you can provide the already
        processed wordlist, a list of (ideally unique) strings.

    Attributes
    ----------
    config : sng.Config
        The Config object supplied, or a default object if none was supplied
        at initialization.
    wordlist : list of strings
        A processed list of unique words, each ending in a newline.
        This is the input to the neural network.

    Examples
    --------
    You can create a word generator like this::

        import sng
        cfg = sng.Config()

        # Folder for pre-installed wordlists:
        wordlist_folder = os.path.join(
            os.path.dirname(os.path.abspath(sng.__file__)), 'wordlists')
        sample_wordlist = os.path.join(wordlist_folder, 'latin.txt')

        # Create a Generator object with some wordlist:
        gen = sng.Generator(wordlist_file=sample_wordlist, config=cfg)

        # Train the model:
        gen.fit()

        # Get a few name suggestions:
        gen.simulate(n=5)
    """

    def __init__(self, config=Config(), wordlist_file=None, wordlist=None):
        self.config = config

        if wordlist_file:
            # text_to_word_sequence only splits by space, not newline.
            # Make all word separators spaces:
            contents = open(wordlist_file).read().replace('\n', ' ')
            wordlist = text_to_word_sequence(
                contents,
                filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~0123456789–…\'\"’«·»'
            )

        # Keep only unique words:
        self.wordlist = list(set(wordlist))
        # Terminate each word with a newline:
        self.wordlist = [word.strip() + '\n' for word in self.wordlist]

        # Generate the set of unique characters (including newline)
        # https://stackoverflow.com/questions/952914/making-a-flat-list-out-of-list-of-lists-in-python
        self.chars = sorted(list(set(
            [char for word in self.wordlist for char in word]
        )))

        self.vocab_size = len(self.chars)
        self.corpus_size = len(self.wordlist)

        self.ix_to_char = {
            ix: char for ix, char in enumerate(self.chars)
        }
        self.char_to_ix = {
            char: ix for ix, char in enumerate(self.chars)
        }

        if self.config.verbose:
            print(self.corpus_size, "words\n")
            print(len(self.chars), "characters, including the \\n:")
            print(self.chars)
            print("\nFirst two sample words:")
            print(self.wordlist[:2])

[docs]    def fit(self):
        """Fit the model. Adds the 'model' attribute to itself.
        """

        X = np.zeros((self.corpus_size,
                      self.config.max_word_len,
                      self.vocab_size))
        Y = np.zeros((self.corpus_size,
                      self.config.max_word_len,
                      self.vocab_size))
        for word_i in range(self.corpus_size):
            word = self.wordlist[word_i]
            chars = list(word)

            for char_j in range(min(len(word), self.config.max_word_len)):
                char = chars[char_j]
                char_ix = self.char_to_ix[char]
                X[word_i, char_j, char_ix] = 1
                if char_j > 0:
                    # the 'next char' at time point char_j
                    Y[word_i, char_j - 1, char_ix] = 1

        model = Sequential()
        model.add(LSTM(self.config.hidden_dim,
                       input_shape=(None, self.vocab_size),
                       return_sequences=True))
        for i in range(self.config.n_layers - 1):
            model.add(LSTM(self.config.hidden_dim, return_sequences=True))
        model.add(TimeDistributed(Dense(self.vocab_size)))
        model.add(Activation('softmax'))
        model.compile(loss="categorical_crossentropy", optimizer="rmsprop")

        # TODO how to move this function into helpers.py?
        def on_epoch_end(epoch, logs):
            if epoch % 10 == 0 and self.config.verbose:
                print("epoch " + str(epoch) + " words: ", end="")
                for _ in range(4):
                    word = self._generate_word(model)
                    print(word + ", ", end="")

                print("loss: " + str(np.round(logs['loss'], 4)))

        print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
        model.fit(X, Y, batch_size=self.config.batch_size, verbose=0,
                  epochs=self.config.epochs, callbacks=[print_callback])

        self.model = model

[docs]    def simulate(self, n=10, temperature=None, min_word_len=None,
                 max_word_len=None):
        """Use the trained model to simulate a few name suggestions.

        Parameters
        ----------

        n : int
            The number of name suggestions to simulate
        temperature : float or None
            Sampling temperature. Lower values are "colder", i.e.
            sampling probabilities will be more conservative.
            If None, will use the value specified in self.config.
        min_word_len : int or None
            Minimum word length of the simulated names.
            If None, will use the value specified in self.config.
        max_word_len : int or None
            Maximum word length of the simulated names.
            If None, will use the value specified in self.config.
        """

        temperature = temperature or self.config.temperature
        min_word_len = min_word_len or self.config.min_word_len
        max_word_len = max_word_len or self.config.max_word_len

        assert hasattr(self, 'model'), 'Call the fit() method first!'
        words = []
        for i in range(n):
            word = self._generate_word(self.model)
            words.append(word + self.config.suffix)
        return words

[docs]    def save(self, directory, overwrite=False):
        """Save the model into a folder.

        Parameters
        ----------
        directory : str
            The folder to store the generator in. Should be non-existing.
        overwrite : bool
            If True, the folder contents will be overwritten if it already
            exists. Not recommended, though.
        """

        if not overwrite:
            assert not os.path.exists(directory), 'Directory already ' + \
                'exists! Please choose a non-existing path.'

        if not os.path.exists(directory):
            os.makedirs(directory)

        pickle.dump(self.config,
                    open(os.path.join(directory, 'config.pkl'),
                         "wb"), pickle.HIGHEST_PROTOCOL)
        pickle.dump(self.wordlist,
                    open(os.path.join(directory, 'wordlist.pkl'),
                         "wb"), pickle.HIGHEST_PROTOCOL)
        self.model.save(os.path.join(directory, 'model.h5'))

[docs]    @classmethod
    def load(cls, directory):
        """Create a Generator object from a stored folder.

        Arguments
        ---------
        directory : str
            Folder where you used Generator.save() to store the contents in.
        """

        config = pickle.load(
            open(os.path.join(directory, 'config.pkl'), 'rb'))
        wordlist = pickle.load(
            open(os.path.join(directory, 'wordlist.pkl'), 'rb'))
        model = keras.models.load_model(os.path.join(directory, 'model.h5'))
        generator = cls(config=config, wordlist=wordlist)
        generator.model = model
        return generator

    def _generate_word(self, model):

        X = np.zeros((1, self.config.max_word_len, self.vocab_size))

        # sample the first character
        initial_char_distribution = temp_scale(
            model.predict(X[:, 0:1, :]).flatten(), self.config.temperature
        )

        ix = 0

        # make sure the initial character is not a newline (i.e. index 0)
        while ix == 0:
            ix = int(np.random.choice(self.vocab_size, size=1,
                                      p=initial_char_distribution))

        X[0, 0, ix] = 1

        # start with first character, then later successively append chars
        generated_word = [self.ix_to_char[ix].upper()]

        # sample all remaining characters
        for i in range(1, self.config.max_word_len):
            next_char_distribution = temp_scale(
                model.predict(X[:, 0:i, :])[:, i-1, :].flatten(),
                self.config.temperature
            )

            ix_choice = np.random.choice(
                self.vocab_size, size=1, p=next_char_distribution
            )

            ctr = 0
            while ix_choice == 0 and i < self.config.min_word_len:
                ctr += 1
                # sample again if you picked the end-of-word token too early
                ix_choice = np.random.choice(
                    self.vocab_size, size=1, p=next_char_distribution
                )
                if ctr > 1000:
                    print("caught in a near-infinite loop."
                          "You might have picked too low a temperature "
                          "and the sampler just keeps sampling \\n's")
                    break

            next_ix = int(ix_choice)
            X[0, i, next_ix] = 1
            if next_ix == 0:
                break
            generated_word.append(self.ix_to_char[next_ix])

        return ('').join(generated_word)