Source code for sng.Generator

# coding: utf-8

import os
import numpy as np
import pickle

from .Config import Config
from .helpers import temp_scale

import keras
from keras.preprocessing.text import text_to_word_sequence
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM, TimeDistributed  # , SimpleRNN, GRU
from keras.callbacks import LambdaCallback


[docs]class Generator: """Main class that holds the config, wordlist, and the trained model. Parameters ---------- config : sng.Config, optional A Config instance specifying training and simulation parameters. If not supplied, a default configuration will be created. wordlist_file : str Path to a textfile holding the text corpus you want to use. wordlist : list of strings Alternatively to ``wordlist_file``, you can provide the already processed wordlist, a list of (ideally unique) strings. Attributes ---------- config : sng.Config The Config object supplied, or a default object if none was supplied at initialization. wordlist : list of strings A processed list of unique words, each ending in a newline. This is the input to the neural network. Examples -------- You can create a word generator like this:: import sng cfg = sng.Config() # Folder for pre-installed wordlists: wordlist_folder = os.path.join( os.path.dirname(os.path.abspath(sng.__file__)), 'wordlists') sample_wordlist = os.path.join(wordlist_folder, 'latin.txt') # Create a Generator object with some wordlist: gen = sng.Generator(wordlist_file=sample_wordlist, config=cfg) # Train the model: gen.fit() # Get a few name suggestions: gen.simulate(n=5) """ def __init__(self, config=Config(), wordlist_file=None, wordlist=None): self.config = config if wordlist_file: # text_to_word_sequence only splits by space, not newline. # Make all word separators spaces: contents = open(wordlist_file).read().replace('\n', ' ') wordlist = text_to_word_sequence( contents, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~0123456789–…\'\"’«·»' ) # Keep only unique words: self.wordlist = list(set(wordlist)) # Terminate each word with a newline: self.wordlist = [word.strip() + '\n' for word in self.wordlist] # Generate the set of unique characters (including newline) # https://stackoverflow.com/questions/952914/making-a-flat-list-out-of-list-of-lists-in-python self.chars = sorted(list(set( [char for word in self.wordlist for char in word] ))) self.vocab_size = len(self.chars) self.corpus_size = len(self.wordlist) self.ix_to_char = { ix: char for ix, char in enumerate(self.chars) } self.char_to_ix = { char: ix for ix, char in enumerate(self.chars) } if self.config.verbose: print(self.corpus_size, "words\n") print(len(self.chars), "characters, including the \\n:") print(self.chars) print("\nFirst two sample words:") print(self.wordlist[:2])
[docs] def fit(self): """Fit the model. Adds the 'model' attribute to itself. """ X = np.zeros((self.corpus_size, self.config.max_word_len, self.vocab_size)) Y = np.zeros((self.corpus_size, self.config.max_word_len, self.vocab_size)) for word_i in range(self.corpus_size): word = self.wordlist[word_i] chars = list(word) for char_j in range(min(len(word), self.config.max_word_len)): char = chars[char_j] char_ix = self.char_to_ix[char] X[word_i, char_j, char_ix] = 1 if char_j > 0: # the 'next char' at time point char_j Y[word_i, char_j - 1, char_ix] = 1 model = Sequential() model.add(LSTM(self.config.hidden_dim, input_shape=(None, self.vocab_size), return_sequences=True)) for i in range(self.config.n_layers - 1): model.add(LSTM(self.config.hidden_dim, return_sequences=True)) model.add(TimeDistributed(Dense(self.vocab_size))) model.add(Activation('softmax')) model.compile(loss="categorical_crossentropy", optimizer="rmsprop") # TODO how to move this function into helpers.py? def on_epoch_end(epoch, logs): if epoch % 10 == 0 and self.config.verbose: print("epoch " + str(epoch) + " words: ", end="") for _ in range(4): word = self._generate_word(model) print(word + ", ", end="") print("loss: " + str(np.round(logs['loss'], 4))) print_callback = LambdaCallback(on_epoch_end=on_epoch_end) model.fit(X, Y, batch_size=self.config.batch_size, verbose=0, epochs=self.config.epochs, callbacks=[print_callback]) self.model = model
[docs] def simulate(self, n=10, temperature=None, min_word_len=None, max_word_len=None): """Use the trained model to simulate a few name suggestions. Parameters ---------- n : int The number of name suggestions to simulate temperature : float or None Sampling temperature. Lower values are "colder", i.e. sampling probabilities will be more conservative. If None, will use the value specified in self.config. min_word_len : int or None Minimum word length of the simulated names. If None, will use the value specified in self.config. max_word_len : int or None Maximum word length of the simulated names. If None, will use the value specified in self.config. """ temperature = temperature or self.config.temperature min_word_len = min_word_len or self.config.min_word_len max_word_len = max_word_len or self.config.max_word_len assert hasattr(self, 'model'), 'Call the fit() method first!' words = [] for i in range(n): word = self._generate_word(self.model) words.append(word + self.config.suffix) return words
[docs] def save(self, directory, overwrite=False): """Save the model into a folder. Parameters ---------- directory : str The folder to store the generator in. Should be non-existing. overwrite : bool If True, the folder contents will be overwritten if it already exists. Not recommended, though. """ if not overwrite: assert not os.path.exists(directory), 'Directory already ' + \ 'exists! Please choose a non-existing path.' if not os.path.exists(directory): os.makedirs(directory) pickle.dump(self.config, open(os.path.join(directory, 'config.pkl'), "wb"), pickle.HIGHEST_PROTOCOL) pickle.dump(self.wordlist, open(os.path.join(directory, 'wordlist.pkl'), "wb"), pickle.HIGHEST_PROTOCOL) self.model.save(os.path.join(directory, 'model.h5'))
[docs] @classmethod def load(cls, directory): """Create a Generator object from a stored folder. Arguments --------- directory : str Folder where you used Generator.save() to store the contents in. """ config = pickle.load( open(os.path.join(directory, 'config.pkl'), 'rb')) wordlist = pickle.load( open(os.path.join(directory, 'wordlist.pkl'), 'rb')) model = keras.models.load_model(os.path.join(directory, 'model.h5')) generator = cls(config=config, wordlist=wordlist) generator.model = model return generator
def _generate_word(self, model): X = np.zeros((1, self.config.max_word_len, self.vocab_size)) # sample the first character initial_char_distribution = temp_scale( model.predict(X[:, 0:1, :]).flatten(), self.config.temperature ) ix = 0 # make sure the initial character is not a newline (i.e. index 0) while ix == 0: ix = int(np.random.choice(self.vocab_size, size=1, p=initial_char_distribution)) X[0, 0, ix] = 1 # start with first character, then later successively append chars generated_word = [self.ix_to_char[ix].upper()] # sample all remaining characters for i in range(1, self.config.max_word_len): next_char_distribution = temp_scale( model.predict(X[:, 0:i, :])[:, i-1, :].flatten(), self.config.temperature ) ix_choice = np.random.choice( self.vocab_size, size=1, p=next_char_distribution ) ctr = 0 while ix_choice == 0 and i < self.config.min_word_len: ctr += 1 # sample again if you picked the end-of-word token too early ix_choice = np.random.choice( self.vocab_size, size=1, p=next_char_distribution ) if ctr > 1000: print("caught in a near-infinite loop." "You might have picked too low a temperature " "and the sampler just keeps sampling \\n's") break next_ix = int(ix_choice) X[0, i, next_ix] = 1 if next_ix == 0: break generated_word.append(self.ix_to_char[next_ix]) return ('').join(generated_word)