add core algorithm implementation package

supercoderhawk · supercoderhawk · commit e1467612fce1 · 2017-10-11T12:27:00.000+08:00
diff --git a/python/dnlp/core/__init__.py b/python/dnlp/core/__init__.py
@@ -0,0 +1 @@
+#-*- coding: UTF-8 -*-
diff --git a/python/dnlp/core/dnn_crf.py b/python/dnlp/core/dnn_crf.py
@@ -0,0 +1,213 @@
+# -*- coding: UTF-8 -*-
+import tensorflow as tf
+import numpy as np
+import math
+from dnlp.core.dnn_crf_base import DnnCrfBase
+from dnlp.config.config import DnnCrfConfig
+
+
+class DnnCrf(DnnCrfBase):
+  def __init__(self, *,config: DnnCrfConfig, data_path: str = '', dtype: type = tf.float32, mode: str = 'train',nn:str, model_path:str=''):
+    if mode not in ['train', 'predict']:
+      raise Exception('mode error')
+    if nn not in ['mlp','lstm','gru']:
+      raise Exception('name of neural network entered is not supported')
+
+    DnnCrfBase.__init__(self, config, data_path, mode, model_path)
+    self.dtype = dtype
+    self.mode = mode
+
+    # &#26500;&#24314;
+    tf.reset_default_graph()
+    self.transition = self.__get_variable([self.tags_count, self.tags_count], 'transition')
+    self.transition_init = self.__get_variable([self.tags_count], 'transition_init')
+    self.params = [self.transition, self.transition_init]
+    # &#36755;&#20837;&#23618;
+    if mode == 'train':
+      self.input = tf.placeholder(tf.int32, [self.batch_size, self.batch_length, self.windows_size])
+    else:
+      self.input = tf.placeholder(tf.int32, [None, self.windows_size])
+    # &#26597;&#25214;&#34920;&#23618;
+    self.embedding_layer = self.get_embedding_layer()
+    # &#38544;&#34255;&#23618;
+    if nn == 'mlp':
+      self.hidden_layer = self.get_mlp_layer(tf.transpose(self.embedding_layer))
+    elif nn == 'lstm':
+      self.hidden_layer = self.get_lstm_layer(tf.transpose(self.embedding_layer))
+    else:
+      self.hidden_layer = self.get_gru_layer(tf.transpose(self.embedding_layer))
+    # &#36755;&#20986;&#23618;
+    self.output = self.get_output_layer(self.hidden_layer)
+
+    if mode == 'predict':
+      self.output = tf.squeeze(self.output, axis=2)
+    else:
+      # &#26500;&#24314;&#35757;&#32451;&#20989;&#25968;
+      # &#35757;&#32451;&#29992;placeholder
+      self.ll_corr = tf.placeholder(tf.int32, shape=[None, 3])
+      self.ll_curr = tf.placeholder(tf.int32, shape=[None, 3])
+      self.trans_corr = tf.placeholder(tf.int32, [None, 2])
+      self.trans_curr = tf.placeholder(tf.int32, [None, 2])
+      self.trans_init_corr = tf.placeholder(tf.int32, [None, 1])
+      self.trans_init_curr = tf.placeholder(tf.int32, [None, 1])
+      # &#25439;&#22833;&#20989;&#25968;
+      self.loss, self.loss_with_init = self.get_loss()
+      self.optimizer = tf.train.AdagradOptimizer(self.learning_rate)
+      self.train = self.optimizer.minimize(self.loss)
+      self.train_with_init = self.optimizer.minimize(self.loss_with_init)
+
+  def fit(self, epochs: int = 100, interval: int = 20):
+    with tf.Session() as sess:
+      tf.global_variables_initializer().run()
+      saver = tf.train.Saver(max_to_keep=100)
+      for epoch in range(1, epochs + 1):
+        print('epoch:', epoch)
+        for _ in range(self.batch_count):
+          characters, labels, lengths = self.get_batch()
+          self.fit_batch(characters, labels, lengths, sess)
+        # if epoch % interval == 0:
+        model_path = '../dnlp/models/cws{0}.ckpt'.format(epoch)
+        saver.save(sess, model_path)
+        self.save_config(model_path)
+
+  def fit_batch(self, characters, labels, lengths, sess):
+    scores = sess.run(self.output, feed_dict={self.input: characters})
+    transition = self.transition.eval(session=sess)
+    transition_init = self.transition_init.eval(session=sess)
+    update_labels_pos = None
+    update_labels_neg = None
+    current_labels = []
+    trans_pos_indices = []
+    trans_neg_indices = []
+    trans_init_pos_indices = []
+    trans_init_neg_indices = []
+    for i in range(self.batch_size):
+      current_label = self.viterbi(scores[:, :lengths[i], i], transition, transition_init)
+      current_labels.append(current_label)
+      diff_tag = np.subtract(labels[i, :lengths[i]], current_label)
+      update_index = np.where(diff_tag != 0)[0]
+      update_length = len(update_index)
+      if update_length == 0:
+        continue
+      update_label_pos = np.stack([labels[i, update_index], update_index, i * np.ones([update_length])], axis=-1)
+      update_label_neg = np.stack([current_label[update_index], update_index, i * np.ones([update_length])], axis=-1)
+      if update_labels_pos is not None:
+        np.concatenate((update_labels_pos, update_label_pos))
+        np.concatenate((update_labels_neg, update_label_neg))
+      else:
+        update_labels_pos = update_label_pos
+        update_labels_neg = update_label_neg
+
+      trans_pos_index, trans_neg_index, trans_init_pos, trans_init_neg, update_init = self.generate_transition_update_index(
+        labels[i, :lengths[i]], current_labels[i])
+
+      trans_pos_indices.extend(trans_pos_index)
+      trans_neg_indices.extend(trans_neg_index)
+
+      if update_init:
+        trans_init_pos_indices.append(trans_init_pos)
+        trans_init_neg_indices.append(trans_init_neg)
+
+    if update_labels_pos is not None and update_labels_neg is not None:
+      feed_dict = {self.input: characters, self.ll_curr: update_labels_neg, self.ll_corr: update_labels_pos,
+                   self.trans_curr: trans_neg_indices, self.trans_corr: trans_pos_indices}
+
+      if not trans_init_pos_indices:
+        sess.run(self.train, feed_dict)
+      else:
+        feed_dict[self.trans_init_corr] = trans_init_pos_indices
+        feed_dict[self.trans_init_curr] = trans_init_neg_indices
+        sess.run(self.train_with_init, feed_dict)
+
+  def generate_transition_update_index(self, correct_labels, current_labels):
+    if correct_labels.shape != current_labels.shape:
+      print('sequence length is not equal')
+      return None
+
+    before_corr = correct_labels[0]
+    before_curr = current_labels[0]
+    update_init = False
+
+    trans_init_pos = None
+    trans_init_neg = None
+    trans_pos = []
+    trans_neg = []
+
+    if before_corr != before_curr:
+      trans_init_pos = [before_corr]
+      trans_init_neg = [before_curr]
+      update_init = True
+
+    for _, (corr_label, curr_label) in enumerate(zip(correct_labels[1:], current_labels[1:])):
+      if corr_label != curr_label or before_corr != before_curr:
+        trans_pos.append([before_corr, corr_label])
+        trans_neg.append([before_curr, curr_label])
+      before_corr = corr_label
+      before_curr = curr_label
+
+    return trans_pos, trans_neg, trans_init_pos, trans_init_neg, update_init
+
+  def predict(self, sentence: str):
+    if self.mode != 'predict':
+      raise Exception('mode is not allowed to predict')
+    with tf.Session() as sess:
+      tf.global_variables_initializer().run()
+      tf.train.Saver().restore(save_path=self.model_path, sess=sess)
+      input = self.indices2input(self.sentence2indices(sentence))
+      runner = [self.output, self.transition, self.transition_init]
+      output, trans, trans_init = sess.run(runner, feed_dict={self.input: input})
+      labels = self.viterbi(output, trans, trans_init)
+      return self.tags2words(sentence, labels)
+
+  def get_embedding_layer(self) -> tf.Tensor:
+    embeddings = self.__get_variable([self.dict_size, self.embed_size], 'embeddings')
+    self.params.append(embeddings)
+    if self.mode == 'train':
+      input_size = [self.batch_size, self.batch_length, self.concat_embed_size]
+      layer = tf.reshape(tf.nn.embedding_lookup(embeddings, self.input), input_size)
+    else:
+      layer = tf.reshape(tf.nn.embedding_lookup(embeddings, self.input), [1, -1, self.concat_embed_size])
+    return layer
+
+  def get_mlp_layer(self, layer: tf.Tensor) -> tf.Tensor:
+    hidden_weight = self.__get_variable([self.hidden_units, self.concat_embed_size], 'hidden_weight')
+    hidden_bias = self.__get_variable([self.hidden_units, 1, 1], 'hidden_bias')
+    self.params += [hidden_weight, hidden_bias]
+    layer = tf.sigmoid(tf.tensordot(hidden_weight, layer, [[1], [0]]) + hidden_bias)
+    return layer
+
+  def get_lstm_layer(self, layer: tf.Tensor) -> tf.Tensor:
+    lstm = tf.nn.rnn_cell.BasicLSTMCell(self.hidden_units)
+    lstm_output, lstm_out_state = tf.nn.dynamic_rnn(lstm, layer, dtype=self.dtype)
+    self.params += [v for v in tf.global_variables() if v.name.startswith('rnn')]
+    return tf.transpose(lstm_output)
+
+  def get_gru_layer(self, layer: tf.Tensor) -> tf.Tensor:
+    gru = tf.nn.rnn_cell.GRUCell(self.hidden_units)
+    gru_output, gru_out_state = tf.nn.dynamic_rnn(gru, layer, dtype=self.dtype)
+    self.params += [v for v in tf.global_variables() if v.name.startswith('rnn')]
+    return tf.transpose(gru_output)
+
+  def get_dropout_layer(self, layer: tf.Tensor) -> tf.Tensor:
+    return tf.layers.dropout(layer, self.dropout_rate)
+
+  def get_output_layer(self, layer: tf.Tensor) -> tf.Tensor:
+    output_weight = self.__get_variable([self.tags_count, self.hidden_units], 'output_weight')
+    output_bias = self.__get_variable([self.tags_count, 1, 1], 'output_bias')
+    self.params += [output_weight, output_bias]
+    return tf.tensordot(output_weight, layer, [[1], [0]]) + output_bias
+
+  def get_loss(self) -> (tf.Tensor, tf.Tensor):
+    output_loss = tf.reduce_sum(tf.gather_nd(self.output, self.ll_curr) - tf.gather_nd(self.output, self.ll_corr))
+    trans_loss = tf.gather_nd(self.transition, self.trans_curr) - tf.gather_nd(self.transition, self.trans_corr)
+    trans_i_curr = tf.gather_nd(self.transition_init, self.trans_init_curr)
+    trans_i_corr = tf.gather_nd(self.transition_init, self.trans_init_corr)
+    trans_init_loss = tf.reduce_sum(trans_i_curr - trans_i_corr)
+    loss = output_loss + trans_loss
+    regu = tf.contrib.layers.apply_regularization(tf.contrib.layers.l2_regularizer(self.lam), self.params)
+    l1 = loss + regu
+    l2 = l1 + trans_init_loss
+    return l1, l2
+
+  def __get_variable(self, size, name) -> tf.Variable:
+    return tf.Variable(tf.truncated_normal(size, stddev=1.0 / math.sqrt(size[-1]), dtype=self.dtype), name=name)
diff --git a/python/dnlp/core/dnn_crf_base.py b/python/dnlp/core/dnn_crf_base.py
@@ -0,0 +1,173 @@
+# -*- coding: UTF-8 -*-
+import numpy as np
+import pickle
+from dnlp.config.config import DnnCrfConfig
+from dnlp.utils.constant import BATCH_PAD, STRT_VAL, END_VAL, TAG_PAD, TAG_BEGIN, TAG_INSIDE, TAG_SINGLE
+
+
+class DnnCrfBase(object):
+  def __init__(self, config: DnnCrfConfig, data_path: str = '', mode: str = 'train', model_path: str = ''):
+    # &#21152;&#36733;&#25968;&#25454;
+    self.data_path = data_path
+    self.config_suffix = '.config.pickle'
+    if mode == 'train':
+      self.dictionary, self.tags, self.characters, self.labels = self.__load_data()
+    else:
+      self.model_path = model_path
+      self.config_path = self.model_path + self.config_suffix
+      self.dictionary, self.tags = self.__load_config()
+    self.tags_count = len(self.tags) - 1  # &#24573;&#30053;TAG_PAD
+    self.tags_map = self.__generate_tag_map()
+    self.dict_size = len(self.dictionary)
+    # &#21021;&#22987;&#21270;&#36229;&#21442;&#25968;
+    self.skip_left = config.skip_left
+    self.skip_right = config.skip_right
+    self.embed_size = config.embed_size
+    self.hidden_units = config.hidden_units
+    self.learning_rate = config.learning_rate
+    self.lam = config.lam
+    self.dropout_rate = config.dropout_rate
+    self.windows_size = self.skip_left + self.skip_right + 1
+    self.concat_embed_size = self.embed_size * self.windows_size
+    self.batch_length = config.batch_length
+    self.batch_size = config.batch_size
+    # &#25968;&#25454;
+    if mode == 'train':
+      self.sentences_length = list(map(lambda s: len(s), self.characters))
+      self.sentences_count = len(self.sentences_length)
+      self.batch_count = self.sentences_count // self.batch_size
+      self.batch_start = 0
+
+  def __load_data(self) -> (dict, tuple, np.ndarray, np.ndarray):
+    with open(self.data_path, 'rb') as f:
+      data = pickle.load(f)
+      return data['dictionary'], data['tags'], data['characters'], data['labels']
+
+  def __load_config(self) -> (dict, tuple):
+    with open(self.config_path, 'rb') as cf:
+      config = pickle.load(cf)
+      return config['dictionary'], config['tags']
+
+  def save_config(self, model_path: str):
+    config_path = model_path + self.config_suffix
+    config = {}
+    config['dictionary'] = self.dictionary
+    config['tags'] = self.tags
+    with open(config_path, 'wb') as cf:
+      pickle.dump(config, cf)
+
+  def __generate_tag_map(self):
+    tags_map = {}
+    for i in range(len(self.tags)):
+      tags_map[self.tags[i]] = i
+    return tags_map
+
+  def get_batch(self) -> (np.ndarray, np.ndarray, np.ndarray):
+    if self.batch_start + self.batch_size > self.sentences_count:
+      new_start = self.batch_start + self.batch_size - self.sentences_count
+      chs_batch = self.characters[self.batch_start:] + self.characters[:new_start]
+      lls_batch = self.labels[self.batch_start:] + self.labels[:new_start]
+      len_batch = self.sentences_length[self.batch_start:] + self.sentences_length[:new_start]
+    else:
+      new_start = self.batch_start + self.batch_size
+      chs_batch = self.characters[self.batch_start:new_start]
+      lls_batch = self.labels[self.batch_start:new_start]
+      len_batch = self.sentences_length[self.batch_start:new_start]
+    for i, (chs, lls) in enumerate(zip(chs_batch, lls_batch)):
+      if len(chs) > self.batch_length:
+        chs_batch[i] = chs[:self.batch_length]
+        lls_batch[i] = list(map(lambda t: self.tags_map[t], lls[:self.batch_length]))
+        len_batch[i] = self.batch_length
+      else:
+        ext_size = self.batch_length - len(chs)
+        chs_batch[i] = chs + ext_size * [self.dictionary[BATCH_PAD]]
+        lls_batch[i] = list(map(lambda t: self.tags_map[t], lls)) + ext_size * [self.tags_map[TAG_PAD]]
+
+    self.batch_start = new_start
+    return self.indices2input(chs_batch), np.array(lls_batch, dtype=np.int32), np.array(len_batch, dtype=np.int32)
+
+  def viterbi(self, emission: np.ndarray, transition: np.ndarray, transition_init: np.ndarray):
+    length = emission.shape[1]
+    path = np.ones([self.tags_count, length], dtype=np.int32) * -1
+    corr_path = np.zeros([length], dtype=np.int32)
+    path_score = np.ones([self.tags_count, length], dtype=np.float64) * (np.finfo('f').min)
+    path_score[:, 0] = transition_init + emission[:, 0]
+
+    for pos in range(1, length):
+      for t in range(self.tags_count):
+        for prev in range(self.tags_count):
+          temp = path_score[prev][pos - 1] + transition[prev][t] + emission[t][pos]
+          if temp >= path_score[t][pos]:
+            path[t][pos] = prev
+            path_score[t][pos] = temp
+
+    max_index = np.argmax(path_score[:, -1])
+    corr_path[length - 1] = max_index
+    for i in range(length - 1, 0, -1):
+      max_index = path[max_index][i]
+      corr_path[i - 1] = max_index
+
+    return corr_path
+
+  def sentence2indices(self, sentence: str) -> list:
+    return list(map(lambda ch: self.dictionary[ch], sentence))
+
+  def indices2input(self, indices: list) -> np.ndarray:
+    res = []
+    if isinstance(indices[0], list):
+      for idx in indices:
+        res.append(self.__indices2input_single(idx))
+    else:
+      res = self.__indices2input_single(indices)
+
+    return np.array(res, np.int32)
+
+  def __indices2input_single(self, indices: list) -> list:
+    ext_indices = [STRT_VAL] * self.skip_left
+    ext_indices.extend(indices + [END_VAL] * self.skip_right)
+    seq = []
+    for index in range(self.skip_left, len(ext_indices) - self.skip_right):
+      seq.append(ext_indices[index - self.skip_left: index + self.skip_right + 1])
+
+    return seq
+
+  def tags2words(self, sentence: str, tags_seq: np.ndarray) -> list:
+    words = []
+    word = ''
+    for tag_index, tag in enumerate(tags_seq):
+      if tag == self.tags_map[TAG_SINGLE]:
+        words.append(sentence[tag_index])
+      elif tag == self.tags_map[TAG_BEGIN]:
+        word = sentence[tag_index]
+      elif tag == self.tags_map[TAG_INSIDE]:
+        word += sentence[tag_index]
+      else:
+        words.append(word + sentence[tag_index])
+        word = ''
+    # &#22788;&#29702;&#26368;&#21518;&#19968;&#20010;&#26631;&#35760;&#20026;I&#30340;&#24773;&#20917;
+    if word != '':
+      words.append(word)
+
+    return words
+
+  def tags2entities(self, sentence: str, tags_seq: np.ndarray, return_start: bool = False):
+    entities = []
+    entity_starts = []
+    entity = ''
+
+    for tag_index, tag in enumerate(tags_seq):
+      if tag == 0:
+        continue
+      elif tag == 1:
+        if entity:
+          entities.append(entity)
+        entity = sentence[tag_index]
+        entity_starts.append(tag_index)
+      else:
+        entity += sentence[tag_index]
+    if entity != '':
+      entities.append(entity)
+    if return_start:
+      return entities, entity_starts
+    else:
+      return entities
diff --git a/python/dnlp/core/mmtnn.py b/python/dnlp/core/mmtnn.py
@@ -0,0 +1 @@
+#-*- coding: UTF-8 -*-
diff --git a/python/dnlp/core/re_cnn.py b/python/dnlp/core/re_cnn.py
@@ -0,0 +1 @@
+#-*- coding: UTF-8 -*-