reformat code and add tag2sequence

supercoderhawk · supercoderhawk · commit 5436518b06e6 · 2017-11-29T21:00:38.000+08:00
diff --git a/python/dnlp/core/__init__.py b/python/dnlp/core/__init__.py
@@ -1 +1,2 @@
-#-*- coding: UTF-8 -*-
+#-*- coding: UTF-8 -*-
+from dnlp.core.dnn_crf import DnnCrf
diff --git a/python/dnlp/core/dnn_crf.py b/python/dnlp/core/dnn_crf.py
@@ -8,10 +8,10 @@
 
 class DnnCrf(DnnCrfBase):
   def __init__(self, *, config: DnnCrfConfig = None, data_path: str = '', dtype: type = tf.float32, mode: str = 'train',
-               train:str='ll',nn: str, model_path: str = ''):
+               train: str = 'll', nn: str, model_path: str = ''):
     if mode not in ['train', 'predict']:
       raise Exception('mode error')
-    if nn not in ['mlp', 'rnn', 'lstm', 'gru']:
+    if nn not in ['mlp', 'rnn', 'lstm', 'bilstm', 'gru']:
       raise Exception('name of neural network entered is not supported')
 
     DnnCrfBase.__init__(self, config, data_path, mode, model_path)
@@ -66,7 +66,6 @@ def __init__(self, *, config: DnnCrfConfig = None, data_path: str = '', dtype: t
       self.train = self.optimizer.minimize(self.loss)
       self.train_with_init = self.optimizer.minimize(self.loss_with_init)
 
-
   def fit(self, epochs: int = 100, interval: int = 20):
     with tf.Session() as sess:
       tf.global_variables_initializer().run()
@@ -130,16 +129,16 @@ def fit_batch(self, characters, labels, lengths, sess):
         feed_dict[self.trans_init_curr] = trans_init_neg_indices
         sess.run(self.train_with_init, feed_dict)
 
-  def fit_ll(self,epochs: int = 100, interval: int = 20):
+  def fit_ll(self, epochs: int = 100, interval: int = 20):
     with tf.Session() as sess:
       tf.global_variables_initializer().run()
       saver = tf.train.Saver(max_to_keep=epochs)
       for epoch in range(1, epochs + 1):
         print('epoch:', epoch)
         for _ in range(self.batch_count):
           characters, labels, lengths = self.get_batch()
-          #scores = sess.run(self.output, feed_dict={self.input: characters})
-          feed_dict = {self.input: characters, self.real_indices:labels, self.seq_length:lengths}
+          # scores = sess.run(self.output, feed_dict={self.input: characters})
+          feed_dict = {self.input: characters, self.real_indices: labels, self.seq_length: lengths}
           sess.run(self.train_ll, feed_dict=feed_dict)
           # self.fit_batch(characters, labels, lengths, sess)
         # if epoch % interval == 0:
@@ -178,7 +177,7 @@ def generate_transition_update_index(self, correct_labels, current_labels):
 
     return trans_pos, trans_neg, trans_init_pos, trans_init_neg, update_init
 
-  def predict(self, sentence: str):
+  def predict(self, sentence: str, return_labels=False):
     if self.mode != 'predict':
       raise Exception('mode is not allowed to predict')
     with tf.Session() as sess:
@@ -188,7 +187,10 @@ def predict(self, sentence: str):
       runner = [self.output, self.transition, self.transition_init]
       output, trans, trans_init = sess.run(runner, feed_dict={self.input: input})
       labels = self.viterbi(output, trans, trans_init)
-      return self.tags2words(sentence, labels)
+      if not return_labels:
+        return self.tags2words(sentence, labels)
+      else:
+        return self.tags2words(sentence, labels), labels
 
   def get_embedding_layer(self) -> tf.Tensor:
     embeddings = self.__get_variable([self.dict_size, self.embed_size], 'embeddings')
@@ -229,10 +231,10 @@ def get_dropout_layer(self, layer: tf.Tensor) -> tf.Tensor:
     return tf.layers.dropout(layer, self.dropout_rate)
 
   def get_output_layer(self, layer: tf.Tensor) -> tf.Tensor:
-    output_weight = self.__get_variable([self.hidden_units,self.tags_count], 'output_weight')
-    output_bias = self.__get_variable([1, 1, self.tags_count ], 'output_bias')
+    output_weight = self.__get_variable([self.hidden_units, self.tags_count], 'output_weight')
+    output_bias = self.__get_variable([1, 1, self.tags_count], 'output_bias')
     self.params += [output_weight, output_bias]
-    return tf.tensordot( layer,output_weight, [[2], [0]]) + output_bias
+    return tf.tensordot(layer, output_weight, [[2], [0]]) + output_bias
 
   def get_loss(self) -> (tf.Tensor, tf.Tensor):
     output_loss = tf.reduce_sum(tf.gather_nd(self.output, self.ll_curr) - tf.gather_nd(self.output, self.ll_corr))
diff --git a/python/dnlp/core/dnn_crf_base.py b/python/dnlp/core/dnn_crf_base.py
@@ -18,6 +18,7 @@ def __init__(self, config: DnnCrfConfig=None, data_path: str = '', mode: str = '
       self.dictionary, self.tags = self.__load_config()
     self.tags_count = len(self.tags) - 1  # &#24573;&#30053;TAG_PAD
     self.tags_map = self.__generate_tag_map()
+    self.reversed_tags_map = dict(zip(self.tags_map.values(),self.tags_map.keys()))
     self.dict_size = len(self.dictionary)
     # &#21021;&#22987;&#21270;&#36229;&#21442;&#25968;
     self.skip_left = config.skip_left
@@ -171,3 +172,11 @@ def tags2entities(self, sentence: str, tags_seq: np.ndarray, return_start: bool
       return entities, entity_starts
     else:
       return entities
+
+  def tag2sequences(self, tags_seq:np.ndarray):
+    seq = []
+
+    for tag in tags_seq:
+      seq.append(self.reversed_tags_map[tag])
+
+    return seq

Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`		`-#-- coding: UTF-8 --`
	`1`	`+#-- coding: UTF-8 --`
	`2`	`+from dnlp.core.dnn_crf import DnnCrf`