Skip to content

Instantly share code, notes, and snippets.

@keiji
Last active April 14, 2021 08:17
Show Gist options
  • Save keiji/fd97edaf5d0a9a14010fedde733d287f to your computer and use it in GitHub Desktop.
Save keiji/fd97edaf5d0a9a14010fedde733d287f to your computer and use it in GitHub Desktop.
Extract glyph set from .xlf files.
import os
import glob
# pip install absl-py
from absl import app
from absl import flags
import xml.etree.ElementTree as ET
FLAGS = flags.FLAGS
flags.DEFINE_string("target_dir", None, "Target directory that contain xlf files")
flags.DEFINE_list(
"target_langs",
# ['en', 'ja', 'zh-Hant'],
None,
"List of BCP47 Tags for Identifying Languages")
flags.DEFINE_string("output_path", './output.txt', "Output path")
# Required flag.
flags.mark_flag_as_required("target_dir")
IGNORE_CHARS = ['\n', '\r', ' ', '\u200b', '\u200e', '\u200f', '\u202F', '\u2007', '\u2060']
def _extract_glyphs(file_path):
print(file_path)
glyph_list = []
tree = ET.parse(file_path)
root = tree.getroot()
for elem in root.findall('.//{urn:oasis:names:tc:xliff:document:1.2}target'):
glyph_list.extend(list(elem.text))
return list(set(glyph_list))
def _load_default_glyphs(file_path):
if not os.path.exists(file_path):
return ''
with open(file_path) as fp:
return fp.read()
def main(argv):
del argv # Unused.
assert os.path.exists(FLAGS.target_dir), "%s is not exist" % FLAGS.target_dir
glyph_list = list(_load_default_glyphs('./default_glyphs.txt'))
xlf_file_paths = glob.glob(os.path.join(FLAGS.target_dir, "*.xlf"), recursive=False)
for file_path in xlf_file_paths:
name = os.path.basename(file_path)
# BCP47 Tags for Identifying Languages
# https://tools.ietf.org/html/bcp47
lang_tag = name.split(".")[1]
if FLAGS.target_langs and lang_tag not in FLAGS.target_langs:
continue
gl = _extract_glyphs(file_path)
print('%s contains %d glyphs.' % (name, len(gl)))
glyph_list.extend(gl)
glyph_list = list(set(glyph_list))
glyph_list.sort()
print('%d glyphs are found.' % len(glyph_list))
with open(FLAGS.output_path, mode='w') as fp:
output = ''.join(filter(lambda c: c not in IGNORE_CHARS, glyph_list))
print('Output %d glyphs.' % len(output))
fp.write(output)
if __name__ == '__main__':
app.run(main)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment