Last active
April 14, 2021 08:17
-
-
Save keiji/fd97edaf5d0a9a14010fedde733d287f to your computer and use it in GitHub Desktop.
Extract glyph set from .xlf files.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import glob | |
# pip install absl-py | |
from absl import app | |
from absl import flags | |
import xml.etree.ElementTree as ET | |
FLAGS = flags.FLAGS | |
flags.DEFINE_string("target_dir", None, "Target directory that contain xlf files") | |
flags.DEFINE_list( | |
"target_langs", | |
# ['en', 'ja', 'zh-Hant'], | |
None, | |
"List of BCP47 Tags for Identifying Languages") | |
flags.DEFINE_string("output_path", './output.txt', "Output path") | |
# Required flag. | |
flags.mark_flag_as_required("target_dir") | |
IGNORE_CHARS = ['\n', '\r', ' ', '\u200b', '\u200e', '\u200f', '\u202F', '\u2007', '\u2060'] | |
def _extract_glyphs(file_path): | |
print(file_path) | |
glyph_list = [] | |
tree = ET.parse(file_path) | |
root = tree.getroot() | |
for elem in root.findall('.//{urn:oasis:names:tc:xliff:document:1.2}target'): | |
glyph_list.extend(list(elem.text)) | |
return list(set(glyph_list)) | |
def _load_default_glyphs(file_path): | |
if not os.path.exists(file_path): | |
return '' | |
with open(file_path) as fp: | |
return fp.read() | |
def main(argv): | |
del argv # Unused. | |
assert os.path.exists(FLAGS.target_dir), "%s is not exist" % FLAGS.target_dir | |
glyph_list = list(_load_default_glyphs('./default_glyphs.txt')) | |
xlf_file_paths = glob.glob(os.path.join(FLAGS.target_dir, "*.xlf"), recursive=False) | |
for file_path in xlf_file_paths: | |
name = os.path.basename(file_path) | |
# BCP47 Tags for Identifying Languages | |
# https://tools.ietf.org/html/bcp47 | |
lang_tag = name.split(".")[1] | |
if FLAGS.target_langs and lang_tag not in FLAGS.target_langs: | |
continue | |
gl = _extract_glyphs(file_path) | |
print('%s contains %d glyphs.' % (name, len(gl))) | |
glyph_list.extend(gl) | |
glyph_list = list(set(glyph_list)) | |
glyph_list.sort() | |
print('%d glyphs are found.' % len(glyph_list)) | |
with open(FLAGS.output_path, mode='w') as fp: | |
output = ''.join(filter(lambda c: c not in IGNORE_CHARS, glyph_list)) | |
print('Output %d glyphs.' % len(output)) | |
fp.write(output) | |
if __name__ == '__main__': | |
app.run(main) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment