-
-
Save santhoshtr/950405 to your computer and use it in GitHub Desktop.
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
texts =[u"वाराणसी", u"भौगोलिक", u"उपदर्शन"] | |
signs = [ | |
u'\u0902', u'\u0903', u'\u093e', u'\u093f', u'\u0940', u'\u0941', | |
u'\u0942', u'\u0943', u'\u0944', u'\u0946', u'\u0947', u'\u0948', | |
u'\u094a', u'\u094b', u'\u094c', u'\u094d'] | |
limiters = ['.','\"','\'','`','!',';',',','?'] | |
virama = u'\u094d' | |
text_index = 1 | |
for text in texts: | |
lst_chars = [] | |
for char in text: | |
if char in limiters: | |
lst_chars.append(char) | |
elif char in signs: | |
lst_chars[-1] = lst_chars[-1] + char | |
else: | |
try: | |
if lst_chars[-1][-1] == virama: | |
lst_chars[-1] = lst_chars[-1] + char | |
else: | |
lst_chars.append(char) | |
except IndexError: | |
lst_chars.append(char) | |
index = 1 | |
for syllable in lst_chars: | |
print text_index, index , syllable | |
index+=1 | |
text_index+=1 |
I can import data from a text file:
import codecs
f = codecs.open('testfile.txt', encoding='utf-8')
texts = f.read().split()
And insert the syallabalized characters into table:
print "mysql -e"insert into test.syllabalize values (",text_index, ",", index, ",", "'",syllable.encode('UTF-8'),"')""
from indicnlp.syllable import syllabifier
w='जगदीशचंद्र'
syllabifier.orthographic_syllabify(w, lang='hi')
returns ['ज', 'ग', 'दी', 'श', 'च', 'ंद्र']
load required libraries:
INDIC_NLP_LIB_HOME = r"/root/marathi/indic_nlp_library/"
INDIC_NLP_RESOURCES = r"/root/marathi/indic_nlp_library/indic_nlp_resources/"
import sys
sys.path.append(r"{}".format(INDIC_NLP_LIB_HOME))
from indicnlp import common
common.set_resources_path(INDIC_NLP_RESOURCES)
from indicnlp import loader
loader.load()
install:
cd /root/
mkdir marathi/
cd marathi
git clone https://github.com/anoopkunchukuttan/indic_nlp_library.git
cd indic_nlp_library/
git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git
mylist = ['ज', 'ग', 'दी', 'श', 'च', 'ंद्र']
import sys
sys.stdout = open("anagram.txt", "w")
from itertools import permutations
for i in permutations(mylist):
print (''.join(i))
returns 720 (6 * 5 * 4 * 3 * 2) possible combinations
compare with spell check database to find valid words
Sample output
$ python syllabify.py
1 1 वा
1 2 रा
1 3 ण
1 4 सी
2 1 भौ
2 2 गो
2 3 लि
2 4 क
3 1 उ
3 2 प
3 3 द
3 4 र्श
3 5 न