1058 lines
36 KiB
Python
Executable File
1058 lines
36 KiB
Python
Executable File
# from transliteration_resources import (
|
||
# azure_transliteration,
|
||
# indic_trans,
|
||
# indic_transliteration_OTHER_GUJARATI,
|
||
# indic_transliteration_OTHER_GURMUKHI,
|
||
# indic_transliteration_OTHER_ORIYA,
|
||
# om_transliterator,
|
||
# libindic,
|
||
# indic_transliteration_IAST,
|
||
# indic_transliteration_ITRANS,
|
||
# # polyglot_trans,
|
||
# sheetal,
|
||
# unicode_transliteration_GURMUKHI,
|
||
# indic_transliteration_GURMUKHI,
|
||
# transliteration_LATIN_CYRILLIC,
|
||
# indic_transliteration_TELUGU,
|
||
# unicode_transliteration_GURMUKHI_LATIN,
|
||
# indic_transliteration_GURMUKHI_LATIN,
|
||
# transliteration_CYRILIC_LATIN,
|
||
# ConvertToLatin,
|
||
# readonly,
|
||
# indic_transliteration_OTHER_DEVANAGRI,
|
||
# indic_transliteration_DEVANAGRI_OTHER,
|
||
# indic_transliteration_KANNADA_OTHER,
|
||
# indic_transliteration_OTHER_KANNADA,
|
||
# indic_transliteration_TAMIL_OTHER,
|
||
# indic_transliteration_OTHER_TAMIL,
|
||
# indic_transliteration_TELUGU_OTHER,
|
||
# indic_transliteration_MALAYALAM_OTHER,
|
||
# indic_transliteration_OTHER_GUJARATI,
|
||
# indic_transliteration_OTHER_GURMUKHI,
|
||
# indic_transliteration_OTHER_ORIYA,
|
||
# translit_CHINESE_LATIN,
|
||
# translit_th_sin_mng_heb_to_latin
|
||
# ) # , translit_THAI_LATIN
|
||
import subprocess
|
||
import sys
|
||
import os
|
||
import requests, uuid, json
|
||
from indictrans import Transliterator
|
||
from om_transliterator import Transliterator as om_Transliterator
|
||
from indic_transliteration import sanscript
|
||
from indic_transliteration.sanscript import transliterate
|
||
from libindic.transliteration import getInstance
|
||
|
||
t = getInstance()
|
||
|
||
from indic_transliteration import sanscript
|
||
from indic_transliteration.sanscript import transliterate
|
||
from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator
|
||
from transliterate import translit # , get_available_language_codes
|
||
from indic_transliteration.sanscript import SchemeMap, SCHEMES, transliterate
|
||
# import polyglot
|
||
# from polyglot.transliteration import Transliterator as poly
|
||
# from polyglot.text import Text
|
||
import pinyin
|
||
from anyascii import anyascii
|
||
# from MNF.settings import BasePath
|
||
|
||
basePath = "/home/user/mnf/project/MNF"
|
||
|
||
|
||
# -> Directly Usable azure api for transliteration
|
||
def azure_transliteration(text, source_lang, source_script, dest_script):
|
||
if source_script == "Devanagari":
|
||
source_script = "Deva"
|
||
elif source_script == "Arabic":
|
||
source_script = "Arab"
|
||
elif source_script == "Latin":
|
||
source_script = "Latn"
|
||
elif source_script == "Kannada":
|
||
source_script = "knda"
|
||
elif source_script == "Tamil":
|
||
source_script = "Taml"
|
||
elif source_script == "Bengali":
|
||
source_script = "Beng"
|
||
elif source_script == "Telugu":
|
||
source_script = "Telu"
|
||
elif source_script == "Malayalam":
|
||
source_script = "Mlym"
|
||
elif source_script == "Cyrillic":
|
||
source_script = "Cyrl"
|
||
elif source_script == "Gurmukhi":
|
||
source_script = "Guru"
|
||
elif source_script == "Telugu":
|
||
source_script = "Telu"
|
||
elif source_script == "Gujarati":
|
||
source_script = "Gujr"
|
||
elif source_script == "Oriya":
|
||
source_script = "Orya"
|
||
elif source_script == "Sinhala":
|
||
source_script = "Sinh"
|
||
elif source_script == "Hanji":
|
||
source_script = "Hans"
|
||
elif source_script == "Thai":
|
||
source_script = "Thai"
|
||
elif source_script == "Hebrew":
|
||
source_script = "Hebr"
|
||
|
||
if dest_script == "Devanagari":
|
||
dest_script = "Deva"
|
||
elif dest_script == "Arabic":
|
||
dest_script = "Arab"
|
||
elif dest_script == "Latin":
|
||
dest_script = "Latn"
|
||
elif dest_script == "Kannada":
|
||
dest_script = "knda"
|
||
elif dest_script == "Tamil":
|
||
dest_script = "Taml"
|
||
elif dest_script == "Cyrillic":
|
||
dest_script = "Cyrl"
|
||
elif dest_script == "Malayalam":
|
||
dest_script = "Mlym"
|
||
elif dest_script == "Gurmukhi":
|
||
dest_script = "Guru"
|
||
elif dest_script == "Telugu":
|
||
dest_script = "Telu"
|
||
elif dest_script == "Gujarati":
|
||
dest_script = "Gujr"
|
||
elif dest_script == "Oriya":
|
||
dest_script = "Orya"
|
||
elif dest_script == "Bengali":
|
||
dest_script = "Beng"
|
||
elif dest_script == "Sinhala":
|
||
dest_script = "Sinh"
|
||
elif dest_script == "Hanji":
|
||
dest_script = "Hans"
|
||
elif dest_script == "Thai":
|
||
dest_script = "Thai"
|
||
elif dest_script == "Hebrew":
|
||
dest_script = "Hebr"
|
||
|
||
subscription_key = "959354878e73458e898a69f1f5887b69"
|
||
endpoint = "https://api.cognitive.microsofttranslator.com"
|
||
location = "eastus"
|
||
path = '/translate'
|
||
constructed_url = endpoint + path
|
||
headers = {
|
||
'Ocp-Apim-Subscription-Key': subscription_key,
|
||
'Ocp-Apim-Subscription-Region': location,
|
||
'Content-type': 'application/json',
|
||
'X-ClientTraceId': str(uuid.uuid4())
|
||
}
|
||
constructed_url1 = "https://api.cognitive.microsofttranslator.com/transliterate?api-version=3.0"
|
||
|
||
print("source_script", source_script)
|
||
print("dest_script", dest_script)
|
||
print("source_lang", source_lang)
|
||
print("text", text)
|
||
|
||
params = {'language': source_lang, 'fromScript': source_script, 'toScript': dest_script}
|
||
body = [{'text': text}]
|
||
# try:
|
||
request = requests.post(constructed_url1, params=params, headers=headers, json=body)
|
||
response = request.json()
|
||
print(response)
|
||
out = response[0]['text']
|
||
# except Exception as e:
|
||
# print("The error was ", e)
|
||
# out = text
|
||
return out
|
||
print(azure_transliteration("mera naam dharmesh hai", "hi", "Latn", "Deva"))
|
||
from collections import Counter
|
||
|
||
def two_sources_two_outputs(sources_name, O):
|
||
print("sources name is", sources_name, O)
|
||
dict1 = Counter(O)
|
||
print("dict1", dict1)
|
||
sorted_values = sorted(dict1.values(), reverse=True) # Sort the values
|
||
print("sorted_value", sorted_values)
|
||
sorted_dict = {}
|
||
for i in sorted_values:
|
||
for k in dict1.keys():
|
||
if dict1[k] == i:
|
||
sorted_dict[k] = dict1[k]
|
||
print("sorted_Dict", sorted_dict)
|
||
sources = list(sorted_dict.keys())
|
||
print(sources)
|
||
rm =[]
|
||
for r in Counter(O).keys():
|
||
temp = [i for i in range(len(O)) if O[i] == r]
|
||
rm.append(temp)
|
||
print("rm", rm)
|
||
resANDmethods_indexes={}
|
||
fs = list(Counter(O).keys())
|
||
print("fs", fs)
|
||
for t in range(len(fs)):
|
||
resANDmethods_indexes.update({fs[t]: rm[t]})
|
||
print("here it is", resANDmethods_indexes)
|
||
out1 = sources[0]
|
||
source1 = [sources_name[str(i)] for i in resANDmethods_indexes[out1]]
|
||
print(source1)
|
||
if len(sources)==1:
|
||
return (out1, source1), ("", "")
|
||
else:
|
||
out2 = sources[1]
|
||
source2 = [sources_name[str(i)] for i in resANDmethods_indexes[out2]]
|
||
print((out1, source1), (out2, source2))
|
||
return (out1, source1), (out2, source2)
|
||
|
||
def selection_source_transliteration(sources_name, O, priority_list):
|
||
seq = list(Counter(O).values())
|
||
print(seq)
|
||
seq.sort(reverse = True)
|
||
print(seq)
|
||
check=[]
|
||
temp="y"
|
||
# -> check if highest sequence value is greater than all other values
|
||
for i in range(len(seq)-1):
|
||
if seq[0]>seq[i+1]:
|
||
check.append(i)
|
||
print("check here is -> ", check)
|
||
# -> check if the highest occurence of a number in seq is greater than all others?
|
||
if len(check)==(len(seq)-1):
|
||
temp = "yes"
|
||
print("check", check)
|
||
if temp=="yes":
|
||
print("here1")
|
||
(o1, s1), (o2, s2) = two_sources_two_outputs(sources_name, O)
|
||
print((o1, s1), (o2, s2))
|
||
output1 = o1
|
||
source1 = s1
|
||
print(seq)
|
||
if len(seq)==2:
|
||
output2 = o2
|
||
source2 = s2
|
||
else:
|
||
temp1="y"
|
||
check1=[]
|
||
for i in range(len(seq)-2):
|
||
if seq[1]>seq[i+2]:
|
||
check.append(i)
|
||
if len(check1)==(len(seq)-2):
|
||
temp1 = "yes"
|
||
if temp1=="yes":
|
||
output2 = o2
|
||
source2 = s2
|
||
else:
|
||
for i in priority_list:
|
||
temp_source="test"
|
||
if i not in source1:
|
||
temp_source = i
|
||
break
|
||
if temp_source=="test":
|
||
output2 = o2
|
||
source2 = s2
|
||
else:
|
||
if temp_source != priority_list[1]:
|
||
output2= O[priority_list.index(temp_source)]
|
||
source2=temp_source
|
||
else:
|
||
output2= O[priority_list.index(priority_list[1])]
|
||
source2=priority_list[1]
|
||
|
||
else:
|
||
(o1, s1), (o2, s2) = two_sources_two_outputs(sources_name, O)
|
||
if priority_list[0] in s1:
|
||
output1= o1
|
||
source1= s1
|
||
elif priority_list[0] in s2:
|
||
output1= o2
|
||
source1= s2
|
||
else:
|
||
output1=O[0]
|
||
source1= priority_list[0]
|
||
temp_source = "test"
|
||
for i in priority_list:
|
||
if i not in source1:
|
||
temp_source = i
|
||
break
|
||
if temp_source=="test":
|
||
output2 = o2
|
||
source2 = s2
|
||
else:
|
||
if temp_source != priority_list[1]:
|
||
output2= O[priority_list.index(temp_source)]
|
||
source2=temp_source
|
||
else:
|
||
output2= O[priority_list.index(priority_list[1])]
|
||
source2=priority_list[1]
|
||
return(output1, source1), (output2, source2)
|
||
|
||
def space_after_punct(text):
|
||
import re
|
||
# text = text.replace('...',' ... ')
|
||
text = text.replace(". . .", " ... ")
|
||
text = re.sub("([,!?()…-])", r"\1 ", text)
|
||
text = re.sub("\s{2,}", " ", text)
|
||
return text
|
||
def final_transliterated_sentence(original, transliterated):
|
||
original = space_after_punct(original)
|
||
punct_list = [
|
||
"!",
|
||
'"',
|
||
"#",
|
||
"$",
|
||
"%",
|
||
"&",
|
||
"'",
|
||
"(",
|
||
")",
|
||
"*",
|
||
"+",
|
||
",",
|
||
" ",
|
||
"-",
|
||
".",
|
||
"/",
|
||
":",
|
||
";",
|
||
"<",
|
||
"=",
|
||
">",
|
||
"?",
|
||
"@",
|
||
"[",
|
||
"\\",
|
||
"]",
|
||
"^",
|
||
"_",
|
||
"`",
|
||
"{",
|
||
"|",
|
||
"}",
|
||
"~",
|
||
"…",
|
||
"...",
|
||
"।",
|
||
]
|
||
sentence = []
|
||
j = 0
|
||
|
||
for i in range(len(original.split())):
|
||
if original.split()[i] in punct_list:
|
||
sentence.append(original.split()[i])
|
||
elif original.split()[i][-1] in punct_list:
|
||
temp = transliterated.split()[j] + original.split()[i][-1]
|
||
sentence.append(temp)
|
||
j = j + 1
|
||
elif original.split()[i][-1] not in punct_list:
|
||
temp = transliterated.split()[j]
|
||
sentence.append(temp)
|
||
j = j + 1
|
||
|
||
transliterated_sentence = " ".join(sentence)
|
||
transliterated_sentence.replace(" ... ", "...")
|
||
transliterated_sentence.replace("… ", "…")
|
||
return transliterated_sentence
|
||
|
||
|
||
def compare_outputs_transliteration(word, outputs, sources_name, priority_list):
|
||
# print(outputs)
|
||
# doc2 = docx.Document()
|
||
# sections = doc2.sections
|
||
# for section in sections:
|
||
# section.top_margin = Inches(0.2)
|
||
# section.bottom_margin = Inches(0.2)
|
||
# section.left_margin = Inches(0.2)
|
||
# section.right_margin = Inches(0.2)
|
||
# section = doc2.sections[-1]
|
||
# new_height = section.page_width
|
||
# section.page_width = section.page_height
|
||
# section.page_height = new_height
|
||
# name = 'Final table ' + doc_file
|
||
# doc2.add_heading(name, 0)
|
||
# doc_para = doc2.add_paragraph()
|
||
# doc_para.add_run('Translation resources used : Google, IBM watson, AWS, Azure, Lingvanex, Yandex').bold = True
|
||
# table2 = doc2.add_table(rows=1, cols=4)
|
||
# table2.style = 'TableGrid'
|
||
# hdr_Cells = table2.rows[0].cells
|
||
# hdr_Cells[0].paragraphs[0].add_run("Input").bold = True
|
||
# hdr_Cells[1].paragraphs[0].add_run("Output1").bold = True
|
||
# hdr_Cells[2].paragraphs[0].add_run("Output2").bold = True
|
||
# hdr_Cells[3].paragraphs[0].add_run("Output3").bold = True
|
||
O1ANDS1, O2ANDS2 = selection_source_transliteration(
|
||
sources_name, outputs, priority_list
|
||
)
|
||
print(O1ANDS1, "compare all transliterations")
|
||
# add_dial_comparison_doc2_transliteration(doc2, table2, word, O1ANDS1, O2ANDS2, sources_name)
|
||
return O1ANDS1[0]
|
||
# -> Directly Usable Polyglot api for transliteration
|
||
# def polyglot_trans(text, source_script, dest_script):
|
||
# # from polyglot.downloader import downloader
|
||
|
||
# if source_script=="Latin":
|
||
# source_script="en"
|
||
# elif source_script=="Arabic":
|
||
# source_script="ar"
|
||
# elif source_script=="Hanji":
|
||
# source_script="zh"
|
||
|
||
# if dest_script=="Latin":
|
||
# dest_script="en"
|
||
# elif dest_script=="Arabic":
|
||
# dest_script="ar"
|
||
# elif source_script=="Hanji":
|
||
# source_script="zh"
|
||
|
||
# new_text = ""
|
||
# text_break = Text(text)
|
||
# for x in text_break.transliterate(dest_script):
|
||
# new_text = new_text + str(x)
|
||
# return new_text
|
||
|
||
|
||
# -> Directly Usable indic_trans api for transliteration
|
||
def indic_trans(text, source_script, dest_script):
|
||
if source_script == "Devanagari":
|
||
source_script = "hin"
|
||
elif source_script == "Arabic":
|
||
source_script = "urd"
|
||
elif source_script == "Kannada":
|
||
source_script = "kan"
|
||
elif source_script == "Tamil":
|
||
source_script = "tam"
|
||
elif source_script == "Latin":
|
||
source_script = "eng"
|
||
elif source_script == "Bengali":
|
||
source_script = "ben"
|
||
elif source_script == "Telugu":
|
||
source_script = "tel"
|
||
elif source_script == "Malayalam":
|
||
source_script = "mal"
|
||
elif source_script == "Tamil":
|
||
source_script = "tam"
|
||
elif source_script == "Oriya":
|
||
source_script = "ori"
|
||
elif source_script == "Gujarati":
|
||
source_script = "guj"
|
||
elif source_script == "Gurmukhi":
|
||
source_script = "pan"
|
||
|
||
if dest_script == "Devanagari":
|
||
dest_script = "hin"
|
||
elif dest_script == "Arabic":
|
||
dest_script = "urd"
|
||
elif dest_script == "Kannada":
|
||
dest_script = "kan"
|
||
elif dest_script == "Latin":
|
||
dest_script = "eng"
|
||
elif source_script == "Tamil":
|
||
source_script = "tam"
|
||
elif dest_script == "Gujarati":
|
||
dest_script = "guj"
|
||
elif dest_script == "Oriya":
|
||
dest_script = "ori"
|
||
elif dest_script == "Telugu":
|
||
dest_script = "tel"
|
||
elif dest_script == "Malayalam":
|
||
dest_script = "mal"
|
||
# elif dest_script=="Gurmukhi":
|
||
# dest_script="Guru"
|
||
elif dest_script == "Telugu":
|
||
dest_script = "Telu"
|
||
elif dest_script == "Gujarati":
|
||
dest_script = "Gujr"
|
||
elif dest_script == "Oriya":
|
||
dest_script = "Orya"
|
||
elif dest_script == "Bengali":
|
||
dest_script = "Ben"
|
||
elif dest_script == "Tamil":
|
||
dest_script = "tam"
|
||
elif dest_script == "Gurmukhi":
|
||
dest_script = "pan"
|
||
trn = Transliterator(source=source_script, target=dest_script, build_lookup=True)
|
||
out = trn.transform(text)
|
||
return out
|
||
|
||
|
||
# -> Directly Usable om_translator api for transliteration
|
||
def om_transliterator(text):
|
||
transliterator = om_Transliterator()
|
||
out = transliterator.knda_to_latn(text)
|
||
return out
|
||
|
||
|
||
# -> Directly Usable libindic api for transliteration
|
||
def libindic(text, dest_script):
|
||
if dest_script == "Devanagari":
|
||
dest_script = "hi"
|
||
elif dest_script == "Latin":
|
||
dest_script = "en"
|
||
elif dest_script == "Malayalam":
|
||
dest_script = "ml"
|
||
elif dest_script == "Gujarati":
|
||
dest_script = "gu"
|
||
elif dest_script == "Oriya":
|
||
dest_script = "or"
|
||
elif dest_script == "Telugu":
|
||
dest_script = "te"
|
||
elif dest_script == "Bengali":
|
||
dest_script = "bn"
|
||
elif dest_script == "Tamil":
|
||
dest_script = "ta"
|
||
elif dest_script == "Kannada":
|
||
dest_script = "kn"
|
||
elif dest_script == "Gurmukhi":
|
||
dest_script = "gu"
|
||
code = dest_script + '_IN'
|
||
out = t.transliterate(text, code)
|
||
return out
|
||
|
||
|
||
# -> Directly Usable indic_transliteration_IAST api for transliteration
|
||
def indic_transliteration_IAST(text):
|
||
out = transliterate(text, sanscript.IAST, sanscript.DEVANAGARI)
|
||
return out
|
||
|
||
|
||
# -> Directly Usable indic_transliteration_ITRANS api for transliteration
|
||
def indic_transliteration_ITRANS(text):
|
||
out = transliterate(text, sanscript.ITRANS, sanscript.DEVANAGARI)
|
||
return out
|
||
|
||
|
||
# -> Directly Usable sheetal api for transliteration
|
||
def sheetal(text):
|
||
s2_out = subprocess.check_output([sys.executable, rf"{basePath}/conversion/translation/dev-rom-sheetal.py", text])
|
||
out = s2_out.decode('utf-8')
|
||
return out
|
||
|
||
|
||
# -> Directly Usable ritwik code for transliteration
|
||
def ritwik(text):
|
||
s2_out = subprocess.check_output([sys.executable, rf"{basePath}/conversion/translation/dev-rom-ritwik.py", text])
|
||
out = s2_out.decode('utf-8')
|
||
return out
|
||
|
||
|
||
# -> Directly Usable indic_transliteration_GURMUKHI api for transliteration
|
||
def indic_transliteration_GURMUKHI(text):
|
||
out = transliterate(text, sanscript.IAST, sanscript.GURMUKHI)
|
||
return out
|
||
|
||
|
||
# -> Directly Usable unicode_transliteration_GURMUKHI api for transliteration
|
||
def unicode_transliteration_GURMUKHI(text):
|
||
input_text = transliterate(text, sanscript.IAST, sanscript.DEVANAGARI)
|
||
out = UnicodeIndicTransliterator.transliterate(input_text, "hi", "pa")
|
||
return out
|
||
|
||
|
||
# -> Directly Usable transliteration_LATIN_CYRILLIC api for transliteration
|
||
def transliteration_LATIN_CYRILLIC(text):
|
||
out = translit(text, 'bg')
|
||
return out
|
||
|
||
|
||
# -> Directly Usable translit_CHINESE_LATIN api for transliteration
|
||
def translit_CHINESE_LATIN(text):
|
||
out = pinyin.get(text, format="strip", delimiter=" ")
|
||
return out
|
||
|
||
|
||
def translit_th_sin_mng_heb_to_latin(text):
|
||
out = anyascii(text)
|
||
return out
|
||
|
||
|
||
# -> Directly Usable indic_transliteration_TELUGU api for transliteration
|
||
def indic_transliteration_TELUGU(text):
|
||
out = transliterate(text, sanscript.IAST, sanscript.TELUGU)
|
||
return out
|
||
|
||
|
||
# -> Directly Usable indic_transliteration_GURMUKHI_LATIN api for transliteration
|
||
def indic_transliteration_GURMUKHI_LATIN(text):
|
||
out = transliterate(text, sanscript.GURMUKHI, sanscript.ITRANS)
|
||
return out
|
||
|
||
|
||
# -> Directly Usable unicode_transliteration_GURMUKHI_LATIN api for transliteration
|
||
def unicode_transliteration_GURMUKHI_LATIN(text):
|
||
input_text = transliterate(text, sanscript.IAST, sanscript.DEVANAGARI)
|
||
out = UnicodeIndicTransliterator.transliterate(input_text, "hi", "pa")
|
||
return out
|
||
|
||
|
||
# -> Directly Usable transliteration_CYRILIC_LATIN api for transliteration
|
||
def transliteration_CYRILIC_LATIN(text):
|
||
out = translit(text, 'bg', reversed=True)
|
||
return out
|
||
|
||
|
||
# -> Some Random Code to replace special characters
|
||
def readonly(str):
|
||
str = str.replace("а", "a")
|
||
str = str.replace("б", "b")
|
||
str = str.replace("в", "v")
|
||
str = str.replace("г", "g")
|
||
str = str.replace("д", "d")
|
||
str = str.replace("е", "e")
|
||
str = str.replace("ё", "yo")
|
||
str = str.replace("ж", "zh")
|
||
str = str.replace("з", "z")
|
||
str = str.replace("и", "i")
|
||
str = str.replace("й", "j")
|
||
str = str.replace("к", "k")
|
||
str = str.replace("л", "l")
|
||
str = str.replace("м", "m")
|
||
str = str.replace("н", "n")
|
||
str = str.replace("о", "o")
|
||
str = str.replace("п", "p")
|
||
str = str.replace("р", "r")
|
||
str = str.replace("с", "s")
|
||
str = str.replace("т", "t")
|
||
str = str.replace("у", "u")
|
||
str = str.replace("ф", "f")
|
||
str = str.replace("х", "h")
|
||
str = str.replace("ц", "c")
|
||
str = str.replace("ч", "ch")
|
||
str = str.replace("ш", "sh")
|
||
str = str.replace("щ", "sch")
|
||
str = str.replace("ъ", "j")
|
||
str = str.replace("ы", "i")
|
||
str = str.replace("ь", "j")
|
||
str = str.replace("э", "e")
|
||
str = str.replace("ю", "yu")
|
||
str = str.replace("я", "ya")
|
||
str = str.replace("А", "A")
|
||
str = str.replace("Б", "B")
|
||
str = str.replace("В", "V")
|
||
str = str.replace("Г", "G")
|
||
str = str.replace("Д", "D")
|
||
str = str.replace("Е", "E")
|
||
str = str.replace("Ё", "Yo")
|
||
str = str.replace("Ж", "Zh")
|
||
str = str.replace("З", "Z")
|
||
str = str.replace("И", "I")
|
||
str = str.replace("Й", "J")
|
||
str = str.replace("К", "K")
|
||
str = str.replace("Л", "L")
|
||
str = str.replace("М", "M")
|
||
str = str.replace("Н", "N")
|
||
str = str.replace("О", "O")
|
||
str = str.replace("П", "P")
|
||
str = str.replace("Р", "R")
|
||
str = str.replace("С", "S")
|
||
str = str.replace("Т", "T")
|
||
str = str.replace("У", "U")
|
||
str = str.replace("Ф", "F")
|
||
str = str.replace("Х", "H")
|
||
str = str.replace("Ц", "C")
|
||
str = str.replace("Ч", "Ch")
|
||
str = str.replace("Ш", "Sh")
|
||
str = str.replace("Щ", "Sch")
|
||
str = str.replace("Ъ", "J")
|
||
str = str.replace("Ы", "I")
|
||
str = str.replace("Ь", "J")
|
||
str = str.replace("Э", "E")
|
||
str = str.replace("Ю", "Yu")
|
||
str = str.replace("Я", "Ya")
|
||
|
||
return str
|
||
|
||
|
||
# -> Code to Convert Letters to Latin Script
|
||
def ConvertToLatin(source):
|
||
result = ''
|
||
for letter in source:
|
||
Letter = readonly(letter) ## replacemnet of word
|
||
result = result + Letter
|
||
return result
|
||
|
||
|
||
# -> Directly Usable indic_transliteration_OTHER_DEVANAGRI api for transliteration
|
||
def indic_transliteration_OTHER_DEVANAGRI(text, src_script):
|
||
if src_script == "Malayalam":
|
||
out = transliterate(text, sanscript.MALAYALAM, sanscript.DEVANAGARI)
|
||
if src_script == "Gujarati":
|
||
out = transliterate(text, sanscript.GUJARATI, sanscript.DEVANAGARI)
|
||
if src_script == "Telugu":
|
||
out = transliterate(text, sanscript.TELUGU, sanscript.DEVANAGARI)
|
||
if src_script == "Oriya":
|
||
out = transliterate(text, sanscript.ORIYA, sanscript.DEVANAGARI)
|
||
if src_script == "Bengali":
|
||
out = transliterate(text, sanscript.BENGALI, sanscript.DEVANAGARI)
|
||
if src_script == "Kannada":
|
||
out = transliterate(text, sanscript.KANNADA, sanscript.DEVANAGARI)
|
||
if src_script == "Gurmukhi":
|
||
out = transliterate(text, sanscript.GURMUKHI, sanscript.DEVANAGARI)
|
||
if src_script == "Tamil":
|
||
out = transliterate(text, sanscript.TAMIL, sanscript.DEVANAGARI)
|
||
|
||
return out
|
||
|
||
|
||
# -> Directly Usable indic_transliteration_DEVANAGRI_OTHER api for transliteration
|
||
def indic_transliteration_DEVANAGRI_OTHER(text, dest_script):
|
||
if dest_script == "Malayalam":
|
||
out = transliterate(text, sanscript.DEVANAGARI, sanscript.MALAYALAM)
|
||
if dest_script == "Gujarati":
|
||
out = transliterate(text, sanscript.DEVANAGARI, sanscript.GUJARATI)
|
||
if dest_script == "Telugu":
|
||
out = transliterate(text, sanscript.DEVANAGARI, sanscript.TELUGU)
|
||
if dest_script == "Oriya":
|
||
out = transliterate(text, sanscript.DEVANAGARI, sanscript.ORIYA)
|
||
if dest_script == "Bengali":
|
||
out = transliterate(text, sanscript.DEVANAGARI, sanscript.BENGALI)
|
||
if dest_script == "Kannada":
|
||
out = transliterate(text, sanscript.DEVANAGARI, sanscript.KANNADA)
|
||
if dest_script == "Gurmukhi":
|
||
out = transliterate(text, sanscript.DEVANAGARI, sanscript.GURMUKHI)
|
||
if dest_script == "Tamil":
|
||
out = transliterate(text, sanscript.DEVANAGARI, sanscript.TAMIL)
|
||
return out
|
||
|
||
|
||
# -> Directly Usable indic_transliteration_KANNADA_OTHER api for transliteration
|
||
def indic_transliteration_KANNADA_OTHER(text, dest_script):
|
||
if dest_script == "Malayalam":
|
||
out = transliterate(text, sanscript.KANNADA, sanscript.MALAYALAM)
|
||
if dest_script == "Telugu":
|
||
out = transliterate(text, sanscript.KANNADA, sanscript.TELUGU)
|
||
if dest_script == "Tamil":
|
||
out = transliterate(text, sanscript.KANNADA, sanscript.TAMIL)
|
||
if dest_script == "Bengali":
|
||
out = transliterate(text, sanscript.KANNADA, sanscript.BENGALI)
|
||
return out
|
||
|
||
|
||
# -> Directly Usable indic_transliteration_OTHER_KANNADA api for transliteration
|
||
def indic_transliteration_OTHER_KANNADA(text, src_script):
|
||
if src_script == "Malayalam":
|
||
out = transliterate(text, sanscript.MALAYALAM, sanscript.KANNADA)
|
||
if src_script == "Telugu":
|
||
out = transliterate(text, sanscript.TELUGU, sanscript.KANNADA)
|
||
if src_script == "Tamil":
|
||
out = transliterate(text, sanscript.TAMIL, sanscript.KANNADA)
|
||
if src_script == "Bengali":
|
||
out = transliterate(text, sanscript.BENGALI, sanscript.KANNADA)
|
||
return out
|
||
|
||
|
||
# -> Directly Usable indic_transliteration_TAMIL_OTHER api for transliteration
|
||
def indic_transliteration_TAMIL_OTHER(text, dest_script):
|
||
if dest_script == "Malayalam":
|
||
out = transliterate(text, sanscript.TAMIL, sanscript.MALAYALAM)
|
||
if dest_script == "Telugu":
|
||
out = transliterate(text, sanscript.TAMIL, sanscript.TELUGU)
|
||
return out
|
||
|
||
|
||
# -> Directly Usable indic_transliteration_OTHER_TAMIL api for transliteration
|
||
def indic_transliteration_OTHER_TAMIL(text, src_script):
|
||
if src_script == "Malayalam":
|
||
out = transliterate(text, sanscript.MALAYALAM, sanscript.TAMIL)
|
||
if src_script == "Telugu":
|
||
out = transliterate(text, sanscript.TELUGU, sanscript.TAMIL)
|
||
return out
|
||
|
||
|
||
# -> Directly Usable indic_transliteration_TELUGU_OTHER api for transliteration
|
||
def indic_transliteration_TELUGU_OTHER(text, desc_script):
|
||
if desc_script == "Malayalam":
|
||
out = transliterate(text, sanscript.TELUGU, sanscript.MALAYALAM)
|
||
return out
|
||
|
||
|
||
# -> Directly Usable indic_transliteration_MALAYALAM_OTHER api for transliteration
|
||
def indic_transliteration_MALAYALAM_OTHER(text, desc_script):
|
||
if desc_script == "Telugu":
|
||
out = transliterate(text, sanscript.MALAYALAM, sanscript.TELUGU)
|
||
return out
|
||
|
||
|
||
# -> Directly Usable indic_transliteration_OTHER_GUJARATI api for transliteration
|
||
def indic_transliteration_OTHER_GUJARATI(text, src_script):
|
||
if src_script == "Gurmukhi":
|
||
out = transliterate(text, sanscript.GURMUKHI, sanscript.GUJARATI)
|
||
if src_script == "Oriya":
|
||
out = transliterate(text, sanscript.ORIYA, sanscript.GUJARATI)
|
||
return out
|
||
|
||
|
||
# -> Directly Usable indic_transliteration_OTHER_GURMUKHI api for transliteration
|
||
def indic_transliteration_OTHER_GURMUKHI(text, src_script):
|
||
if src_script == "Gujarati":
|
||
out = transliterate(text, sanscript.GUJARATI, sanscript.GURMUKHI)
|
||
if src_script == "Oriya":
|
||
out = transliterate(text, sanscript.ORIYA, sanscript.GURMUKHI)
|
||
return out
|
||
|
||
|
||
# -> Directly Usable indic_transliteration_OTHER_ORIYA api for transliteration
|
||
def indic_transliteration_OTHER_ORIYA(text, src_script):
|
||
if src_script == "Gujarati":
|
||
out = transliterate(text, sanscript.GUJARATI, sanscript.ORIYA)
|
||
if src_script == "Gurmukhi":
|
||
out = transliterate(text, sanscript.GURMUKHI, sanscript.ORIYA)
|
||
return out
|
||
from indicnlp.tokenize import sentence_tokenize
|
||
def punct_remover(string):
|
||
punctuations = """!()-[]{};:'"\,<>./?@#$%^&*_~…।"""
|
||
for x in string.lower():
|
||
if x in punctuations:
|
||
string = string.replace(x, " ")
|
||
return string
|
||
source_lang = "hi"
|
||
text = "सड़क के बीच में एक बड़ा ट्रक क्यों है?"
|
||
source_script = "Devanagari"
|
||
dest_script = "Latin"
|
||
|
||
|
||
|
||
# from fuzzywuzzy import fuzz
|
||
from difflib import SequenceMatcher
|
||
import Levenshtein
|
||
from rapidfuzz import fuzz
|
||
|
||
|
||
def calculate_edit_distance(original_word: str, transliterated_word: str) -> float:
|
||
return Levenshtein.distance(original_word, transliterated_word)
|
||
|
||
|
||
def calculate_similarity(original_word: str, transliterated_word: str) -> float:
|
||
return 1 - Levenshtein.distance(original_word, transliterated_word) / max(len(original_word), len(transliterated_word))
|
||
# return matcher.ratio()
|
||
|
||
|
||
def calculate_fuzz_similarity(original_word: str, transliterated_word: str) -> float:
|
||
return fuzz.ratio(original_word, transliterated_word)
|
||
|
||
|
||
|
||
def get_best_output(inside_func: callable, original_word: str, transliteration_outputs: list, reverse: bool=False):
|
||
best_transliteration = original_word
|
||
lowest_distance = float('inf')
|
||
highest_similarity = 0
|
||
|
||
parameter = highest_similarity if not reverse else lowest_distance
|
||
|
||
|
||
for candidate in transliteration_outputs:
|
||
# total_parameter = 0
|
||
total_parameter = inside_func(original_word, candidate)
|
||
print("total paramter", total_parameter, parameter, original_word, candidate)
|
||
# average_similarity = total_parameter
|
||
|
||
if not reverse:
|
||
# average_distance = total_parameter
|
||
|
||
if total_parameter > parameter:
|
||
parameter = total_parameter
|
||
# highest_similarity = average_similarity
|
||
best_transliteration = candidate
|
||
else:
|
||
# average_distance = total_parameter
|
||
|
||
if total_parameter < parameter:
|
||
parameter = total_parameter
|
||
# highest_similarity = average_distance
|
||
best_transliteration = candidate
|
||
print(best_transliteration)
|
||
return best_transliteration
|
||
|
||
|
||
|
||
def compare_transliteration_outputs(original_word: str, transliterated_words: list) -> str:
|
||
best_of_all_outputs = original_word
|
||
|
||
if original_word is None or transliterated_words is None:
|
||
return best_of_all_outputs
|
||
|
||
"""getting outputs compared using different functions and picking best outputs out of them"""
|
||
best_output1 = get_best_output(calculate_edit_distance, original_word, transliterated_words, True)
|
||
best_output2 = get_best_output(calculate_similarity, original_word, transliterated_words)
|
||
best_output3 = get_best_output(calculate_fuzz_similarity, original_word, transliterated_words)
|
||
|
||
best_of_all_outputs = Counter([best_output1, best_output2, best_output3]).most_common(1)[0][0]
|
||
|
||
print(best_output1, best_output2, best_output3, "89999999999999999999")
|
||
return best_of_all_outputs
|
||
|
||
|
||
|
||
|
||
|
||
# sources_name = {"0": "indic_trans", "1": "Azure","2": "libindic", "3": "sheetal"}
|
||
# priority_list = ["indic_trans", "Azure", "libindic", "sheetal"]
|
||
# etc_punctuation = ["", " . . .", " . .", " . . ”"]
|
||
# sentences = sentence_tokenize.sentence_split(text, lang="hi")
|
||
# if source_lang == "ne":
|
||
# source_lang = "hi"
|
||
# transliterated_text = []
|
||
# sentences = sentence_tokenize.sentence_split(text, lang="hi")
|
||
# if source_lang == "ne":
|
||
# source_lang = "hi"
|
||
# transliterated_text = []
|
||
# Out = []
|
||
# print("sentences", sentences)
|
||
# for sentence in sentences[0].split():
|
||
# print("full word -> ", sentence)
|
||
# if sentence in etc_punctuation:
|
||
# continue
|
||
# temp_sentence = punct_remover(sentence)
|
||
# t0 = indic_trans(temp_sentence, source_script, dest_script)
|
||
# t1 = azure_transliteration(
|
||
# temp_sentence, source_lang, source_script, dest_script
|
||
# )
|
||
# t2 = libindic(temp_sentence, dest_script).rstrip()
|
||
# t3 = sheetal(temp_sentence).replace("\n", "")
|
||
|
||
# Out = []
|
||
|
||
# for i in range(len(temp_sentence.split())):
|
||
# word = temp_sentence.split()[i]
|
||
# T0 = t0.split()[i]
|
||
# T1 = t1.split()[i]
|
||
# T2 = t2.split()[i]
|
||
# T3 = t3.split()[i]
|
||
# outputs = [T0, T1, T2, T3]
|
||
# out = compare_outputs_transliteration(
|
||
# word, outputs, sources_name, priority_list
|
||
# )
|
||
# Out.append(out)
|
||
# trans_sent_wo_punct = " ".join(Out)
|
||
# out = compare_transliteration_outputs(temp_sentence, [t0, t1, t2, t3])
|
||
# # print("this words output is -> ", out)
|
||
# # out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
|
||
# Out.append(out)
|
||
# trans_sent_wo_punct = " ".join(Out)
|
||
# print("trans_sent_wo_punct", trans_sent_wo_punct)
|
||
# transliterated_sentence = final_transliterated_sentence(
|
||
# sentence, trans_sent_wo_punct
|
||
# )
|
||
# print("trans_sent_w_punct", transliterated_sentence)
|
||
# transliterated_text.append(transliterated_sentence)
|
||
# print(transliterated_sentence)
|
||
|
||
# print("Entered Exiting Here1212", Out)
|
||
# f = open("output.txt", "w")
|
||
# f.write(" ".join(Out))
|
||
# f.close()
|
||
# return " ".join(transliterated_text)
|
||
# # print("original_sentence", sentence)
|
||
# temp_sentence = punct_remover(text)
|
||
# t0 = indic_trans(temp_sentence, source_script, dest_script)
|
||
# t1 = azure_transliteration(
|
||
# temp_sentence, source_lang, source_script, dest_script
|
||
# )
|
||
# t2 = libindic(temp_sentence, dest_script).rstrip()
|
||
# t3 = sheetal(temp_sentence).replace("\n", "")
|
||
# Out = []
|
||
# print(t0, t1, t2, t3)
|
||
# outputs_len = [len(T.split(" ")) for T in [t0, t1, t2, t3]]
|
||
# print(outputs_len, "outputs len")
|
||
# for i in range(len(temp_sentence.split())):
|
||
# word = temp_sentence.split()[i]
|
||
# T0 = t0.split()[i]
|
||
# T1 = t1.split()[i]
|
||
# T2 = t2.split()[i]
|
||
# T3 = t3.split()[i]
|
||
# outputs = [T0, T1, T2, T3]
|
||
#
|
||
# out = compare_outputs_transliteration(
|
||
# word, outputs, sources_name, priority_list
|
||
# )
|
||
# Out.append(out)
|
||
# trans_sent_wo_punct = " ".join(Out)
|
||
# out = compare_outputs_transliteration(temp_sentence, [t0,t1,t2,t3], sources_name, priority_list)
|
||
# # print("trans_sent_wo_punct", trans_sent_wo_punct)
|
||
# transliterated_sentence = final_transliterated_sentence(temp_sentence, out)
|
||
# print("trans_sent_", transliterated_sentence)
|
||
# transliterated_text.append(transliterated_sentence)
|
||
# print("Entered Exiting Here1212")
|
||
# print(" ".join(transliterated_text))
|
||
# if text in etc_punctuation:
|
||
# return text
|
||
# # print("original_sentence", sentence)
|
||
# temp_sentence = punct_remover(text)
|
||
# tt = 0
|
||
# try:
|
||
# t0 = indic_trans(temp_sentence, source_script, dest_script)
|
||
# outputa = t0
|
||
# except:
|
||
# tt += 1
|
||
# try:
|
||
# if tt == 1:
|
||
# t1 = azure_transliteration(
|
||
# temp_sentence, source_lang, source_script, dest_script
|
||
# )
|
||
# outputa = t1
|
||
# except:
|
||
# tt += 1
|
||
# # print("before t1111111111")
|
||
# try:
|
||
# if tt == 2:
|
||
# t2 = libindic(temp_sentence, dest_script).rstrip()
|
||
# outputa = t2
|
||
# except:
|
||
# tt += 1
|
||
# # print("before sheetal", t2)
|
||
# try:
|
||
# if tt == 3:
|
||
# t3 = sheetal(temp_sentence).replace("\n", "")
|
||
# outputa = t3
|
||
# except:
|
||
# tt += 1
|
||
#
|
||
# if tt == 4:
|
||
# outputa = text
|
||
# else:
|
||
# trans_sent_wo_punct = outputa
|
||
# print("trans_sent_wo_punct", trans_sent_wo_punct)
|
||
# transliterated_sentence = final_transliterated_sentence(
|
||
# sentence, trans_sent_wo_punct
|
||
# )
|
||
# print("trans_sent_w_punct", transliterated_sentence)
|
||
# transliterated_text.append(transliterated_sentence)
|
||
# print("Entered Exiting Here1212")
|
||
# return outputa
|
||
|
||
# source_lang = "hi"
|
||
# source_script = "Latin"
|
||
# dest_script = "Devanagari"
|
||
# sources_name = {
|
||
# "0": "Azure",
|
||
# "1": "indic_trans",
|
||
# "2": "google",
|
||
# "3": "indic_trans_IAST",
|
||
# }
|
||
# sentences = sentence_tokenize.sentence_split(text, lang="en")
|
||
# priority_list = [
|
||
# "Azure",
|
||
# "indic_trans",
|
||
# "google",
|
||
# "indic_trans_IAST",
|
||
# ]
|
||
# transliterated_text = []
|
||
# for sentence in sentences:
|
||
# if (
|
||
# sentence == ""
|
||
# or sentence == " . . ."
|
||
# or sentence == " . ."
|
||
# or sentence == " . . ”"
|
||
# ):
|
||
# continue
|
||
# OUT = []
|
||
# for word in sentence.split():
|
||
# if word == ".":
|
||
# continue
|
||
# t0 = azure_transliteration(
|
||
# word, source_lang, source_script, dest_script)
|
||
# t1 = indic_trans(word, source_script, dest_script)
|
||
# t2 = google(word, "en", "hi")
|
||
# t3 = indic_transliteration_IAST(word)
|
||
# outputs = [t0, t1, t2, t3]
|
||
# out = compare_outputs_transliteration(
|
||
# word, outputs, sources_name, priority_list
|
||
# )
|
||
# OUT.append(out)
|
||
# transliterated_text.append(" ".join(OUT))
|
||
# print("running perfectly")
|
||
# return " ".join(transliterated_text)
|
||
|
||
# print(indic_transliteration_IAST("mera naam dharmesh hai"))
|