#!/usr/bin/python
'''ISO 15919 transliteration for devanagari text.
Simple usage:
import iso15919
romanised_unicode = iso15919.transliterate(indic_unicode)
Copyright (c) 2008 by Mublin <mublin@dealloc.org>
This module is free software, and you may redistribute it and/or modify
it under the same terms as Python itself, so long as this copyright message
and disclaimer are retained in their original form.
IN NO EVENT SHALL THE AUTHOR BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT,
SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OF
THIS CODE, EVEN IF THE AUTHOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
DAMAGE.
THE AUTHOR SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE. THE CODE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS,
AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE,
SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
TODO
* U+0904 = short a: transliteration?
* nasalisation of inherent vowel?
* check danda and double danda transliteration
Sources.
* http://www.unicode.org/charts/PDF/U0900.pdf
* http://transliteration.eki.ee/pdf/Hindi-Marathi-Nepali.pdf
* http://homepage.ntlworld.com/stone-catend/triunico.htm'''
__author__ = "Mublin <mublin@dealloc.org>"
__date__ = "20 April 2008"
__version__ = "0.1.8"
class TransliterationError(Exception):
pass
DEVANAGARI_START = u'\u0901'
CANDRABINDU = u'\u0901'
ANUSVARA = u'\u0902'
VISARGA = u'\u0903'
VOWEL_START = u'\u0904'
VOWEL_END = u'\u0914'
CONSONANT_START = u'\u0915'
CONSONANT_END = u'\u0939'
NUKTA = u'\u093c'
AVAGRAHA = u'\u093d'
MATRA_START = u'\u093e'
MATRA_END = u'\u094c'
VIRAMA = u'\u094d'
OM = u'\u0950'
UDATTA = u'\u0951'
ANUDATTA = u'\u0952'
GRAVE = u'\u0953'
ACUTE = u'\u0954'
CONSONANT2_START = u'\u0958'
CONSONANT2_END = u'\u095f'
VOWEL2_START = u'\u0960'
VOWEL2_END = u'\u0961'
MATRA2_START = u'\u0962'
MATRA2_END = u'\u0963'
PUNCTUATION_START = u'\u0964'
DANDA = u'\u0964'
DOUBLEDANDA = u'\u0965'
PUNCTUATION_END = u'\u0965'
DIGIT_START = u'\u0966'
DIGIT_END = u'\u096f'
PUNCTUATION2_START = u'\u0970'
PUNCTUATION2_END = u'\u0971'
VOWEL3 = u'\u0972'
CONSONANT3_START = u'\u097b'
CONSONANT3_END = u'\u097c'
GLOTTALSTOP = u'\u097d'
CONSONANT4_START = u'\u097e'
CONSONANT4_END = u'\u097f'
DEVANAGARI_END = u'\u097f'
iso15919 = u'''\
\u0901 m\u0310
\u0902 \u1e41
\u0903 \u1e25
\u0904
\u0905 a
\u0906 \u0101
\u0907 i
\u0908 \u012b
\u0909 u
\u090a \u016b
\u090b \u1e5b
\u090c \u1e37
\u090d \u00ea
\u090e e
\u090f \u0113
\u0910 ai
\u0911 \u00f4
\u0912 o
\u0913 \u014d
\u0914 au
\u0915 ka
\u0916 kha
\u0917 ga
\u0918 gha
\u0919 \u1e45a
\u091a ca
\u091b cha
\u091c ja
\u091d jha
\u091e \xf1a
\u091f \u1e6da
\u0920 \u1e6dha
\u0921 \u1e0da
\u0922 \u1e0dha
\u0923 \u1e47a
\u0924 ta
\u0925 tha
\u0926 da
\u0927 dha
\u0928 na
\u0929 n\u0331a
\u092a pa
\u092b pha
\u092c ba
\u092d bha
\u092e ma
\u092f ya
\u0930 ra
\u0931 r\u0331a
\u0932 la
\u0933 \u1e37a
\u0934 l\u0331a
\u0935 va
\u0936 \u015ba
\u0937 \u1e63a
\u0938 sa
\u0939 ha
\u093c
\u093d '
\u093e \u0101
\u093f i
\u0940 \u012b
\u0941 u
\u0942 \u016b
\u0943 \u1e5b
\u0944 \u1e5d
\u0945 \u00ea
\u0946 e
\u0947 \u0113
\u0948 ai
\u0949 \u00f4
\u094a o
\u094b \u014d
\u094c au
\u094d
\u0950 o\u1e43
\u0951
\u0952
\u0953
\u0954
\u0958 qa
\u0959 k\u0331h\u0331a
\u095a \u0121
\u095b za
\u095c \u1e5ba
\u095d \u1e5bha
\u095e fa
\u095f \u1e8fa
\u0960 \u1e5d
\u0961 \u1e39
\u0962 \u1e37
\u0963 \u1e39
\u0964 .
\u0965 ..
\u0966 0
\u0967 1
\u0968 2
\u0969 3
\u096a 4
\u096b 5
\u096c 6
\u096d 7
\u096e 8
\u096f 9
\u0970 \u2026
\u0971
\u0972
\u097b
\u097c
\u097d
\u097e
\u097f '''
# These are special transliterations for consonant triples which have
# a virama in the centre, as well as for some consonant-nukta pairs
# which are not equivalent to a single Unicode character.
clusters = u'''\
\u0939\u093c h\u0324a
\u0938\u093c s\u0324a
\u0924\u093c t\u0324a
\u0915\u094d\u0937 k\u1e63a
\u091c\u094d\u091e j\xf1a
\u0924\u094d\u0930 tra
\u0936\u094d\u0930 \u015bra'''
# These are combinations of consonant and nukta which are equivalent
# to a single Unicode character.
nukta_consonants = u'''\
\u0929 \u0928\u093c
\u0931 \u0930\u093c
\u0934 \u0933\u093c
\u0958 \u0915\u093c
\u0959 \u0916\u093c
\u095a \u0917\u093c
\u095b \u091c\u093c
\u095c \u0921\u093c
\u095d \u0922\u093c
\u095e \u092b\u093c
\u095f \u092f\u093c'''
# This table specifies the transliteration of anusvara where followed
# by a consonant.
anusvara_consonants = u'''\
n \u0915 \u0915 \u0917 \u0918 \u0919 \u0924 \u0925 \u0926 \u0927 \u0928
\u00f1 \u091a \u091b \u091c \u091d \u091e
\u1e47 \u091f \u0920 \u0921 \u0922 \u0923
m \u092a \u092b \u092c \u092d \u092e'''
iso15919 = [row.split('\t') for row in iso15919.split('\n')]
iso15919, _iso15919 = {}, iso15919
for char, trans in _iso15919:
if trans:
iso15919[char] = trans
clusters = dict(row.split('\t') for row in clusters.split('\n'))
clusterables = dict.fromkeys(cluster[0] for cluster in clusters)
nukta_consonants = dict(row.split('\t') for row in nukta_consonants.split('\n'))
anusvara_consonants, _anusvara_consonants = {}, anusvara_consonants
for row in _anusvara_consonants.split('\n'):
char, consonants = row.split('\t')
for consonant in consonants.split(' '):
anusvara_consonants[consonant] = char
def transliterate(source):
'''Transliterate Devanagari to the Latin alphabet (ISO 15919).
transliterate(unicode) -> unicode
If a unicode character from the Devanagari range cannot be
transliterated, a TransliterationError is raised. If another
unicode character cannot be transliterated, it is copied unchanged
to the result string.'''
# normalisation: replace consonant + nukta by equivalent
# consonants
orig = source
for char, combination in nukta_consonants.iteritems():
source = source.replace(combination, char)
# transliterate character by character
result, i = [], 0
while i < len(source):
char = source[i]
# anusvara + consonant?
if char == ANUSVARA:
try:
next = source[i+1]
result.append(anusvara_consonants[next])
i += 1
continue
except (IndexError, KeyError):
pass
# vowel + anusvara/candrabindu?
if i and char in (ANUSVARA, CANDRABINDU):
prev = source[i-1]
if VOWEL_START <= prev <= VOWEL_END \
or VOWEL2_START <= prev <= VOWEL2_END \
or VOWEL3 == prev \
or MATRA_START <= prev <= MATRA_END \
or MATRA2_START <= prev <= MATRA2_END:
result.append(u'\u0303')
i += 1
continue
# consonant + virama or matra?
if i and (char == VIRAMA or
MATRA_START <= char <= MATRA_END or
MATRA2_START <= char <= MATRA2_END):
prev = source[i-1]
if prev != NUKTA or i > 1:
if prev == NUKTA:
prev = source[i-2]
if (CONSONANT_START <= prev <= CONSONANT_END or
CONSONANT2_START <= prev <= CONSONANT2_END):
consonant = result[-1]
if consonant.endswith('a'):
if char == VIRAMA:
result[-1] = consonant[:-1]
else:
result[-1] = consonant[:-1] + iso15919[char]
i += 1
continue
# special transliteration for consonant cluster?
if char in clusterables:
try:
next = source[i+1]
except IndexError:
pass
else:
try:
if next == VIRAMA:
result.append(clusters[source[i:i+3]])
i += 3
continue
elif next == NUKTA:
result.append(clusters[source[i:i+2]])
i += 2
continue
except KeyError:
pass
# vowel + nukta?
if i and char == NUKTA:
prev = source[i-1]
if VOWEL_START <= prev <= VOWEL_END \
or VOWEL2_START <= prev <= VOWEL2_END \
or VOWEL3 == prev \
or MATRA_START <= prev <= MATRA_END \
or MATRA2_START <= prev <= MATRA2_END:
result.append(u'\u2018')
i += 1
continue
# default.
try:
result.append(iso15919[char])
except KeyError:
if DEVANAGARI_START <= char <= DEVANAGARI_END:
start, end = i - 3, i + 3
if start < 0:
start, end = 0, end - start
raise TransliterationError, \
'no transliteration for Devanagari %r (%r)' % (char, source[start:end])
result.append(char)
i += 1
return ''.join(result)
if __name__ == '__main__':
import sys
status = 0
for line in sys.stdin:
try:
sys.stdout.write(
transliterate(line.decode('utf-8')).encode('utf-8'))
except TransliterationError, e:
sys.stderr.write(e.message + '\n')
status = 1
sys.exit(status)
Generated by GNU
enscript 1.6.4.