[Unicode]   Unicode Localization Interoperability Technical Committee : Bug Tracking Home | Site Map | Search
 

source: trunk/abbrs/src/json2cldr.py @ 24

Revision 24, 2.9 KB checked in by srloomis, 13 months ago (diff)

ticket:7336: commit generated xml

Line 
1# -*- coding: utf-8 -*-
2# srl
3import sys
4
5reload(sys)
6sys.setdefaultencoding("utf-8")
7
8import json
9import os
10from lxml import etree
11
12dbg = False
13
14src_dir = '../json-cooked'
15dst_dir = '../xml/common/segments'
16
17comment = 'From ULI data, http://uli.unicode.org'
18draft = 'provisional'
19encoding = 'UTF-8' # of course
20doctype = '<!DOCTYPE ldml SYSTEM "../../common/dtd/ldml.dtd">'
21
22# list of locales
23locs = []
24
25# still doing it wrong.
26files = os.walk(src_dir)
27for ent in files:
28    (path,dirs,files) = ent
29    if(path.find("/.svn") != -1):
30        continue
31    for file in files:
32        if(file.endswith('.json')):
33            locs.append(file.split('.')[0])
34
35#testing
36#locs = ['en']
37
38print '# Converting ULI JSON from %s to CLDR XML in %s' % (src_dir, dst_dir)
39
40for loc in locs:
41    print '#   %s' % (loc)
42    fni = '%s/%s.json' % (src_dir, loc)
43    fi = open(fni,"rb")
44    data = json.load(fi)
45    fi.close()
46
47    abbrs = set(data['data']['abbrs'])
48    # calculate bcp47 segments (todo use a real parser)
49    locsplit = loc.split('_')
50    nsegs = len(locsplit)
51    language = None
52    script = None
53    region = None
54    variant = None
55    if dbg:
56        print "locsplit = %s, len=%d" % (locsplit,nsegs)
57    n=0
58    if nsegs>n:
59        language = locsplit[n]
60        n = n + 1
61    if nsegs>n and len(locsplit[n])==4: # it is a script code
62        script = locsplit[n]
63        n = n + 1
64    if nsegs>n and len(locsplit[n])<4: #it is AB or 123 - a region
65        region = locsplit[n]
66        n = n + 1
67    if nsegs>n:
68        variant = locsplit[n]
69        n = n + 1
70
71    ldml = etree.Element('ldml')
72    root = etree.ElementTree(ldml)
73    identity = etree.Element('identity')
74    identity.append(etree.Element('version',number='$Revision: $'))
75    identity.append(etree.Element('generation',date='$Date: $'))
76    if language:
77        identity.append(etree.Element('language',type=language))
78    if script:
79        identity.append(etree.Element('script',type=script))
80    if region:
81        identity.append(etree.Element('region',type=region))
82    if variant:
83        identity.append(etree.Element('variant',type=variant))
84   
85    ldml.append(identity)
86    segmentations = etree.Element('segmentations')
87    segmentation = etree.Element('segmentation',type='SentenceBreak')
88    exceptions = etree.Element('exceptions', draft=draft)
89    exceptionText = ''
90    for k in abbrs:
91        exceptionText = exceptionText + '\n       ' + k
92    exceptionText = exceptionText + '\n     '
93    exceptions.text = exceptionText
94    segmentation.append(etree.Comment(comment))
95    segmentation.append(exceptions)
96    segmentations.append(segmentation)
97    ldml.append(segmentations)
98
99    str = etree.tostring(root,xml_declaration=True,pretty_print=True,encoding=encoding,doctype=doctype)
100    if dbg:
101        print str
102
103    fn = '%s/%s.xml' % (dst_dir, loc)
104    f = open(fn, 'wb')
105    print >>f, str
106    print '#        Wrote %d abbrs. to %s' % (len(abbrs),fn)
Note: See TracBrowser for help on using the repository browser.