[Unicode]   Unicode Localization Interoperability Technical Committee : Bug Tracking Home | Site Map | Search
 

source: trunk/abbrs/src/json2cldr.py @ 34

Revision 34, 2.8 KB checked in by srloomis, 16 months ago (diff)

ticket:7336: move pt_BR to pt. Update XML generator with latest CLDR DTD and latest proposal.

Line 
1# -*- coding: utf-8 -*-
2# srl
3import sys
4
5reload(sys)
6sys.setdefaultencoding("utf-8")
7
8import json
9import os
10from lxml import etree
11
12dbg = False
13
14src_dir = '../json-cooked'
15dst_dir = '../xml/common/segments'
16
17comment = 'From ULI data, http://uli.unicode.org'
18draft = 'provisional'
19encoding = 'UTF-8' # of course
20doctype = '<!DOCTYPE ldml SYSTEM "../../common/dtd/ldml.dtd">'
21
22# list of locales
23locs = []
24
25# still doing it wrong.
26files = os.walk(src_dir)
27for ent in files:
28    (path,dirs,files) = ent
29    if(path.find("/.svn") != -1):
30        continue
31    for file in files:
32        if(file.endswith('.json')):
33            locs.append(file.split('.')[0])
34
35#testing
36#locs = ['en']
37
38print '# Converting ULI JSON from %s to CLDR XML in %s' % (src_dir, dst_dir)
39
40for loc in locs:
41    print '#   %s' % (loc)
42    fni = '%s/%s.json' % (src_dir, loc)
43    fi = open(fni,"rb")
44    data = json.load(fi)
45    fi.close()
46
47    abbrs = set(data['data']['abbrs'])
48    # calculate bcp47 segments (todo use a real parser)
49    locsplit = loc.split('_')
50    nsegs = len(locsplit)
51    language = None
52    script = None
53    region = None
54    variant = None
55    if dbg:
56        print "locsplit = %s, len=%d" % (locsplit,nsegs)
57    n=0
58    if nsegs>n:
59        language = locsplit[n]
60        n = n + 1
61    if nsegs>n and len(locsplit[n])==4: # it is a script code
62        script = locsplit[n]
63        n = n + 1
64    if nsegs>n and len(locsplit[n])<4: #it is AB or 123 - a region
65        region = locsplit[n]
66        n = n + 1
67    if nsegs>n:
68        variant = locsplit[n]
69        n = n + 1
70
71    ldml = etree.Element('ldml')
72    root = etree.ElementTree(ldml)
73    identity = etree.Element('identity')
74    identity.append(etree.Element('version',number='$Revision: $'))
75    identity.append(etree.Element('generation',date='$Date: $'))
76    if language:
77        identity.append(etree.Element('language',type=language))
78    if script:
79        identity.append(etree.Element('script',type=script))
80    if region:
81        identity.append(etree.Element('region',type=region))
82    if variant:
83        identity.append(etree.Element('variant',type=variant))
84
85    ldml.append(identity)
86    segmentations = etree.Element('segmentations')
87    segmentation = etree.Element('segmentation',type='SentenceBreak')
88    exceptions = etree.Element('exceptions')
89    for k in abbrs:
90        exception = etree.Element('exception', draft=draft)
91        exception.text = k
92        exceptions.append(exception)
93    segmentation.append(etree.Comment(comment))
94    segmentation.append(exceptions)
95    segmentations.append(segmentation)
96    ldml.append(segmentations)
97
98    str = etree.tostring(root,xml_declaration=True,pretty_print=True,encoding=encoding,doctype=doctype)
99    if dbg:
100        print str
101
102    fn = '%s/%s.xml' % (dst_dir, loc)
103    f = open(fn, 'wb')
104    print >>f, str
105    print '#        Wrote %d abbrs. to %s' % (len(abbrs),fn)
Note: See TracBrowser for help on using the repository browser.