#!/usr/bin/env python
import sys, re, os, math
from xml.sax import make_parser
from xml.sax.handler import ContentHandler
height = 3119
offset = -38
rects = []
texts = {}
class BoxHandler(ContentHandler):
def __init__(self):
self.inside_text = False
self.textxy = (0, 0)
def getFeature(self, name):
features = {'feature_namespaces': 0, 'feature_external_ges': 0, 'feature_external_pes': 0}
return features[name]
def startElement(self, name, attrs):
if name == 'rect':
x, y, rectwidth, rectheight = map(lambda s: int(round(float(attrs.get(s)), 0)), ('x', 'y', 'width', 'height'))
endy = height - y
starty = endy - rectheight
endx = x + rectwidth
startx = x
rects.append((startx, starty, endx, endy))
#print "storing rect: %d, %d, %d, %d" % (startx, starty, endx, endy)
elif name == 'text':
self.inside_text = True
tf = attrs.get('transform')
q = re.match("matrix\(1 0 0 1 ((\d|\.)+) ((\d|\.)+)\)", tf)
if not q:
print "Error parsing text element: got transform " + str(tf)
sys.exit(2)
textx = int(round(float(q.groups(1)[0])))
texty = int(round(float(q.groups(1)[2])))
texty = height - texty
self.textxy = (textx, texty)
#print "starting text: %d, %d" % (textx, texty)
def endElement(self, name):
if name == 'text':
self.inside_text = False
def characters(self, ch):
if self.inside_text:
texts[self.textxy] = ch
#print "stored text @ %d, %d: %s" % (self.textxy[0], self.textxy[1], ch)
if len(sys.argv) < 2:
print "Usage: svg2box.py <filename.svg>"
sys.exit(2)
svgfile = sys.argv[1]
q = re.match("(.*).svg", svgfile)
if not q:
print "Input is not a svg file!"
sys.exit(2)
else:
basefile = q.groups(1)[0]
inf = open(basefile + '.svg', 'r')
if os.path.exists(basefile + '.box'):
# backup the original file
print "%s.box has been saved to %s.box.orig" % (basefile, basefile)
os.system('mv %s.box %s.box.orig' % (basefile, basefile))
outf = open(basefile + '.box', 'w')
print "Converting %s.svg to %s.box" % (basefile, basefile)
parser = make_parser()
parser.setContentHandler(BoxHandler())
parser.parse(inf)
print "Done parsing, sorting the results..."
def sorthelper(a, b):
if abs(a[1] - b[1]) < 35:
# if vertical separation is less than 35 pixels,
# we're on the same line, so sort horizontally
return cmp(a[0], b[0])
else:
# otherwise, sort vertically
return cmp(b[1], a[1])
# now sort the database to our satisfaction
rects.sort(sorthelper)
for (startx, starty, endx, endy) in rects:
try:
char = texts[(startx, endy)]
except KeyError:
print "No character found for box (%d, %d, %d, %d)" % (startx, starty, endx, endy)
char = "?"
outf.write("%s %d %d %d %d\n" % (char, startx, starty, endx, endy))
outf.close()
inf.close()