#!/usr/bin/env python

import sys, re, os, math

from xml.sax import make_parser
from xml.sax.handler import ContentHandler

height = 3119
offset = -38

rects = []
texts = {}

class BoxHandler(ContentHandler):
	def __init__(self):
		self.inside_text = False
		self.textxy = (0, 0)
	
	def getFeature(self, name):
	    features = {'feature_namespaces': 0, 'feature_external_ges': 0, 'feature_external_pes': 0}
	    return features[name]
	
	def startElement(self, name, attrs):
		if name == 'rect':
			x, y, rectwidth, rectheight = map(lambda s: int(round(float(attrs.get(s)), 0)), ('x', 'y', 'width', 'height'))
			
			endy = height - y
			starty = endy - rectheight
			
			endx = x + rectwidth
			startx = x
			
			rects.append((startx, starty, endx, endy))
			#print "storing rect: %d, %d, %d, %d" % (startx, starty, endx, endy)
			
		elif name == 'text':
			self.inside_text = True
			
			tf = attrs.get('transform')
			q = re.match("matrix\(1 0 0 1 ((\d|\.)+) ((\d|\.)+)\)", tf)
			if not q:
				print "Error parsing text element: got transform " + str(tf)
				sys.exit(2)
			textx = int(round(float(q.groups(1)[0])))
			texty = int(round(float(q.groups(1)[2])))
			
			texty = height - texty
			
			self.textxy = (textx, texty)
			#print "starting text: %d, %d" % (textx, texty)
	
	def endElement(self, name):
		if name == 'text':
			self.inside_text = False
			
	def characters(self, ch):
		if self.inside_text:
			texts[self.textxy] = ch
			#print "stored text @ %d, %d: %s" % (self.textxy[0], self.textxy[1], ch)


if len(sys.argv) < 2:
	print "Usage: svg2box.py <filename.svg>"
	sys.exit(2)

svgfile = sys.argv[1]

q = re.match("(.*).svg", svgfile)
if not q:
	print "Input is not a svg file!"
	sys.exit(2)
else:
	basefile = q.groups(1)[0]


inf = open(basefile + '.svg', 'r')

if os.path.exists(basefile + '.box'):
	# backup the original file
	print "%s.box has been saved to %s.box.orig" % (basefile, basefile)
	os.system('mv %s.box %s.box.orig' % (basefile, basefile))

outf = open(basefile + '.box', 'w')

print "Converting %s.svg to %s.box" % (basefile, basefile)

parser = make_parser()
parser.setContentHandler(BoxHandler())

parser.parse(inf)

print "Done parsing, sorting the results..."

def sorthelper(a, b):
	if abs(a[1] - b[1]) < 35:
		# if vertical separation is less than 35 pixels, 
		# we're on the same line, so sort horizontally
		return cmp(a[0], b[0])
	else:
		# otherwise, sort vertically
		return cmp(b[1], a[1])

# now sort the database to our satisfaction
rects.sort(sorthelper)

for (startx, starty, endx, endy) in rects:
	try:
		char = texts[(startx, endy)]
	except KeyError:
		print "No character found for box (%d, %d, %d, %d)" % (startx, starty, endx, endy)
		char = "?"
	
	outf.write("%s %d %d %d %d\n" % (char, startx, starty, endx, endy))

outf.close()
inf.close()