# RemoveBOM v2.2
# Clones a directory structure but removing the BOM from UTF-8 files
# (C) 2007 Niall Douglas
# 23rd April 2007 (last modified: 10th March 2012)
 
import sys, os, shutil, re, time, urllib, hashlib
from lxml import etree
 
dontbothervalidating=[
"public_html\\Niall_stuff\\john_templeton_online_funding_inquiry.html",
"public_html\\Niall_stuff\\vdiary\\archives\\june07.html",
"public_html\\Niall_stuff\\vdiary\\archives\\march08.html",
"public_html\\uploadspam\\uploadspam.html",
"public_html\\uploadspam\\submit.html",
"public_html\\google5fc8a1f8de4b73f0.html",
"public_html\\private\\geshi\\docs\\geshi-doc.html",
"public_html\\admin\\phpMyAdmin\\Documentation.html"
]
 
class DTDResolver(etree.Resolver):
    def resolve(self, url, id, context):
		if url=="http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd":
			return self.resolve_filename("xhtml1-strict.dtd", context)
		elif url=="xhtml-lat1.ent":
			return self.resolve_filename("xhtml-lat1.ent", context)
		elif url=="xhtml-symbol.ent":
			return self.resolve_filename("xhtml-symbol.ent", context)
		elif url=="xhtml-special.ent":
			return self.resolve_filename("xhtml-special.ent", context)
		elif url=="http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd":
			return self.resolve_filename("xhtml1-transitional.dtd", context)
		elif url=="http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd":
			return self.resolve_filename("xhtml1-frameset.dtd", context)
		elif url=="http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd":
			return self.resolve_filename("xhtml11-flat.dtd", context)
		elif '/' not in url:
			return self.resolve_filename("public_html/"+url, context)
 
 
def enumeratedir(path):
    ret={}
    for root, dirs, files in os.walk(path, False):
        for f in files:
            ret[os.path.join(root, f)]=(1, os.stat(os.path.join(root, f)))
        for f in dirs:
            ret[os.path.join(root, f)]=(2, os.stat(os.path.join(root, f)))
    return ret
def replaceroot(root, path, pathroot):
    return os.path.join(root, path[-(len(path)-len(pathroot)-1):])
def ensuredir(path):
    if os.path.exists(path): return
    ensuredir(os.path.dirname(path))
    print 'Making directory',path
    os.mkdir(path)
UTF8BOM=chr(0xef)+chr(0xbb)+chr(0xbf)
 
try:
    indir=sys.argv[1]
    outdir=sys.argv[2]
except:
    indir="public_html"
    outdir="public_html_bomfixed"
if not indir or not outdir: raise Exception, "Missing input or output dirs"
print "Enumerating directories ..."
if not os.path.exists(outdir): os.mkdir(outdir)
indircontents=enumeratedir(indir)
outdircontents=enumeratedir(outdir)
frontpagestuff=[]
for path, st in indircontents.iteritems():
    if "_vti_" in path or "_private" in path or ("\\." in path and not "\\.htaccess" in path):
        frontpagestuff.append(path)
for path in frontpagestuff:
    del indircontents[path]
 
print "Generating sitemap.xml ..."
sitemaph=open(indir+"/sitemap.xml.tmp", "wt")
sitemaph.write('<?xml version="1.0" encoding="utf-8"?>\n');
sitemaph.write('<urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n');
sitemaph.write('  xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9\n');
sitemaph.write('  http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"\n');
sitemaph.write('  xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n');
for path, st in indircontents.iteritems():
    segpath=urllib.quote(path[len(indir):].replace('\\', '/'))
    if st[0]!=1 \
        or "/sitemap.xml" in segpath \
        or "/_vti_" in segpath \
        or "/private" in segpath \
        or ("/Niall_stuff/vdiary/archives/" in segpath and not segpath[-10:]=="index.html") \
        or "/." in segpath: continue
    sitemaph.write('<url><loc>http://www.nedprod.com'+segpath+'</loc><lastmod>'+time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(st[1].st_mtime))+'</lastmod></url>\n')
sitemaph.write('</urlset>\n')
sitemaph.close()
 
with open(indir+"/sitemap.xml", "rt") as sitemaph:
    oldsitemaphash=hashlib.md5(sitemaph.read())
with open(indir+"/sitemap.xml.tmp", "rt") as sitemaph:
    newsitemaphash=hashlib.md5(sitemaph.read())
if oldsitemaphash.digest()==newsitemaphash.digest():
    print "Site hasn't changed!"
    os.remove(indir+"/sitemap.xml.tmp")
else:
    print "Site has changed, replacing sitemap.xml!"
    os.remove(indir+"/sitemap.xml")
    os.rename(indir+"/sitemap.xml.tmp", indir+"/sitemap.xml")
 
print "Deleting deleted items ..."
for path, st in outdircontents.iteritems():
    ipath=replaceroot(indir, path, outdir)
    if not indircontents.has_key(ipath):
        print 'Deleting',path
        if os.path.isfile(path) or os.path.islink(path):
            os.remove(path)
        elif os.path.isdir(path):
            os.rmdir(path)
 
print "Writing changed files ..."
xmlerrorsh=open("xmlerrors.txt", "wt")
for path, st in indircontents.iteritems():
    if path=='public_html'+os.sep+'RemoveBOM.py': continue
    if st[0]==1:
        f2path=replaceroot(outdir, path, indir)
        if outdircontents.has_key(f2path) and \
            abs(st[1].st_mtime-outdircontents[f2path][1].st_mtime)<2 \
            :
			#and '.html' not in f2path: # and 'cv_v5.0.xml' not in f2path:
            #abs(st[1].st_atime-outdircontents[f2path][1].st_atime)<2 and \
            # Unchanged
            pass
        else:
            #print abs(st[1].st_atime-outdircontents[f2path][1].st_atime),abs(st[1].st_mtime-outdircontents[f2path][1].st_mtime)
            done=False
            if path[-18:]==os.sep+'xmlcv'+os.sep+'cv_v5.0.xml' or path[-18:]==os.sep+'xmlcv'+os.sep+'cv_v5.1.xml' or path[-16:]==os.sep+'xmlcv'+os.sep+'cv_v6.xml':
                print 'Fixing up XML at',path
                f=open(path, 'rb')
                try:
                    f2=open(f2path, 'wb')
                    try:
                        data=f.read()
                        idx1=data.find('<privatedetails>')
                        idx2=data.find('</privatedetails>')
                        data=data[:idx1+16]+data[idx2:]
                        f2.write(data)
                    finally:
                        f2.close()
                    done=True
                finally:
                    f.close()
            elif path[-5:]=='.html' and not '_vti_cnf' in path:
                #print 'Looking at',path
                f=open(path, 'rb')
                try:
                    ensuredir(os.path.dirname(f2path))
                    data=f.read()
                    header=data[:64]
                    if '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML' in header and path not in dontbothervalidating:
                        # Replace whatever XHTML schema it uses with our own
                        # <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict with HTML5 microdata//EN" "xhtml1-strict-with-html5-microdata.dtd">
                        doctype_start=data.find("<!DOCTYPE", 0, 256)
                        assert doctype_start!=-1
                        doctype_end=data.find(">", doctype_start, 256)
                        assert doctype_end!=-1
                        validate_data=data[:doctype_start]+'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict with HTML5 microdata//EN" "xhtml1-strict-with-html5-microdata.dtd">'+data[doctype_end+1:]
                        # Validate according to the appropriate XHTML schema
                        try:
                            parser = etree.XMLParser(dtd_validation=True)
                            parser.resolvers.add( DTDResolver() )
                            tree = etree.fromstring(validate_data, parser)
                        except:
                            xmlerrorsh.write("%s: %s\n\n" % (path, sys.exc_info()[1]))
                            continue
                    f2=open(f2path, 'wb')
                    try:
                        isUTF8=data.startswith(UTF8BOM)
                        if isUTF8: # It's the UTF-8 BOM
                            data=data[3:]
                            try:
                                data.decode('utf-8')
                            except UnicodeDecodeError:
                                xmlerrorsh.write("%s: %s\n\n" % (path, sys.exc_info()[1]))
                        if not data.startswith('<?php') or data.startswith('<?php // RemoveBOM this'):
                            # Get PHP to mark server side that this is UTF-8 html
                            f2.write('<?php \r\n')
                            f2.write('$time = microtime(); $time = explode(" ", $time); $time = $time[1] + $time[0]; $startload = $time;\r\n');
                            f2.write('$uri=parse_url("http://".$_SERVER["HTTP_HOST"].$_SERVER["REQUEST_URI"]);\r\n');
                            f2.write('if(!$uri) { header("Status: 400 Bad Request"); exit; }\r\n');
                            f2.write('$path_excess=substr($uri["path"], strlen($_SERVER["SCRIPT_NAME"]));\r\n');
                            f2.write('if(strlen($path_excess)) { header("Status: 404 Not Found"); exit; }\r\n');
                            if isUTF8:
                                print 'File',path,'changed, removing UTF-8 BOM to',f2path
                                f2.write('header("Content-type: text/html; charset=utf-8");\r\n');
                            else:
                                print 'File',path,'changed, modifying to',f2path
                            f2.write('header("Last-Modified: " . gmdate("D, d M Y H:i:s", getlastmod()) . " GMT");\r\n')
                            f2.write('ob_start();\r\n')
                            f2.write('$gzipthis = (filesize(__FILE__)>65535);\r\n')
                            f2.write('if($gzipthis) ob_start("ob_gzhandler");\r\n')
                            f2.write('?>\r\n')
                            f2.write(data)
                            f2.write('<?php ')
                            f2.write('if($gzipthis) ob_end_flush();\r\n')
                            f2.write('header("Content-Length: " . ob_get_length());\r\n')
                            f2.write('$time = microtime(); $time = explode(" ", $time); $time = $time[1] + $time[0]; $finish = $time; $total_time = round(($finish - $startload), 4); header("X-Page-Build-Time: " . $total_time);\r\n')
                            f2.write('ob_end_flush();\r\n')
                            f2.write('?>\r\n')
                        else:
                            f2.write(data)
                    finally:
                        f2.close()
                    done=True
                finally:
                    f.close()
                if done:
                    shutil.copymode(path, f2path)
                    shutil.copystat(path, f2path)
            if not done:
                print 'File',path,'changed, copying to',f2path
                ensuredir(os.path.dirname(f2path))
                shutil.copy2(path, f2path)
 
xmlerrorsh.close()
if os.path.getsize("xmlerrors.txt"):
    print "\nThere were some XHTML validation errors!"
    os.startfile("xmlerrors.txt")
else:
    print "\nAll up to date!"