ned Productions - RemoveBOM v2

I found a second showstopping problem with Microsoft Expression Web other than the BOM breaking PHP, and this includes all versions up to v4: there is a bug where the formatting engine will slowly & randomly corrupt your HTML! This is a rather serious problem, so I resolved to fix it by having my preflight python script perform validation of new content to catch any errors.

The following Python script fixes this problem and is an enhancement of RemoveBOM v1. It takes your Microsoft Expression Web directory tree and copies it to another location, performing the following operations as it goes:

It generates a full sitemap.xml in the document root.
Tests ..html files (not .htm, this is an easy way to mark for non-validation) for the UTF-8 BOM. If present, it removes the BOM and validates the data as valid UTF-8.
For .html files it prepends and appends php header rewriting code which spits out headers setting the content type to UTF-8 if the BOM was present.
For .html files it also sets HTTP Last-Modified to the last modified time of the php-containing html file which ensures that a HTTP 302 Not Modified response is given by Apache should the web browser send a "send if modified since X" request (which most do), thus greatly lowering bandwidth costs and indeed server load thanks to idiotic spider robots.
For .html files it also uses PHP output buffering to determine a correct Content-Length header and enables zlib compression should the source file exceed 64Kb - this adds latency for the compression and decompression, but halves or quarters the amount of data needing to be transmitted.
It passes all XHTML declaring itself as such through a validating XHTML parser and opens a list of found errors, if any, after completion. It uses a HTML5 microdata enabled XHTML DTD, so you can use HTML5 microdata just fine.
It knows when to not copy files which are unchanged, so it is fast to run just before you upload your changes.

You may find this script useful as a base for writing your own. No guarantees or support are given with this code. Enjoy!

# RemoveBOM v2.2
# Clones a directory structure but removing the BOM from UTF-8 files
# (C) 2007 Niall Douglas
# 23rd April 2007 (last modified: 10th March 2012)

import sys, os, shutil, re, time, urllib, hashlib
from lxml import etree

dontbothervalidating=[
"public_html\\Niall_stuff\\john_templeton_online_funding_inquiry.html",
"public_html\\Niall_stuff\\vdiary\\archives\\june07.html",
"public_html\\Niall_stuff\\vdiary\\archives\\march08.html",
"public_html\\uploadspam\\uploadspam.html",
"public_html\\uploadspam\\submit.html",
"public_html\\google5fc8a1f8de4b73f0.html",
"public_html\\private\\geshi\\docs\\geshi-doc.html",
"public_html\\admin\\phpMyAdmin\\Documentation.html"
]
dynamicallygenerated=[
"public_html\\index.html",
"public_html\\Niall_stuff\\vdiary\\archives\\entry.html",
"public_html\\Niall_stuff\\vdiary\\archives\\index.html"
]

class DTDResolver(etree.Resolver):
    def resolve(self, url, id, context):
        if url=="http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd":
            return self.resolve_filename("xhtml1-strict.dtd", context)
        elif url=="xhtml-lat1.ent":
            return self.resolve_filename("xhtml-lat1.ent", context)
        elif url=="xhtml-symbol.ent":
            return self.resolve_filename("xhtml-symbol.ent", context)
        elif url=="xhtml-special.ent":
            return self.resolve_filename("xhtml-special.ent", context)
        elif url=="http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd":
            return self.resolve_filename("xhtml1-transitional.dtd", context)
        elif url=="http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd":
            return self.resolve_filename("xhtml1-frameset.dtd", context)
        elif url=="http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd":
            return self.resolve_filename("xhtml11-flat.dtd", context)
        elif '/' not in url:
            return self.resolve_filename("public_html/"+url, context)


def enumeratedir(path):
    ret={}
    for root, dirs, files in os.walk(path, False):
        for f in files:
            ret[os.path.join(root, f)]=(1, os.stat(os.path.join(root, f)))
        for f in dirs:
            ret[os.path.join(root, f)]=(2, os.stat(os.path.join(root, f)))
    return ret
def replaceroot(root, path, pathroot):
    return os.path.join(root, path[-(len(path)-len(pathroot)-1):])
def ensuredir(path):
    if os.path.exists(path): return
    ensuredir(os.path.dirname(path))
    print 'Making directory',path
    os.mkdir(path)
UTF8BOM=chr(0xef)+chr(0xbb)+chr(0xbf)

try:
    indir=sys.argv[1]
    outdir=sys.argv[2]
except:
    indir="public_html"
    outdir="public_html_bomfixed"
if not indir or not outdir: raise Exception, "Missing input or output dirs"
print "Enumerating directories ..."
if not os.path.exists(outdir): os.mkdir(outdir)
indircontents=enumeratedir(indir)
outdircontents=enumeratedir(outdir)
frontpagestuff=[]
for path, st in indircontents.iteritems():
    if "_vti_" in path or "_private" in path or ("\\." in path and not "\\.htaccess" in path) or "Thumbs.db" in path:
        frontpagestuff.append(path)
for path in frontpagestuff:
    del indircontents[path]

print "Generating sitemap.xml ..."
sitemaph=open(indir+"/sitemap.xml.tmp", "wt")
sitemaph.write('<?xml version="1.0" encoding="utf-8"?>\n');
sitemaph.write('<urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n');
sitemaph.write('  xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9\n');
sitemaph.write('  http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"\n');
sitemaph.write('  xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n');
for path, st in indircontents.iteritems():
    segpath=urllib.quote(path[len(indir):].replace('\\', '/'))
    if st[0]!=1 \
        or "/sitemap.xml" in segpath \
        or "/_vti_" in segpath \
        or "/private" in segpath \
        or "/admin" in segpath \
        or ("/Niall_stuff/vdiary/archives/" in segpath and not segpath[-10:]=="index.html") \
        or "/." in segpath: continue
    sitemaph.write('<url><loc>http://www.nedprod.com'+segpath+'</loc><lastmod>'+time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(st[1].st_mtime))+'</lastmod></url>\n')
sitemaph.write('</urlset>\n')
sitemaph.close()

with open(indir+"/sitemap.xml", "rt") as sitemaph:
    oldsitemaphash=hashlib.md5(sitemaph.read())
with open(indir+"/sitemap.xml.tmp", "rt") as sitemaph:
    newsitemaphash=hashlib.md5(sitemaph.read())
if oldsitemaphash.digest()==newsitemaphash.digest():
    print "Site hasn't changed!"
    os.remove(indir+"/sitemap.xml.tmp")
else:
    print "Site has changed, replacing sitemap.xml!"
    os.remove(indir+"/sitemap.xml")
    os.rename(indir+"/sitemap.xml.tmp", indir+"/sitemap.xml")

print "Deleting deleted items ..."
for path, st in outdircontents.iteritems():
    ipath=replaceroot(indir, path, outdir)
    if not indircontents.has_key(ipath):
        print 'Deleting',path
        if os.path.isfile(path) or os.path.islink(path):
            os.remove(path)
        elif os.path.isdir(path):
            os.rmdir(path)

print "Writing changed files ..."
xmlerrorsh=open("xmlerrors.txt", "wt")
for path, st in indircontents.iteritems():
    if path=='public_html'+os.sep+'RemoveBOM.py': continue
    if st[0]==1:
        f2path=replaceroot(outdir, path, indir)
        if outdircontents.has_key(f2path) and \
            abs(st[1].st_mtime-outdircontents[f2path][1].st_mtime)<2 \
            :
            #and '.html' not in f2path: # and 'cv_v5.0.xml' not in f2path:
            #abs(st[1].st_atime-outdircontents[f2path][1].st_atime)<2 and \
            # Unchanged
            pass
        else:
            #print abs(st[1].st_atime-outdircontents[f2path][1].st_atime),abs(st[1].st_mtime-outdircontents[f2path][1].st_mtime)
            done=False
            if path[-18:]==os.sep+'xmlcv'+os.sep+'cv_v5.0.xml' or path[-18:]==os.sep+'xmlcv'+os.sep+'cv_v5.1.xml' or path[-16:]==os.sep+'xmlcv'+os.sep+'cv_v6.xml':
                print 'Fixing up XML at',path
                f=open(path, 'rb')
                try:
                    f2=open(f2path, 'wb')
                    try:
                        data=f.read()
                        idx1=data.find('<privatedetails>')
                        idx2=data.find('</privatedetails>')
                        data=data[:idx1+16]+data[idx2:]
                        f2.write(data)
                    finally:
                        f2.close()
                    done=True
                finally:
                    f.close()
            elif path[-5:]=='.html' and not '_vti_cnf' in path:
                #print 'Looking at',path
                f=open(path, 'rb')
                try:
                    ensuredir(os.path.dirname(f2path))
                    data=f.read()
                    # Replace all CRLF with LF
                    data=data.replace('\r', '')
                    # Get front of file
                    header=data[:64]
                    if '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML' in header and path not in dontbothervalidating:
                        # Replace whatever XHTML schema it uses with our own
                        # <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict with HTML5 microdata//EN" "xhtml1-strict-with-html5-microdata.dtd">
                        doctype_start=data.find("<!DOCTYPE", 0, 256)
                        assert doctype_start!=-1
                        doctype_end=data.find(">", doctype_start, 256)
                        assert doctype_end!=-1
                        validate_data=data[:doctype_start]+'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict with HTML5 microdata//EN" "xhtml1-strict-with-html5-microdata.dtd">'+data[doctype_end+1:]
                        # Validate according to the appropriate XHTML schema
                        try:
                            parser = etree.XMLParser(dtd_validation=True)
                            parser.resolvers.add( DTDResolver() )
                            tree = etree.fromstring(validate_data, parser)
                        except:
                            xmlerrorsh.write("%s: %s\n\n" % (path, sys.exc_info()[1]))
                            continue
                    f2=open(f2path, 'wb')
                    try:
                        isUTF8=data.startswith(UTF8BOM)
                        if isUTF8: # It's the UTF-8 BOM
                            data=data[3:]
                            try:
                                data.decode('utf-8')
                            except UnicodeDecodeError:
                                xmlerrorsh.write("%s: %s\n\n" % (path, sys.exc_info()[1]))
                        if not data.startswith('<?php') or data.startswith('<?php // RemoveBOM this'):
                            # Get PHP to mark server side that this is UTF-8 html
                            f2.write('<?php \r\n')
                            f2.write('$time = microtime(); $time = explode(" ", $time); $time = $time[1] + $time[0]; $startload = $time;\r\n');
                            f2.write('$uri=parse_url("http://".$_SERVER["HTTP_HOST"].$_SERVER["REQUEST_URI"]);\r\n');
                            f2.write('if(!$uri) { header("Status: 400 Bad Request"); exit; }\r\n');
                            f2.write('$path_excess=substr($uri["path"], strlen(urlencode($_SERVER["SCRIPT_NAME"])));\r\n');
                            f2.write('if(strlen($path_excess)) { header("Status: 404 Not Found"); exit; }\r\n');
                            if isUTF8:
                                print 'File',path,'changed, removing UTF-8 BOM to',f2path
                                f2.write('header("Content-type: text/html; charset=utf-8");\r\n');
                            else:
                                print 'File',path,'changed, modifying to',f2path
                            if path not in dynamicallygenerated:
                                f2.write('header("Last-Modified: " . gmdate("D, d M Y H:i:s", getlastmod()) . " GMT");\r\n')
                            else:
                                toroot="../"*(path.count('\\')-1)
                                f2.write('header("Last-Modified: " . gmdate("D, d M Y H:i:s", filemtime("'+toroot+'feed/atom-cached.xml")) . " GMT");\r\n')
                            f2.write('ob_start();\r\n')
                            f2.write('$gzipthis = (filesize(__FILE__)>65535);\r\n')
                            f2.write('if($gzipthis) ob_start("ob_gzhandler");\r\n')
                            f2.write('?>\r\n')
                            f2.write(data)
                            f2.write('<?php ')
                            f2.write('if($gzipthis) ob_end_flush();\r\n')
                            f2.write('header("Content-Length: " . ob_get_length());\r\n')
                            f2.write('$time = microtime(); $time = explode(" ", $time); $time = $time[1] + $time[0]; $finish = $time; $total_time = round(($finish - $startload), 4); header("X-Page-Build-Time: " . $total_time);\r\n')
                            f2.write('ob_end_flush();\r\n')
                            f2.write('?>\r\n')
                        else:
                            f2.write(data)
                    finally:
                        f2.close()
                    done=True
                finally:
                    f.close()
                if done:
                    shutil.copymode(path, f2path)
                    shutil.copystat(path, f2path)
            if not done:
                print 'File',path,'changed, copying to',f2path
                ensuredir(os.path.dirname(f2path))
                shutil.copy2(path, f2path)

xmlerrorsh.close()
if os.path.getsize("xmlerrors.txt"):
    print "\nThere were some XHTML validation errors!"
    os.startfile("xmlerrors.txt")
else:
    print "\nAll up to date!"

ned Productions – RemoveBOM v2