RemoveBOM v2 webpage

This page has been accessed 3,128 times since the 21st August 2007.

 

View this page in: flag
English
Any language:
flag
Chinese
flag
French
flag
German
flag
Japanese
flag
Portuguese
flag
Russian
flag
Spanish

Translation to non-English languages provided by Google Language

You are connecting to the IPv4 version of this website from the IP address 38.107.191.95. You can try the IPv6-only version if you want.

 

 

I found another problem with Microsoft Expression Web: there is a bug that if you do not start a page with <!DOCTYPE html etc. then the formatting engine will slowly & randomly corrupt your HTML! This is a rather serious problem as I had been prefixing my UTF-8 HTML pages with <?php header("Content-type: text/html; charset=utf-8"); ?>.

The following Python script fixes this problem and is an enhancement of RemoveBOM v1. It takes your Microsoft Expression Web directory tree and copies it to another location, testing HTML files for the UTF-8 BOM and replacing that with the above php header rewriting code. It knows when to not copy files which are unchanged, so it is fast to run just before you upload your changes. It also rewrites the header via php such that HTTP Last-Modified is set to the last modified time of the php-containing html file which ensures that a HTTP 302 Not Modified response is given by Apache should the web browser send a "send if modified since X" request (which most do), thus greatly lowering bandwidth costs and indeed server load thanks to idiotic spider robots. Lastly, it also uses PHP output buffering to determine a correct Content-Length header and enables zlib compression should the source file exceed 64Kb - this adds latency for the compression and decompression, but halves or quarters the amount of data needing to be transmitted.

No guarantees or support are given with this code. Enjoy!

# RemoveBOM v2.0
# Clones a directory structure but removing the BOM from UTF-8 files
# (C) 2007 Niall Douglas
# 23rd April 2007 (last modified: 21st August 2007)

import sys, os, shutil

def enumeratedir(path):
    ret={}
    for root, dirs, files in os.walk(path, False):
        for f in files:
            ret[os.path.join(root, f)]=(1, os.stat(os.path.join(root, f)))
        for f in dirs:
            ret[os.path.join(root, f)]=(2, os.stat(os.path.join(root, f)))
    return ret
def replaceroot(root, path, pathroot):
    return os.path.join(root, path[-(len(path)-len(pathroot)-1):])
def ensuredir(path):
    if os.path.exists(path): return
    ensuredir(os.path.dirname(path))
    print 'Making directory',path
    os.mkdir(path)
UTF8BOM=chr(0xef)+chr(0xbb)+chr(0xbf)

try:
    indir=sys.argv[1]
    outdir=sys.argv[2]
except:
    indir="public_html"
    outdir="public_html_bomfixed"
if not indir or not outdir: raise Exception, "Missing input or output dirs"
indircontents=enumeratedir(indir)
outdircontents=enumeratedir(outdir)
for path, st in outdircontents.iteritems():
    ipath=replaceroot(indir, path, outdir)
    if not indircontents.has_key(ipath):
        print 'Deleting',path
        if os.path.isfile(path) or os.path.islink(path):
            os.remove(path)
        elif os.path.isdir(path):
            os.rmdir(path)
    
for path, st in indircontents.iteritems():
    if st[0]==1:
        f2path=replaceroot(outdir, path, indir)
        if outdircontents.has_key(f2path) and \
            abs(st[1].st_mtime-outdircontents[f2path][1].st_mtime)<2:
            #abs(st[1].st_atime-outdircontents[f2path][1].st_atime)<2 and \
            # Unchanged
            pass
        else:
            #print abs(st[1].st_atime-outdircontents[f2path][1].st_atime),abs(st[1].st_mtime-outdircontents[f2path][1].st_mtime)
            done=False
            if path[-5:]=='.html' and not '_vti_cnf' in path:
                #print 'Looking at',path
                f=open(path, 'rb')
                try:
                    ensuredir(os.path.dirname(f2path))
                    f2=open(f2path, 'wb')
                    try:
                        data=f.read()
                        isUTF8=data.startswith(UTF8BOM)
                        if isUTF8: # It's the UTF-8 BOM
                            data=data[3:]
                        if not data.startswith('<?php'):
                            # Get PHP to mark server side that this is UTF-8 html
                            f2.write('<?php ')
                            if isUTF8:
                                print 'File',path,'changed, removing UTF-8 BOM to',f2path
                                f2.write('header("Content-type: text/html; charset=utf-8");\r\n');
                            else:
                                print 'File',path,'changed, modifying to',f2path
                            f2.write('header("Last-Modified: " . gmdate("D, d M Y H:i:s", getlastmod()) . " GMT");\r\n')
                            f2.write('ob_start();\r\n')
                            f2.write('$gzipthis = (filesize(__FILE__)>65535);\r\n')
                            f2.write('if($gzipthis) ob_start("ob_gzhandler");\r\n')
                            f2.write('?>\r\n')
                            f2.write(data)
                            f2.write('<?php ')
                            f2.write('if($gzipthis) ob_end_flush();\r\n')
                            f2.write('header("Content-Length: " . ob_get_length());\r\n')
                            f2.write('ob_end_flush();\r\n')
                            f2.write('?>\r\n')
                        else:
                            f2.write(data)
                    finally:
                        f2.close()
                    done=True
                finally:
                    f.close()
                if done:
                    shutil.copymode(path, f2path)
                    shutil.copystat(path, f2path)
            if not done:
                print 'File',path,'changed, copying to',f2path
                ensuredir(os.path.dirname(f2path))
                shutil.copy2(path, f2path)
       
print "All up to date!"

Contact the webmaster: Niall Douglas @ webmaster2<at symbol>nedprod.com (Last updated: 15 March 2009 19:01:44 -0000)