This page has been accessed 2,782 times since the 23rd April 2007.
| View this page in: |
English |
Any language: |
Chinese |
French |
German |
Japanese |
Portuguese |
Russian |
Spanish |
Translation to non-English languages provided by Google Language
You are connecting to the IPv4 version of this website from the IP address 38.107.179.230. You can try the IPv6-only version if you want.
|
|
In my recent move to Microsoft Expression Web, I came across a problem
which has been plaguing users across the internet according to google.
If one uses UTF-8 to encode ones web pages in Expression Web so you can
be XHTML compliant, it inserts a hidden UTF-8 BOM marker at the start of
the file. This is very, very bad, because PHP and Perl doesn't
understand BOM markers (yet) and neither do web browsers which then
display three squiggly characters at the top of each page. In
particular, these three BOM bytes cause PHP to think <body> has started
and thus to refuse the use of the header() function to reset
Content-Type to utf-8, thus causing your UTF-8 pages to appear as
ISO-8859! The following Python script fixes this problem. It takes your Microsoft Expression Web directory tree and copies it to another location, testing HTML files for the UTF-8 BOM and removing them if present. It knows when to not copy files which are unchanged, so it is fast to run just before you upload your changes. You might also be interested in RemoveBOM v2. No guarantees or support are given with this code. Enjoy! # RemoveBOM
# Clones a directory structure but removing the BOM from UTF-8 files
# (C) 2007 Niall Douglas
# 23rd April 2007
import sys, os, shutil
def enumeratedir(path):
ret={}
for root, dirs, files in os.walk(path, False):
for f in files:
ret[os.path.join(root, f)]=(1, os.stat(os.path.join(root, f)))
for f in dirs:
ret[os.path.join(root, f)]=(2, os.stat(os.path.join(root, f)))
return ret
def replaceroot(root, path, pathroot):
return os.path.join(root, path[-(len(path)-len(pathroot)-1):])
def ensuredir(path):
if os.path.exists(path): return
ensuredir(os.path.dirname(path))
print 'Making directory',path
os.mkdir(path)
UTF8BOM=chr(0xef)+chr(0xbb)+chr(0xbf)
try:
indir=sys.argv[1]
outdir=sys.argv[2]
except:
indir="public_html"
outdir="public_html_bomfixed"
if not indir or not outdir: raise Exception, "Missing input or output dirs"
indircontents=enumeratedir(indir)
outdircontents=enumeratedir(outdir)
for path, st in outdircontents.iteritems():
ipath=replaceroot(indir, path, outdir)
if not indircontents.has_key(ipath):
print 'Deleting',path
if os.path.isfile(path) or os.path.islink(path):
os.remove(path)
elif os.path.isdir(path):
os.rmdir(path)
for path, st in indircontents.iteritems():
if st[0]==1:
f2path=replaceroot(outdir, path, indir)
if outdircontents.has_key(f2path) and \
abs(st[1].st_atime-outdircontents[f2path][1].st_atime)<2 and \
abs(st[1].st_mtime-outdircontents[f2path][1].st_mtime)<2:
# Unchanged
pass
else:
#print abs(st[1].st_atime-outdircontents[f2path][1].st_atime),abs(st[1].st_mtime-outdircontents[f2path][1].st_mtime)
done=False
if path[-5:]=='.html':
#print 'Looking at',path
f=open(path, 'rb')
try:
data=f.read(3)
if data==UTF8BOM: # It's the UTF-8 BOM
print 'File',path,'changed, removing UTF-8 BOM to',f2path
ensuredir(os.path.dirname(f2path))
f2=open(f2path, 'wb')
try:
data=f.read()
f2.write(data)
finally:
f2.close()
done=True
finally:
f.close()
if done:
shutil.copymode(path, f2path)
shutil.copystat(path, f2path)
if not done:
print 'File',path,'changed, copying to',f2path
ensuredir(os.path.dirname(f2path))
shutil.copy2(path, f2path)
print "All up to date!"
blog comments powered by Disqus
|