# webch: Web Characters, version 0.1 (C) 23-6-2003 Daniel Clemente Laboreo
# webch: Change strange characters of web pages to ISO-Latin1
# www.danielclemente.com/webch danielclemente@ozu.es
# This program is GPL; you can modify it at your own.
# You need awk and basename (from coreutils) to run the script.
# TODO:
# * Solve the DO-NOT-CHANGE-TAGS thing, maybe by using another language -C- or using tricky tricks with diff, awk and patch (I have some ideas)
# * Accept input from pipes
# * Learn about the euro symbol
# -u option for undoing changes
echo "************************************************************************"
echo "* starting Web Characters 0.1 23th June 2003 by Daniel Clemente *"
echo "************ GPL'ed ****************** www.danielclemente.com/webch ****"
echo " danielclemente@ozu.es"
echo
echo "Use: $0 files... (example: $0 *.htm). Output goes to 'converted' dir"
echo
echo Remember this version will also change things like:
echo " to "
echo " to "
echo this is unwanted! But what did you hope of a 0.1 version?
echo "Anyway, wise webmasters don't use strange characters in paths."
echo
# The Convert function; it has to be declared here
Convert () {
awk '{
# Vowels with acute accent
gsub (/á/, "\\á");
gsub (/é/, "\\é");
gsub (/í/, "\\í");
gsub (/ó/, "\\ó");
gsub (/ú/, "\\ú");
gsub (/Á/, "\\Á");
gsub (/É/, "\\É");
gsub (/Í/, "\\Í");
gsub (/Ó/, "\\Ó");
gsub (/Ú/, "\\Ú");
# Vowels with grave accent
gsub (/à/, "\\à");
gsub (/è/, "\\è");
gsub (/ì/, "\\ì");
gsub (/ò/, "\\ò");
gsub (/ù/, "\\ù");
gsub (/À/, "\\À");
gsub (/È/, "\\È");
gsub (/Ì/, "\\Ì");
gsub (/Ò/, "\\Ò");
gsub (/Ù/, "\\Ù");
# Vowels with umlaut
gsub (/ä/, "\\ä");
gsub (/ë/, "\\ë");
gsub (/ï/, "\\ï");
gsub (/ö/, "\\ö");
gsub (/ü/, "\\ü");
gsub (/Ä/, "\\Ä");
gsub (/Ë/, "\\Ë");
gsub (/Ï/, "\\Ï");
gsub (/Ö/, "\\Ö");
gsub (/Ü/, "\\Ü");
# Vowels with circumflex accent
gsub (/â/, "\\â");
gsub (/ê/, "\\ê");
gsub (/î/, "\\î");
gsub (/ô/, "\\ô");
gsub (/û/, "\\û");
gsub (/Â/, "\\Â");
gsub (/Ê/, "\\Ê");
gsub (/Î/, "\\Î");
gsub (/Ô/, "\\Ô");
gsub (/Û/, "\\Û");
# C cedil used in french, catalan, ... Similar to "s"
gsub (/ç/, "\\ç");
gsub (/Ç/, "\\Ç");
# "Eñe" used in spanish (pronounced as in "caNYon")
gsub (/ñ/, "\\ñ");
gsub (/Ñ/, "\\Ñ");
# To use at the beginning of long sentences ending with ! or ?
gsub (/¡/, "\\¡");
gsub (/¿/, "\\¿");
# The following are not so common
# The accent marks themselves, without vowels
gsub (/Ž/, "\\´");
# No, ` ( `````` ) is not supported by browsers... Why?
gsub (/š/, "\\¨");
gsub (/\^/, "\\ˆ");
# We use this in catalan with leter l: "l·l". Similar to "l"
gsub (/·/, "\\·");
# As "1st" means "first", in spanish "1º"="primero" and "1ª"="primera"
gsub (/º/, "\\º");
gsub (/ª/, "\\ª");
# a-e ligature
gsub (/æ/, "\\æ");
gsub (/Æ/, "\\Æ");
# Very nice for quotations
gsub (/«/, "\\«");
gsub (/»/, "\\»");
# Currency symbols
gsub (/€/, "\\€"); # This does not work very well, tell me if you know something about it.
gsub (/¢/, "\\¢");
gsub (/£/, "\\£");
gsub (/¥/, "\\¥");
# Use this to refer to sections (text paragrahs, for example)
gsub (/§/, "\\§");
# Logical NOT (hypotheses, etc)
gsub (/¬/, "\\¬");
# Macron: indicates a longer sound for a vowel
gsub (/¯/, "\\¯");
# Add your own changes here. E-mail me the missing ones!
gsub (/ĉ/, "\\ĉ"); gsub (/Ĉ/, "\\Ĉ");
gsub (/ĝ/, "\\ĝ"); gsub (/Ĝ/, "\\Ĝ");
gsub (/ĥ/, "\\ĥ"); gsub (/Ĥ/, "\\Ĥ");
gsub (/ĵ/, "\\ĵ"); gsub (/Ĵ/, "\\Ĵ");
gsub (/ŝ/, "\\ŝ"); gsub (/Ŝ/, "\\Ŝ");
gsub (/ŭ/, "\\ŭ"); gsub (/Ŭ/, "\\Ŭ");
print;
}' "$1" >converted/"`basename \"$1\"`"
# basename strips the path and returns the file name.
# It's in package coreutils. Install it, it's useful.
echo ok
}
# HERE BEGINS THE SCRIPT
mkdir converted &> /dev/null
while [ $# -ne 0 ]
do
echo -n ***\> PROCESSING: $1...\
# Check if it's a good file
if ! test -r "$1"
then
echo error! Can\'t read file. Skipping it.
elif test -d "$1"
then
echo "directory! Recursive processing not yet implemented; use $0 \`find\`"
else
# Start the processing
Convert "$1"
fi
shift
done
rmdir converted &> /dev/null # (if it's empty)
# THE SCRIPT ENDS HERE. I hope you liked it.