# webch: Web Characters, version 0.1  (C) 23-6-2003 Daniel Clemente Laboreo
# webch: Change strange characters of web pages to ISO-Latin1
# www.danielclemente.com/webch   danielclemente@ozu.es
# This program is GPL; you can modify it at your own.

# You need awk and basename (from coreutils) to run the script.

# TODO:
# * Solve the DO-NOT-CHANGE-TAGS thing, maybe by using another language -C- or using tricky tricks with diff, awk and patch (I have some ideas)
# * Accept input from pipes
# * Learn about the euro symbol
# -u option for undoing changes


echo "************************************************************************"
echo "* starting Web Characters 0.1        23th June 2003 by Daniel Clemente *"
echo "************ GPL'ed ****************** www.danielclemente.com/webch ****"
echo "                                              danielclemente@ozu.es"
echo
echo "Use: $0 files...  (example: $0 *.htm). Output goes to 'converted' dir"
echo
echo Remember this version will also change things like:
echo "<!-- Año -->  to  <!-- A&ntilde;o -->"
echo "<IMG SRC=pingüino.png>  to  <IMG SRC=ping&uuml;ino.png>"
echo  this is unwanted! But what did you hope of a 0.1 version?
echo "Anyway, wise webmasters don't use strange characters in paths."
echo

# The Convert function; it has to be declared here
Convert () {
	awk '{


	# Vowels with acute accent
	gsub (/á/, "\\&aacute;");
	gsub (/é/, "\\&eacute;");
	gsub (/í/, "\\&iacute;");
	gsub (/ó/, "\\&oacute;");
	gsub (/ú/, "\\&uacute;");

	gsub (/Á/, "\\&Aacute;");
	gsub (/É/, "\\&Eacute;");
	gsub (/Í/, "\\&Iacute;");
	gsub (/Ó/, "\\&Oacute;");
	gsub (/Ú/, "\\&Uacute;");

	# Vowels with grave accent
	gsub (/à/, "\\&agrave;");
	gsub (/è/, "\\&egrave;");
	gsub (/ì/, "\\&igrave;");
	gsub (/ò/, "\\&ograve;");
	gsub (/ù/, "\\&ugrave;");

	gsub (/À/, "\\&Agrave;");
	gsub (/È/, "\\&Egrave;");
	gsub (/Ì/, "\\&Igrave;");
	gsub (/Ò/, "\\&Ograve;");
	gsub (/Ù/, "\\&Ugrave;");

	# Vowels with umlaut
	gsub (/ä/, "\\&auml;");
	gsub (/ë/, "\\&euml;");
	gsub (/ï/, "\\&iuml;");
	gsub (/ö/, "\\&ouml;");
	gsub (/ü/, "\\&uuml;");

	gsub (/Ä/, "\\&Auml;");
	gsub (/Ë/, "\\&Euml;");
	gsub (/Ï/, "\\&Iuml;");
	gsub (/Ö/, "\\&Ouml;");
	gsub (/Ü/, "\\&Uuml;");

	# Vowels with circumflex accent
	gsub (/â/, "\\&acirc;");
	gsub (/ê/, "\\&ecirc;");
	gsub (/î/, "\\&icirc;");
	gsub (/ô/, "\\&ocirc;");
	gsub (/û/, "\\&ucirc;");

	gsub (/Â/, "\\&Acirc;");
	gsub (/Ê/, "\\&Ecirc;");
	gsub (/Î/, "\\&Icirc;");
	gsub (/Ô/, "\\&Ocirc;");
	gsub (/Û/, "\\&Ucirc;");

	# C cedil used in french, catalan, ... Similar to "s"
	gsub (/ç/, "\\&ccedil;");
	gsub (/Ç/, "\\&Ccedil;");
	
	# "Eñe" used in spanish (pronounced as in "caNYon")
	gsub (/ñ/, "\\&ntilde;");
	gsub (/Ñ/, "\\&Ntilde;");

	# To use at the beginning of long sentences ending with ! or ?
	gsub (/¡/, "\\&iexcl;");
	gsub (/¿/, "\\&iquest;");


	# The following are not so common
	
	# The accent marks themselves, without vowels
	gsub (/Ž/, "\\&acute;");
	# No, &grave; ( `````` ) is not supported by browsers... Why?
	gsub (/š/, "\\&uml;");
	gsub (/\^/, "\\&circ;");

	# We use this in catalan with leter l: "l·l". Similar to "l"
	gsub (/·/, "\\&middot;");

	# As "1st" means "first", in spanish "1º"="primero" and "1ª"="primera"
	gsub (/º/, "\\&ordm;");
	gsub (/ª/, "\\&ordf;");

	# a-e ligature
	gsub (/æ/, "\\&aelig;");
	gsub (/Æ/, "\\&AElig;");

	# Very nice for quotations
	gsub (/«/, "\\&laquo;");
	gsub (/»/, "\\&raquo;");


	# Currency symbols
	gsub (/€/, "\\&euro;"); # This does not work very well, tell me if you know something about it.
	gsub (/¢/, "\\&cent;");
	gsub (/£/, "\\&pound;");
	gsub (/¥/, "\\&yen;");
	
	# Use this to refer to sections (text paragrahs, for example)
	gsub (/§/, "\\&sect;");

	# Logical NOT (hypotheses, etc)
	gsub (/¬/, "\\&not;");

	# Macron: indicates a longer sound for a vowel
	gsub (/¯/, "\\&macr;");


        # Add your own changes here. E-mail me the missing ones!

        gsub (/ĉ/, "\\&#265;"); gsub (/Ĉ/, "\\&#264;");
        gsub (/ĝ/, "\\&#285;"); gsub (/Ĝ/, "\\&#284;");
        gsub (/ĥ/, "\\&#293;"); gsub (/Ĥ/, "\\&#292;");
        gsub (/ĵ/, "\\&#309;"); gsub (/Ĵ/, "\\&#308;");
        gsub (/ŝ/, "\\&#349;"); gsub (/Ŝ/, "\\&#348;");
        gsub (/ŭ/, "\\&#365;"); gsub (/Ŭ/, "\\&#364;");



 

	print;

	}' "$1" >converted/"`basename \"$1\"`"
	# basename strips the path and returns the file name.
	# It's in package coreutils. Install it, it's useful.

	echo ok
	
}


# HERE BEGINS THE SCRIPT

mkdir converted &> /dev/null
while [ $# -ne 0 ]
do
	echo -n ***\> PROCESSING: $1...\ 


	# Check if it's a good file
	
	if ! test -r "$1"
	then
		echo error! Can\'t read file. Skipping it.
		
	elif test -d "$1"
	then
		echo "directory! Recursive processing not yet implemented; use $0 \`find\`"

	else
		# Start the processing
		Convert "$1"
	fi

	
	shift


done
rmdir converted  &> /dev/null # (if it's empty)
# THE SCRIPT ENDS HERE. I hope you liked it.


