#!/usr/bin/perl -w
#
# dislines: distribute in several files the lines of a tagged text.
# Version 1.2 (c) 12-7 to 30-7-2005 Daniel Clemente Laboreo
# http://www.danielclemente.com/dislines/
#
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
#
#
# Super-quick syntax guide:
# (full syntax at): http://www.danielclemente.com/dislines/syntax.en.html
#
# Simple: @tag
# Multiple: @tag1,tag2,tag3
# Blocks: open with @{tag or @tag{ and close them with @}tag @tag} or @}
# Comments: @----- or in a block. Any number of -
# Repeat last tag used: @""""""" Any number of "
# You can use a lot of @ if you want: @@@@@@@@tag
# Common lines can also have it: @@ This is common
#
#
#
#
# The idea for the program is:
#
# 1. Create a temporary file for the common lines
# 2. For every line of the file:
# 3. | If it's common (no tag), add it to the global file created in (1)
# 4. | If it has a tag or more attached, do for each tag:
# 5. | | If it's the first time we see this tag:
# 6. | | | Duplicate the global file to make a temporary file for this tag
# 7. | | Add the line to the temporary file of the tag
# 8. | Also add the line to the global file
# 9. Now turn the temporary files into the final ones and delete the global
#
#
# use v5.0; # or later
use strict;
# use warnings; # Not on <5.6.0. I'm using -w
use Getopt::Long;
use File::Copy;
use File::Basename;
#
# Customizable settings:
#
# Change to 1 to better understand the code:
my $DEBUG=0;
# How to separate the "TAG" from the "file" in "file.TAG.ext"
my $SEP_CHAR='.'; # file.TAG.ext
# my $SEP_CHAR='-'; # file-TAG.ext (eg. index-en.html)
# If you don't like ats (@), you can change the used symbol here.
# Of course, you must put the one you use in your files.
my $AT='@';
# Options from the command line
my (
$just_list,
$only_tags,
$out_base,
$just_stats,
$show_help,
$show_version,
$quiet,
);
GetOptions (
"list" => \$just_list,
"tags=s" => \$only_tags,
"out=s" => \$out_base,
"stats" => \$just_stats,
"quiet" => \$quiet,
"help" => \$show_help,
"version" => \$show_version,
)
or die "Strange parameters passed. Stopping";
&show_help() if $show_help;
&show_version() if $show_version;
my $input_file;
if ($#ARGV ne 0) {
print STDERR "I need the name of the file to process. See help (-h)\n";
exit -1;
}
$input_file = pop @ARGV;
open (INPUT, $input_file) or &system_error("Can't open $input_file");
warn "Be careful: $input_file doesn't seem a text file. Warning"
if not $quiet
and not -T $input_file
and not $input_file eq "-";
# Once opened, we just want its name, to create temporary files
$^O =~ s/^dos$/msdos/i; # dosemu said just "dos"
fileparse_set_fstype($^O); # and this requires "msdos"
$input_file = basename ($input_file);
# If not specified, use the same name as the base for the output files
$out_base ||= $input_file;
# Don't print messages if we're using STDOUT
$quiet=1 if $out_base eq "-";
my %file=(); # For each tag, it has its associated file handler.
# tag "" means common text
my %lines=(); # For stats: How many lines has each file
&do_list() if $just_list;
print "Processing $input_file\n" if not $quiet;
# This block has tools to access each temporary file
{
# Detect if we have File::Temp (implemented in 5.6.1) to create temporary
# files safely. Otherwise, use our own temporary file creation.
#
my $hasTemp=1;
eval 'use File::Temp qw(tempfile tempdir);';
$hasTemp=0 if $@;
# We create a temporary directory, which will be destroyed after exit
my $temp_dir;
eval '$temp_dir = tempdir( CLEANUP => 1 );';
die "Could not create temporary directory: $@" if ($hasTemp and $@);
if ($DEBUG) {print "Our temp directory is $temp_dir\n";}
my $counter="000"; # For our own file name generator
# For each tag, has the name of the temporary file associated
my %names = ();
# Return the filehandle of a newly created temporary file,
# associating it to the specified tag
sub create_temp_file ($) {
my $tag = shift;
my ($fh, $filename);
if ($hasTemp) { # We have File::Temp :-)
eval '($fh, $filename) =
tempfile( "disl_XXXX", DIR => "' . $temp_dir . '");';
die "Could not create temporary file: $@" if $@;
} else {
# Try to create a good temp file
# For instance: tempfile.000, tempfile.001, .002, etc.
$filename = "tempfile.$counter";
$counter++; # from "000" to "001"
# 'autovivify' on open() is just for >=5.6.0
# So is the 3 parameters version...
open ( HANDLE, "+>" . $filename )
or &system_error("Can't create temporary file");
$fh=*HANDLE;
}
$names{$tag}=$filename;
if ($DEBUG) {print ("Created temp file for tag '$tag': $filename\n");}
return $fh;
}
# Returns the name of the temporary file for the given tag
sub temp_file_name ($) {
return $names{shift};
}
# Renames the temporary file associated to the specified tag
# to the final output file (whose name is specified)
sub temp_to_final ($$) {
my $old=shift;
$old=$names{$old};
my $new=shift;
if ($DEBUG) { print "Renaming $old to $new\n";}
move( $old, $new )
or &system_error("Error renaming temporary file");
}
# Remove the temporary file for the given tag
# In fact, File::Temp already removes the files on exit, but we have
# already been called to delete some files, so let's delete them ourselves.
sub delete_temp ($) {
my $tag = shift;
if ($DEBUG) {print "Deleting temporary file for tag '$tag': $names{$tag}\n";}
# Don't stop if there are problems; we want most files removed
unlink $names{$tag} or warn("Could not delete temp. file $names{$tag}: $!.");
delete $names{$tag}; # We need it no more in the hash
}
} # End of the 'temporary files' block
# Here starts the text processing
my $last_tag=""; # Last tag seen ( for @"" )
my $in_block=""; # Are we inside a block? If so, which tags is it for?
# Both variables contain a list of tags, like "en,eo,it"
# Open buffer for common lines
$file{""} = &create_temp_file("");
while ( ) {
my ($command, $rest_of_line);
chomp; # No newlines ( \n ) at the end
if ($DEBUG) {print "$.: $_\n";}
($command, $rest_of_line)= /^${AT}+(.*?)(?: (.*))?$/o
or ($command, $rest_of_line)=("",$_); # Common line (no tag)
# Handle @en\n and others
$rest_of_line="" if not defined $rest_of_line;
# Skip comments: @---
if ($command =~ /^\-+$/) {
$lines{"-"}++ if $just_stats;
next;
}
# Handle @{ and @}
if ($command =~ /^\{(.*)$/ or $command =~ /^(.*)\{$/) {
# Opening a block
&bad_syntax("You can't nest blocks") if $in_block;
&bad_syntax("Block with no tags") if not $1;
$in_block=$1;
if ($DEBUG) {print "Entering block $in_block\n";}
next;
} elsif ($command =~ /^\}(.*)$/ or $command =~ /^(.*)\}$/) {
# Closing a block
&bad_syntax("You closed block $1, but no block was opened")
if (not $in_block);
&bad_syntax("You closed block $1, but I was expecting $in_block")
if ($1 and $1 ne $in_block);
if ($DEBUG) {print "Going out of block $in_block\n";}
$in_block="";
next;
}
# Handle blocks, and @""
if ($in_block) {
&bad_syntax("Use of tag $command not allowed inside a block")
if $command ne "";
# Translate the block @{tag to a simple @tag for each line
$command=$in_block;
} else { # Things to do when you're not into a block
$command=$last_tag if $command =~ /"+/;
}
if ($command eq "") {
# Not tagged. It may be a common line, or rest inside a block
if (not $in_block) { # Common line
# Add it to the global buffer ($file{""})
# and to each known file
foreach my $tag ( keys %file ) {
print { $file{$tag} } "$rest_of_line\n";
$lines{$tag}++ if $just_stats;
}
if($DEBUG) {print "Adding common line: $rest_of_line\n";}
next; # No tags to process
} else {
$command=$in_block;
}
}
# Handle things like @,,,,
&bad_syntax("Don't do that with the commas") if $command =~ /^,+$/;
# Iterate over the t1, t2, t3 of a @t1,t2,t3
foreach my $tag ( split /,/, $command ) {
&bad_syntax("Don't include null tag names")
if $tag eq ""; # as in the case of a @a,,b
&bad_syntax("Illegal tag name $tag (remove the '$&')")
if $tag =~ /[\/\\\@\{\}]/;
# now, tag is correct (it doesn't match /[ ,]/ nor /[\@{}]/ )
if ($tag =~ /^\-*$/) { # it's a comment
$lines{"-"}++ if $just_stats;
next;
}
next if ($only_tags and $only_tags !~ /\b$tag\b/); # Not selected
if (! $file{$tag} ) { # New tag found
# Create temporary file for this tag
local *HANDLE;
*HANDLE=&create_temp_file($tag);
$file{$tag}=*HANDLE;
# Copy all the lines from the global file to this one
seek ($file{""}, 0, 0)
and copy( $file{""} , $file{$tag} )
or &system_error("Problem duplicating file");
$lines{$tag}=$lines{""} if $just_stats;
if ($DEBUG) {print "New tag found: $tag\n";}
}
# Write in each tag's buffer
print { $file{$tag} } "$rest_of_line\n";
$lines{$tag}++ if $just_stats;
}
# Make @"" work
$last_tag=$command if not $in_block;
if ($DEBUG) {print ">>>>> Command: $command\tLine: $rest_of_line\n";}
}
# No more lines to process
&bad_syntax("You forgot to close block $in_block")
if $in_block;
# Just finished processing; if you wanted just the statistics, here are they
&do_stats() if $just_stats;
# Everything ok, so make the temporary files be the final ones
close INPUT;
close $file{""};
delete $file{""};
&delete_temp("");
foreach my $tag (keys %file) {
if ($out_base ne "-") {
my $out_name=&out_file_name($tag);
close $file{$tag};
&temp_to_final( $tag, $out_name );
print "Wrote $out_name\n" if not $quiet;
} else {
# Write all files -concatenated- to STDOUT
# Hope that the user had selected just one with -t
my $handler = $file{$tag};
seek $handler, 0, 0;
print while ( <$handler> );
close $file{$tag};
}
}
&remove_all_temp() if ($out_base eq "-");
exit 0;
# And here ended the program
sub bad_syntax ($) {
my $message = shift;
print STDERR "Error at line $.: $message\n";
&remove_all_temp();
die "Stopping";
}
sub system_error ($) {
my $message = shift;
print STDERR "$message: $!\n";
&remove_all_temp();
die "Stopping";
}
# Called in case of error, or statistics.
# Otherwise, the temp files are converted to final ones.
sub remove_all_temp () {
foreach my $tag (keys %file) {
&delete_temp($tag);
}
}
sub out_file_name ($) {
# From a.html -> a.TAG.html
# a.b.txt -> a.b.TAG.txt
# abc -> abc.TAG
#
my $tag=shift;
my ($name, $dot_ext) = $out_base =~ /^(.*?)(\.[^\.]*)?$/;
$dot_ext="" if not $dot_ext;
# file . TAG .ext
return "${name}${SEP_CHAR}${tag}${dot_ext}";
}
sub do_list () {
# We're not doing a full parse of the file,
# neither check whether it is valid.
# We just search for lines beginning with @ to extract the tags.
my %seen_tags=();
print "List of tags used in $input_file:\n" if not $quiet;
while () {
if ($DEBUG) {print "$. $_\n";}
# Take the command from the @command Rest_of_line
next unless /^${AT}+(?!\@)([^ ]+)[ \n]/o;
if ($DEBUG) {print "The command seems $1\n";}
foreach my $tag (split /[,\{\}]/, $1) {
next if $tag =~ /[\\\/\@]/; # skip the erroneous tags
next if not $tag;
# skip @"" and @--
next if $tag =~ /^\"+$/ or $tag =~ /^\-+$/;
if ($DEBUG) {print "$tag\n";}
$seen_tags{$tag}=1;
}
}
print map { "$_\n" } keys %seen_tags;
exit 0;
}
sub do_stats () {
# We have the line count of each file. We can extract some information.
my $total_input_lines = $.;
my $common_lines = $lines{""} || 0;
my $phra= "$input_file ($total_input_lines lines):";
print $phra, "\n", '-'x(length $phra), "\n";
# Get the maximum number of digits for some values
my ($longest_tag,$longest_lines)=(0,0);
foreach my $tag (keys %file) {
next if $tag eq "";
my $c;
$c=&length_in_chars($tag);
$longest_tag=$c if $c > $longest_tag;
$c=length $lines{$tag};
$longest_lines=$c if $c > $longest_lines;
}
$longest_tag= &length_in_chars(
&out_file_name( 'x' x $longest_tag )
); # Length of the full name, not just the largest tag
foreach my $tag (keys %file) {
close $file{$tag}; # Profit the occasion to close temp files
next if $tag eq ""; # Just real tags; we don't want the global buffer
my $num=$lines{$tag};
my $name=&out_file_name($tag);
my $lin="%${longest_lines}u"; # Format string for the line counts
# Of course, this can be done with "format"s, but wouldn't simplify
# the code since the records aren't fixed-width.
print
sprintf(
' 'x($longest_tag-&length_in_chars($name)) .
"%s: $lin lines = " .
"$lin proper + $lin common\n",
$name, $num,
$num-$common_lines, $common_lines);
}
my $num_tags= scalar(keys %file) -1; # -1 because of the "" (not real)
$lines{'-'} ||=0;
print "Number of tags: $num_tags. Number of comments: $lines{'-'}.\n\n";
&remove_all_temp();
exit 0;
}
sub length_in_chars ($) {
# This is a little hack to get the length of a string
# which we don't know if it's Unicode (UTF-8) or not.
my $a = shift;
# Perl can't know if $a is Unicode or not
my $l = length $a; # Take its length as it if weren't
# This sets the "utf8" bit for $b
my $b;
eval q{
$b = pack "U0C*", unpack "C*", $a;
# We're hiding this from <5.6.0 versions,
# which will get the length as if it weren't UTF-8
};
# Try to take its length as if it were Unicode
eval q{
# If length detects some error (like malformed UTF-8 characters),
# die and don't touch $l. Otherwise, it's UTF-8, so get the real $l
use warnings FATAL => 'all';
$l=length $b;
};
return $l;
}
sub show_version () {
print <<'EOF';
dislines 1.0 < http://www.danielclemente.com/dislines/ >
Follows version 1 of the specification.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, version 2.
July 2005. Daniel Clemente Laboreo.
EOF
exit 0;
}
sub show_help () {
print <<'EOF';
Usage: dislines [OPTIONS] file.ext
Options:
(nothing) Process the file and create a file.TAG.ext for each tag
-l, --list Just show a list of the tags used in the file
-t, --tags=LIST Only include these tags. Ex: en,it,fr
-o, --out=FILE Use this base name for the output files (file.TAG.ext)
-s, --stats Just print some information about the files to be created
-q, --quiet Write only the necessary information to the screen
-h, --help Show this help (see the manual for the syntax)
-v, --version Print version information
You may put a dash (-) instead of file names to use STDIN or STDOUT.
EOF
exit 0;
}
# The end, really