#!/usr/bin/perl -w
#
# dislines: distribute in several files the lines of a tagged text.
# Version 1.1 (c) 12-7 to 26-7-2005 Daniel Clemente Laboreo
# http://www.danielclemente.com/dislines/
#
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
#
#
# Super-quick syntax guide:
# (full syntax at): http://www.danielclemente.com/dislines/syntax.en.html
#
# Simple: @tag
# Multiple: @tag1,tag2,tag3
# Blocks: open with @{tag or @tag{ and close them with @}tag @tag} or @}
# Comments: @----- or in a block. Any number of -
# Repeat last tag used: @""""""" Any number of "
# You can use a lot of @ if you want: @@@@@@@@tag
# Common lines can also have it: @@ This is common
#
#
#
#
# The idea for the program is:
#
# 1. Create a temporary file for the common lines
# 2. For every line of the file:
# 3. | If it's common (no tag), add it to the global file created in (1)
# 4. | If it has a tag or more attached, do for each tag:
# 5. | | If it's the first time we see this tag:
# 6. | | | Duplicate the global file to make a temporary file for this tag
# 7. | | Add the line to the temporary file of the tag
# 8. | Also add the line to the global file
# 9. Now turn the temporary files into the final ones and delete the global
#
#
# use v5.0; # or later
use strict;
# use warnings; # Not on <5.6.0. I'm using -w
use Getopt::Long;
use File::Copy;
use File::Basename;
#
# Customizable settings:
#
# Change to 1 to better understand the code:
my $DEBUG=0;
# How to separate the "TAG" from the "file" in "file.TAG.ext"
my $SEP_CHAR='.'; # file.TAG.ext
# my $sep_char='-'; # file-TAG.ext (eg. index-en.html)
# If you don't like ats (@), you can change the used symbol here.
# Of course, you must put the one you use in your files.
my $AT='@';
# Options from the command line
my (
$just_list,
$only_tags,
$out_base,
$just_stats,
$show_help,
$show_version,
$quiet,
);
GetOptions (
"list" => \$just_list,
"tags=s" => \$only_tags,
"out=s" => \$out_base,
"stats" => \$just_stats,
"quiet" => \$quiet,
"help" => \$show_help,
"version" => \$show_version,
)
or die "Strange parameters passed. Stopping";
&show_help() if $show_help;
&show_version() if $show_version;
my $input_file;
if ($#ARGV ne 0) {
print STDERR "I need the name of the file to process. See help (-h)\n";
exit -1;
}
$input_file = pop @ARGV;
open (INPUT, $input_file) or &system_error("Can't open $input_file");
warn "Be careful: $input_file doesn't seem a text file. Warning"
if not $quiet
and not -T $input_file
and not $input_file eq "-";
# Once opened, we just want its name, to create temporary files
$^O =~ s/^dos$/msdos/i; # dosemu said just "dos"
fileparse_set_fstype($^O); # and this requires "msdos"
$input_file = basename ($input_file);
# If not specified, use the same name as the base for the output files
$out_base ||= $input_file;
# Don't print messages if we're using STDOUT
$quiet=1 if $out_base eq "-";
my %file=(); # For each tag, it has its associated file handler.
# tag "" means common text
my %lines=(); # For stats: How many lines has each file
&do_list() if $just_list;
print "Processing $input_file\n" if not $quiet;
my $last_tag=""; # Last tag seen ( for @"" )
my $in_block=""; # Are we inside a block? If so, which tags is it for?
# Both variables contain a list of tags, like "en,eo,it"
# Open buffer for common lines
# 'autovivify' on open() is just for >=5.6.0
# So is the 3 parameters version...
open ( COMMON, "+>". &temp_file_name("") )
or &system_error("Can't create temporary file");
$file{""}=*COMMON;
while ( ) {
my ($command, $rest_of_line);
chomp; # No newlines ( \n ) at the end
if ($DEBUG) {print "$.: $_\n";}
($command, $rest_of_line)= /^${AT}+(.*?)(?: (.*))?$/o
or ($command, $rest_of_line)=("",$_); # Common line (no tag)
# Handle @en\n and others
$rest_of_line="" if not defined $rest_of_line;
# Skip comments: @---
if ($command =~ /^\-+$/) {
$lines{"-"}++ if $just_stats;
next;
}
# Handle @{ and @}
if ($command =~ /^\{(.*)$/ or $command =~ /^(.*)\{$/) {
# Opening a block
&bad_syntax("You can't nest blocks") if $in_block;
&bad_syntax("Block with no tags") if not $1;
$in_block=$1;
if ($DEBUG) {print "Entering block $in_block\n";}
next;
} elsif ($command =~ /^\}(.*)$/ or $command =~ /^(.*)\}$/) {
# Closing a block
&bad_syntax("You closed block $1, but no block was opened")
if (not $in_block);
&bad_syntax("You closed block $1, but I was expecting $in_block")
if ($1 and $1 ne $in_block);
if ($DEBUG) {print "Going out of block $in_block\n";}
$in_block="";
next;
}
# Handle blocks, and @""
if ($in_block) {
&bad_syntax("Use of tag $command not allowed inside a block")
if $command ne "";
# Translate the block @{tag to a simple @tag for each line
$command=$in_block;
} else { # Things to do when you're not into a block
$command=$last_tag if $command =~ /"+/;
}
if ($command eq "") {
# Not tagged. It may be a common line, or rest inside a block
if (not $in_block) { # Common line
# Add it to the global buffer ($file{""})
# and to each known file
foreach my $tag ( keys %file ) {
print { $file{$tag} } "$rest_of_line\n";
$lines{$tag}++ if $just_stats;
}
if($DEBUG) {print "Adding common line: $rest_of_line\n";}
next; # No tags to process
} else {
$command=$in_block;
}
}
# Handle things like @,,,,
&bad_syntax("Don't do that with the commas") if $command =~ /^,+$/;
# Iterate over the t1, t2, t3 of a @t1,t2,t3
foreach my $tag ( split /,/, $command ) {
&bad_syntax("Don't include null tag names")
if $tag eq ""; # as in the case of a @a,,b
&bad_syntax("Illegal tag name $tag (remove the '$&')")
if $tag =~ /[\/\\\@\{\}]/;
# now, tag is correct (it doesn't match /[ ,]/ nor /[\@{}]/ )
if ($tag =~ /^\-*$/) { # it's a comment
$lines{"-"}++ if $just_stats;
next;
}
next if ($only_tags and $only_tags !~ /\b$tag\b/); # Not selected
if (! $file{$tag} ) { # New tag found
# Create temporary file for this tag
local (*HANDLE);
open ( HANDLE, "+>". &temp_file_name($tag) )
or &system_error("Can't create temporary file");
$file{$tag}=*HANDLE;
# Copy all the lines from the global file to this one
seek $file{""}, 0, 0
and copy( $file{""} , $file{$tag} )
or &system_error("Problem duplicating file");
$lines{$tag}=$lines{""} if $just_stats;
if ($DEBUG) {print "New tag found: $tag\n";}
}
# Write in each tag's buffer
print { $file{$tag} } "$rest_of_line\n";
$lines{$tag}++ if $just_stats;
}
# Make @"" work
$last_tag=$command if not $in_block;
if ($DEBUG) {print ">>>>> Command: $command\tLine: $rest_of_line\n";}
}
# No more lines to process
&bad_syntax("You forgot to close block $in_block")
if $in_block;
# Just finished processing; if you wanted just the statistics, here are they
&do_stats() if $just_stats;
# Everything ok, so make the temporary files be the final ones
close INPUT;
close $file{""};
delete $file{""};
unlink &temp_file_name("")
or &system_error("Error deleting temporary file");
foreach my $tag (keys %file) {
my $temp=&temp_file_name($tag);
my $name=&out_file_name($tag);
if ($out_base ne "-") {
close $file{$tag};
move( $temp, $name )
or &system_error("Error renaming temporary file");
} else {
# Write all files -concatenated- to STDOUT
# Hope that the user had selected just one with -t
my $handler = $file{$tag};
seek $handler, 0, 0;
print while ( <$handler> );
close $file{$tag};
}
print "Wrote $name\n" if not $quiet;
}
&remove_temp() if ($out_base eq "-");
exit 0;
# And here ended the program
sub bad_syntax ($) {
my $message = shift;
print STDERR "Error at line $.: $message\n";
&remove_temp();
die "Stopping";
}
sub system_error ($) {
my $message = shift;
print STDERR "$message: $!\n";
&remove_temp();
die "Stopping";
}
sub remove_temp () {
foreach my $tag (keys %file) {
my $name = &temp_file_name($tag);
if ($DEBUG) {print "Deleting temporary file $name\n";}
# Don't stop if there are problems; we want most files removed
unlink $name or warn("Could not delete $name: $!.");
}
}
sub add_tag ($$) {
# (file_name, $tag_name)
# From a.html -> a.TAG.html
# a.b.txt -> a.b.TAG.txt
# abc -> abc.TAG
#
my $file_name=shift;
my $tag_name=shift;
my ($name, $dot_ext) = $file_name =~ /^(.*?)(\.[^\.]*)?$/;
$dot_ext="" if not $dot_ext;
# file . TAG .ext
return "${name}${SEP_CHAR}${tag_name}${dot_ext}";
}
sub temp_file_name ($) {
# Mmmm... you shouldn't use more than 1 point in your
# file names if you're using FAT16.
return "." . &add_tag($input_file, shift);
}
sub out_file_name ($) {
# $out_base eq $input_file by default (unless you specify --out)
return &add_tag($out_base, shift);
}
sub do_list () {
# We're not doing a full parse of the file,
# neither check whether it is valid.
# We just search for lines beginning with @ to extract the tags.
my %seen_tags=();
print "List of tags used in $input_file:\n" if not $quiet;
while () {
if ($DEBUG) {print "$. $_\n";}
# Take the command from the @command Rest_of_line
next unless /^${AT}+(?!\@)([^ ]+)[ \n]/o;
if ($DEBUG) {print "The command seems $1\n";}
foreach my $tag (split /[,\{\}]/, $1) {
next if $tag =~ /[\\\/\@]/; # skip the erroneous tags
next if not $tag;
# skip @"" and @--
next if $tag =~ /^\"+$/ or $tag =~ /^\-+$/;
if ($DEBUG) {print "$tag\n";}
$seen_tags{$tag}=1;
}
}
print map { "$_\n" } keys %seen_tags;
exit 0;
}
sub do_stats () {
# We have the line count of each file. We can extract some information.
my $total_input_lines = $.;
my $common_lines = $lines{""} || 0;
my $phra= "$input_file ($total_input_lines lines):";
print $phra, "\n", '-'x(length $phra), "\n";
# Get the maximum number of digits for some values
my ($longest_tag,$longest_lines)=(0,0);
foreach my $tag (keys %file) {
next if $tag eq "";
my $c;
$c=&length_in_chars($tag);
$longest_tag=$c if $c > $longest_tag;
$c=length $lines{$tag};
$longest_lines=$c if $c > $longest_lines;
}
$longest_tag= &length_in_chars(
&out_file_name( 'x' x $longest_tag )
); # Length of the full name, not just the largest tag
foreach my $tag (keys %file) {
next if $tag eq "";
my $num=$lines{$tag};
my $name=&out_file_name($tag);
my $lin="%${longest_lines}u"; # Format string for the line counts
print
sprintf(
' 'x($longest_tag-&length_in_chars($name)) .
"%s: $lin lines = " .
"$lin proper + $lin common\n",
$name, $num,
$num-$common_lines, $common_lines);
}
my $num_tags= scalar(keys %file) -1; # -1 because of the "" (not real)
$lines{'-'} ||=0;
print "Number of tags: $num_tags. Number of comments: $lines{'-'}.\n\n";
&remove_temp();
exit 0;
}
sub length_in_chars ($) {
# This is a little hack to get the length of a string
# which we don't know if it's Unicode (UTF-8) or not.
my $a = shift;
# Perl can't know if $a is Unicode or not
my $l = length $a; # Take its length as it if weren't
# This sets the "utf8" bit for $b
my $b;
eval q{
$b = pack "U0C*", unpack "C*", $a;
# We're hiding this from <5.6.0 versions,
# which will get the length as if it weren't UTF-8
};
# Try to take its length as if it were Unicode
eval q{
# If length detects some error (like malformed UTF-8 characters),
# die and don't touch $l. Otherwise, it's UTF-8, so get the real $l
use warnings FATAL => 'all';
$l=length $b;
};
return $l;
}
sub show_version () {
print <<'EOF';
dislines 1.0 < http://www.danielclemente.com/dislines/ >
Follows version 1 of the specification.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, version 2.
July 2005. Daniel Clemente Laboreo.
EOF
exit 0;
}
sub show_help () {
print <<'EOF';
Usage: dislines [OPTIONS] file.ext
Options:
(nothing) Process the file and create a file.TAG.ext for each tag
-l, --list Just show a list of the tags used in the file
-t, --tags=LIST Only include these tags. Ex: en,it,fr
-o, --out=FILE Use this base name for the output files (file.TAG.ext)
-s, --stats Just print some information about the files to be created
-q, --quiet Write only the necessary information to the screen
-h, --help Show this help (see the manual for the syntax)
-v, --version Print version information
You may put a dash (-) instead of file names to use STDIN or STDOUT.
EOF
exit 0;
}
# The end, really