Programemos Cosas: perl

Mostrando entradas con la etiqueta perl. Mostrar todas las entradas

miércoles, 12 de marzo de 2014

user-list.pl - programa en Perl para leer un formato determinado y crear una lista de usuarios

use strict;
use feature "switch";
open my $outfh, '>:encoding(utf8)', 'user-list.txt';
my %NOT_COLLECTED;
my @UNCLASSIFIED_FILE;
sub print_not_collected {
print $outfh "-----------------------------------------\n";
for (keys %NOT_COLLECTED) {
 print $outfh $_ . ' ' . join(' ', keys %{+{map +($_ => undef), @{$NOT_COLLECTED{$_}}}}) . "\n";
}
print $outfh "-----------------------------------------\n";
print $outfh join "\n", @UNCLASSIFIED_FILE;
close($outfh);
exit(0);
}

use sigtrap 'handler' => \&print_not_collected, qw(INT QUIT);

my @tmp_PIV;
while(<user-pivot-*.txt>) {
my ($class) = /\d+/g;
open my $ph, "<:encoding(utf8)", $_;
push @tmp_PIV, \%{+{map {/(.*)/; $1=>$class} grep /.+/, <$ph>}};
}
my %DUP;
for my $i (0..$#tmp_PIV-1) {
for my $j ($i+1..$#tmp_PIV) {
 for (keys %{$tmp_PIV[$i]}) {
 $DUP{$_}{$tmp_PIV[$i]{$_}} = $tmp_PIV[$j]{$_} if exists $tmp_PIV[$j]{$_};
 }
}
}
for my $name (keys %DUP) {
 for my $i (keys %{$DUP{$name}}) {
 print "$name $i ${DUP{$name}{$i}}\n";
 }
}
die "Duplicated class entries." unless (keys %DUP)==0;
my %PIV_EQ_CLASS;
for (@tmp_PIV) {
%PIV_EQ_CLASS = (%PIV_EQ_CLASS, %{$_});
}
my %NOT_PIV;
open my $nph, "<:encoding(utf8)", "user-not-pivot.txt";
%NOT_PIV = %{+{map {/(.*)/; $1=>0} grep /.+/, <$nph>}};
$NOT_PIV{''} = 0; # we don't want to consider blank pivots
my @tmp = grep exists($NOT_PIV{$_}), keys %PIV_EQ_CLASS;
die 'Ambiguity pivot not pivot: ' . join "\n", @tmp unless @tmp==0;
local $/ = undef;
while(glob "HTML/**") {
print "$_\n";
open my $fh, "<:encoding(utf8)", $_;
my $fname = $_;
my $text = <$fh>;
$text =~ /\n/g;
my @tmp = $text =~ //g;
my $text2 = join '', map {; s///; $_; } @tmp;
$text2 =~ s/- ( *)(?=-)/-$1/g;
my @FILE_CLASS;
for ($text2 =~ /FriendlyName="(.*?)"/g) {
 push @FILE_CLASS, $PIV_EQ_CLASS{$_} if exists $PIV_EQ_CLASS{$_};
 push @{$NOT_COLLECTED{$_}}, $fname if not exists $PIV_EQ_CLASS{$_} and not exists $NOT_PIV{$_};
}
print $outfh $fname . ' ' . join(' ', keys %{+{map +($_ => undef), @FILE_CLASS}}) . "\n" unless @FILE_CLASS==0;
push @UNCLASSIFIED_FILE, $fname if @FILE_CLASS==0;
}

print_not_collected;

martes, 18 de febrero de 2014

Tokens V2

Tokens: Programme to discover tokens, where there are not.

Now available at https://puszcza.gnu.org.ua/projects/tokens/
This is Version 2, for version 1, go here.

Synopsis:
use TokensV2;

sub printFile;
my @FORMAT = (
['<Message Date=".*?" Time=".*?" DateTime=".*?" SessionID=".*?"><From>(?:<User FriendlyName=".*?"/>)+</From><To>(?:<User FriendlyName=".*?"/>)+</To><Text(?: Style=".*?")?>.*?</Text></Message>',
sub {
 my $fh = $_[1];
 my ($d, $t, $f, $s, $T) = $_[0] =~ m|<Message Date="(.*?)" Time="(.*?)" DateTime=".*?" SessionID=".*?">(<From>(?:<User FriendlyName=".*?"/>)+</From>)<To>(?:<User FriendlyName=".*?"/>)+</To><Text(?: Style="(.*?)")?>(.*?)</Text></Message>|;
 my $F = join ' ', $f =~ m|<User FriendlyName="(.*?)"/>|g;
 print $fh "($d $t) $F $T";
}
],
['<Invitation Date=".*?" Time=".*?" DateTime=".*?" SessionID=".*?"><From><User FriendlyName=".*?"/></From><File>.*?</File><Text(?: Style=".*?")?>.*?</Text></Invitation>',
sub {
 my $fh = $_[1];
 my ($d, $t, $f, $s, $T) = $_[0] =~ m|<Invitation Date="(.*?)" Time="(.*?)" DateTime=".*?" SessionID=".*?"><From><User FriendlyName="(.*?)"/></From><File>.*?</File><Text(?: Style="(.*?)")?>(.*?)</Text></Invitation>|;
 print $fh "($d $t) $f $T";
}
],
['<InvitationResponse Date=".*?" Time=".*?" DateTime=".*?" SessionID=".*?"><From><User FriendlyName=".*?"/></From><File>.*?</File><Text(?: Style=".*?")?>.*?</Text></InvitationResponse>',
sub {
 my $fh = $_[1];
 my ($d, $t, $f, $s, $T) = $_[0] =~ m|<InvitationResponse Date="(.*?)" Time="(.*?)" DateTime=".*?" SessionID=".*?"><From><User FriendlyName="(.*?)"/></From><File>.*?</File><Text(?: Style="(.*?)")?>(.*?)</Text></InvitationResponse>|;
 print $fh "($d $t) $f $T";
}
],
['<Invitation Date=".*?" Time=".*?" DateTime=".*?" SessionID=".*?"><From><User FriendlyName=".*?"/></From><Application>.*?</Application><Text(?: Style=".*?")?>.*?</Text></Invitation>',
sub {
 my $fh = $_[1];
 my ($d, $t, $f, $s, $T) = $_[0] =~ m|<Invitation Date="(.*?)" Time="(.*?)" DateTime=".*?" SessionID=".*?"><From><User FriendlyName="(.*?)"/></From><Application>.*?</Application><Text(?: Style="(.*?)")?>(.*?)</Text></Invitation>|;
 print $fh "($d $t) $f $T";
}
],
['<InvitationResponse Date=".*?" Time=".*?" DateTime=".*?" SessionID=".*?"><From><User FriendlyName=".*?"/></From><Application>.*?</Application><Text(?: Style=".*?")?>.*?</Text></InvitationResponse>',
sub {
 my $fh = $_[1];
 my ($d, $t, $f, $s, $T) = $_[0] =~ m|<InvitationResponse Date="(.*?)" Time="(.*?)" DateTime=".*?" SessionID=".*?"><From><User FriendlyName="(.*?)"/></From><Application>.*?</Application><Text(?: Style="(.*?)")?>(.*?)</Text></InvitationResponse>|;
 print $fh "($d $t) $f $T";
}
],
['<Join Date=".*?" Time=".*?" DateTime=".*?" SessionID=".*?"><User FriendlyName=".*?"/><Text(?: Style=".*?")?>.*?</Text></Join>',
sub {
 my $fh = $_[1];
 my ($d, $t, $f, $s, $T) = $_[0] =~ m|<Join Date="(.*?)" Time="(.*?)" DateTime=".*?" SessionID=".*?"><User FriendlyName="(.*?)"/><Text(?: Style="(.*?)")?>(.*?)</Text></Join>|;
 print $fh "($d $t) $f $T";
}
],
['<Leave Date=".*?" Time=".*?" DateTime=".*?" SessionID=".*?"><User FriendlyName=".*?"/><Text(?: Style=".*?")?>.*?</Text></Leave>',
sub {
 my $fh = $_[1];
 my ($d, $t, $f, $s, $T) = $_[0] =~ m|<Leave Date="(.*?)" Time="(.*?)" DateTime=".*?" SessionID=".*?"><User FriendlyName="(.*?)"/><Text(?: Style="(.*?)")?>(.*?)</Text></Leave>|;
 print $fh "($d $t) $f $T";
}
]
);

sub printFile {
return if @_==1;
if (@_==2) {
 my $fh = $_[1];
 ${_[0]} =~ s/</</g;
 print $fh "${_[0]}\n";
 return;
}
# @_==5
printFile @{$_[1]}, $_[4];
my $coderef = $FORMAT[$_[3]][1];
&$coderef($_[0], $_[4]);
my $fh = $_[4];
print $fh "\n";
printFile @{$_[2]}, $_[4];
}

sub joinParseTree {
return if @_==0;
if (@_==1) {
 return ${_[0]};
}
# @_==4
return joinParseTree(@{$_[1]}) . $_[0] . joinParseTree(@{$_[2]});
}

open my $fH, "<:encoding(utf8)", "tokens-processed.txt"; # H for HASH, not handle
my %HASH = map { /([^\s]*)/; $1 => undef } grep $_ ne "\n", <$fH>;
close $fH;
open $fH, ">>:encoding(utf8)", "tokens-processed.txt";
print $fH "----------bookmark----------\n";

local $/ = undef;

my $regex = join "\n", map $$_[0], @FORMAT;
while(glob "TXT/**") {
print "$_\n";
next if exists $HASH{$_};
open my $fh, "<:encoding(utf8)", $_;
my $text = <$fh>;
my $oName = $_;
$oName =~ s/TXT/HTML/;
$oName =~ s/\.txt/.html/;
my @parse_tree;
 eval {
 local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required
 alarm 5;
 @parse_tree = parse($text, $regex);
 alarm 0;
 };
 if ($@) {
 die unless $@ eq "alarm\n"; # propagate unexpected errors
 # timed out
 print $fH "$_ timed out\n";
 next;
 }
my $parse_text = joinParseTree @parse_tree;
open $fhHTML, ">:encoding(utf8)", $oName;
print $fhHTML <<ENDDOC
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
</head>
<body>
ENDDOC
;
printFile @parse_tree, $fhHTML;
$text =~ s/-( *)(?=-)/- \1/g;
$text =~ s/(.{1,512})/\n/g;
print $fhHTML "

$text";
$parse_text =~ s/-( *)(?=-)/- \1/g;
$parse_text =~ s/(.{1,512})/\n/g;
print $fhHTML "

$parse_text
</body>
</html>";
close $fhHTML;
print $fH "$_\n";
}

TokensV2.pm file:

=pod
Tokens: Programme to discover tokens, where there are not.

Copyright 2013 Gabriel Czernikier

 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program. If not, see .
=cut
use strict;
#package declarations
sub digest_single;
sub parse_programme;
sub printAll;

my @REGEX;
my @MASK;
my $long_tokens;
my @DIGEST_SINGLE;
my $literal_char_count;

sub digest_single { # $target, $regex, $eaten_left_literal_chars, $eaten_right_literal_chars
#return unless $_[1] ne '';
#return if length $_[1] < 8; # x>(.*?)<
return unless ($_[2]+$_[3])/$literal_char_count<0.999999;
return @{$DIGEST_SINGLE[$_[2]][$_[3]]} if defined $DIGEST_SINGLE[$_[2]][$_[3]];
pos($_[0]) = undef;
goto VISITING unless $_[0] =~ /(${_[1]})/g;
my $digit = 2;
while( my $cg = eval '$'.$digit++ ) {
 my @suspicious_tokens = $cg =~ /$long_tokens/g;
 goto VISITING if @suspicious_tokens>=2;
}
my $pff = (pos $_[0]);
my $pii = $pff - (length $1) if defined $pff;
$DIGEST_SINGLE[$_[2]][$_[3]] = [$_[2], $_[3], $pii, $pff];
return @{$DIGEST_SINGLE[$_[2]][$_[3]]};

VISITING:
my $re = $_[1];
# strip off left shelter, also understood as walking righwards across the regex .+? stopping at a hopefuly serviceable string
goto VISITING_2 unless $re =~ s/(.+?)(.$\.\*\?$.|$[^.])/\2/; # .+? is also the "shelter"
my $increment_eaten_literal_chars = $+[1];
$increment_eaten_literal_chars -= 5 if $1 =~ /\(\.\*\?$/; # discount the single occurrence of non-literal chars: (.*?), left alone surrounding ""
# expansions
$re =~ s/^$\?:(?:[^)]|$[^?+*])+\)\*//;
$re =~ s/^$\?:((?:[^)]|$[^?+*])+)\)\+/\1$\?:\1$\*/;
$re =~ s/^$\?:((?:[^)]|$[^?+*])+)\)\?/\1/;
my ($el, $er, $pi, $pf) = digest_single $_[0], $re, $_[2]+$increment_eaten_literal_chars, $_[3];

VISITING_2:
$re = $_[1];
# strip off right shelter, also understood as walking leftwards across the regex .+? stopping at a hopefuly serviceable string
goto SUBDIGEST unless $re =~ s/(.*(?:.$\.\*\?$.|\)[?+*]))(.+?)$/\1/; # .+? is the "shelter"

$increment_eaten_literal_chars = $+[2] - $-[2];
$increment_eaten_literal_chars -= 5 if $2 =~ /$\.\*\?$/; # discount the single occurrence of non-literal chars: (.*?), left alone surrounding ""
# expansions
$re =~ s/$\?:(?:[^)]|$[^?+*])+\)\*$//;
$re =~ s/$\?:((?:[^)]|$[^?+*])+)\)\+$/$\?:\1$\*\1/;
$re =~ s/$\?:(?:[^)]|$[^?+*])+\)\?$//;
my ($el2, $er2, $pi2, $pf2) = digest_single $_[0], $re, $_[2], $_[3]+$increment_eaten_literal_chars;

SUBDIGEST:
$DIGEST_SINGLE[$_[2]][$_[3]] = [$el, $er, $pi, $pf] if (defined $pi) && ($er+$el<=$er2+$el2 || !defined $pi2);
return @{$DIGEST_SINGLE[$_[2]][$_[3]]} if defined $DIGEST_SINGLE[$_[2]][$_[3]];
$DIGEST_SINGLE[$_[2]][$_[3]] = [$el2, $er2, $pi2, $pf2] if (defined $pi2) && ($er2+$el2<$er+$el || !defined $pi);
return @{$DIGEST_SINGLE[$_[2]][$_[3]]} if defined $DIGEST_SINGLE[$_[2]][$_[3]];
$DIGEST_SINGLE[$_[2]][$_[3]] = [];
return @{$DIGEST_SINGLE[$_[2]][$_[3]]};
}

sub digest_multiple {
my $regex_num = 0;
my $Pos_Ini;
my $Eaten_Left;
my $Eaten_Right;
my $Pos_Fin;
my $Regex_Num;

my @DIGEST_MULTIPLE = ();
for my $regex (@REGEX) {
 @DIGEST_SINGLE = undef;
 my @literal_chars = $regex =~ /[^.*(?:)+]/g;
 $literal_char_count = scalar @literal_chars;
 my ($eaten_left, $eaten_right, $pos_ini, $pos_fin) = digest_single $_[0], $regex, 0, 0;
 return $eaten_left, $eaten_right, $pos_ini, $pos_fin, $regex_num if(($eaten_right+$eaten_left)/$literal_char_count<0.2);
 push @DIGEST_MULTIPLE, [$eaten_left, $eaten_right, $pos_ini, $pos_fin, $regex_num] if defined $pos_ini && $pos_fin!=0;
 $regex_num++;
}

$regex_num = 0;
while(@DIGEST_MULTIPLE!=0) {
 my ($eaten_left, $eaten_right, $pos_ini, $pos_fin) = @{shift @DIGEST_MULTIPLE};
 if($eaten_right+$eaten_left<$Eaten_Right+$Eaten_Left || !defined $Pos_Ini) {
 $Pos_Ini = $pos_ini;
 $Eaten_Left = $eaten_left;
 $Eaten_Right = $eaten_right;
 $Pos_Fin = $pos_fin;
 $Regex_Num = $regex_num;
 }
 $regex_num++;
}
return $Eaten_Left, $Eaten_Right, $Pos_Ini, $Pos_Fin, $Regex_Num if defined $Pos_Ini;
}

sub parse_programme {
return if $_[0] eq '';
return $_[0] if length $_[0] < 8; # <x>y</x>
my ($eaten_left, $eaten_right, $pos_ini, $pos_fin, $regex_num) = digest_multiple($_[0]);
if(not defined $pos_ini) {
 #my $oldfh = select;
 #select STDERR;
 #print "$ARGV[1], [ini-no-parseable]${target}[fin-no-parseable]\n";
 #select $oldfh;
 return $_[0];
}
my $mask_right = @MASK[$regex_num];
$mask_right =~ s/.*(.{$eaten_right})/\1/;
my $mask_left = @MASK[$regex_num];
$mask_left =~ s/(.{$eaten_left}).*/\1/;
my $match_length = $pos_fin-$pos_ini;
my ($target_left,$match,$target_right) = $_[0] =~ /(.{$pos_ini})(.{$match_length})(.*)/;
shift; # discard unused argument
return $mask_left.$match.$mask_right, +[parse_programme $target_left], +[parse_programme $target_right], $regex_num;
}

sub printAll {
return if @_==0;
if (@_==1) {
 print "[ini-nonparse]${_[0]}[fin-nonparse]\n";
 return;
}
printAll @{$_[1]};
print "[ini-prod]${_[0]}[fin-prod]\n";
printAll @{$_[2]};
}

sub parse {
@REGEX = ();
@MASK = ();
my $target = shift;
my $regex = shift;
while($regex =~ /^(.+)$/mg) {
 my $_ = $1;
 my $other = $_;
 # strip out non-literal chars from MASK
 $other =~ s/$\?:(.+?)$\+/\1/g;
 $other =~ s/$\?:(.+?)$\?/\1/g;
 $other =~ s/\.\*\?//g;
 push @MASK, $other;
 # surround wildcards with capturing group for REGEX
 s/(\.\*\?)/$\1$/g;
 push @REGEX, $_;
}

$long_tokens = '\b' . join '\b|\b', grep length>=3,keys %{+{ map +($_=>undef), map /\w+/g, @REGEX }};
$long_tokens = $long_tokens . '\b';

parse_programme $target;
}

1;

sábado, 25 de enero de 2014

extract-txt-bin-portions.pl: programa el Perl para encontrar cadenas UTF-8 en archivos binarios sin formato

# Lee un archivo mixto UTF-8 - no UTF-8
# output:
# Repeticiones de las líneas de texto:
# "bin: " byte-inicial-secuencia-no-UTF-8 byte-final-secuencia-no-UTF-8
# "txt: " byte-inicial-secuencia-UTF-8 byte-final-secuencia-UTF-8
# por orden de aparición de las secuencias (orden normal de los bytes dentro del archivo)

open my $fh, "<:raw", @ARGV[0] or die "cannot open < @ARGV[0]: $!";
my $i = 0;
my $ini_ciclo_bin = 1;

my $length = read $fh, my $byte, 1;
$i += $length;
# print "a ".unpack("B*", $byte) . "\n";

my $ini_bin;
my $fin_bin;
my $ini_txt;
my $fin_txt;

while($length) {
# ciclo bin
while($length and not unpack("B*", $byte) =~ /^(0|110|1110|11110|111110|1111110).*/) {
$length = read $fh, $byte, 1;
$i += $length;
# print "b ".unpack("B*", $byte) . "\n";
}

# ciclo txt
# inicialización secuencia txt
$ini_txt = $i;
$fin_txt = 0; # no def
my $s_val = 1; # boolean TRUE, secuencia txt válida (secuencia de caracteres)
while($length and unpack("B*", $byte) =~ /^(0|110|1110|11110|111110|1111110).*/ and $s_val) {
# inicialización 1 caracter
my $l = index(unpack("B*", $byte),'0');
if($l eq 0) {
$l = 1;
}
my $ini_sec_bytes = $i;

# lectura 1 caracter
while($length and $i-$ini_sec_bytes+1 lt $l and $s_val) {
$length = read $fh, $byte, 1;
$i += $length;
# print "c ".unpack("B*", $byte) . "\n";
if(not unpack("B*", $byte) =~ /^10.*/) {
$s_val = 0; # boolean FALSE
}
}

if($s_val and $i-$ini_sec_bytes+1 eq $l) {
$fin_txt = $i; # def
$length = read $fh, $byte, 1;
$i += $length;
# print "d ".unpack("B*", $byte) . "\n";
}
}

# delimitar secuencias
if($fin_txt) { # secuencia txt no vacía
if($ini_txt ne $ini_ciclo_bin) { # secuencia binaria no vacía
$ini_bin = $ini_ciclo_bin;
$fin_bin = $ini_txt - 1;
print "bin: $ini_bin $fin_bin\n";
}
print "txt: $ini_txt $fin_txt\n";
$ini_ciclo_bin = $fin_txt + 1; # para la vuelta entrante
} # si no las condiciones continúan igual para reentrar el ciclo-bin
}
# delimitación final
if($i) { # archivo no vacío
if($fin_txt and $fin_txt ne $i) { # secuencia txt no vacía ($fin_txt) and quedaron ($i-$fin_txt) bytes bin al final del archivo
$ini_bin = $fin_txt + 1;
$fin_bin = $i;
print "bin: $ini_bin $fin_bin\n";
}
elsif($fin_txt eq 0) { # último ciclo txt vacío ==> todo el ciclo corresponde a datos bin
$ini_bin = $ini_ciclo_bin;
$fin_bin = $i;
print "bin: $ini_bin $fin_bin\n";
}
# si no si ($fin_txt eq $i) el archivo termina justo en una secuencia txt, último OUTPUT emitido en ciclo txt
}

close $fh;

martes, 17 de diciembre de 2013

Derreferenciamiento en Perl

Interpretación provisoria. No está basada en la teoría, sino en el caso empírico dado.
perl -w o bien use strict deberían quejarse de la mayoría de estos casos.

Código:
@ARY = ( [ qw(vilma palma vampiro) ], [ qw(zapallo zanahoria zapallito) ] );
print "@ARY\n"; # 1) Testigo
print ARY[1]; # 2) La expresión de derreferenciamiento requiere un caracter de tipo bien al principio. El subscript [] no es suficiente para que Perl sepa que nos estamos refiriendo al array ARY.
print "\n";
print "@{ARY[1]}\n"; # 3) Adentro de {}: no rige la regla del caso (2). Afuera de {}: derreferenciamento impropio de un escalar con @.
print "@ARY[1]\n"; # 4) El caracter de tipo tiene mayor precedencia que el subscript de matrices [] (perldsc sobre Perl 5.12.5, "Caveat on precedence"), excepto que la regla (2) tiene mayor prioridad. Mismo mecanismo que el caso (3).
print "@{@ARY[1]}\n"; # 5) Adentro de {}: Si hay un @ bien al principio, entonces la expresión se trata como en el caso (4), si no, es exactamente el caso (3). Afuera de {}: correcto derreferenciamiento de un array con @.
print "@{@{ARY[1]}}\n"; # 6) Expresión explícita del caso (5). Es exactamente el mismo caso.

Output:
1) ARRAY(0x989c818) ARRAY(0x98b7110)
2)
3) ARRAY(0x98b7110)
4) ARRAY(0x98b7110)
5) zapallo zanahoria zapallito
6) zapallo zanahoria zapallito

Código:
@ARY = (58, 90);
print ARY[1];   # 1) ARY: descriptor de archivo (en general nulo => el output no va a ningún lado). [1]: Arrayref anónimo con el elemento 1. Comparar por ejemplo con: print STDOUT"8";
print "\n";
print @{ARY[1]};   # 2) El contexto @{} aplicado a un escalar x, significa el array con el único elemento x.
print "\n";
print @ARY[1]; # El caracter de tipo tiene mayor precedencia que el subscript de matrices [] (perldsc sobre Perl 5.12.5, "Caveat on precedence").
print "\n";
print ( (\@ARY[1]) eq (\$ARY[1]) );   # 4
print "\n";
print ( (\@{ARY[1]}) eq (\@ARY[1]) );   # 5
print "\n";
print ( (\@{ARY[1]}) eq (\$ARY[1]) );   # 6
print "\n";

Output:
1)
2) 90
3) 90
4) 1
5) 1
6) 1

martes, 5 de noviembre de 2013

Tokens - programa en Perl para parsear XML

Sinopsis
use Tokens;

open my $regex_fh, '<', $ARGV[0]; #regex.txt
open my $target_fh, '<', $ARGV[1]; #file.txt

my $target;
{
local $/ = undef;
$target = <$target_fh>; #the content
$target =~ s/\n/ /g;
}

my $regex;
{
local $/ = undef;
$regex = <$regex_fh>; #the content
}

printAll parse $target, $regex;

Códio (archivo: Tokens.pm):
=pod
Copyright 2013 Gabriel Czernikier

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program. If not, see .
=cut
use strict;
#package declarations
sub digest_single;
sub parse_programme;
sub printAll;

my @token_separator = ('<', '/', '"', '=', '>', ' ');

my @REGEX;
my @MASK;
my $long_tokens;
my @DIGEST_SINGLE;

sub min_token_separator_left {
my $regex = shift;
my $T;
$regex =~ /([@token_separator])/og;
my $min = (pos $regex) - (length $1);
my $T = $1;
return ($T, $min) if $regex =~ m/([@token_separator])/o;
}

sub min_token_separator_right {
my $regex = shift;
$regex =~ /.*([@token_separator])/og;
my $min = (pos $regex) - (length $1);
my $T = $1;
return ($T, $min) if $regex =~ m/([@token_separator])/o;
}

sub expand_left {
$_[0] =~ s/^$\?:(?:[^)]|$[^+*])+\)\*//;
$_[0] =~ s/^$\?:((?:[^)]|$[^+*])+)\)\+/\1$\?:\1$\*/;
}

sub trim_left {
$_[0] =~ s/^$\.\*\?$//;
}

sub expand_right {
$_[0] =~ s/$\?:(?:[^)]|$[^+*])+\)\*$//;
$_[0] =~ s/$\?:((?:[^)]|$[^+*])+)\)\+$/$\?:\1$\*\1/;
}

sub trim_right {
$_[0] =~ s/$\.\*\?$$//;
}

sub digest_single {
my $target = shift;
my $regex = shift;
return unless $regex ne '';
my $token_desde = shift;
my $token_hasta = shift;
return @{$DIGEST_SINGLE[$token_desde][$token_hasta]} if defined $DIGEST_SINGLE[$token_desde][$token_hasta];
my ($T, $pos)=min_token_separator_left($regex);
return unless $target =~ /$regex/ or defined $pos;
$target =~ /($regex)/g;
my $digit = 2;
while( my $cg = eval '$'.$digit++ ) {
goto VISITING if $cg =~ /$long_tokens/;
}
my $pff = (pos $target);
my $pii = $pff - (length $1) if defined $pff;
VISITING:
return ($token_desde, $token_hasta, $pii, $pff) unless defined $pos;
my $re = $regex;
expand_left $re;
($T, $pos)=min_token_separator_left($re);
my $pos_aux = $pos+length($T);
$re =~ s/.{$pos_aux}//;
trim_left $re;
my ($td, $th, $pi, $pf) = digest_single $target, $re, $token_desde+1, $token_hasta;
$re = $regex;
expand_right $re;
($T, $pos) = min_token_separator_right($re);
$re =~ s/(.{$pos}).*/\1/;
trim_right $re;
my ($td2, $th2, $pi2, $pf2) = digest_single $target, $re, $token_desde, $token_hasta-1;
$DIGEST_SINGLE[$token_desde][$token_hasta] = [$token_desde, $token_hasta, $pii, $pff] if (defined $pii) && ($pff-$pii>=$pf-$pi || !defined $pi) && ($pff-$pii>=$pf2-$pi2 || !defined $pi2);
return @{$DIGEST_SINGLE[$token_desde][$token_hasta]} if defined $DIGEST_SINGLE[$token_desde][$token_hasta];
$DIGEST_SINGLE[$token_desde][$token_hasta] = [$td, $th, $pi, $pf] if (defined $pi) && ($pf-$pi>$pff-$pii || !defined $pii) && ($pf-$pi>=$pf2-$pi2 || !defined $pi2);
return @{$DIGEST_SINGLE[$token_desde][$token_hasta]} if defined $DIGEST_SINGLE[$token_desde][$token_hasta];
$DIGEST_SINGLE[$token_desde][$token_hasta] = [$td2, $th2, $pi2, $pf2] if (defined $pi2) && ($pf2-$pi2>$pff-$pii || !defined $pii) && ($pf2-$pi2>$pf-$pi || !defined $pi);
return @{$DIGEST_SINGLE[$token_desde][$token_hasta]} if defined $DIGEST_SINGLE[$token_desde][$token_hasta];
$DIGEST_SINGLE[$token_desde][$token_hasta] = [];
return @{$DIGEST_SINGLE[$token_desde][$token_hasta]};
}

sub token_count {
my $regex = shift;
my @matches = $regex =~ /[@token_separator]/og;
return scalar @matches;
}

sub digest_multiple {
my $target = shift;
my @R = @REGEX;
my $regex_num = 0;
my $Pos_Ini;
my $Token_Desde;
my $Token_Hasta;
my $Pos_Fin;
my $Regex_Num;

my @DIGEST_MULTIPLE = ();
while( defined(my $regex = shift @R)) {
@DIGEST_SINGLE = undef;
my ($token_desde, $token_hasta, $pos_ini, $pos_fin) = digest_single $target, $regex, 0, token_count($regex);
return $token_desde, $token_hasta, $pos_ini, $pos_fin, $regex_num if($token_desde==0 && $token_hasta==token_count($regex));
push @DIGEST_MULTIPLE, [$token_desde, $token_hasta, $pos_ini, $pos_fin, $regex_num] if defined $pos_ini;
$regex_num++;
}

$regex_num = 0;
while(@DIGEST_MULTIPLE!=0) {
my ($token_desde, $token_hasta, $pos_ini, $pos_fin) = @{shift @DIGEST_MULTIPLE};
if(defined $pos_ini && $pos_fin!=0 && ($pos_ini<$Pos_Ini || !defined $Pos_Ini)) {
$Pos_Ini = $pos_ini;
$Token_Desde = $token_desde;
$Token_Hasta = $token_hasta;
$Pos_Fin = $pos_fin;
$Regex_Num = $regex_num;
}
$regex_num++;
}
return $Token_Desde, $Token_Hasta, $Pos_Ini, $Pos_Fin, $Regex_Num if defined $Pos_Ini;
}

sub parse_programme {
my $target = shift;
return if $target eq '';
my ($token_desde, $token_hasta, $pos_ini, $pos_fin, $regex_num) = digest_multiple($target);
if(not defined $pos_ini) {
#my $oldfh = select;
#select STDERR;
#print "$ARGV[1], [ini-no-parseable]${target}[fin-no-parseable]\n";
#select $oldfh;
return $target;
}
my $mask_right = @MASK[$regex_num];
my $repetitions = token_count($mask_right)-$token_hasta;
$mask_right =~ s/.*((?:[@token_separator][^@token_separator]*){$repetitions})$/\1/;
my $mask_left = @MASK[$regex_num];
$mask_left =~ s/^((?:[^@token_separator]*[@token_separator]){$token_desde}).*/\1/;
my $match = $target;
$match =~ s/(.{$pos_fin}).*/\1/;
$match =~ s/.{$pos_ini}//;
my $produccion = $mask_left . $match . $mask_right;
$target =~ /(.{$pos_ini})/;
my $target_left = $1;
$target =~ /.{$pos_fin}(.*)/;
my $target_right = $1;
return $produccion, +[parse_programme $target_left], +[parse_programme $target_right];
}

sub printAll {
return if @_==0;
if (@_==1) {
print "[ini-nonparse]${_[0]}[fin-nonparse]\n";
return;
}
printAll @{$_[1]};
print "[ini-prod]${_[0]}[fin-prod]\n";
printAll @{$_[2]};
}

sub parse {
my $target = shift;
my $regex = shift;
while($regex =~ /^(.+)$/mg) {
my $_ = $1;
my $other = $_;
$other =~ s/$\?:(.+?)$\+/\1/g;
$other =~ s/\.\*\?//g;
push @MASK, $other;
s/(\.\*\?)/$\1$/g;
push @REGEX, $_;
}

$long_tokens = join '|', grep length>=3,keys %{+{ map +($_=>undef), map /\w+/g, @REGEX }};

parse_programme $target;
}

1;

Input:

regex2-group.txt
<MyTag1 myAttr1=".*?" myAttr2=".*?">(?:<MyTag2 myAttr3=".*?" myAttr4=".*?">.*?</MyTag2>)+</MyTag1>

regex3-multiple.txt
<MyTag1 myAttr1=".*?" myAttr2=".*?">(?:<MyTag2 myAttr3=".*?" myAttr4=".*?">.*?</MyTag2>)+</MyTag1>
<Foo myFoo=".*?" myFoo2=".*?"><Bar1 myBar1=".*?" myBar1b=".*?" myBar1c=".*?"></Bar1><Bar2 myBar2=".*?" myBar2b=".*?"><Bar2Sub1>.*?</Bar2Sub1></Bar2></Foo>
<MyA myA=".*?"><MyAB>(?:<MyB myB=".*?"></MyB>)+</MyAB><MyAC>(?:<MyC myC=".*?"></MyC>)+</MyAC></MyA>
<TagT attrT=".*?">.*?</TagT>

file2-broken-left-broken-right-repetitions.txt
Tag2 myAttr3="myVal3Pre" myAttr4="myVal4Pre">MyText1Pre</MyTag2><MyTag2 myAttr3="myVal3" myAttr4="myVal4">MyText1</MyTag2></MyTag1><MyTag1 myAttr1="myVal1b" myAttr2="myVal2b"><MyTag2 myAttr3="myVal3b" myAttr4="myVal4b">MyText1b</MyTag2><MyTag2 myAttr3="myVal3bPos" myAttr4="myVal4bPos">MyText1bPos</MyTag2></MyTag1><MyTag1 myAttr1="myVal1c" myAttr2="myVal2c"><MyTag2 myAttr3="myVal3c" myAttr4="myVal4c">MyText1c</MyTag2><MyTag2 myAttr3="myVal3cPos" myAtt

file3-broken-left-broken-right-repetitions-multiple.txt
Tag2 myAttr3="myVal3Pre" myAttr4="myVal4Pre">MyText1Pre</MyTag2><MyTag2 myAttr3="myVal3" myAttr4="myVal4">MyText1</MyTag2></MyTag1><Foo myFoo="foo-123" myFoo2="foo-456"><Bar1 myBar1="bar-123" myBar1b="bar-456" myBar1c="bar-789"></Bar1><Bar2 myBar2="bar-135" myBar2b="bar-790"><Bar2Sub1>BarSubText</Bar2Sub1></Bar2></Foo><MyTag1 myAttr1="myVal1b" myAttr2="myVal2b"><MyTag2 myAttr3="myVal3b" myAttr4="myVal4b">MyText1b</MyTag2><MyTag2 myAttr3="myVal3bPos" myAttr4="myVal4bPos">MyText1bPos</MyTag2></MyTag1><MyA myA="MYa-123"><MyAB><MyB myB="MYb-123"></MyB><MyB myB="MYb-456"></MyB><MyB myB="MYb-123-bis"></MyB></MyAB><MyAC><MyC myC="mycR"></My M&; <MyTag1 myAttr1="myVal1c" myAttr2="myVal2c"><MyTag2 myAttr3="myVal3c" myAttr4="myVal4c">MyText1c</MyTag2><MyTag2 myAttr3="myVal3cPos" myAtt <TagT attrT="valueT">TextT-pr

miércoles, 14 de agosto de 2013

Perl Net::LDAP::SimpleServer

Adaptaciones sobre el módulo LDAP Server para Windows (Strawberry Perl)

Lista de adaptaciones (continúa más abajo):
- Relajación de condiciones de bind:
    - Cuenta principal (principal account)
    - Validación de contraseñas

Ubicación del archivo:
%Strawberry_Perl%\site\lib\net\ldap\SimpleServer\ProtocolHandler.pm

CPAN: http://search.cpan.org/~russoz/Net-LDAP-SimpleServer-0.0.17/lib/Net/LDAP/SimpleServer.pm

Código:

package Net::LDAP::SimpleServer::ProtocolHandler;

use strict;
use warnings;

# ABSTRACT: LDAP protocol handler used with Net::LDAP::SimpleServer

our $VERSION = '0.0.17';    # VERSION

use Net::LDAP::Server;
use base 'Net::LDAP::Server';
use fields qw(store root_dn root_pw allow_anon);

use Carp;
use Net::LDAP::LDIF;
use Net::LDAP::Util qw{canonical_dn};
use Net::LDAP::FilterMatch;

use Net::LDAP::Constant (
    qw/LDAP_SUCCESS LDAP_AUTH_UNKNOWN LDAP_INVALID_CREDENTIALS/,
    qw/LDAP_AUTH_METHOD_NOT_SUPPORTED/ );

use Scalar::Util qw{reftype};
use UNIVERSAL::isa;

use Data::Dumper;

sub _make_result {
    my $code = shift;
    my $dn   = shift || '';
    my $msg = shift || '';

    return {
        matchedDN    => $dn,
        errorMessage => $msg,
        resultCode   => $code,
    };
}

sub new {
    my $class = shift;
    my $params = shift || croak 'Must pass parameters!';
    my $self   = $class->SUPER::new( $params->{input}, $params->{output} );

    croak 'Parameter must be a HASHREF' unless reftype($params) eq 'HASH';
    croak 'Must pass option {store}' unless exists $params->{store};
    croak 'Not a LDIFStore'
      unless $params->{store}->isa('Net::LDAP::SimpleServer::LDIFStore');

    croak 'Must pass option {root_dn}' unless exists $params->{root_dn};
    croak 'Option {root_dn} can not be empty' unless $params->{root_dn};
    croak 'Invalid root DN'
      unless my $canon_dn = canonical_dn( $params->{root_dn} );

    $self->{store}      = $params->{store};
    $self->{root_dn}    = $canon_dn;
    $self->{root_pw}    = $params->{root_pw};
    $self->{allow_anon} = $params->{allow_anon};
    chomp( $self->{root_pw} );
        print STDERR "*** new ***\n";
    return $self;
}

sub unbind {
    my $self = shift;

    $self->{store}   = undef;
    $self->{root_dn} = undef;
    $self->{root_pw} = undef;

    return _make_result(LDAP_SUCCESS);
}

sub bind {    ## no critic (ProhibitBuiltinHomonyms)
    my ( $self, $request ) = @_;

        select(STDERR);
        $| = 1;
    print STDERR '===== bind =====' . "\n";
    #print STDERR Dumper($self);
    print STDERR Dumper($request);
    my $ok = _make_result(LDAP_SUCCESS);

    if (    not $request->{name}
        and exists $request->{authentication}->{simple}
        and $self->{allow_anon} )
    {
        return $ok;
    }

    print STDERR qq{not anonymous\n};
    # As of now, accepts only simple authentication
    return _make_result(LDAP_AUTH_METHOD_NOT_SUPPORTED)
      unless exists $request->{authentication}->{simple};

    print STDERR qq{is simple authentication\n};
    return _make_result(LDAP_INVALID_CREDENTIALS)
      unless my $binddn = canonical_dn( $request->{name} );

    print STDERR qq#binddn is ok ($request->{name}) => ($binddn)\n#;
    #print STDERR qq#handler dn is $self->{root_dn}\n#;
    #return _make_result(LDAP_INVALID_CREDENTIALS)
    # unless uc($binddn) eq uc( $self->{root_dn} );

    print STDERR qq{binddn is good\n};
    my $bindpw = $request->{authentication}->{simple};
    chomp($bindpw);

    #print STDERR qq|comparing ($bindpw) eq ($self->{root_pw})\n|;
    #return _make_result(LDAP_INVALID_CREDENTIALS)
    # unless $bindpw eq $self->{root_pw};

    return $ok;
}

sub _match {
    my ( $filter_spec, $elems ) = @_;

    my $f = bless $filter_spec, 'Net::LDAP::Filter';
    return [ grep { $f->match($_) } @{$elems} ];
}

sub search {
    my ( $self, $request ) = @_;

    my $list = $self->{store}->list;

    #my $basedn = $request->{baseObject};
        select(STDERR);
        $| = 1;
    #print STDERR '=' x 50 . "\n";
        print STDERR '===== search =====' . "\n";
    print STDERR Dumper($request);
    #print STDERR Dumper($list);

    my $res = _match( $request->{filter}, $list );

    #print STDERR Dumper($res);

    return ( _make_result(LDAP_SUCCESS), @{$res} );
}

1;    # Magic true value required at end of module

=pod

=encoding utf-8

=head1 NAME

Net::LDAP::SimpleServer::ProtocolHandler - LDAP protocol handler used with Net::LDAP::SimpleServer

=head1 VERSION

version 0.0.17

=head1 SYNOPSIS

    use Net::LDAP::SimpleServer::ProtocolHandler;

    my $store = Net::LDAP::SimpleServer::LDIFStore->new($datafile);
    my $handler =
      Net::LDAP::SimpleServer::ProtocolHandler->new({
          store   => $datafile,
          root_dn => 'cn=root',
          root_pw => 'somepassword'
      }, $socket );

=head1 DESCRIPTION

This module provides an interface between Net::LDAP::SimpleServer and the
underlying data store. Currently only L
is available.

=head1 METHODS

=head2 new( OPTIONS, IOHANDLES )

Creates a new handler for the LDAP protocol, using STORE as the backend
where the directory data is stored. The rest of the IOHANDLES are the same
as in the L module.

=head2 bind( REQUEST )

Handles a bind REQUEST from the LDAP client.

=head2 unbind()

Unbinds the connection to the server.

=head2 search( REQUEST )

Performs a search in the data store.

=head1 SEE ALSO

Please see those modules/websites for more information related to this module.

=over 4

=item *

L

=back

=head1 AUTHOR

Alexei Znamensky

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2012 by Alexei Znamensky.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=head1 BUGS AND LIMITATIONS

You can make new bug reports, and view existing ones, through the
web interface at L.

=head1 DISCLAIMER OF WARRANTY

BECAUSE THIS SOFTWARE IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
FOR THE SOFTWARE, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT
WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER
PARTIES PROVIDE THE SOFTWARE "AS IS" WITHOUT WARRANTY OF ANY KIND,
EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
SOFTWARE IS WITH YOU. SHOULD THE SOFTWARE PROVE DEFECTIVE, YOU ASSUME
THE COST OF ALL NECESSARY SERVICING, REPAIR, OR CORRECTION.

IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
REDISTRIBUTE THE SOFTWARE AS PERMITTED BY THE ABOVE LICENCE, BE LIABLE
TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL, OR
CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
SOFTWARE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
FAILURE OF THE SOFTWARE TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
DAMAGES.

=cut

__END__

Lista de adaptaciones (parte 2):
- Corrección del bug TAG 13 ASN

Net::LDAP::Server versión 0.43, solamente cambia el método "new"

# Net::LDAP::Server

sub new {
    my ($proto, $input, $output) = @_;
    my $class = ref($proto) || $proto;
    my $self = fields::new($class);

    #print STDERR Dumper($input);
    #print STDERR Dumper($output);

    binmode($output, ':raw');
    binmode($input, ':raw');
    $self->{in} = $input;
    $self->{out} = $output || $input;
    return $self;
}

Alternativamente, se puede crear un .bat con lo siguiente:

set PERLIO=raw
C:\strawberry\perl\site\bin\ldapd.bat