Test script using an external tokenizer with Marpa
This is a proof-of-concept script illustrating how to use the Marpa parser with an external
tokenizer. For some background and analysis of how it works, see the
main article. A downloadable text version can be found
here.
01 use strict; use warnings; 02 use Lingua::Tok::Document::TTX; 03 use Lingua::Tok; 04 use 5.10.0; 05 use Marpa::R2; 06 07 use Data::Dumper; 08 use utf8; 09 binmode (STDOUT, ":utf8"); 10 11 my $preamble = <<'EOF'; 12 :default ::= action => [name,values] 13 lexeme default = latm => 1 14 :start ::= text 15 EOF 16 17 my @tokens = ('n', 'aa', 'prep', 'unk'); 18 my $tlist = join (' | ', @tokens); 19 20 my $rules = <<"EOF"; 21 text ::= heading | salad 22 heading ::= np 23 np ::= n | adjp n | np pp 24 pp ::= prep np 25 adjp ::= aa+ 26 salad ::= lex+ 27 lex ::= np | $tlist 28 EOF 29 30 31 my %types; 32 my $tokens = ''; 33 foreach my $token (@tokens) { 34 $types{$token} = '_'.$token unless $token eq 'unk'; 35 $tokens .= "$token ::= _$token\n"; 36 $tokens .= "_$token ~ 'a'\n"; 37 } 38 39 my $grammar = Marpa::R2::Scanless::G->new({source => \($preamble . $rules . $tokens)}); 40 41 my $extra = Lingua::Lex->new (db => 'extra.lex'); 42 $extra->reload('extra_words.txt'); 43 44 my $tok = Lingua::Tok->new (Lingua::Tok::Document::TTX->load('test.ttx'), 45 [{rec => ['NUM']}, 46 $extra, 47 {lang => 'DE'}, 48 {splitter => 'y'}, 49 ]); 50 51 my $recce; 52 my @cursent; 53 sub out; 54 while (1) { 55 my $token = $tok->token(); 56 last if not defined $token; 57 if (ref $token eq 'ARRAY') { 58 next if $token->[0] eq 'S'; 59 if ($token->[0] eq 'FSB') { 60 $recce = Marpa::R2::Scanless::R->new( { grammar => $grammar } ); 61 my $input = ' ' x 100; 62 $recce->read(\$input, 0, 0); 63 @cursent = (); 64 } elsif ($token->[0] eq 'FSE') { 65 my $v = $recce->value(); 66 out ($$v, @cursent); 67 #while ($v = $recce->value()) { 68 # print " --- OR ---\n"; 69 # out ($$v, @cursent); 70 #} 71 } else { 72 my $type = $types{$token->[0]} || '_unk'; 73 $recce->lexeme_read ($type, 1, 1, scalar @cursent); 74 push @cursent, $token; 75 } 76 } else { 77 die "Hit a non-array token $token"; 78 } 79 } 80 81 sub out { 82 my $v = shift; 83 my $type = shift @$v; 84 my @phrase = (); 85 if (ref $v->[0]) { 86 push @phrase, out (shift @$v, @_) while @$v; 87 my $p = join ' ', @phrase; 88 print "$type\t--> $p\n" unless $type eq 'lex' or $type eq 'text'; 89 } else { 90 my $num = $v->[0]; 91 my $t = $_[$num]; 92 push @phrase, $t->[1]; 93 printf "%s\t%d\t%s\t%s\t%s\t%s\n", $type, $num, $t->[0], $t->[1], $t->[2] || '', $t->[3] || ''; 94 } 95 @phrase; 96 }