人工無脳をつくってみようとする7
http://d.hatena.ne.jp/hogelog/20061228#p1
以前書いた方向性で、やってみた。「中継点」こと「注目単語」は一つしか選択できない、そもそも注目単語の抽出が乱数まかせ、経路のコスト計算を単語2-gramだけでおこなっている、というまったくあれな感じのだけど。MeCabは分かち書きにしか使ってない。次の段階ではMeCabから品詞情報を貰って、注目単語の抽出に役立てようと思っている。最短経路の発見はダイクストラ法じゃなくて、ワーシャル・フロイド法が適当なのかも。
まあそんな感じのコード。
meztalk.pl
#!/usr/bin/perl -w use Mezassi; my $mezassi = Mezassi->new; $mezassi->read_dic("dumpdic"); while(print "# " and my $sentence = <>){ $mezassi->read_sentence($sentence); print "> ", $mezassi->gen_sentence, "\n"; } $mezassi->write_dic("dumpdic");
Mezassi.pm
# Chatbot "Mezassi" Engine # use 2 word gram model package Mezassi; use strict; use warnings; use Storable qw(nstore retrieve); use Data::Dumper; $Data::Dumper::Indent=1; use MeCab; sub new{ my ($pkg) = @_; bless { chisiki => {}, kibun => {}, dic_type => 'dump', debug => 1, }, $pkg; } BEGIN{ # 毎回MeCabオブジェクトを生成すると大変なので # コンパイル時に初期化 my $mecab = new MeCab::Tagger("-Owakati"); sub wordsplit{ my ($self, $sentence) = @_; return split(" ", $mecab->parse($sentence)); } } sub read_sentence{ my ($self, $sentence) = @_; my $chisiki = $self->{'chisiki'}; my $kibun = $self->{'kibun'}; chomp($sentence); return if($sentence eq ""); my @wordlist = $self->wordsplit($sentence); push @{$kibun->{'resentlog'}}, "@wordlist"; my $prevword = '&bos;'; foreach(@wordlist){ $_ = escape($_); ++$kibun->{'begin2end'}{$prevword}{'sum'}; ++$kibun->{'begin2end'}{$prevword}{$_}; $prevword = $_; } ++$kibun->{'begin2end'}{$prevword}{'sum'}; ++$kibun->{'begin2end'}{$prevword}{'&eos;'}; my $nextword = '&eos;'; foreach(reverse @wordlist){ $_ = escape($_); ++$kibun->{'end2begin'}{$nextword}{'sum'}; ++$kibun->{'end2begin'}{$nextword}{$_}; $nextword = $_; } ++$kibun->{'end2begin'}{$nextword}{'sum'}; ++$kibun->{'end2begin'}{$nextword}{'&bos;'}; } sub gen_sentence{ my ($self) = @_; my $chisiki = $self->{'chisiki'}; my $kibun = $self->{'kibun'}; # ここで最近の会話ログから注目単語を拾ってくる my @resentlog = @{$kibun->{'resentlog'}}; my @impwords = split ' ', $resentlog[-1]; my $impword = $impwords[rand @impwords]; print "impword: $impword\n" if $self->{'debug'}; my $sentence = $impword; my $chain; # gen_begin2end my $prevword = $impword; B2E: while(1){ if(defined $chisiki->{'begin2end'}{$prevword}){ $chain = $chisiki->{'begin2end'}{$prevword}; } elsif(defined $kibun->{'begin2end'}{$prevword}){ $chain = $kibun->{'begin2end'}{$prevword}; } else{ print"unknown word $prevword\n"; exit 1; } my @next_list = keys %$chain; my $next_sum = $chain->{'sum'}; foreach(@next_list){ next if($_ eq 'sum'); if(rand($next_sum) <= $chain->{$_}){ last B2E if($_ eq '&eos;'); $sentence .= $_; $prevword = $_; last; } $next_sum -= $chain->{$_}; } } # gen_end2begin $prevword = $impword; E2B: while(1){ if(defined $chisiki->{'end2begin'}{$prevword}){ $chain = $chisiki->{'end2begin'}{$prevword}; } elsif(defined $kibun->{'end2begin'}{$prevword}){ $chain = $kibun->{'end2begin'}{$prevword}; } else{ print"unknown word $prevword\n"; exit 1; } my @next_list = keys %$chain; my $next_sum = $chain->{'sum'}; foreach(@next_list){ next if($_ eq 'sum'); if(rand($next_sum) <= $chain->{$_}){ last E2B if($_ eq '&bos;'); $sentence = $_ . $sentence; $prevword = $_; last; } $next_sum -= $chain->{$_}; } } $chain = $chisiki->{'end2begin'}; return descape($sentence); } sub read_dic{ my ($self, $dic_name) = @_; if(-e $dic_name){ if($self->{'dic_type'} eq 'store'){ $self->{'chisiki'} = retrieve($dic_name); } elsif($self->{'dic_type'} eq 'dump'){ $self->{'chisiki'} = require $dic_name; } } } sub write_dic{ my ($self, $dic_name) = @_; my $chisiki = $self->{'chisiki'}; my $kibun = $self->{'kibun'}; while(my ($prevword, $nextword) = each(%{$kibun->{'begin2end'}})){ while(my ($surface, $count) = each(%{$nextword})){ $chisiki->{'begin2end'}{$prevword}{$surface} += $count; } } while(my ($prevword, $nextword) = each(%{$kibun->{'end2begin'}})){ while(my ($surface, $count) = each(%{$nextword})){ $chisiki->{'end2begin'}{$prevword}{$surface} += $count; } } if($self->{'dic_type'} eq 'store'){ nstore($chisiki, $dic_name); } elsif($self->{'dic_type'} eq 'dump'){ open(DIC, ">$dic_name"); print DIC Dumper($chisiki); close(DIC); } } sub escape{ my ($unit) = @_; $unit =~ s/&/&/g; return $unit; } sub descape{ my ($unit) = @_; $unit =~ s/&/&/g; return $unit; }
なんか無駄に長いなあ。Mezassi.pmを同ディレクトリに置いてmeztalk.plを実行。このmeztalk.plだとdumpdicという辞書を作るね。