#!/usr/bin/perl -T
use strict;
use warnings;
use Encode;
use CGI;
use HTML::Template;
BEGIN {unshift @INC, "/home/jaja/lib/perl5/site_perl/5.8.8/mach/", "/home/jaja/lib/perl/"};
use MeCab;
use JSON;
my $q = new CGI;
if (not $q->param) {
print $q->header(-charset => 'UTF-8');
my $url = $q->self_url();
print << "END_HTML"
MECAPI - MeCab Web Service (MeCab API)
MECAPI - MeCab Web Service (MeCab API)
Use "MeCab", the Japanese morphological analyzer, via Web service !!!
[SENTENCE] ==> MeCab Web Service ==> [Results of Morphological Analysis in XML]
Request URL
$url
Request parameters
| Parameter | Value | Description |
| sentence | string (required) | The sentence to be analyzed (Japanse, UTF-8) |
| response |
surface, feature, pos, inflection, baseform, pronounciation
|
Controls the data returned by the operation.
surface : Surface string of words.
feature : Various information. Contains pos, inflection, baseform and pronounciation
pos : Part-Of-Speech of the word.
inflection : Type and form of inflection.
baseform : Baseform of the word.
pronounciation : Pronounciation of the word.
Default: "surface,feature"
|
| filter |
noun, uniq |
Filters the words in the result of MeCab by the operation.
noun : ignores the words whose part-of-speech is not noun.
uniq : removes duplicate words and count them.
Default: ""
|
| format |
xml, json |
Specifies the output format.
Default: "xml"
|
| callback |
string |
The name of the callback function to wrap around the JSON data.
If format=json has not been requested, this parameter is ignored.
|
Sample Request Url:
- .../mecab.cgi?sentence=%E6%B8%8B%E8%B0%B7%E5%8C%BA%E3%81%AE&response=surface&filter=noun,uniq
Response fields
| Field | Description |
| MecabResult | Contains all of the results. |
| word | Contains each individual word. |
| surface | Surface string of a word. |
| feature | Contains pos, inflection, baseform and pronounciation. |
| pos | Part-Of-Speech of the word. |
| inflection | Type and form of inflection. |
| baseform | Baseform of the word. |
| pronounciation | Pronounciation of the word. |
| count | The frequency of words in the result. (for filter=uniq) |
Sample response
Source Code
You can get SOURCE CODE FROM HERE.
original version
by Tatsuo Yamashita, since 2006.9.18.
modified by YOU.
END_HTML
;
exit;
}
my $mode = $q->param('mode') || "";
if ($mode eq "code") {
open my $fh, $0 or die;
my $code = join("", <$fh>);
close $fh;
print "Content-Type: text/plain; charset=UTF-8\n\n$code\n";
exit;
}
my $sentence = $q->param('sentence') || "";
my $res = $q->param('response') || "surface,feature"; # surface, feature
my $filter = $q->param('filter') || ""; # noun, uniq
my $format = $q->param('format') || "xml"; # xml, json
my $callback = $q->param('callback') || "";
Encode::from_to($sentence, 'utf-8', 'euc-jp');
my $words_ref = do_macab_euc($sentence);
if ($format eq "json") {
my $json = objToJson($words_ref);
if ($callback ne "") {
$json = qq{$callback($json);};
}
Encode::from_to($json, 'euc-jp', 'utf-8');
print "Content-Type: application/x-javascript; charset=UTF-8\n\n$json\n";
exit;
}
my $template = join("", );
my $t = HTML::Template->new(scalarref => \$template,
die_on_bad_params => 0);
$t->param(results => $words_ref);
my $out = $t->output();
Encode::from_to($out, 'euc-jp', 'utf-8');
print "Content-Type: text/xml; charset=UTF-8\n\n$out";
sub do_macab_euc {
my ($str) = @_;
my $m = new MeCab::Tagger("");
my $n = $m->parseToNode($str);
my @words;
my %count;
while ($n = $n->{next}) {
next if $n->{surface} eq "";
my $f = $n->{feature};
next if ($filter =~ /\b noun \b/x and $f !~ /^\xcc\xbe\xbb\xec/); # Meishi (noun)
$count{$n->{surface}}++;
next if ($filter =~ /\b uniq \b/x and $count{$n->{surface}} >= 2);
my %hash;
$hash{surface} = $n->{surface} if ($res =~ /\b surface \b/x);
$hash{feature} = $n->{feature} if ($res =~ /\b feature \b/x);
my @ft = split(",", $f);
$hash{pos} = join(",", grep {!/^\*/} @ft[0,1,2,3]) if ($res =~ /\b pos \b/x);
$hash{inflection} = join(",", grep {!/^\*/} @ft[4,5]) if ($res =~ /\b inflection \b/x);
$hash{baseform} = $ft[6] if ($res =~ /\b baseform \b/x);
$hash{pronounciation} = $ft[7] if ($res =~ /\b pronounciation \b/x);
push @words, \%hash;
}
if ($filter =~ /\b uniq \b/x) {
foreach my $w (@words) {
$w->{count} = $count{$w->{surface}};
}
}
return \@words;
}
__DATA__