~ubuntu-branches/ubuntu/vivid/chasen/vivid

« back to all changes in this revision

Viewing changes to doc/manual.tex

Committer: Bazaar Package Importer
Author(s): NOKUBI Takatsugu
Date: 2004-07-12 17:04:30 UTC
mfrom: (2.1.1 warty)
Revision ID: james.westby@ubuntu.com-20040712170430-qd9g2og0261n6h8j

Tags: 2.3.3-5

http://bugs.debian.org/258568

Fixed non-ISO C++ compliant code, closes: #258568.

files added:
acconfig.h

debian/NEWS.Debian

debian/chasen.dirs

debian/darts.h

depcomp

lib/block.c

lib/dartsdic.cpp

lib/dartsdic.h

lib/literal.c

lib/literal.h

mkchadic/dumpdic.c

mkchadic/translate.c

tests

tests/Makefile.am

tests/Makefile.in

tests/cforms.cha

tests/chasenrc

tests/connect.cha

tests/ctypes.cha

tests/grammar.cha

tests/test-chasen.sh

tests/test-dic.sh

tests/test.dic

tests/test.txt

tests/test.vch

tests/wrong.dic

files removed:
INSTALL-ja

NEWS-ja

README-ja

chasen/chasmpl.c

chasen/client.c

chasen/server.c

debian/dirs

debian/docs

debian/libchasen0.postinst

doc/manual.pdf

doc/manual.tex

lib/chfile.c

lib/dic.c

lib/htobe.c

lib/htobe.h

lib/pat.c

lib/pat.h

lib/patfile.c

lib/select.c

lib/sufary.h

lib/zentohan.c

mkchadic/convary.c

mkchadic/convdic.c

mkchadic/interface.c

mkchadic/makeint.c

mkchadic/mkary.c

mkchadic/sortdic.c

mkchadic/trans.c

prolog

prolog/Makefile.am

prolog/Makefile.in

prolog/README.prolog

prolog/chasen.pl

prolog/chasen_user.pl

prolog/jinput.pl

prolog/juman.pl

prolog/utils.pl

stamp-h.in

files modified:
AUTHORS

COPYING

ChangeLog

INSTALL

Makefile.am

Makefile.in

NEWS

README

aclocal.m4

chasen-config.in

chasen.spec

chasen.spec.in

chasen/Makefile.am

chasen/Makefile.in

chasen/chasen.c

config.guess

config.h.in

config.sub

configure

configure.in

debian/changelog

debian/chasen.docs

debian/control

debian/rules

doc/Makefile.am

doc/Makefile.in

doc/manual-j.pdf

doc/manual-j.tex

install-sh

lib/Makefile.am

lib/Makefile.in

lib/chadic.h

lib/chalib.c

lib/chalib.h

lib/chasen.h

lib/connect.c

lib/getid.c

lib/getopt.c

lib/grammar.c

lib/init.c

lib/iotool.c

lib/jfgets.c

lib/katuyou.c

lib/lisp.c

lib/mmap.c

lib/parse.c

lib/print.c

lib/tokenizer.c

lib/tokenizer.h

ltmain.sh

missing

mkchadic/Makefile.am

mkchadic/Makefile.in

mkchadic/makemat.c

mkinstalldirs

perl/ChaSen.pm

perl/ChaSen.xs

perl/README

Show diffs side-by-side

added added

removed removed

doc/manual.tex

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%%

%%% Japanese Morphological Analysis

%%% System ChaSen Manual

%%%

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%% This file is for ASCII jLaTeX. %%%

%%% Use `j-article' for NTT jLaTeX. %%%

\documentstyle[ascmac,titlepage]{article}

%\documentstyle[11pt,epsf,titlepage]{article}

\topmargin -10mm

\textheight 240mm

\oddsidemargin -5mm

\textwidth 165mm

%\input a4

\title{\vspace{-3cm}

%{\small \sf \hfill NAIST Technical Report\\

%\vspace{-3mm}\hfill NAIST-IS-TR99009}\\

\vspace{3cm}

\bf Morphological Analysis System\\

ChaSen version 2.2.9 Manual}

\author{Yuji Matsumoto, Akira Kitauchi, Tatsuo Yamashita, Yoshitaka Hirano,\\

Hiroshi Matsuda, Kazuma Takaoka and Masayuki Asahara}

\date{February 2002

\rule{0mm}{110mm}Copyright \copyright\ 2002

Nara Institute of Science and Technology.}

\def\|{\verb|}

\def\*{\verb**}

\begin{document}

\maketitle

\thispagestyle{empty}

\vspace*{\fill}

\begin{footnotesize}

\noindent

Morphological Analysis System ChaSen Manual

\noindent

Yuji Matsumoto, Akira Kitauchi, Tatsuo Yamashita, Yoshitaka Hirano, Hiroshi Matsuda, Kazuma Takaoka and Masayuki Asahara

\noindent

\vspace{.5em}

Redistribution and use in source and binary forms, with or without

modification, are permitted provided that the following conditions

are met:

\begin{enumerate}

\item Redistributions of source code must retain the above copyright

notice, this list of conditions and the following disclaimer.

\item Redistributions in binary form must reproduce the above copyright

notice, this list of conditions and the following disclaimer in the

documentation and/or other materials provided with the distribution.

\item All advertising materials mentioning features or use of this software

must display the following acknowledgement:

This product includes software developed by Nara Institute of

Science and Technology.

\item The name Nara Institute of Science and Technology may not be used to

endorse or promote products derived from this software without specific

prior written permission.

\end{enumerate}

\vspace{.5em}

THIS SOFTWARE IS PROVIDED BY Nara Institute of Science and Technology

``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A

PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Nara Institute

of Science and Technology BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED

TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

\vspace*{1em}

\noindent

\begin{tabular}{ll}

JUMAN\\

~ ~ version 0.6 & 17 February 1992\\

~ ~ version 0.8 & 14 April 1992\\

~ ~ version 1.0 & 25 February 1993\\

~ ~ version 2.0 & 11 July 1994\\

ChaSen\\

% ~ ~ version 1.0b1 & 14 October 1996\\

% ~ ~ version 1.0b2 & 15 October 1996\\

100

% ~ ~ version 1.0b3 & 15 October 1996\\

101

% ~ ~ version 1.0b4 & 17 October 1996\\

102

% ~ ~ version 1.0b5 & 15 November 1996\\

103

% ~ ~ version 1.0b6 & 16 January 1997\\

104

% ~ ~ version 1.0b7 & 21 January 1997\\

105

~ ~ version 1.0 & 19 February 1997\\

106

~ ~ version 1.5 & 7 July 1997\\

107

~ ~ version 1.51 & 29 July 1997\\

108

%~ ~ version 2.0b6 & 20 April 1999\\

109

~ ~ version 2.0 & 15 December 1999\\

110

%~ ~ version 2.01 & 20 December 1999\\

111

%~ ~ version 2.02 & 29 December 1999\\

112

~ ~ version 2.2.0 & 06 December 2000\\

113

~ ~ version 2.2.1 & 20 December 2000\\

114

~ ~ version 2.2.2 & 22 February 2001\\

115

~ ~ version 2.2.3 & 24 February 2001\\

116

~ ~ version 2.2.4 & 18 March 2001\\

117

~ ~ version 2.2.5 & 24 March 2001\\

118

~ ~ version 2.2.6 & 17 April 2001\\

119

~ ~ version 2.2.7 & 22 June 2001\\

120

~ ~ version 2.2.8 & 23 July 2001\\

121

~ ~ version 2.2.9 & 08 February 2002\\

122

ChaSen for Windows\\

123

~ ~ version 1.0 & 29 March 1997\\

124

~ ~ version 2.0 & 15 December 1999\\

125

NAIST Technical Report (NAIST-IS-TR99009)\\

126

~ ~ 1st edition & 20 April 1999\\

127

~ ~ 2nd edition & 15 December 1999\\

128

\end{tabular}

129

\end{footnotesize}

130

131

\clearpage

132

133

\pagenumbering{roman}

134

\tableofcontents

135

136

\clearpage

137

\pagenumbering{arabic}

138

%----------------------------------------%

139

% body %

140

%----------------------------------------%

141

142

%----------------------------------------%

143

\section{Introduction}

144

145

ChaSen is a morphological analysis system which basically has

146

the following facilities and features.

147

148

\begin{itemize}

149

\item It segments Japanese text (sentences) string into morphemes

150

and tags those morphemes with their parts of speech and pronunciations.

151

It also tokenizes conjugative morphemes,

152

i.e., it tags the conjugative morphemes with

153

their base forms and conjugation types/forms.

154

155

\item In its grammar and dictionaries,

156

morphemes as well as connectivity of two morphemes

157

/ parts of speech

158

are defined, where some costs are assigned to their definition.

159

160

\item In its morphological analysis process,

161

ChaSen sums up those costs of morphemes

162

and their connectivities,

163

then outputs results with the minimum cost.

164

165

\item Basically, connectivity of two morphemes / parts of speech is defined

166

in the form of their bi-grams.

167

In the case of the current dictionary (ipadic1.0),

168

connectivity of two morphemes / parts of speech and its costs

169

are automatically extracted from a parts-of-speech tagged Japanese newspaper

170

article corpus.

171

In order to tune its costs,

172

part of speech bi-gram Markov model is employed,

173

and the probability parameters of maximum likelihood estimate (MLE) model

174

is transformed into its connectivity costs.

175

Similarly, costs of morphemes are also obtained from the MLE model.

176

177

\end{itemize}

178

179

180

%----------------------------------------%

181

\section{Grammar and Dictionaries}

182

\label{sec:grdic}

183

184

\begin{table}[htbp]

185

\begin{center}

186

% \leavevmode

187

\begin{tabular}{|l||l|l|}

188

\hline

189

& \multicolumn{1}{|c|}{Morpheme Files}

190

& \multicolumn{1}{|c|}{Grammar Files} \\

191

\hline

192

\hline

193

Definition Files & Morpheme Definition Files & Grammar Definition Files \\

194

& \hspace{5mm}Morpheme Dictionaries

195

& \hspace{5mm}Parts of speech File \\

196

& & \hspace{5mm}Conjugation Types File \\

197

& & \hspace{5mm}Conjugation Forms File \\

198

& & \hspace{5mm}Connectivity Rules File \\

199

\hline

200

System Files & System Dictionaries & Connectivity Table \\

201

& Index Files & Connectivity Matrix \\

202

\hline

203

\end{tabular}

204

\end{center}

205

\caption{Grammar/Dictionary Files}

206

\label{tab:files}

207

\end{table}

208

209

As shown in Table~\ref{tab:files},

210

grammar and dictionary files of ChaSen system

211

can be classified using two dimensions.

212

According to the first dimension,

213

they can be classified into

214

{\it Definition Files} and {\it System Files}.

215

Definition Files include definitions of the grammar and the morphemes

216

of ChaSen, and are automatically compiled into System Files which are used

217

in the morphological analysis.

218

Using the second dimension,

219

they can be classified into {\it Morpheme Files} and {\it Grammar Files}

220

according to the linguistic type of the contents of the files.

221

222

The description of those grammar and dictionary files is

223

summarized below.

224

225

\begin{enumerate}

226

\item {\bf Definition Files}

227

228

\begin{enumerate}

229

\item Morpheme Definition Files

230

231

\begin{itemize}

232

\item {\it Morpheme Dictionaries} ({\tt Noun.dic}, etc.)

233

234

define morphemes of each part of speech.

235

A morpheme is defined as a list

236

of its surface form (or its base form if conjugative),

237

pronunciation, conjugation type if conjugative,

238

and semantic information.

239

A surface form cost (to be used in the morphological analysis)

240

can be assigned to each morpheme definition.

241

\end{itemize}

242

243

\item Grammar Definition Files

244

\begin{itemize}

245

\item {\it Parts of speech File} ({\tt grammar.cha})

246

247

defines the set of parts of speech.

248

249

\item {\it Conjugation Types File} ({\tt ctypes.cha})

250

251

defines the set of conjugation types for each

252

conjugative part of speech.

253

254

\item {\it Conjugation Forms File} ({\tt cforms.cha})

255

256

defines possible conjugation forms

257

for each conjugation type.

258

259

\item {\it Connectivity Rules File}

260

({\tt connect.cha})

261

262

defines connectivity of two morphemes / parts of speech

263

in the form of their bi-grams.

264

A connectivity cost has to be assigned to each bi-gram

265

of morphemes / parts of speech.

266

267

\end{itemize}

268

269

\end{enumerate}

270

271

\item {\bf System Files}

272

273

\begin{enumerate}

274

\item Morpheme Files

275

276

\begin{itemize}

277

\item {\it System Dictionaries} (\|*.int|)

278

279

is obtained by compiling morpheme dictionaries

280

and encoding morpheme information.

281

282

\item {\it Index Files} (\|*.pat|)

283

284

include Patricia tree indices of system dictionaries.

285

286

\end{itemize}

287

288

\item Grammar Files

289

290

\begin{itemize}

291

\item {\it Connectivity Table} ({\tt table.cha})

292

293

defines the correspondence between the rows/columns

294

of the connectivity matrix and

295

the morphemes / parts of speech listed in the connectivity rules file.

296

297

\item {\it Connectivity Matrix} ({\tt matrix.cha})

298

299

defines connectivity of two morphemes / parts of speech

300

in the form of a matrix.

301

302

\end{itemize}

303

304

305

\end{enumerate}

306

\end{enumerate}

307

308

309

%----------------------------------------%

310

\section{Morphological Analysis}

311

312

%----------------------------------------%

313

\subsection{Algorithm}

314

\label{subsec:algo}

315

316

For the string of the input Japanese sentence,

317

ChaSen consults its morpheme dictionaries and records

318

all the possible morphemes that are any sub-strings of the input string.

319

Next, ChaSen calculates following two types of costs.

320

\begin{description}

321

\item[Morpheme Cost]

322

A cost that is assigned to each morpheme,

323

and is calculated as the product of

324

\begin{itemize}

325

\item the cost of the corresponding part of speech

326

(defined in the {\tt chasenrc} resource file),

327

\item relative weight of morpheme costs

328

(defined in the {\tt chasenrc} resource file),

329

\item and the surface form cost

330

(defined in the morpheme dictionaries).

331

\end{itemize}

332

333

\item[Connectivity Cost]

334

A cost that is assigned to each bi-gram of morphemes,

335

and is calculated as the product of

336

\begin{itemize}

337

\item the connectivity cost defined in the connectivity rules file,

338

\item and the relative weight of connectivity costs

339

(defined in the {\tt chasenrc} resource file).

340

\end{itemize}

341

\end{description}

342

For the string of the input Japanese sentence,

343

every possible segmentation into morpheme sequences

344

and their parts of speech tagging is considered

345

and sum of the above morpheme costs and their connectivity costs

346

are calculated.

347

Then, the results with the minimum cost are returned.

348

Some cost width of beam search is defined in the {\tt chasenrc} resource file,

349

and at every position in the input string,

350

morphological analysis results are pruned using this cost width of beam search.

351

352

%----------------------------------------%

353

\subsection{Coping with Unknown Words}

354

\label{subsec:unk}

355

356

When ChaSen consults its morpheme dictionaries with some sub-string

357

of the input string and can not find any morphemes,

358

it assumes that the sub-string should be considered as a morpheme

359

and behaves as if the sub-string were contained in its morpheme dictionaries,

360

although the sub-string is assigned an extremely high cost compared with

361

those morphemes existing in its morpheme dictionaries.

362

Details of this facility of coping with unknown words

363

are as follows:

364

\begin{itemize}

365

\item For hiragana (Japanese), kanji (Chinese), numbers, and symbols

366

character types,

367

ChaSen assumes each one character as a possible unknown

368

morpheme

369

that is not contained in its morpheme dictionaries.

370

On the other hand, for other character types (katakana

371

(foreign),

372

(English) alphabet, etc.),

373

ChaSen assumes the longest string each character of which

374

is of the same character type as a possible unknown morpheme

375

that is not contained in its morpheme dictionaries.

376

377

\item Those morphemes that are not contained in the morpheme dictionaries

378

are considered as having the {\it part of speech for unknown words},

379

which is defined in the {\tt chasenrc} resource file.

380

381

\item Those morphemes that are not contained in the morpheme dictionaries

382

are assigned the {\it cost for unknown words},

383

which is defined in the {\tt chasenrc} resource file.

384

\end{itemize}

385

386

%----------------------------------------%

387

\subsection{Unknown Connectivity Cost}

388

\label{subsec:unkc}

389

390

Basically, bi-grams of morphemes that does not

391

match any rules listed in the connectivity rules file

392

are not allowed in the morphological analysis results.

393

However, users can allow those prohibited bi-grams

394

in the morphological analysis by giving them an extremely high cost.

395

This can be done by defining {\it unknown connectivity cost}

396

in the {\tt chasenrc} resource file

397

(how to define the unknown connectivity cost

398

are described in the next section).

399

400

401

%----------------------------------------%

402

\section{Installation}

403

404

\begin{enumerate}

405

\item Type `{\tt ./configure}' to configure the package for your system.

406

407

\begin{screen}\begin{verbatim}

408

% ./configure

409

\end{verbatim}\end{screen}

410

411

You can give `{\tt configure}' initial values for variables by setting

412

them in the environment.

413

414

\begin{screen}\begin{verbatim}

415

% env CC=cc CFLAGS="-O2 -Wall" ./configure

416

\end{verbatim}\end{screen}

417

418

See the file `{\tt INSTALL}' for detail usage of configure.

419

420

\item Type `{\tt make}'.

421

422

\begin{screen}\begin{verbatim}

423

% make

424

\end{verbatim}\end{screen}

425

426

This produces the system. You might have to use GNU make.

427

428

\item Type `{\tt make install}' to install programs.

429

430

\begin{screen}\begin{verbatim}

431

# make install

432

\end{verbatim}\end{screen}

433

434

This will install the following files.

435

{\tt PREFIX} is defined by {\tt ./configure --prefix}.

436

(default setting is {\tt /usr/local})��

437

438

\begin{tabular}{ll}

439

{\tt PREFIX/bin/chasen} & ChaSen command \\

440

{\tt PREFIX/libexec/chasen/} & programs for building dictionaries \\

441

{\tt PREFIX/lib/libchasen.*} & ChaSen libraries \\

442

{\tt PREFIX/include/chasen.h} & header files \\

443

{\tt PREFIX/share/chasen/doc/} & manuals \\

444

{\tt PREFIX/share/chasen/prolog/} & Prolog programs to use ChaSen \\

445

\end{tabular}

446

447

The following files will not be installed.

448

\begin{tabular}{ll}

449

{\tt chasen/chasen.el} & Emacs lisp to use ChaSen \\

450

{\tt perl/ChaSen.pm} & Perl modules to use ChaSen \\

451

\end{tabular}

452

453

To remove old version of ChaSen programs, type the command below:

454

455

\begin{screen}\begin{verbatim}

456

# rm -rf PREFIX/lib/chasen

457

\end{verbatim}\end{screen}

458

459

{\tt chasenrc} is not installed when system is installed.

460

You need to put chasenrc file on PREFIX/etc, when you install a dictionary

461

package.

462

463

464

\end{enumerate}

465

466

%----------------------------------------%

467

\section{How to Use ChaSen System}

468

469

%----------------------------------------%

470

\subsection{Running ChaSen Program}

471

472

Suppose a Japanese text file "{\sl nihongo}", which should be encoded in

473

Japanese EUC (Extended UNIX Code) or JIS (ISO-2022-JP). Issue the

474

following command:

475

\begin{quote}

476

\|chasen| {\sl nihongo}

477

\end{quote}

478

The result of the morphological analysis is shown on the standard

479

output. If your terminal has a direct input facility of Japanese

480

characters, simply type

481

\begin{quote}

482

\|chasen|

483

\end{quote}

484

then input a Japanese sentence followed by a carrige return.

485

486

487

%----------------------------------------%

488

\subsection{Options}

489

490

There are several options:

491

\begin{itemize}

492

493

\item how to run

494

\begin{quote}

495

\begin{tabular}{ll}

496

{\tt -s} & start ChaSen server \\

497

{\tt -P} {\sl port} & specify ChaSen server's port number

498

(use with {\tt -s}, the default is 31000)\\

499

{\tt -D} {\sl host[:port]} & connect to ChaSen server \\

500

{\tt -R} & with {\tt -D}, do not read chasenrc file, \\

501

& without {\tt -D}, read the default chasenrc file \\

502

{\tt -a} & run standalone even if environment variable {\tt CHASENSERVER} is set

503

\end{tabular}

504

\end{quote}

505

506

\item how to print ambiguous results

507

\begin{quote}

508

\begin{tabular}{ll}

509

{\tt -b} & print one result with the least cost (default) \\

510

{\tt -m} & print ambiguous parts explicitly \\

511

{\tt -p} & print all possible results independently

512

\end{tabular}

513

\end{quote}

514

515

\item output format

516

\begin{quote}

517

\begin{tabular}{ll}

518

{\tt -f} & print the result in a table like format (default)\\

519

{\tt -e} & print all information of each morpheme separated by a blank \\

520

{\tt -c} & print all information of each morpheme in internal codes\\

521

{\tt -d} & print detailed morpheme data for Prolog.\\

522

{\tt -v} & print detailed morpheme data for ViCha.\\

523

{\tt -F} {\sl format} & print morpheme data with formetted output \\

524

{\tt -Fh} & print help of the format of {\tt -F} option \\

525

\end{tabular}

526

\end{quote}

527

528

\item miscellaneous

529

\begin{quote}

530

\begin{tabular}{ll}

531

{\tt -j} & Japanese sentence mode \\

532

& (assume a punctuation mark as a sentence delimiter) \\

533

{\tt -o} {\sl file} & write output to {\sl file} \\

534

{\tt -w} {\sl width} & specify the cost width\\

535

{\tt -C} & use command mode\\

536

{\tt -r} {\sl rc\_file} & use {\sl rc\_file} as a chasenrc file other than the default\\

537

{\tt -L} {\sl lang} & specify the language of the input text \\

538

{\tt -lp} & print the list of parts of speech \\

539

{\tt -lt} & print the list of conjugation types \\

540

{\tt -lf} & print the list of conjugation forms \\

541

{\tt -h} & print the help message \\

542

{\tt -V} & print ChaSen version number

543

\end{tabular}

544

\end{quote}

545

546

\end{itemize}

547

548

For example, compare the default output with the results of the following.

549

\begin{quote}

550

{\tt chasen -m -e} {\sl nihongo}

551

\end{quote}

552

553

554

%----------------------------------------%

555

\subsection{ChaSen Server and Client}

556

557

You can use ChaSen server and its client. First, type

558

\begin{quote}

559

\|chasen -s|

560

\end{quote}

561

to start ChaSen server.

562

563

\noindent

564

Type

565

\begin{quote}

566

\|chasen| -D{\sl host} {\sl nihongo}

567

\end{quote}

568

(`{\sl host}' should be the hostname of ChaSen server) to run ChaSen

569

client.

570

571

572

%----------------------------------------%

573

\subsection{Output Format}

574

575

Notes about {\tt -F} option.\\

576

577

\noindent

578

format characters:

579

\begin{quote}

580

\begin{tabular}{ll}

581

\|%m| & surface form (conjugated form) \\

582

\|%M| & surface form (base form) \\

583

\|%y| & first candidate of reading (conjugated form) \\

584

\|%Y| & first candidate of reading (base form) \\

585

\|%y0| & reading (conjugated form) \\

586

\|%Y0| & reading (base form) \\

587

\|%a| & first candidate of pronunciation (conjugated form) \\

588

\|%A| & first candidate of pronunciation (base form) \\

589

\|%a0| & pronunciation (conjugated form) \\

590

\|%A0| & pronunciation (base form) \\

591

\|%rABC| & surface form with ruby \\

592

\|%i| & first candidate of semantic information \\

593

\|%i0| & semantic information \\

594

\|%Ic| & semantic information (if NIL, print character 'c'.) \\

595

\|%Pc| & parts of speech (name) of all the layers of the parts of

596

speech hierarchy, \\

597

& concatenated with the character 'c' \\

598

\|%Pnc| & parts of speech (name) of the layers 1--n of the parts of

599

speech hierarchy, \\

600

& concatenated with the character 'c' \\

601

\|%h| & part of speech (code) \\

602

\|%H| & part of speech (name) \\

603

\|%Hn| & the part of speech (name) at the n-th layer (if NIL,

604

the part of speech \\

605

& at the most specific layer) \\

606

\|%b| & 0 (only for the backward compatibility) \\

607

\|%BB| & sub-part of speech (name) (if NIL, print part of speech) \\

608

\|%Bc| & sub-part of speech (name) (if NIL, print character 'c') \\

609

\|%t| & conjugation type (code) \\

610

\|%Tc| & conjugation type (name) (if NIL, print character 'c') \\

611

\|%f| & conjugated form (code) \\

612

\|%Fc| & conjugated form (name) (if NIL, print character 'c')\\

613

\|%c| & cost value of the morpheme \\

614

\|%S| & the input sentence \\

615

\|%pb| & if the best path, ``\|*|'', otherwise ``\* *'' \\

616

\|%pi| & the index of the path of the output lattice \\

617

\|%ps| & the starting position of the morpheme

618

at the path of the output lattice \\

619

\|%pe| & the ending position of the morpheme

620

at the path of the output lattice \\

621

\|%pc| & the cost of the path of the output lattice \\

622

\|%ppiC| & the indices of the preceding paths,

623

concatenated with the character 'C' \\

624

\|%ppcC| & the costs of the preceding paths,

625

concatenated with the character 'C' \\

626

\|%?B/STR1/STR2/| &

627

if sub-part of speech exists, STR1, otherwise, STR2 \\

628

\|%?I/STR1/STR2/| &

629

unless the semantic information is NIL and "", STR1,

630

otherwise, STR2 \\

631

\|%?T/STR1/STR2/| &

632

if conjugative, STR1, otherwise, STR2 \\

633

\|%?F/STR1/STR2/| &

634

same as \|%?T/STR1/STR2/| \\

635

\|%?U/STR1/STR2/| &

636

if unknown word, STR1, otherwise, STR2 \\

637

\|%U/STR/| &

638

if unknown word, "̤�θ�", otherwise, STR \\

639

\|%%| & '\%' \\

640

\|.| & specify the field width \\

641

\|-| & specify the field width \\

642

\|1-9| & specify the field width\\

643

\|\n| & carrige return\\

644

\|\t| & tab \\

645

\|\\| & back slash \\

646

\|\'| & single quotation mark \\

647

\|\"| & double quotation mark\\

648

\end{tabular}

649

\end{quote}

650

651

example:

652

653

\begin{itemize}

654

\item same as the default output ({\tt -f} option)

655

656

\*"%m\t%y\t%M\t%U(%P-)\t%T \t%F \n"* or \*"-f"*

657

658

\item surface forms, readings, and parts of speech separated by TAB characters

659

660

\*"%m\t%y\t%P-\n"*

661

662

\item surface forms only

663

664

\*"%m\n"*

665

666

\item surface forms separated by space characters

667

668

\*"%m "*

669

670

\item kanji to kana conversion

671

672

\*"%y"*

673

674

\item surface forms with ruby

675

676

\*"%r ()"*

677

678

\end{itemize}

679

680

%----------------------------------------%

681

\section{{\tt chasenrc} Resource File}

682

\label{sec:chasenrc}

683

684

The {\tt chasenrc} resource file is used for defining various

685

options necessary for running ChaSen morphological analysis program.

686

As for which {\tt chasenrc} resource file to be used in the morphological

687

analysis process, the following preference order holds.

688

\begin{enumerate}

689

\item the one given with {\tt -r} option when running ChaSen program.

690

\item the one given as the environmental variable {\tt CHASENRC}.

691

\item {\tt .chasenrc} file at the user's home directory.

692

\item the default file given as the variable {\tt RCPATH}

693

in the top level {\tt Makefile}.

694

Usually {\tt PREFIX/etc/chasenrc}.

695

\end{enumerate}

696

697

The following gives options that are defined in the {\tt chasenrc} resource file,

698

as well as their examples.

699

\begin{enumerate}

700

\item Directory of grammar files (section~\ref{sec:grdic}).

701

702

\begin{screen}\begin{verbatim}

703

(GRAMMAR /usr/local/lib/chasen/dic/ipadic)

704

\end{verbatim}\end{screen}

705

706

When this option is omitted,

707

the directory which contains {\tt chasenrc} file

708

is used as the directory of grammar files.

709

710

\item System dictionaries (section~\ref{sec:grdic}).

711

The suffix `{\tt .int}' has to be omitted.

712

More than one system dictionaries can be used.

713

714

\begin{screen}\begin{verbatim}

715

(PATDIC chadic

716

/home/rikyu/mydic/chadic)

717

\end{verbatim}\end{screen}

718

719

In the description above, the following two directories will be read.

720

721

\begin{enumerate}

722

\item {\tt chadic.int} and {\tt chadic.pat} in the same directory as grammar files.

723

\item {\tt chadic.int} and {\tt chadic.pat} in {\tt /home/rikyu/mydic/}.

724

\end{enumerate}

725

726

To use a package for string search, SUFARY, use {\tt SUFDIC} option.

727

728

\begin{screen}\begin{verbatim}

729

(SUFDIC chadic)

730

\end{verbatim}\end{screen}

731

732

In the description above, {\tt chadic.int} and {\tt chadic.ary} in the

733

same directory as grammar files will be read.

734

735

{\tt chadic.ary} is not created by default.

736

To create it, type `{\tt make ary}' in the same directory as

737

dictionary.

738

739

The index file will be read for a shorter time with SUFDIC than PATDIC,

740

but the time for search is longer.

741

You had better use SUFDIC for small sentences,

742

PATDIC for large sentences.

743

744

The maximum number of directories is 5 for both PATDIC and SUFDIC.

745

To change this number, edit the valude {\tt MAX\_DIC\_NUMBER} in

746

{\tt chasen/pat.h} and re-compile.

747

748

\item Part of speech for unknown words (section~\ref{subsec:unk}).

749

750

\begin{screen}\begin{verbatim}

751

(UNKNOWN_POS (̾�� ³)) ; a part of speech

752

(UNKNOWN_POS (̾�� ³) (̾�� )) ; two parts of speech

753

\end{verbatim}\end{screen}

754

755

\item Cost of each part of speech (section~\ref{subsec:algo}).

756

757

\begin{screen}\begin{verbatim}

758

(POS_COST

759

((*) 1)

760

((̤�θ�) 500)

761

((̾��) 2)

762

((̾�� ͭ̾��) 3)

763

)

764

\end{verbatim}\end{screen}

765

766

\item Relative weights of connectivity and morpheme costs

767

(section~\ref{subsec:algo}).

768

769

\begin{screen}\begin{verbatim}

770

(CONN_WEIGHT 1) ; defalut vaule

771

(MORPH_WEIGHT 1) ; default value

772

\end{verbatim}\end{screen}

773

774

\item Cost width of beam search

775

(section~\ref{subsec:algo}).

776

777

\begin{screen}\begin{verbatim}

778

(COST_WIDTH 0) ; default value

779

\end{verbatim}\end{screen}

780

781

\item Unknown connectivity cost.

782

(section~\ref{subsec:unkc}).

783

784

\begin{screen}\begin{verbatim}

785

(DEF_CONN_COST 500)

786

\end{verbatim}\end{screen}

787

788

\item Output format.

789

790

Users can specify the output format.

791

For example, if there is the following line in

792

{\tt .chasenrc}, the surface form, the reading,

793

and the part of speech will be printed.

794

795

\begin{verbatim}

796

(OUTPUT_FORMAT "%m\t%y\t%P-\n")

797

\end{verbatim}

798

799

Note that {\tt -f}, {\tt -e}, {\tt -c}, {\tt -d} and {\tt -F}

800

command line options override the format defined in {\tt .chasenrc}.

801

802

\item String for the beginning of the sentence.

803

804

\begin{screen}\begin{verbatim}

805

(BOS_STRING "sentence: [%S]\n")

806

\end{verbatim}\end{screen}

807

808

\item String for the end of the sentence.

809

810

\begin{screen}\begin{verbatim}

811

(EOS_STRING "end_of_sentence\n")

812

\end{verbatim}\end{screen}

813

814

\item Parts of speech for space and tab characters.

815

816

ChaSen ignores space (ASCII code is 32) and tab (ASCII code is 9)

817

characters in the analysis process, whose information is not

818

output by default.

819

Set `{\tt SPACE\_POS}' to output those information.

820

821

\begin{screen}\begin{verbatim}

822

(SPACE_POS (�� ))

823

\end{verbatim}\end{screen}

824

825

\item Annotations.

826

827

ChaSen can analyze sentences ignoring the certain strings like

828

annotations, and output the information about each string as one

829

morpheme.

830

831

\begin{screen}\begin{verbatim}

832

(ANNOTATION (("<" ">") "%m\n")

833

(("��") (�� ))

834

(("��") (�� ))

835

(("\"" "\"") (̾�� ʸ��))

836

(("��" "��"))

837

)

838

\end{verbatim}\end{screen}

839

840

In the descripation above, ChaSen will analyze and output in the

841

following way.

842

843

\begin{itemize}

844

\item Output the strings as it is which begin with ``\|<|'' and end

845

with ``\|>|'' such as {\tt <img src="cha.gif">}.

846

\item Output the strings ``��'' or ``��'' as {\tt ��-��}

847

\item Output the strings surrounded by double quotations such as

848

``{\tt "hello(again)"}'' as {\tt ̾��-��ʸ��}.

849

\item Analyze ignoring the strings which begin with ``��''

850

and end with ``��'' such as ``{\tt �Τ��㤻��}'' and

851

output no information about the strings.

852

\end{itemize}

853

854

\item Parts of speech for concatenated morphemes output.

855

856

ChaSen concatenates morphemes of the same part of speech

857

if the part of speech is among those specified

858

for concatenated morphemes output.

859

860

\begin{screen}\begin{verbatim}

861

(COMPOSIT_POS (̾�� ) (��))

862

\end{verbatim}\end{screen}

863

864

\item Sentence delimiter characters.

865

866

Users can define sentence delimiter characters

867

that are used when the ChaSen program is called

868

with {\tt -j} option.

869

870

\begin{screen}\begin{verbatim*}

871

(DELIMITER "��.,!? ")

872

\end{verbatim*}\end{screen}

873

874

\end{enumerate}

875

876

%----------------------------------------%

877

\section{ChaSen Library}

878

879

You can use ChaSen library

880

{\tt \$CHASEN/lib/libchasen.a} to put ChaSen's module into other

881

programs.

882

883

%----------------------------------------%

884

\section{Calling ChaSen from Other Languages}

885

886

% %----------------------------------------%

887

% \subsection{Calling ChaSen from Prolog}

888

889

%----------------------------------------%

890

\subsection{Emacs Lisp Version of ChaSen Client}

891

892

Copy \|$CHASEN/chasen/chasen.el| to the Emacs Lisp directory to

893

install. Specify hostname and port number of ChaSen server, and

894

describe autoloaded functions in your .emacs.

895

896

\begin{verbatim}

897

(setq chasen-server-host "kyusu")

898

(setq chasen-server-port 31234) ; the default is 31000

899

900

(autoload 'chasen-region "chasen" "ChaSen client" t)

901

(autoload 'chasen-line "chasen" "ChaSen client" t)

902

(autoload 'chasen-highlight-class-region "chasen" "ChaSen client" t)

903

(autoload 'chasen-property-class-region "chasen" "ChaSen client" t)

904

\end{verbatim}

905

906

907

%----------------------------------------%

908

\section{Contact}

909

910

For further information, send an email to:

911

\begin{quote}

912

{\sf chasen@is.aist-nara.ac.jp}

913

\end{quote}

914

915

\end{document}

916

% Local Variables:

917

% mode: latex

918

% TeX-master: t

919

% End:

Older »