@Preamble{
"\hyphenation{ }"
}
@String{ack-nhfb = "Nelson H. F. Beebe,
University of Utah,
Department of Mathematics, 110 LCB,
155 S 1400 E RM 233,
Salt Lake City, UT 84112-0090, USA,
Tel: +1 801 581 5254,
FAX: +1 801 581 4148,
e-mail: \path|beebe@math.utah.edu|,
\path|beebe@acm.org|,
\path|beebe@computer.org| (Internet),
URL: \path|http://www.math.utah.edu/~beebe/|"}
@String{j-TALIP = "ACM Transactions on Asian Language
Information Processing"}
@Article{Wong:2002:P,
author = "Kam-Fai Wong and Jun'ichi Tsujii",
title = "Prologue",
journal = j-TALIP,
volume = "1",
number = "1",
pages = "1--2",
month = mar,
year = "2002",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Tue Nov 5 23:44:34 MST 2002",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Gao:2002:TUA,
author = "Jianfeng Gao and Joshua Goodman and Mingjing Li and
Kai-Fu Lee",
title = "Toward a unified approach to statistical language
modeling for {Chinese}",
journal = j-TALIP,
volume = "1",
number = "1",
pages = "3--33",
month = mar,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1145/509900.509903",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Tue Nov 5 23:44:34 MST 2002",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Lai:2002:MTE,
author = "Yu-Sheng Lai and Chung-Hsien Wu",
title = "Meaningful term extraction and discriminative term
selection in text categorization via unknown-word
methodology",
journal = j-TALIP,
volume = "1",
number = "1",
pages = "34--64",
month = mar,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1145/509900.509904",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Tue Nov 5 23:44:34 MST 2002",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Kim:2002:MBG,
author = "Byeongchang Kim and Gary Geunbae Lee and Jong-Hyeok
Lee",
title = "Morpheme-based grapheme to phoneme conversion using
phonetic patterns and morphophonemic connectivity
information",
journal = j-TALIP,
volume = "1",
number = "1",
pages = "65--82",
month = mar,
year = "2002",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Tue Nov 5 23:44:34 MST 2002",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Lee:2002:UTI,
author = "Tan Lee and Wai Lau and Y. W. Wong and P. C. Ching",
title = "Using tone information in {Cantonese} continuous
speech recognition",
journal = j-TALIP,
volume = "1",
number = "1",
pages = "83--102",
month = mar,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1145/509900.509906",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Tue Nov 5 23:44:34 MST 2002",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Chen:2002:BCE,
author = "Hsin-Hsi Chen and Chi-Ching Lin and Wen-Cheng Lin",
title = "Building a {Chinese--English} wordnet for translingual
applications",
journal = j-TALIP,
volume = "1",
number = "2",
pages = "103--122",
month = jun,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1145/568954.568955",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Tue Nov 5 23:44:36 MST 2002",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Meng:2002:GPM,
author = "Helen Meng and Po-Chui Luk and Kui Xu and Fuliang
Weng",
title = "{GLR} parsing with multiple grammars for natural
language queries",
journal = j-TALIP,
volume = "1",
number = "2",
pages = "123--144",
month = jun,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1145/568954.568956",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Tue Nov 5 23:44:36 MST 2002",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Murata:2002:CTM,
author = "Masaki Murata and Qing Ma and Hitoshi Isahara",
title = "Comparison of three machine-learning methods for
{Thai} part-of-speech tagging",
journal = j-TALIP,
volume = "1",
number = "2",
pages = "145--158",
month = jun,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1145/568954.568957",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Tue Nov 5 23:44:36 MST 2002",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Lu:2002:TWQ,
author = "Wen-Hsiang Lu and Lee-Feng Chien and Hsi-Jian Lee",
title = "Translation of {Web} queries using anchor text
mining",
journal = j-TALIP,
volume = "1",
number = "2",
pages = "159--172",
month = jun,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1145/568954.568958",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Tue Nov 5 23:44:36 MST 2002",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Li:2002:WBA,
author = "Wenjie Li and Kam-Fai Wong",
title = "A word-based approach for modeling and discovering
temporal relations embedded in {Chinese} sentences",
journal = j-TALIP,
volume = "1",
number = "3",
pages = "173--206",
month = sep,
year = "2002",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Aug 7 08:49:00 MDT 2003",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Lee:2002:ACB,
author = "Jin-Seok Lee and Byeongchang Kim and Gary Geunbae
Lee",
title = "Automatic corpus-based tone and break-index prediction
using {K-ToBI} representation",
journal = j-TALIP,
volume = "1",
number = "3",
pages = "207--224",
month = sep,
year = "2002",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Aug 7 08:49:00 MDT 2003",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Luk:2002:CCD,
author = "Robert W. P. Luk and K. L. Kwok",
title = "A comparison of {Chinese} document indexing strategies
and retrieval models",
journal = j-TALIP,
volume = "1",
number = "3",
pages = "225--268",
month = sep,
year = "2002",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Aug 7 08:49:00 MDT 2003",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Suzuki:2002:LCS,
author = "Izumi Suzuki and Yoshiki Mikami and Ario Ohsato and
Yoshihide Chubachi",
title = "A language and character set determination method
based on {N}-gram statistics",
journal = j-TALIP,
volume = "1",
number = "3",
pages = "269--278",
month = sep,
year = "2002",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Aug 7 08:49:00 MDT 2003",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Jin:2002:CDC,
author = "Honglan Jin and Kam-Fai Wong",
title = "A {Chinese} dictionary construction algorithm for
information retrieval",
journal = j-TALIP,
volume = "1",
number = "4",
pages = "281--296",
month = dec,
year = "2002",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Aug 7 08:49:01 MDT 2003",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Li:2002:CCB,
author = "Yuanxiang Li and Xiaoqing Ding and Chew Lim Tan",
title = "Combining character-based bigrams with word-based
bigrams in contextual postprocessing for {Chinese}
script recognition",
journal = j-TALIP,
volume = "1",
number = "4",
pages = "297--309",
month = dec,
year = "2002",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Aug 7 08:49:01 MDT 2003",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Lo:2003:CLS,
author = "Wai-Kit Lo and Helen Meng and P. C. Ching",
title = "Cross-language spoken document retrieval using
{HMM}-based retrieval model with multi-scale fusion",
journal = j-TALIP,
volume = "2",
number = "1",
pages = "1--26",
month = mar,
year = "2003",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sun Jan 11 10:17:38 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Shi:2003:OHC,
author = "Daming Shi and Robert I. Damper and Steve R. Gunn",
title = "Offline handwritten {Chinese} character recognition by
radical decomposition",
journal = j-TALIP,
volume = "2",
number = "1",
pages = "27--48",
month = mar,
year = "2003",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sun Jan 11 10:17:38 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Lee:2003:TAS,
author = "Yue-Shi Lee",
title = "Task adaptation in stochastic language model for
{Chinese} homophone disambiguation",
journal = j-TALIP,
volume = "2",
number = "1",
pages = "49--62",
month = mar,
year = "2003",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sun Jan 11 10:17:38 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Shieh:2003:EAT,
author = "Jiann-Cherng Shieh",
title = "An efficient accessing technique for {Taiwanese}
phonetic transcriptions",
journal = j-TALIP,
volume = "2",
number = "1",
pages = "63--77",
month = mar,
year = "2003",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sun Jan 11 10:17:38 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Oard:2003:SLE,
author = "Douglas W. Oard",
title = "The surprise language exercises",
journal = j-TALIP,
volume = "2",
number = "2",
pages = "79--84",
month = jun,
year = "2003",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Nov 4 08:37:35 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Allan:2003:MTD,
author = "James Allan and Victor Lavrenko and Margaret E.
Connell",
title = "A month to topic detection and tracking in {Hindi}",
journal = j-TALIP,
volume = "2",
number = "2",
pages = "85--100",
month = jun,
year = "2003",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Nov 4 08:37:35 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Strassel:2003:LRC,
author = "Stephanie Strassel and Mike Maxwell and Christopher
Cieri",
title = "Linguistic resource creation for research and
technology development: a recent experiment",
journal = j-TALIP,
volume = "2",
number = "2",
pages = "101--117",
month = jun,
year = "2003",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Nov 4 08:37:35 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Dorr:2003:RPD,
author = "Bonnie J. Dorr and Necip Fazil Ayan and Nizar Habash
and Nitin Madnani and Rebecca Hwa",
title = "Rapid porting of {DUSTer} to {Hindi}",
journal = j-TALIP,
volume = "2",
number = "2",
pages = "118--123",
month = jun,
year = "2003",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Nov 4 08:37:35 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Huang:2003:ENE,
author = "Fei Huang and Stephan Vogel and Alex Waibel",
title = "Extracting named entity translingual equivalence with
limited resources",
journal = j-TALIP,
volume = "2",
number = "2",
pages = "124--129",
month = jun,
year = "2003",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Nov 4 08:37:35 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Larkey:2003:HCT,
author = "Leah S. Larkey and Margaret E. Connell and Nasreen
Abduljaleel",
title = "{Hindi CLIR} in thirty days",
journal = j-TALIP,
volume = "2",
number = "2",
pages = "130--142",
month = jun,
year = "2003",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Nov 4 08:37:35 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Lavie:2003:EHE,
author = "Alon Lavie and Stephan Vogel and Lori Levin and Erik
Peterson and Katharina Probst and Ariadna Font
Llitj{\'o}s and Rachel Reynolds and Jaime Carbonell and
Richard Cohen",
title = "Experiments with a {Hindi-to-English} transfer-based
{MT} system under a miserly data scenario",
journal = j-TALIP,
volume = "2",
number = "2",
pages = "143--163",
month = jun,
year = "2003",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Nov 4 08:37:35 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Xu:2003:CLR,
author = "Jinxi Xu and Ralph Weischedel",
title = "Cross-lingual retrieval for {Hindi}",
journal = j-TALIP,
volume = "2",
number = "2",
pages = "164--168",
month = jun,
year = "2003",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Nov 4 08:37:35 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{May:2003:SWC,
author = "Jonathan May and Ada Brunstein and Prem Natarajan and
Ralph Weischedel",
title = "Surprise! {What}'s in a {Cebuano} or {Hindi Name?}",
journal = j-TALIP,
volume = "2",
number = "3",
pages = "169--180",
month = sep,
year = "2003",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Nov 4 08:37:36 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Sekine:2003:HEC,
author = "Satoshi Sekine and Ralph Grishman",
title = "{Hindi-English} cross-lingual question-answering
system",
journal = j-TALIP,
volume = "2",
number = "3",
pages = "181--192",
month = sep,
year = "2003",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Nov 4 08:37:36 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Ma:2003:AHO,
author = "Huanfeng Ma and David Doermann",
title = "Adaptive {Hindi OCR} using generalized {Hausdorff}
image comparison",
journal = j-TALIP,
volume = "2",
number = "3",
pages = "193--218",
month = sep,
year = "2003",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Nov 4 08:37:36 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{He:2003:MMI,
author = "Daqing He and Douglas W. Oard and Jianqiang Wang and
Jun Luo and Dina Demner-Fushman and Kareem Darwish and
Philip Resnik and Sanjeev Khudanpur and Michael Nossal
and Michael Subotin and Anton Leuski",
title = "Making {MIRACLEs}: {Interactive} translingual search
for {Cebuano} and {Hindi}",
journal = j-TALIP,
volume = "2",
number = "3",
pages = "219--244",
month = sep,
year = "2003",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Nov 4 08:37:36 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Leuski:2003:CLC,
author = "Anton Leuski and Chin-Yew Lin and Liang Zhou and
Ulrich Germann and Franz Josef Och and Eduard Hovy",
title = "Cross-lingual {C*ST*RD}: {English} access to {Hindi}
information",
journal = j-TALIP,
volume = "2",
number = "3",
pages = "245--269",
month = sep,
year = "2003",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Nov 4 08:37:36 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Dorr:2003:CLH,
author = "Bonnie Dorr and David Zajic and Richard Schwartz",
title = "Cross-language headline generation for {Hindi}",
journal = j-TALIP,
volume = "2",
number = "3",
pages = "270--289",
month = sep,
year = "2003",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Nov 4 08:37:36 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Li:2003:RDH,
author = "Wei Li and Andrew McCallum",
title = "Rapid development of {Hindi} named entity recognition
using conditional random fields and feature induction",
journal = j-TALIP,
volume = "2",
number = "3",
pages = "290--294",
month = sep,
year = "2003",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Nov 4 08:37:36 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Maynard:2003:RCI,
author = "Diana Maynard and Valentin Tablan and Kalina Bontcheva
and Hamish Cunningham",
title = "Rapid customization of an information extraction
system for a surprise language",
journal = j-TALIP,
volume = "2",
number = "3",
pages = "295--300",
month = sep,
year = "2003",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Nov 4 08:37:36 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Kang:2003:IPP,
author = "Mi-Young Kang and Aesun Yoon and Hyuk-Chul Kwon",
title = "Improving partial parsing based on error-pattern
analysis for a {Korean} grammar-checker",
journal = j-TALIP,
volume = "2",
number = "4",
pages = "301--323",
month = dec,
year = "2003",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Nov 4 08:37:36 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Kim:2003:RRE,
author = "Harksoo Kim and Jungyun Seo",
title = "Resolution of referring expressions in a {Korean}
multimodal dialogue system",
journal = j-TALIP,
volume = "2",
number = "4",
pages = "324--337",
month = dec,
year = "2003",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Nov 4 08:37:36 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Mani:2004:ISI,
author = "Inderjeet Mani and James Pustejovsky and Beth
Sundheim",
title = "Introduction to the special issue on temporal
information processing",
journal = j-TALIP,
volume = "3",
number = "1",
pages = "1--10",
month = mar,
year = "2004",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Nov 4 08:37:36 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Han:2004:FRT,
author = "Benjamin Han and Alon Lavie",
title = "A framework for resolution of time in natural
language",
journal = j-TALIP,
volume = "3",
number = "1",
pages = "11--32",
month = mar,
year = "2004",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Nov 4 08:37:36 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Schilder:2004:EMT,
author = "Frank Schilder",
title = "Extracting meaning from temporal nouns and temporal
prepositions",
journal = j-TALIP,
volume = "3",
number = "1",
pages = "33--50",
month = mar,
year = "2004",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Nov 4 08:37:36 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Jang:2004:ATT,
author = "Seok Bae Jang and Jennifer Baldwin and Inderjeet
Mani",
title = "Automatic {TIMEX2} tagging of {Korean} news",
journal = j-TALIP,
volume = "3",
number = "1",
pages = "51--65",
month = mar,
year = "2004",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Nov 4 08:37:36 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Hobbs:2004:OTS,
author = "Jerry R. Hobbs and Feng Pan",
title = "An ontology of time for the {Semantic Web}",
journal = j-TALIP,
volume = "3",
number = "1",
pages = "66--85",
month = mar,
year = "2004",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Nov 4 08:37:36 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Gao:2004:ISI,
author = "Jianfeng Gao and Chin-Yew Lin",
title = "Introduction to the special issue on statistical
language modeling",
journal = j-TALIP,
volume = "3",
number = "2",
pages = "87--93",
month = jun,
year = "2004",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Nov 22 06:20:04 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Kim:2004:LTL,
author = "Woosung Kim and Sanjeev Khudanpur",
title = "Lexical triggers and latent semantic analysis for
cross-lingual language model adaptation",
journal = j-TALIP,
volume = "3",
number = "2",
pages = "94--112",
month = jun,
year = "2004",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Nov 22 06:20:04 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Linares:2004:HLM,
author = "Diego Linares and Jos{\'e}-Miguel Bened{\'\i} and
Joan-Andreu S{\'a}nchez",
title = "A hybrid language model based on a combination of
{$N$}-grams and stochastic context-free grammars",
journal = j-TALIP,
volume = "3",
number = "2",
pages = "113--127",
month = jun,
year = "2004",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Nov 22 06:20:04 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Chen:2004:DHG,
author = "Berlin Chen and Hsin-Min Wang and Lin-Shan Lee",
title = "A discriminative {HMM\slash N}-gram-based retrieval
approach for {Mandarin} spoken documents",
journal = j-TALIP,
volume = "3",
number = "2",
pages = "128--145",
month = jun,
year = "2004",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Nov 22 06:20:04 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Nguyen:2004:EBS,
author = "Minh Le Nguyen and Susumu Horiguchi and Akira Shimazu
and Bao Tu Ho",
title = "Example-based sentence reduction using the hidden
{Markov} model",
journal = j-TALIP,
volume = "3",
number = "2",
pages = "146--158",
month = jun,
year = "2004",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Nov 22 06:20:04 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Fung:2004:MEC,
author = "Pascale Fung and Grace Ngai and Yongsheng Yang and
Benfeng Chen",
title = "A maximum-entropy {Chinese} parser augmented by
transformation-based learning",
journal = j-TALIP,
volume = "3",
number = "2",
pages = "159--168",
month = jun,
year = "2004",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Nov 22 06:20:04 MST 2004",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Li:2004:AMF,
author = "Yujia Li and Tan Lee and Yao Qian",
title = "Analysis and modeling of {F0} contours for {Cantonese}
text-to-speech",
journal = j-TALIP,
volume = "3",
number = "3",
pages = "169--180",
month = sep,
year = "2004",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Apr 14 12:20:22 MDT 2005",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Huang:2004:UWB,
author = "Chien-Chung Huang and Shui-Lung Chuang and Lee-Feng
Chien",
title = "Using a {Web}-based categorization approach to
generate thematic metadata from texts",
journal = j-TALIP,
volume = "3",
number = "3",
pages = "190--212",
month = sep,
year = "2004",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Apr 14 12:20:22 MDT 2005",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Myaeng:2004:ISI,
author = "Sung Hyon Myaeng",
title = "Introduction to the special issue on computer
processing of oriental languages",
journal = j-TALIP,
volume = "3",
number = "4",
pages = "213--213",
month = dec,
year = "2004",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Apr 14 12:20:22 MDT 2005",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Baoli:2004:ANN,
author = "Li Baoli and Lu Qin and Yu Shiwen",
title = "An adaptive $k$-nearest neighbor text categorization
strategy",
journal = j-TALIP,
volume = "3",
number = "4",
pages = "215--226",
month = dec,
year = "2004",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Apr 14 12:20:22 MDT 2005",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Kim:2004:UTI,
author = "Pyung Kim and Sung Hyon Myaeng",
title = "Usefulness of temporal information automatically
extracted from news articles for topic tracking",
journal = j-TALIP,
volume = "3",
number = "4",
pages = "227--242",
month = dec,
year = "2004",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Apr 14 12:20:22 MDT 2005",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Zhang:2004:ESS,
author = "Le Zhang and Jingbo Zhu and Tianshun Yao",
title = "An evaluation of statistical spam filtering
techniques",
journal = j-TALIP,
volume = "3",
number = "4",
pages = "243--269",
month = dec,
year = "2004",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Apr 14 12:20:22 MDT 2005",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Wu:2005:DSF,
author = "Chung-Hsien Wu and Jui-Feng Yeh and Ming-Jun Chen",
title = "Domain-specific {FAQ} retrieval using independent
aspects",
journal = j-TALIP,
volume = "4",
number = "1",
pages = "1--17",
month = mar,
year = "2005",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Jul 7 13:48:21 MDT 2005",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Murata:2005:CEV,
author = "Masaki Murata and Masao Utiyama and Kiyotaka Uchimoto
and Hitoshi Isahara and Qing Ma",
title = "Correction of errors in a verb modality corpus for
machine translation with a machine-learning method",
journal = j-TALIP,
volume = "4",
number = "1",
pages = "18--37",
month = mar,
year = "2005",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Jul 7 13:48:21 MDT 2005",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Hendessi:2005:SSP,
author = "F. Hendessi and A. Ghayoori and T. A. Gulliver",
title = "A speech synthesizer for {Persian} text using a neural
network with a smooth ergodic {HMM}",
journal = j-TALIP,
volume = "4",
number = "1",
pages = "38--52",
month = mar,
year = "2005",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Jul 7 13:48:21 MDT 2005",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Zhang:2005:COT,
author = "Ying Zhang and Phil Vines and Justin Zobel",
title = "{Chinese} {OOV} translation and post-translation query
expansion in {Chinese--English} cross-lingual
information retrieval",
journal = j-TALIP,
volume = "4",
number = "2",
pages = "57--77",
month = jun,
year = "2005",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Dec 17 08:07:33 MST 2005",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Qu:2005:TES,
author = "Yan Qu and David A. Hull and Gregory Grefenstette and
David A. Evans and Motoko Ishikawa and Setsuko Nara and
Toshiya Ueda and Daisuke Noda and Kousaku Arita and
Yuki Funakoshi and Hiroshi Matsuda",
title = "Towards effective strategies for monolingual and
bilingual information retrieval: {Lessons} learned from
{NTCIR-4}",
journal = j-TALIP,
volume = "4",
number = "2",
pages = "78--110",
month = jun,
year = "2005",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Dec 17 08:07:33 MST 2005",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Sakai:2005:FPR,
author = "Tetsuya Sakai and Toshihiko Manabe and Makoto
Koyama",
title = "Flexible pseudo-relevance feedback via selective
sampling",
journal = j-TALIP,
volume = "4",
number = "2",
pages = "111--135",
month = jun,
year = "2005",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Dec 17 08:07:33 MST 2005",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Kwok:2005:RRP,
author = "Kui Lam Kwok and Sora Choi and Norbert Dinstl",
title = "Rich results from poor resources: {NTCIR-4}
monolingual and cross-lingual retrieval of {Korean}
texts using {Chinese} and {English}",
journal = j-TALIP,
volume = "4",
number = "2",
pages = "135--158",
month = jun,
year = "2005",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Dec 17 08:07:33 MST 2005",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Savoy:2005:CSM,
author = "Jacques Savoy",
title = "Comparative study of monolingual and multilingual
search models for use with {Asian} languages",
journal = j-TALIP,
volume = "4",
number = "2",
pages = "159--185",
month = jun,
year = "2005",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Dec 17 08:07:33 MST 2005",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Mase:2005:PTS,
author = "Hisao Mase and Tadataka Matsubayashi and Yuichi Ogawa
and Makoto Iwayama and Tadaaki Oshio",
title = "Proposal of two-stage patent retrieval method
considering the claim structure",
journal = j-TALIP,
volume = "4",
number = "2",
pages = "186--202",
month = jun,
year = "2005",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Dec 17 08:07:33 MST 2005",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Nakagawa:2005:PSI,
author = "Hiroshi Nakagawa and Tatsunori Mori and Noriko
Kando",
title = "Preface to the special issues on {NTCIR-4}",
journal = j-TALIP,
volume = "4",
number = "3",
pages = "237--242",
month = sep,
year = "2005",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Jan 26 08:28:41 MST 2006",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Kato:2005:ODQ,
author = "Tsuneaki Kato and Jun'ichi Fukumoto and Fumito Masui
and Noriko Kando",
title = "Are open-domain question answering technologies useful
for information access dialogues?---an empirical study
and a proposal of a novel challenge",
journal = j-TALIP,
volume = "4",
number = "3",
pages = "243--262",
month = sep,
year = "2005",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Jan 26 08:28:41 MST 2006",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Isozaki:2005:AHP,
author = "Hideki Isozaki",
title = "An analysis of a high-performance {Japanese} question
answering system",
journal = j-TALIP,
volume = "4",
number = "3",
pages = "263--279",
month = sep,
year = "2005",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Jan 26 08:28:41 MST 2006",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Mori:2005:JQA,
author = "Tatsunori Mori",
title = "{Japanese} question-answering system using {A*} search
and its improvement",
journal = j-TALIP,
volume = "4",
number = "3",
pages = "280--304",
month = sep,
year = "2005",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Jan 26 08:28:41 MST 2006",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Mori:2005:MAF,
author = "Tatsunori Mori and Masanori Nozawa and Yoshiaki
Asada",
title = "Multi-answer-focused multi-document summarization
using a question-answering engine",
journal = j-TALIP,
volume = "4",
number = "3",
pages = "305--320",
month = sep,
year = "2005",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Jan 26 08:28:41 MST 2006",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Okazaki:2005:ICO,
author = "Naoaki Okazaki and Yutaka Matsuo and Mitsuru
Ishizuka",
title = "Improving chronological ordering of sentences
extracted from multiple newspaper articles",
journal = j-TALIP,
volume = "4",
number = "3",
pages = "321--339",
month = sep,
year = "2005",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Jan 26 08:28:41 MST 2006",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Yoshioka:2005:CPB,
author = "Masaharu Yoshioka and Makoto Haraguchi",
title = "On a combination of probabilistic and {Boolean} {IR}
models for {WWW} document retrieval",
journal = j-TALIP,
volume = "4",
number = "3",
pages = "340--356",
month = sep,
year = "2005",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Jan 26 08:28:41 MST 2006",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Lingpeng:2005:CIR,
author = "Yang Lingpeng and Ji Donghong and Tang Li and Niu
Zhengyu",
title = "{Chinese} information retrieval based on terms and
relevant terms",
journal = j-TALIP,
volume = "4",
number = "3",
pages = "357--374",
month = sep,
year = "2005",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Jan 26 08:28:41 MST 2006",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Sakai:2006:ISI,
author = "Tetsuya Sakai and Yuji Matsumoto",
title = "Introduction to the special issue: {Recent} advances
in information processing and access for {Japanese}",
journal = j-TALIP,
volume = "4",
number = "4",
pages = "375--376",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Feb 16 10:54:02 MST 2006",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Doi:2006:EBM,
author = "Takao Doi and Hirofumi Yamamoto and Eiichiro Sumita",
title = "Example-based machine translation using efficient
sentence retrieval based on edit-distance",
journal = j-TALIP,
volume = "4",
number = "4",
pages = "377--399",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Feb 16 10:54:02 MST 2006",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Tomiura:2006:ESS,
author = "Yoichi Tomiura and Shosaku Tanaka and Toru Hitaka",
title = "Estimating satisfactoriness of selectional restriction
from corpus without a thesaurus",
journal = j-TALIP,
volume = "4",
number = "4",
pages = "400--416",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Feb 16 10:54:02 MST 2006",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Iida:2006:ARA,
author = "Ryu Iida and Kentaro Inui and Yuji Matsumoto",
title = "Anaphora resolution by antecedent identification
followed by anaphoricity determination",
journal = j-TALIP,
volume = "4",
number = "4",
pages = "417--434",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Feb 16 10:54:02 MST 2006",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Inui:2006:ACK,
author = "Takashi Inui and Kentaro Inui and Yuji Matsumoto",
title = "Acquiring causal knowledge from text using the
connective marker {\em tame\/}",
journal = j-TALIP,
volume = "4",
number = "4",
pages = "435--474",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Feb 16 10:54:02 MST 2006",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Ma:2006:TSB,
author = "Qiang Ma and Katsumi Tanaka",
title = "Topic-structure-based complementary information
retrieval and its application",
journal = j-TALIP,
volume = "4",
number = "4",
pages = "475--503",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Feb 16 10:54:02 MST 2006",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Park:2006:ATM,
author = "Jong C. Park and Gary Geunbae Lee and Limsoon Wong",
title = "{AUTHOR}: {Text} mining and management in
biomedicine",
journal = j-TALIP,
volume = "5",
number = "1",
pages = "1--3",
month = mar,
year = "2006",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu May 11 11:29:25 MDT 2006",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Park:2006:MBB,
author = "Kyung-Mi Park and Seon-Ho Kim and Hae-Chang Rim and
Young-Sook Hwang",
title = "{ME}-based biomedical named entity recognition using
lexical knowledge",
journal = j-TALIP,
volume = "5",
number = "1",
pages = "4--21",
month = mar,
year = "2006",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu May 11 11:29:25 MDT 2006",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Nenadic:2006:MSR,
author = "Goran Nenadi{\'c} and Sophia Ananiadou",
title = "Mining semantically related terms from biomedical
literature",
journal = j-TALIP,
volume = "5",
number = "1",
pages = "22--43",
month = mar,
year = "2006",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu May 11 11:29:25 MDT 2006",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Kim:2006:ECI,
author = "Jung-Jae Kim and Jong C. Park",
title = "Extracting contrastive information from negation
patterns in biomedical literature",
journal = j-TALIP,
volume = "5",
number = "1",
pages = "44--60",
month = mar,
year = "2006",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu May 11 11:29:25 MDT 2006",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Kim:2006:TPL,
author = "Eunju Kim and Yu Song and Cheongjae Lee and Kyoungduk
Kim and Gary Geunbae Lee and Byoung-Kee Yi and Jeongwon
Cha",
title = "Two-phase learning for biological event extraction and
verification",
journal = j-TALIP,
volume = "5",
number = "1",
pages = "61--73",
month = mar,
year = "2006",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu May 11 11:29:25 MDT 2006",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Mima:2006:TBK,
author = "Hideki Mima and Sophia Ananiadou and Katsumori
Matsushima",
title = "Terminology-based knowledge mining for new knowledge
discovery",
journal = j-TALIP,
volume = "5",
number = "1",
pages = "74--88",
month = mar,
year = "2006",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu May 11 11:29:25 MDT 2006",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Carpuat:2006:AWS,
author = "Marine Carpuat and Pascale Fung and Grace Ngai",
title = "Aligning word senses using bilingual corpora",
journal = j-TALIP,
volume = "5",
number = "2",
pages = "89--120",
month = jun,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1145/1165255.1165256",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Oct 5 07:00:29 MDT 2006",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "The growing importance of multilingual information
retrieval and machine translation has made multilingual
ontologies extremely valuable resources. Since the
construction of an ontology from scratch is a very
expensive and time-consuming undertaking, it is
attractive to consider ways of automatically aligning
monolingual ontologies, which already exist for many of
the world's major languages. Previous research
exploited similarity in the structure of the ontologies
to align, or manually created bilingual resources.
These approaches cannot be used to align ontologies
with vastly different structures and can only be
applied to much studied language pairs for which
expensive resources are already available. In this
paper, we propose a novel approach to align the
ontologies at the node level: Given a concept
represented by a particular word sense in one ontology,
our task is to find the best corresponding word sense
in the second language ontology. To this end, we
present a language-independent, corpus-based method
that borrows from techniques used in information
retrieval and machine translation. We show its
efficiency by applying it to two very different
ontologies in very different languages: the Mandarin
Chinese HowNet and the American English WordNet.
Moreover, we propose a methodology to measure bilingual
corpora comparability and show that our method is
robust enough to use noisy nonparallel bilingual
corpora efficiently, when clean parallel corpora are
not available.",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Lee:2006:ABN,
author = "Chun-Jen Lee and Jason S. Chang and Jyh-Shing R.
Jang",
title = "Alignment of bilingual named entities in parallel
corpora using statistical models and multiple knowledge
sources",
journal = j-TALIP,
volume = "5",
number = "2",
pages = "121--145",
month = jun,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1145/1165255.1165257",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Oct 5 07:00:29 MDT 2006",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Named entity (NE) extraction is one of the fundamental
tasks in natural language processing (NLP). Although
many studies have focused on identifying NEs within
monolingual documents, aligning NEs in bilingual
documents has not been investigated extensively due to
the complexity of the task. In this article we
introduce a new approach to aligning bilingual NEs in
parallel corpora by incorporating statistical models
with multiple knowledge sources. In our approach, we
model the process of translating an English NE phrase
into a Chinese equivalent using lexical
translation\slash transliteration probabilities for
word translation and alignment probabilities for word
reordering. The method involves automatically learning
phrase alignment and acquiring word translations from a
bilingual phrase dictionary and parallel corpora, and
automatically discovering transliteration
transformations from a training set of
name-transliteration pairs. The method also involves
language-specific knowledge functions, including
handling abbreviations, recognizing Chinese personal
names, and expanding acronyms. At runtime, the proposed
models are applied to each source NE in a pair of
bilingual sentences to generate and evaluate the target
NE candidates; the source and target NEs are then
aligned based on the computed probabilities.
Experimental results demonstrate that the proposed
approach, which integrates statistical models with
extra knowledge sources, is highly feasible and offers
significant improvement in performance compared to our
previous work, as well as the traditional approach of
IBM Model 4.",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Shirado:2006:UJH,
author = "Tamotsu Shirado and Satoko Marumoto and Masaki Murata
and Hitoshi Isahara",
title = "Using {Japanese} honorific expressions: a
psychological study",
journal = j-TALIP,
volume = "5",
number = "2",
pages = "146--164",
month = jun,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1145/1165255.1165258",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Oct 5 07:00:29 MDT 2006",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "We investigated, via experiment, knowledge of
normative honorific expressions as used in textbooks
and in practice by people. Forty subjects divided into
four groups according to age (younger\slash older) and
gender (male\slash female) participated in the
experiments. The results show that knowledge about the
use of normative honorific expressions in textbooks is
similar to that demonstrated by the younger subject
groups, but differed from that of the older subject
groups. The knowledge of the older subjects was more
complex than that shown in textbooks or demonstrated by
the younger subjects. A model that can identify misuse
of honorific expressions in sentences is the framework
for this investigation. The model is minimal, but could
represent 76\% to 92\% of the subjects' knowledge
regarding each honorific element. This model will be
useful in the development of computer-aided systems to
help teach how honorific expressions should be used.",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Wu:2006:ERT,
author = "Chung-Hsien Wu and Ze-Jing Chuang and Yu-Chung Lin",
title = "Emotion recognition from text using semantic labels
and separable mixture models",
journal = j-TALIP,
volume = "5",
number = "2",
pages = "165--183",
month = jun,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1145/1165255.1165259",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Oct 5 07:00:29 MDT 2006",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "This study presents a novel approach to automatic
emotion recognition from text. First, emotion
generation rules (EGRs) are manually deduced from
psychology to represent the conditions for generating
emotion. Based on the EGRs, the emotional state of each
sentence can be represented as a sequence of semantic
labels (SLs) and attributes (ATTs); SLs are defined as
the domain-independent features, while ATTs are
domain-dependent. The emotion association rules (EARs)
represented by SLs and ATTs for each emotion are
automatically derived from the sentences in an
emotional text corpus using the a priori algorithm.
Finally, a separable mixture model (SMM) is adopted to
estimate the similarity between an input sentence and
the EARs of each emotional state. Since some features
defined in this approach are domain-dependent, a dialog
system focusing on the students' daily expressions is
constructed, and only three emotional states, happy,
unhappy, and neutral, are considered for performance
evaluation. According to the results of the
experiments, given the domain corpus, the proposed
approach is promising, and easily ported into other
domains.",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Dale:2006:ISS,
author = "Robert Dale",
title = "Introduction to the {Special} section: {Extended} best
papers from {IJCNLP 2005}",
journal = j-TALIP,
volume = "5",
number = "3",
pages = "183--184",
month = sep,
year = "2006",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Apr 14 10:21:36 MDT 2007",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Oh:2006:MTM,
author = "Jong-Hoon Oh and Key-Sun Choi and Hitoshi Isahara",
title = "A machine transliteration model based on
correspondence between graphemes and phonemes",
journal = j-TALIP,
volume = "5",
number = "3",
pages = "185--208",
month = sep,
year = "2006",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Apr 14 10:21:36 MDT 2007",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Gao:2006:ESL,
author = "Jianfeng Gao and Hisami Suzuki and Wei Yuan",
title = "An empirical study on language model adaptation",
journal = j-TALIP,
volume = "5",
number = "3",
pages = "209--227",
month = sep,
year = "2006",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Apr 14 10:21:36 MDT 2007",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Ye:2006:SRL,
author = "Patrick Ye and Timothy Baldwin",
title = "Semantic role labeling of prepositional phrases",
journal = j-TALIP,
volume = "5",
number = "3",
pages = "228--244",
month = sep,
year = "2006",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Apr 14 10:21:36 MDT 2007",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Chung:2006:APD,
author = "Tze Leung Chung and Robert Wing Pong Luk and Kam Fai
Wong and Kui Lam Kwok and Dik Lun Lee",
title = "Adapting pivoted document-length normalization for
query size: {Experiments} in {Chinese} and {English}",
journal = j-TALIP,
volume = "5",
number = "3",
pages = "245--263",
month = sep,
year = "2006",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Apr 14 10:21:36 MDT 2007",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Matsumura:2006:ERB,
author = "Atsushi Matsumura and Atsuhiro Takasu and Jun
Adachi",
title = "Effect of relationships between words on {Japanese}
information retrieval",
journal = j-TALIP,
volume = "5",
number = "3",
pages = "264--289",
month = sep,
year = "2006",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Apr 14 10:21:36 MDT 2007",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Song:2006:ISI,
author = "Dawei Song and Jian-Yun Nie",
title = "Introduction to special issue on reasoning in natural
language information processing",
journal = j-TALIP,
volume = "5",
number = "4",
pages = "291--295",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Apr 14 10:21:37 MDT 2007",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Nie:2006:ILM,
author = "Jian-Yun Nie and Guihong Cao and Jing Bai",
title = "Inferential language models for information
retrieval",
journal = j-TALIP,
volume = "5",
number = "4",
pages = "296--322",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Apr 14 10:21:37 MDT 2007",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Gao:2006:SQT,
author = "Jianfeng Gao and Jian-Yun Nie and Ming Zhou",
title = "Statistical query translation models for
cross-language information retrieval",
journal = j-TALIP,
volume = "5",
number = "4",
pages = "323--359",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Apr 14 10:21:37 MDT 2007",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Liu:2006:SFQ,
author = "Yi Liu and Rong Jin and Joyce Y. Chai",
title = "A statistical framework for query translation
disambiguation",
journal = j-TALIP,
volume = "5",
number = "4",
pages = "360--387",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Apr 14 10:21:37 MDT 2007",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Li:2006:TTT,
author = "Baoli Li and Wenjie Li and Qin Lu",
title = "Topic tracking with time granularity reasoning",
journal = j-TALIP,
volume = "5",
number = "4",
pages = "388--412",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Apr 14 10:21:37 MDT 2007",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Phan:2006:IDS,
author = "Xuan-Hieu Phan and Le-Minh Nguyen and Yasushi Inoguchi
and Tu-Bao Ho and Susumu Horiguchi",
title = "Improving discriminative sequential learning by
discovering important association of statistics",
journal = j-TALIP,
volume = "5",
number = "4",
pages = "413--438",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Apr 14 10:21:37 MDT 2007",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Chen:2007:UDM,
author = "Yong Chen and Kwok-Ping Chan",
title = "Using data mining techniques and rough set theory for
language modeling",
journal = j-TALIP,
volume = "6",
number = "1",
pages = "??--??",
month = apr,
year = "2007",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Apr 14 10:21:37 MDT 2007",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Hsu:2007:MSB,
author = "Chung-Chian Hsu and Chien-Hsing Chen and Tien-Teng
Shih and Chun-Kai Chen",
title = "Measuring similarity between transliterations against
noise data",
journal = j-TALIP,
volume = "6",
number = "1",
pages = "??--??",
month = apr,
year = "2007",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Apr 14 10:21:37 MDT 2007",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Sakai:2007:RFQ,
author = "Tetsuya Sakai",
title = "On the reliability of factoid question answering
evaluation",
journal = j-TALIP,
volume = "6",
number = "1",
pages = "??--??",
month = apr,
year = "2007",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Apr 14 10:21:37 MDT 2007",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Wiseman:2007:CBC,
author = "Yair Wiseman and Irit Gefner",
title = "Conjugation-based compression for {Hebrew} texts",
journal = j-TALIP,
volume = "6",
number = "1",
pages = "??--??",
month = apr,
year = "2007",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Apr 14 10:21:37 MDT 2007",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Wu:2007:TBS,
author = "Chung-Hsien Wu and Hung-Yu Su and Yu-Hsien Chiu and
Chia-Hung Lin",
title = "Transfer-based statistical translation of {Taiwanese}
sign language using {PCFG}",
journal = j-TALIP,
volume = "6",
number = "1",
pages = "??--??",
month = apr,
year = "2007",
CODEN = "????",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Apr 14 10:21:37 MDT 2007",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Kuo:2007:PSM,
author = "Jin-Shea Kuo and Haizhou Li and Ying-Kuei Yang",
title = "A phonetic similarity model for automatic extraction
of transliteration pairs",
journal = j-TALIP,
volume = "6",
number = "2",
pages = "6:1--6:??",
month = sep,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1282080.1282081",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Jun 16 17:11:28 MDT 2008",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "This article proposes an approach for the automatic
extraction of transliteration pairs from Chinese Web
corpora. In this approach, we formulate the machine
transliteration process using a syllable-based phonetic
similarity model which consists of phonetic confusion
matrices and a Chinese character n -gram language
model. With the phonetic similarity model, the
extraction of transliteration pairs becomes a two-step
process of recognition followed by validation: First,
in the recognition process, we identify the most
probable transliteration in the k -neighborhood of a
recognized English word. Then, in the validation
process, we qualify the transliteration pair candidates
with a hypothesis test. We carry out an analytical
study on the statistics of several key factors in
English--Chinese transliteration to help formulate
phonetic similarity modeling. We then conduct both
supervised and unsupervised learning of a phonetic
similarity model on a development database. The
experimental results validate the effectiveness of the
phonetic similarity model by achieving an $F$-measure
of 0.739 in supervised learning. The unsupervised
learning approach works almost as well as the
supervised one, thus allowing us to deploy automatic
extraction of transliteration pairs in the Web space.",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "extraction of transliteration pairs; machine
translation; machine transliteration; phonetic
confusion probability; phonetic similarity modeling",
}
@Article{Xiao:2007:SNM,
author = "Jinghui Xiao and Xiaolong Wang and Bingquan Liu",
title = "The study of a nonstationary maximum entropy {Markov}
model and its application on the pos-tagging task",
journal = j-TALIP,
volume = "6",
number = "2",
pages = "7:1--7:??",
month = sep,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1282080.1282082",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Jun 16 17:11:28 MDT 2008",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Sequence labeling is a core task in natural language
processing. The maximum entropy Markov model (MEMM) is
a powerful tool in performing this task. This article
enhances the traditional MEMM by exploiting the
positional information of language elements. The
stationary hypothesis is relaxed in MEMM, and the
nonstationary MEMM (NS-MEMM) is proposed. Several
related issues are discussed in detail, including the
representation of positional information, NS-MEMM
implementation, smoothing techniques, and the space
complexity issue. Furthermore, the asymmetric NS-MEMM
presents a more flexible way to exploit positional
information. In the experiments, NS-MEMM is evaluated
on both the Chinese and the English pos-tagging tasks.
According to the experimental results, NS-MEMM yields
effective improvements over MEMM by exploiting
positional information. The smoothing techniques in
this article effectively solve the NS-MEMM
data-sparseness problem; the asymmetric NS-MEMM is also
an improvement by exploiting positional information in
a more flexible way.",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "data sparseness problem; Markov property; MEMM;
pos-tagging; stationary hypothesis",
}
@Article{Zhuang:2007:IHD,
author = "Yl Zhuang and Yueting Zhuang and Qing Li and Lei
Chen",
title = "Interactive high-dimensional index for large {Chinese}
calligraphic character databases",
journal = j-TALIP,
volume = "6",
number = "2",
pages = "8:1--8:??",
month = sep,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1282080.1282083",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Jun 16 17:11:28 MDT 2008",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "The large numbers of Chinese calligraphic scripts in
existence are valuable part of the Chinese cultural
heritage. However, due to the shape complexity of these
characters, it is hard to employ existing techniques to
effectively retrieve and efficiently index them. In
this article, using a novel shape-similarity- based
retrieval method in which shapes of calligraphic
characters are represented by their contour points
extracted from the character images, we propose an
interactive partial-distance-map (PDM)- based
high-dimensional indexing scheme which is designed
specifically to speed up the retrieval performance of
the large Chinese calligraphic character databases
effectively. Specifically, we use the approximate
minimal bounding sphere of a query character and
utilize users' relevance feedback to refine the query
gradually. Comprehensive experiments are conducted to
testify the efficiency and effectiveness of this
method. In addition, a new $k$-NN search called Pseudo
$k$-NN (P $k$-NN) search is presented to better
facilitate the PDM-based character retrieval.",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "Chinese calligraphic character; hyper-centre
relocation; Pseudo k-NN",
}
@Article{Saraswathi:2007:CPE,
author = "S. Saraswathi and T. V. Geetha",
title = "Comparison of performance of enhanced morpheme-based
language model with different word-based language
models for improving the performance of {Tamil} speech
recognition system",
journal = j-TALIP,
volume = "6",
number = "3",
pages = "9:1--9:??",
month = nov,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1290002.1290003",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Jun 16 17:11:45 MDT 2008",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "This paper describes a new technique of language
modeling for a highly inflectional Dravidian language,
Tamil. It aims to alleviate the main problems
encountered in processing of Tamil language, like
enormous vocabulary growth caused by the large number
of different forms derived from one word. The size of
the vocabulary was reduced by, decomposing the words
into stems and endings and storing these sub word units
(morphemes) in the vocabulary separately. A enhanced
morpheme-based language model was designed for the
inflectional language Tamil. The enhanced
morpheme-based language model was trained on the
decomposed corpus. The perplexity and Word Error Rate
(WER) were obtained to check the efficiency of the
model for Tamil speech recognition system. The results
were compared with word-based bigram and trigram
language models, distance based language model,
dependency based language model and class based
language model. From the results it was analyzed that
the enhanced morpheme-based trigram model with Katz
back-off smoothing effect improved the performance of
the Tamil speech recognition system when compared to
the word-based language models.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "language model; morphemes; perplexity; word error rate
and speech recognition",
}
@Article{Hussain:2007:DLS,
author = "Sarmad Hussain and Sana Gul and Afifah Waseem",
title = "Developing lexicographic sorting: {An} example for
{Urdu}",
journal = j-TALIP,
volume = "6",
number = "3",
pages = "10:1--10:??",
month = nov,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1290002.1290004",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Jun 16 17:11:45 MDT 2008",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Collation or lexicographic sorting is essential to
develop multilingual computing. This paper presents the
challenges faced in developing collation sequence for a
language. The paper discusses both theoretical
linguistic and practical standardization and encoding
related considerations that need to be addressed for
languages for which relevant standards and/or solutions
have not been defined. The paper also defines the
process, by giving the details of the procedure
followed for Urdu language, which is the national
language of Pakistan and is spoken by more than 100
million people across the world. The paper is oriented
towards organizations involved in developing and using
collation standards and the localization industry, and
not focused on theoretical issues.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "text processing; Urdu",
}
@Article{Fukumoto:2007:TTB,
author = "Fumiyo Fukumoto and Yoshimi Suzuki",
title = "Topic tracking based on bilingual comparable corpora
and semisupervised clustering",
journal = j-TALIP,
volume = "6",
number = "3",
pages = "11:1--11:??",
month = nov,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1290002.1290005",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Jun 16 17:11:45 MDT 2008",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "In this paper, we address the problem of skewed data
in topic tracking: the small number of stories labeled
positive as compared to negative stories and propose a
method for estimating effective training stories for
the topic-tracking task. For a small number of labeled
positive stories, we use bilingual comparable, i.e.,
English, and Japanese corpora, together with the EDR
bilingual dictionary, and extract story pairs
consisting of positive and associated stories. To
overcome the problem of a large number of labeled
negative stories, we classified them into clusters.
This is done using a semisupervised clustering
algorithm, combining $k$ means with EM. The method was
tested on the TDT English corpus and the results showed
that the system works well when the topic under
tracking is talking about an event originating in the
source language country, even for a small number of
initial positive training stories.",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "bilingual comparable corpora; clustering; EM
algorithm; N-gram model; topic detection and tracking",
}
@Article{Iida:2007:ZAR,
author = "Ryu Iida and Kentaro Inui and Yuji Matsumoto",
title = "Zero-anaphora resolution by learning rich syntactic
pattern features",
journal = j-TALIP,
volume = "6",
number = "4",
pages = "1:1--1:22",
month = dec,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1316457.1316458",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Jun 16 17:11:55 MDT 2008",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "We approach the zero-anaphora resolution problem by
decomposing it into intrasentential and intersentential
zero-anaphora resolution tasks. For the former task,
syntactic patterns of zeropronouns and their
antecedents are useful clues. Taking Japanese as a
target language, we empirically demonstrate that
incorporating rich syntactic pattern features in a
state-of-the-art learning-based anaphora resolution
model dramatically improves the accuracy of
intrasentential zero-anaphora, which consequently
improves the overall performance of zero-anaphora
resolution.",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Adriani:2007:SIC,
author = "Mirna Adriani and Jelita Asian and Bobby Nazief and S.
M. M. Tahaghoghi and Hugh E. Williams",
title = "Stemming {Indonesian}: a confix-stripping approach",
journal = j-TALIP,
volume = "6",
number = "4",
pages = "2:1--2:33",
month = dec,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1316457.1316458",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Jun 16 17:11:55 MDT 2008",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Stemming words to (usually) remove suffixes has
applications in text search, machine translation,
document summarization, and text classification. For
example, English stemming reduces the words 'computer,'
'computing,' 'computation,' and 'computability' to
their common morphological root, 'comput-.' In text
search, this permits a search for 'computers' to find
documents containing all words with the stem 'comput-.'
In the Indonesian language, stemming is of crucial
importance: words have prefixes, suffixes, infixes, and
confixes that make matching related words
difficult.\par
This work surveys existing techniques for stemming
Indonesian words to their morphological roots, presents
our novel and highly accurate CS algorithm, and
explores the effectiveness of stemming in the context
of general-purpose text information retrieval through
ad hoc queries.",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "Indonesian; information retrieval; stemming",
}
@Article{Thao:2007:NER,
author = "Pham Thi Xuan Thao and Tran Quoc Tri and Dinh Dien and
Nigel Collier",
title = "Named entity recognition in {Vietnamese} using
classifier voting",
journal = j-TALIP,
volume = "6",
number = "4",
pages = "3:1--3:18",
month = dec,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1316457.1316460",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Jun 16 17:11:55 MDT 2008",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Named entity recognition (NER) is one of the
fundamental tasks in natural-language processing (NLP).
Though the combination of different classifiers has
been widely applied in several well-studied languages,
this is the first time this method has been applied to
Vietnamese. In this article, we describe how voting
techniques can improve the performance of Vietnamese
NER. By combining several state-of-the-art
machine-learning algorithms using voting strategies,
our final result outperforms individual algorithms and
gained an $F$-measure of 89.12. A detailed discussion
about the challenges of NER in Vietnamese is also
presented.",
acknowledgement = ack-nhfb,
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "C4.5; Conditional Random Fields; Na{\"\i}ve Bayes
named entity recognition; support vector machines;
transformation based learning; Vietnamese; voting",
}
@Article{Chen:2008:SBM,
author = "Yufeng Chen and Chengqing Zong",
title = "A Structure-Based Model for {Chinese} Organization
Name Translation",
journal = j-TALIP,
volume = "7",
number = "1",
pages = "1:1--1:??",
month = feb,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1330291.1330292",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Jun 16 17:12:10 MDT 2008",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Named entity (NE) translation is a fundamental task in
multilingual natural language processing. The
performance of a machine translation system depends
heavily on precise translation of the inclusive NEs.
Furthermore, organization name (ON) is the most complex
NE for translation among all the NEs. In this article,
the structure formulation of ONs is investigated and a
hierarchical structure-based ON translation model for
Chinese-to-English translation system is
presented.\par
First, the model performs ON chunking; then both the
translation of words within chunks and the process of
chunk-reordering are achieved by synchronous
context-free grammar (CFG). The CFG rules are extracted
from bilingual ON pairs in a training program.\par
The main contributions of this article are: (1)
defining appropriate chunk-units for analyzing the
internal structure of Chinese ONs; (2) making the
chunk-based ON translation feasible and flexible via a
hierarchical CFG derivation; and (3) proposing a
training architecture to automatically learn the
synchronous CFG for constructing ONs with chunk-units
from aligned bilingual ON pairs. The experiments show
that the proposed approach translates the Chinese ONs
into English with an accuracy of 93.75\% and
significantly improves the performance of a baseline
statistical machine translation (SMT) system.",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "alignment; chunk; hierarchical derivation; machine
translation; named entity; organization name; rules
extraction; structural analysis; synchronous
context-free grammar",
}
@Article{Jeong:2008:ISR,
author = "Minwoo Jeong and Gary Geunbae Lee",
title = "Improving Speech Recognition and Understanding using
Error-Corrective Reranking",
journal = j-TALIP,
volume = "7",
number = "1",
pages = "2:1--2:??",
month = feb,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1330291.1330293",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Jun 16 17:12:10 MDT 2008",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "The main issues of practical spoken-language
applications for human-computer interface are how to
overcome speech recognition errors and guarantee the
reasonable end-performance of spoken-language
applications. Therefore, handling the erroneously
recognized outputs is a key in developing robust
spoken-language systems. To address this problem, we
present a method to improve the accuracy of speech
recognition and performance of spoken-language
applications. The proposed error corrective reranking
approach exploits recognition environment
characteristics and domain-specific semantic
information to provide robustness and adaptability for
a spoken-language system. We demonstrate some
experiments of spoken dialogue tasks and empirical
results that show an improvement in accuracy for both
speech recognition and spoken-language understanding.
In our experiment, we show an error reduction of up to
9.7\% and 16.8\%; of word error rate, and 5.5\% and
7.9\% of understanding error for the air travel and
telebanking service domains.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "automatic speech recognition; error-corrective
reranking; improving spoken dialogue system;
spoken-language understanding",
}
@Article{Kuo:2008:MSG,
author = "June-Jei Kuo and Hsin-Hsi Chen",
title = "Multidocument Summary Generation: Using Informative
and Event Words",
journal = j-TALIP,
volume = "7",
number = "1",
pages = "3:1--3:??",
month = feb,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1330291.1330294",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Jun 16 17:12:10 MDT 2008",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Summary generation for multiple documents poses a
number of issues including sentence selection, sentence
ordering, and sentence reduction over single-document
summarization. In addition, the temporal resolution
among extracted sentences is also important. This
article considers informative words and event words to
deal with multidocument summarization. These words
indicate the important concepts and relationships in a
document or among a set of documents, and can be used
to select salient sentences. We present a temporal
resolution algorithm, using focusing time and
coreference chains, to convert Chinese temporal
expressions in a document into calendrical forms.
Moreover, we consider the last calendrical form of a
sentence as a sentence time stamp to address sentence
ordering. Informative words, event words, and temporal
words are introduced to a sentence reduction algorithm,
which deals with both length constraints and
information coverage. Experiments on Chinese-news data
sets show significant improvements of both information
coverage and readability.",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "latent semantic analysis; multidocument summary
generation; sentence ordering; sentence reduction;
sentence selection; temporal processing",
}
@Article{Kando:2008:INS,
author = "Noriko Kando and Teruko Mitamura and Tetsuya Sakai",
title = "Introduction to the {NTCIR-6 Special Issue}",
journal = j-TALIP,
volume = "7",
number = "2",
pages = "4:1--4:??",
month = jun,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1362782.1362783",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Jun 16 17:12:23 MDT 2008",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Zhou:2008:HTE,
author = "Dong Zhou and Mark Truran and Tim Brailsford and Helen
Ashman",
title = "A Hybrid Technique for {English--Chinese} Cross
Language Information Retrieval",
journal = j-TALIP,
volume = "7",
number = "2",
pages = "5:1--5:??",
month = jun,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1362782.1362784",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Jun 16 17:12:23 MDT 2008",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "In this article we describe a hybrid technique for
dictionary-based query translation suitable for
English--Chinese cross language information retrieval.
This technique marries a graph-based model for the
resolution of candidate term ambiguity with a
pattern-based method for the translation of
out-of-vocabulary (OOV) terms. We evaluate the
performance of this hybrid technique in an experiment
using several NTCIR test collections. Experimental
results indicate a substantial increase in retrieval
effectiveness over various baseline systems
incorporating machine- and dictionary-based
translation.",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "cross language information retrieval; disambiguation;
graph-based analysis; patterns; unknown term
translation",
}
@Article{Higashinaka:2008:AAC,
author = "Ryuichiro Higashinaka and Hideki Isozaki",
title = "Automatically Acquiring Causal Expression Patterns
from Relation-annotated Corpora to Improve Question
Answering for why-Questions",
journal = j-TALIP,
volume = "7",
number = "2",
pages = "6:1--6:??",
month = jun,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1362782.1362785",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Jun 16 17:12:23 MDT 2008",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "This article describes our approach for answering
why-questions that we initially introduced at NTCIR-6
QAC-4. The approach automatically acquires causal
expression patterns from relation-annotated corpora by
abstracting text spans annotated with a causal relation
and by mining syntactic patterns that are useful for
distinguishing sentences annotated with a causal
relation from those annotated with other relations. We
use these automatically acquired causal expression
patterns to create features to represent answer
candidates, and use these features together with other
possible features related to causality to train an
answer candidate ranker that maximizes the QA
performance with regards to the corpus of why-questions
and answers. NAZEQA, a Japanese why-QA system based on
our approach, clearly outperforms baselines with a Mean
Reciprocal Rank (top-5) of 0.223 when sentences are
used as answers and with a MRR (top-5) of 0.326 when
paragraphs are used as answers, making it presumably
the best-performing fully implemented why-QA system.
Experimental results also verified the usefulness of
the automatically acquired causal expression
patterns.",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "causal expression; pattern mining; question answering;
relation-annotated corpus",
}
@Article{Li:2008:ASV,
author = "Yaoyong Li and Kalina Bontcheva",
title = "Adapting Support Vector Machines for ${F}$-term-based
Classification of Patents",
journal = j-TALIP,
volume = "7",
number = "2",
pages = "7:1--7:??",
month = jun,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1362782.1362786",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Jun 16 17:12:23 MDT 2008",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Support Vector Machines (SVM) have obtained
state-of-the-art results on many applications including
document classification. However, previous works on
applying SVMs to the $F$-term patent classification
task did not obtain as good results as other learning
algorithms such as k-NN. This is due to the fact that
$F$-term patent classification is different from
conventional document classification in several
aspects, mainly because it is a multiclass, multilabel
classification problem with semi-structured documents
and multi-faceted hierarchical categories.\par
This article describes our SVM-based system and several
techniques we developed successfully to adapt SVM for
the specific features of the $F$-term patent
classification task. We evaluate the techniques using
the NTCIR-6 $F$-term classification terms assigned to
Japanese patents. Moreover, our system participated in
the NTCIR-6 patent classification evaluation and
obtained the best results according to two of the three
metrics used for task performance evaluation. Following
the NTCIR-6 participation, we developed two new
techniques, which achieved even better scores using all
three NTCIR-6 metrics, effectively outperforming all
participating systems. This article presents this new
work and the experimental results that demonstrate the
benefits of the latest approach.",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "F-term classification; patent processing; support
vector machines",
}
@Article{Fukumoto:2008:ICL,
author = "Fumiyo Fukumoto and Yoshimi Suzuki",
title = "Integrating Cross-Language Hierarchies and Its
Application to Retrieving Relevant Documents",
journal = j-TALIP,
volume = "7",
number = "3",
pages = "8:1--8:??",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1386869.1386870",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Fri Aug 22 13:11:51 MDT 2008",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Internet directories such as Yahoo! are an approach to
improve the efficacy and efficiency of Information
Retrieval (IR) on the Web, as pages (documents) are
organized into hierarchical categories, and similar
pages are grouped together. Most of the search engines
on the Web service find documents that are assigned to
a single classification hierarchy. Categories in the
hierarchy are carefully defined by human experts and
documents are well organized. However, a single
hierarchy in one language is often insufficient to find
all relevant material, as each hierarchy tends to have
some bias in both defining hierarchical structure and
classifying documents. Moreover, documents written in a
language other than the user's native language often
include large amounts of information related to the
user's request. In this article, we propose a method of
integrating cross-language (CL) category hierarchies,
that is, Reuters '96 hierarchy and UDC code hierarchy
of Japanese by estimating category similarities. The
method does not simply merge two different hierarchies
into one large hierarchy but instead extracts sets of
similar categories, where each element of the sets is
relevant with each other. It consists of three steps.
First, we classify documents from one hierarchy into
categories with another hierarchy using a
cross-language text classification (CLTC) technique,
and extract category pairs of two hierarchies. Next, we
apply $\chi^2$ statistics to these pairs to
obtain similar category pairs, and finally we apply the
generating function of the Apriori algorithm
(Apriori-Gen) to the category pairs, and find sets of
similar categories. Moreover, we examined whether
integrating hierarchies helps to support retrieval of
documents with similar contents. The retrieval results
showed a 42.7\% improvement over the baseline
nonhierarchy model, and a 21.6\% improvement over a
single hierarchy.",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "cross-language hierarchies; information integration;
retrieval of relevant documents; text classification",
}
@Article{Sharma:2008:AMI,
author = "Utpal Sharma and Jugal K. Kalita and Rajib K. Das",
title = "Acquisition of Morphology of an {Indic} Language from
Text Corpus",
journal = j-TALIP,
volume = "7",
number = "3",
pages = "9:1--9:??",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1386869.1386871",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Fri Aug 22 13:11:51 MDT 2008",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "This article describes an approach to unsupervised
learning of morphology from an unannotated corpus for a
highly inflectional Indo-European language called
Assamese spoken by about 30 million people. Although
Assamese is one of India's national languages, it
utterly lacks computational linguistic resources. There
exists no prior computational work on this language
spoken widely in northeast India. The work presented is
pioneering in this respect. In this article, we discuss
salient issues in Assamese morphology where the
presence of a large number of suffixal determiners,
sandhi, samas, and the propensity to use suffix
sequences make approximately 50\% of the words used in
written and spoken text inflected. We implement methods
proposed by Gaussier and Goldsmith on acquisition of
morphological knowledge, and obtain F-measure
performance below 60\%. This motivates us to present a
method more suitable for handling suffix sequences,
enabling us to increase the F-measure performance of
morphology acquisition to almost 70\%. We describe how
we build a morphological dictionary for Assamese from
the text corpus. Using the morphological knowledge
acquired and the morphological dictionary, we are able
to process small chunks of data at a time as well as a
large corpus. We achieve approximately 85\% precision
and recall during the analysis of small chunks of
coherent text.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "Assamese; Indo-European languages; machine learning;
morphology",
}
@Article{Chen:2008:TTR,
author = "Jiang-Chun Chen and Jyh-Shing Roger Jang",
title = "{TRUES}: {Tone Recognition Using Extended Segments}",
journal = j-TALIP,
volume = "7",
number = "3",
pages = "10:1--10:??",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1386869.1386872",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Fri Aug 22 13:11:51 MDT 2008",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Tone recognition has been a basic but important task
for speech recognition and assessment of tonal
languages, such as Mandarin Chinese. Most previously
proposed approaches adopt a two-step approach where
syllables within an utterance are identified via forced
alignment first, and tone recognition using a variety
of classifiers---such as neural networks, Gaussian
mixture models (GMM), hidden Markov models (HMM),
support vector machines (SVM)---is then performed on
each segmented syllable to predict its tone. However,
forced alignment does not always generate accurate
syllable boundaries, leading to unstable
voiced-unvoiced detection and deteriorating performance
in tone recognition. Aiming to alleviate this problem,
we propose a robust approach called Tone Recognition
Using Extended Segments (TRUES) for HMM-based
continuous tone recognition. The proposed approach
extracts an unbroken pitch contour from a given
utterance based on dynamic programming over time-domain
acoustic features of average magnitude difference
function (AMDF). The pitch contour of each syllable is
then extended for tri-tone HMM modeling, such that the
influence from inaccurate syllable boundaries is
lessened. Our experimental results demonstrate that the
proposed TRUES achieves 49.13\% relative error rate
reduction over that of the recently proposed supratone
modeling, which is deemed the state of the art of tone
recognition that outperforms several previously
proposed approaches. The encouraging improvement
demonstrates the effectiveness and robustness of the
proposed TRUES, as well as the corresponding pitch
determination algorithm which produces unbroken pitch
contours.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "context-dependent tone modeling; continuous tone
recognition; extended segment for tone recognition;
HMM; Mandarin Chinese; supratone modeling",
}
@Article{Lin:2008:VCD,
author = "Jeng-Wei Lin and Jan-Ming Ho and Li-Ming Tseng and
Feipei Lai",
title = "Variant {Chinese} Domain Name Resolution",
journal = j-TALIP,
volume = "7",
number = "4",
pages = "11:1--11:??",
month = nov,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1450295.1450296",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Dec 8 13:56:10 MST 2008",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Many efforts in past years have been made to lower the
linguistic barriers for non-native English speakers to
access the Internet. Internet standard RFC 3490,
referred to as IDNA (Internationalizing Domain Names in
Applications), focuses on access to IDNs
(Internationalized Domain Names) in a range of scripts
that is broader in scope than the original ASCII.
However, the use of character variants that have
similar appearances and/or interpretations could create
confusion. A variant IDL (Internationalized Domain
Label), derived from an IDL by replacing some
characters with their variants, should match the
original IDL; and thus a variant IDN does. In RFC 3743,
referred to as JET (Joint Engineering Team) Guidelines,
it is suggested that zone administrators model this
concept of equivalence as an atomic IDL package. When
an IDL is registered, an IDL package is created that
contains its variant IDLs generated according to the
zone-specific Language Variant Tables (LVTs). In
addition to the registered IDL, the name holder can
request the domain registry to activate some of the
variant IDLs, free or by an extra fee. The activated
variant IDLs are stored in the zone files, and thus
become resolvable. However, an issue of scalability
arises when there is a large number of variant IDLs to
be activated.\par
In this article, the authors present a resolution
protocol that resolves the variant IDLs into the
registered IDL, specifically for Han character
variants. Two Han characters are said to be variants of
each other if they have the same meaning and are
pronounced the same. Furthermore, Han character
variants usually have similar appearances. It is not
uncommon that a Chinese IDL has a large number of
variant IDLs. The proposed protocol introduces a new RR
(resource record) type, denoted as VarIdx RR, to
associate a variant expression of the variant IDLs with
the registered IDL. The label of the VarIdx RR, denoted
as the variant index, is assigned by an indexing
function that is designed to give the same value to all
of the variant IDLs enumerated by the variant
expression. When one of the variant IDLs is accessed,
Internet applications can compute the variant index,
look up the VarIdx RRs, and resolve the variant IDL
into the registered IDL.\par
The authors examine two sets of Chinese IDLs registered
in TWNIC and CNNIC, respectively. The results show that
for a registered Chinese IDL, a very small number of
VarIdx RRs, usually one or two, are sufficient to
activate all of its variant IDLs. The authors also
represent a Web redirection service that employs the
proposed resolution protocol to redirect a URL
addressed by a variant IDN to the URL addressed by the
registered IDN. The experiment results show that the
proposed protocol successfully resolves the variant
IDNs into the registered IDNs.",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "conversion between traditional Chinese and simplified
Chinese; Han character folding; Han character variant;
IDN spoof; internationalized domain name;
localization",
}
@Article{Lee:2008:BCQ,
author = "Cheng-Wei Lee and Min-Yuh Day and Cheng-Lung Sung and
Yi-Hsun Lee and Tian-Jian Jiang and Chia-Wei Wu and
Cheng-Wei Shih and Yu-Ren Chen and Wen-Lian Hsu",
title = "Boosting {Chinese} Question Answering with Two
Lightweight Methods: {ABSPs} and {SCO-QAT}",
journal = j-TALIP,
volume = "7",
number = "4",
pages = "12:1--12:??",
month = nov,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1450295.1450297",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Dec 8 13:56:10 MST 2008",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Question Answering (QA) research has been conducted in
many languages. Nearly all the top performing systems
use heavy methods that require sophisticated
techniques, such as parsers or logic provers. However,
such techniques are usually unavailable or unaffordable
for under-resourced languages or in resource-limited
situations. In this article, we describe how a
top-performing Chinese QA system can be designed by
using lightweight methods effectively. We propose two
lightweight methods, namely the Sum of Co-occurrences
of Question and Answer Terms (SCO-QAT) and
Alignment-based Surface Patterns (ABSPs). SCO-QAT is a
co-occurrence-based answer-ranking method that does not
need extra knowledge, word-ignoring heuristic rules, or
tools. It calculates co-occurrence scores based on the
passage retrieval results. ABSPs are syntactic patterns
trained from question-answer pairs with a multiple
alignment algorithm. They are used to capture the
relations between terms and then use the relations to
filter answers. We attribute the success of the ABSPs
and SCO-QAT methods to the effective use of local
syntactic information and global co-occurrence
information.\par
By using SCO-QAT and ABSPs, we improved the RU-Accuracy
of our testbed QA system, ASQA, from 0.445 to 0.535 on
the NTCIR-5 dataset. It also achieved the top 0.5
RU-Accuracy on the NTCIR-6 dataset. The result shows
that lightweight methods are not only cheaper to
implement, but also have the potential to achieve
state-of-the-art performances.",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "answer filtering; answer ranking; Chinese question
answering; co-occurrence; lightweight method; surface
pattern",
}
@Article{Che:2008:UHC,
author = "Wanxiang Che and Min Zhang and AiTi Aw and ChewLim Tan
and Ting Liu and Sheng Li",
title = "Using a Hybrid Convolution Tree Kernel for Semantic
Role Labeling",
journal = j-TALIP,
volume = "7",
number = "4",
pages = "13:1--13:??",
month = nov,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1450295.1450298",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Dec 8 13:56:10 MST 2008",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "As a kind of Shallow Semantic Parsing, Semantic Role
Labeling (SRL) is gaining more attention as it benefits
a wide range of natural language processing
applications. Given a sentence, the task of SRL is to
recognize semantic arguments (roles) for each predicate
(target verb or noun). Feature-based methods have
achieved much success in SRL and are regarded as the
state-of-the-art methods for SRL. However, these
methods are less effective in modeling structured
features. As an extension of feature-based methods,
kernel-based methods are able to capture structured
features more efficiently in a much higher dimension.
Application of kernel methods to SRL has been achieved
by selecting the tree portion of a predicate and one of
its arguments as feature space, which is named as
predicate-argument feature (PAF) kernel. The PAF kernel
captures the syntactic tree structure features using
convolution tree kernel, however, it does not
distinguish between the path structure and the
constituent structure. In this article, a hybrid
convolution tree kernel is proposed to model different
linguistic objects. The hybrid convolution tree kernel
consists of two individual convolution tree kernels.
They are a Path kernel, which captures
predicate-argument link features, and a Constituent
Structure kernel, which captures the syntactic
structure features of arguments. Evaluations on the
data sets of the CoNLL-2005 SRL shared task and the
Chinese PropBank (CPB) show that our proposed hybrid
convolution tree kernel statistically significantly
outperforms the previous tree kernels. Moreover, in
order to maximize the system performance, we present a
composite kernel through combining our hybrid
convolution tree kernel method with a feature-based
method extended by the polynomial kernel. The
experimental results show that the composite kernel
achieves better performance than each of the individual
methods and outperforms the best reported system on the
CoNLL-2005 corpus when only one syntactic parser is
used and on the CPB corpus when automated syntactic
parse results and correct syntactic parse results are
used respectively.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "hybrid convolution tree kernel; semantic role
labeling",
}
@Article{Wu:2009:ISI,
author = "Chung-Hsien Wu and Haizhou Li",
title = "Introduction to the Special Issue on Recent Advances
in {Asian} Language Spoken Document Retrieval",
journal = j-TALIP,
volume = "8",
number = "1",
pages = "1:1--1:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1482343.1482344",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Mar 23 16:32:22 MDT 2009",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Chen:2009:WTM,
author = "Berlin Chen",
title = "Word Topic Models for Spoken Document Retrieval and
Transcription",
journal = j-TALIP,
volume = "8",
number = "1",
pages = "2:1--2:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1482343.1482345",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Mar 23 16:32:22 MDT 2009",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Statistical language modeling (LM), which aims to
capture the regularities in human natural language and
quantify the acceptability of a given word sequence,
has long been an interesting yet challenging research
topic in the speech and language processing community.
It also has been introduced to information retrieval
(IR) problems, and provided an effective and
theoretically attractive probabilistic framework for
building IR systems. In this article, we propose a word
topic model (WTM) to explore the co-occurrence
relationship between words, as well as the long-span
latent topical information, for language modeling in
spoken document retrieval and transcription. The
document or the search history as a whole is modeled as
a composite WTM model for generating a newly observed
word. The underlying characteristics and different
kinds of model structures are extensively investigated,
while the performance of WTM is thoroughly analyzed and
verified by comparison with the well-known
probabilistic latent semantic analysis (PLSA) model as
well as the other models. The IR experiments are
performed on the TDT Chinese collections (TDT-2 and
TDT-3), while the large vocabulary continuous speech
recognition (LVCSR) experiments are conducted on the
Mandarin broadcast news collected in Taiwan.
Experimental results seem to indicate that WTM is a
promising alternative to the existing models.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "adaptation; information retrieval; language model;
speech recognition; word topic model",
}
@Article{Lin:2009:CSP,
author = "Shih-Hsiang Lin and Berlin Chen and Hsin-Min Wang",
title = "A Comparative Study of Probabilistic Ranking Models
for {Chinese} Spoken Document Summarization",
journal = j-TALIP,
volume = "8",
number = "1",
pages = "3:1--3:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1482343.1482346",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Mar 23 16:32:22 MDT 2009",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Extractive document summarization automatically
selects a number of indicative sentences, passages, or
paragraphs from an original document according to a
target summarization ratio, and sequences them to form
a concise summary. In this article, we present a
comparative study of various probabilistic ranking
models for spoken document summarization, including
supervised classification-based summarizers and
unsupervised probabilistic generative summarizers. We
also investigate the use of unsupervised summarizers to
improve the performance of supervised summarizers when
manual labels are not available for training the
latter. A novel training data selection approach that
leverages the relevance information of spoken sentences
to select reliable document-summary pairs derived by
the probabilistic generative summarizers is explored
for training the classification-based summarizers.
Encouraging initial results on Mandarin Chinese
broadcast news data are demonstrated.",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "extractive summarization; probabilistic ranking
models; relevance information; spoken document
summarization",
}
@Article{Chen:2009:TSH,
author = "Boxing Chen and Min Zhang and Ai Ti Aw",
title = "Two-Stage Hypotheses Generation for Spoken Language
Translation",
journal = j-TALIP,
volume = "8",
number = "1",
pages = "4:1--4:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1482343.1482347",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Mar 23 16:32:22 MDT 2009",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Spoken Language Translation (SLT) is the research area
that focuses on the translation of speech or text
between two spoken languages. Phrase-based and
syntax-based methods represent the state-of-the-art for
statistical machine translation (SMT). The phrase-based
method specializes in modeling local reorderings and
translations of multiword expressions. The syntax-based
method is enhanced by using syntactic knowledge, which
can better model long word reorderings, discontinuous
phrases, and syntactic structure. In this article, we
leverage on the strength of these two methods and
propose a strategy based on multiple hypotheses
generation in a two-stage framework for spoken language
translation. The hypotheses are generated in two
stages, namely, decoding and regeneration. In the
decoding stage, we apply state-of-the-art,
phrase-based, and syntax-based methods to generate
basic translation hypotheses. Then in the regeneration
stage, much more hypotheses that cannot be captured by
the decoding algorithms are produced from the basic
hypotheses. We study three regeneration methods:
redecoding, n-gram expansion, and confusion network in
the second stage. Finally, an additional reranking pass
is introduced to select the translation outputs by a
linear combination of rescoring models. Experimental
results on the Chinese-to-English IWSLT-2006 challenge
task of translating the transcription of spontaneous
speech show that the proposed mechanism achieves
significant improvements over the baseline of about
2.80 BLEU-score.",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "hypotheses generation; spoken language translation;
statistical machine translation",
}
@Article{Chiang:2009:ISI,
author = "David Chiang and Philipp Koehn",
title = "Introduction to the Special Issue on Machine
Translation of {Asian} Language",
journal = j-TALIP,
volume = "8",
number = "2",
pages = "5:1--5:??",
month = may,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1526252.1526253",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Wed Jun 3 16:13:52 MDT 2009",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{He:2009:IMH,
author = "Xiaodong He and Mei Yang and Jianfeng Gao and Patrick
Nguyen and Robert Moore",
title = "Improved Monolingual Hypothesis Alignment for Machine
Translation System Combination",
journal = j-TALIP,
volume = "8",
number = "2",
pages = "6:1--6:??",
month = may,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1526252.1526254",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Wed Jun 3 16:13:52 MDT 2009",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "This article presents a new hypothesis alignment
method for combining outputs of multiple machine
translation (MT) systems. An indirect hidden Markov
model (IHMM) is proposed to address the synonym
matching and word ordering issues in hypothesis
alignment. Unlike traditional HMMs whose parameters are
trained via maximum likelihood estimation (MLE), the
parameters of the IHMM are estimated indirectly from a
variety of sources including word semantic similarity,
word surface similarity, and a distance-based
distortion penalty. The IHMM-based method significantly
outperforms the state-of-the-art, TER-based alignment
model in our experiments on NIST benchmark datasets.
Our combined SMT system using the proposed method
achieved the best Chinese-to-English translation result
in the constrained training track of the 2008 NIST Open
MT Evaluation.",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "hidden Markov model; statistical machine translation;
system combination; word alignment",
}
@Article{Ma:2009:BMW,
author = "Yanjun Ma and Andy Way",
title = "Bilingually Motivated Word Segmentation for
Statistical Machine Translation",
journal = j-TALIP,
volume = "8",
number = "2",
pages = "7:1--7:??",
month = may,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1526252.1526255",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Wed Jun 3 16:13:52 MDT 2009",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "We introduce a bilingually motivated word segmentation
approach to languages where word boundaries are not
orthographically marked, with application to
Phrase-Based Statistical Machine Translation (PB-SMT).
Our approach is motivated from the insight that PB-SMT
systems can be improved by optimizing the input
representation to reduce the predictive power of
translation models. We firstly present an approach to
optimize the existing segmentation of both source and
target languages for PB-SMT and demonstrate the
effectiveness of this approach using a Chinese--English
MT task, that is, to measure the influence of the
segmentation on the performance of PB-SMT systems. We
report a 5.44\% relative increase in Bleu score and a
consistent increase according to other metrics. We then
generalize this method for Chinese word segmentation
without relying on any segmenters and show that using
our segmentation PB-SMT can achieve more consistent
state-of-the-art performance across two domains. There
are two main advantages of our approach. First of all,
it is adapted to the specific translation task at hand
by taking the corresponding source (target) language
into account. Second, this approach does not rely on
manually segmented training data so that it can be
automatically adapted for different domains.",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "alignment; bilingually motivated; phrase-based
statistical machine translation; word segmentation",
}
@Article{Venkatapathy:2009:DMT,
author = "Sriram Venkatapathy and Srinivas Bangalore",
title = "Discriminative Machine Translation Using Global
Lexical Selection",
journal = j-TALIP,
volume = "8",
number = "2",
pages = "8:1--8:??",
month = may,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1526252.1526256",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Wed Jun 3 16:13:52 MDT 2009",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Statistical phrase-based machine translation models
crucially rely on word alignments. The search for
word-alignments assumes a model of word locality
between source and target languages that is violated in
starkly different word-order languages such as
English-Hindi. In this article, we present models that
decouple the steps of lexical selection and lexical
reordering with the aim of minimizing the role of
word-alignment in machine translation. Indian languages
are morphologically rich and have relatively free-word
order where the grammatical role of content words is
largely determined by their case markers and not just
by their positions in the sentence. Hence, lexical
selection plays a far greater role than lexical
reordering. For lexical selection, we investigate
models that take the entire source sentence into
account and evaluate their performance for
English-Hindi translation in a tourism domain.",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "global lexical selection; machine translation",
}
@Article{Tsunakawa:2009:CJL,
author = "Takashi Tsunakawa and Naoaki Okazaki and Xiao Liu and
Jun'ichi Tsujii",
title = "A {Chinese--Japanese} Lexical Machine Translation
through a Pivot Language",
journal = j-TALIP,
volume = "8",
number = "2",
pages = "9:1--9:??",
month = may,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1526252.1526257",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Wed Jun 3 16:13:52 MDT 2009",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "The bilingual lexicon is an expensive but critical
resource for multilingual applications in natural
language processing. This article proposes an
integrated framework for building a bilingual lexicon
between the Chinese and Japanese languages. Since the
language pair Chinese--Japanese does not include
English, which is a central language of the world, few
large-scale bilingual resources between Chinese and
Japanese have been constructed. One solution to
alleviate this problem is to build a Chinese--Japanese
bilingual lexicon through English as the pivot
language. In addition to the pivotal approach, we can
make use of the characteristics of Chinese and Japanese
languages that use Han characters. We incorporate a
translation model obtained from a small
Chinese--Japanese lexicon and use the similarity of the
hanzi and kanji characters by using the log-linear
model. Our experimental results show that the use of
the pivotal approach can improve the translation
performance over the translation model built from a
small Chinese--Japanese lexicon. The results also
demonstrate that the similarity between the hanzi and
kanji characters provides a positive effect for
translating technical terms.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "bilingual lexicon; Han characters; hanzi; kanji; pivot
language; statistical machine translation",
}
@Article{Chen:2009:USD,
author = "Wenliang Chen and Daisuke Kawahara and Kiyotaka
Uchimoto and Yujie Zhang and Hitoshi Isahara",
title = "Using Short Dependency Relations from Auto-Parsed Data
for {Chinese} Dependency Parsing",
journal = j-TALIP,
volume = "8",
number = "3",
pages = "10:1--10:??",
month = aug,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1568292.1568293",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Mar 29 15:37:08 MDT 2010",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Dependency parsing has become increasingly popular for
a surge of interest lately for applications such as
machine translation and question answering. Currently,
several supervised learning methods can be used for
training high-performance dependency parsers if
sufficient labeled data are available.\par
However, currently used statistical dependency parsers
provide poor results for words separated by long
distances. In order to solve this problem, this article
presents an effective dependency parsing approach of
incorporating short dependency information from
unlabeled data. The unlabeled data is automatically
parsed by using a deterministic dependency parser,
which exhibits a relatively high performance for short
dependencies between words. We then train another
parser that uses the information on short dependency
relations extracted from the output of the first
parser. The proposed approach achieves an unlabeled
attachment score of 86.52\%, an absolute 1.24\%
improvement over the baseline system on the Chinese
Treebank data set. The results indicate that the
proposed approach improves the parsing performance for
longer distance words.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "Chinese dependency parsing; semi-supervised learning;
unlabeled data",
}
@Article{Chanda:2009:WWT,
author = "Sukalpa Chanda and Umapada Pal and Oriol Ramos
Terrades",
title = "Word-Wise {Thai} and {Roman} Script Identification",
journal = j-TALIP,
volume = "8",
number = "3",
pages = "11:1--11:??",
month = aug,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1568292.1568294",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Mar 29 15:37:08 MDT 2010",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "In some Thai documents, a single text line of a
printed document page may contain words of both Thai
and Roman scripts. For the Optical Character
Recognition (OCR) of such a document page it is better
to identify, at first, Thai and Roman script portions
and then to use individual OCR systems of the
respective scripts on these identified portions. In
this article, an SVM-based method is proposed for
identification of word-wise printed Roman and Thai
scripts from a single line of a document page. Here, at
first, the document is segmented into lines and then
lines are segmented into character groups (words). In
the proposed scheme, we identify the script of a
character group combining different character features
obtained from structural shape, profile behavior,
component overlapping information, topological
properties, and water reservoir concept, etc. Based on
the experiment on 10,000 data (words) we obtained
99.62\% script identification accuracy from the
proposed scheme.",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "Multi-script OCR; script identification; SVM; Thai
Script",
}
@Article{Nguyen:2009:WSC,
author = "Cam-Tu Nguyen and Xuan-Hieu Phan and Susumu Horiguchi
and Thu-Trang Nguyen and Quang-Thuy Ha",
title = "{Web} Search Clustering and Labeling with Hidden
Topics",
journal = j-TALIP,
volume = "8",
number = "3",
pages = "12:1--12:??",
month = aug,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1568292.1568295",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Mar 29 15:37:08 MDT 2010",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Web search clustering is a solution to reorganize
search results (also called ``snippets'') in a more
convenient way for browsing. There are three key
requirements for such post-retrieval clustering
systems: (1) the clustering algorithm should group
similar documents together; (2) clusters should be
labeled with descriptive phrases; and (3) the
clustering system should provide high-quality
clustering without downloading the whole Web
page.\par
This article introduces a novel framework for
clustering Web search results in Vietnamese which
targets the three above issues. The main motivation is
that by enriching short snippets with hidden topics
from huge resources of documents on the Internet, it is
able to cluster and label such snippets effectively in
a topic-oriented manner without concerning whole Web
pages. Our approach is based on recent successful topic
analysis models, such as Probabilistic-Latent Semantic
Analysis, or Latent Dirichlet Allocation. The
underlying idea of the framework is that we collect a
very large external data collection called ``universal
dataset,'' and then build a clustering system on both
the original snippets and a rich set of hidden topics
discovered from the universal data collection. This can
be seen as a richer representation of snippets to be
clustered. We carry out careful evaluation of our
method and show that our method can yield impressive
clustering quality.",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "cluster labeling; collocation; hidden topics analysis;
Hierarchical Agglomerative Clustering; Latent Dirichlet
allocation; Vietnamese; Web search clustering",
}
@Article{Shaalan:2009:ISI,
author = "K. Shaalan and A. Farghaly",
title = "Introduction to the Special Issue on {Arabic} Natural
Language Processing",
journal = j-TALIP,
volume = "8",
number = "4",
pages = "13:1--13:??",
month = dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1644879.1644880",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Mar 29 15:37:17 MDT 2010",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Farghaly:2009:ANL,
author = "Ali Farghaly and Khaled Shaalan",
title = "{Arabic} Natural Language Processing: Challenges and
Solutions",
journal = j-TALIP,
volume = "8",
number = "4",
pages = "14:1--14:??",
month = dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1644879.1644881",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Mar 29 15:37:17 MDT 2010",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "The Arabic language presents researchers and
developers of natural language processing (NLP)
applications for Arabic text and speech with serious
challenges. The purpose of this article is to describe
some of these challenges and to present some solutions
that would guide current and future practitioners in
the field of Arabic natural language processing (ANLP).
We begin with general features of the Arabic language
in Sections 1, 2, and 3 and then we move to more
specific properties of the language in the rest of the
article. In Section 1 of this article we highlight the
significance of the Arabic language today and describe
its general properties. Section 2 presents the feature
of Arabic Diglossia showing how the sociolinguistic
aspects of the Arabic language differ from other
languages. The stability of Arabic Diglossia and its
implications for ANLP applications are discussed and
ways to deal with this problematic property are
proposed. Section 3 deals with the properties of the
Arabic script and the explosion of ambiguity that
results from the absence of short vowel representations
and overt case markers in contemporary Arabic texts. We
present in Section 4 specific features of the Arabic
language such as the nonconcatenative property of
Arabic morphology, Arabic as an agglutinative language,
Arabic as a pro-drop language, and the challenge these
properties pose to ANLP. We also present solutions that
have already been adopted by some pioneering
researchers in the field. In Section 5 we point out to
the lack of formal and explicit grammars of Modern
Standard Arabic which impedes the progress of more
advanced ANLP systems. In Section 6 we draw our
conclusion.",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "Arabic dialects; Arabic script; Modern Standard
Arabic",
}
@Article{Espana-Bonet:2009:DPB,
author = "Cristina Espa{\~n}a-Bonet and Jes{\'u}s Gim{\'e}nez
and Llu{\'\i}s M{\`a}rquez",
title = "Discriminative Phrase-Based Models for {Arabic}
Machine Translation",
journal = j-TALIP,
volume = "8",
number = "4",
pages = "15:1--15:??",
month = dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1644879.1644882",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Mar 29 15:37:17 MDT 2010",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "A design for an Arabic-to-English translation system
is presented. The core of the system implements a
standard phrase-based statistical machine translation
architecture, but it is extended by incorporating a
local discriminative phrase selection model to address
the semantic ambiguity of Arabic. Local classifiers are
trained using linguistic information and context to
translate a phrase, and this significantly increases
the accuracy in phrase selection with respect to the
most frequent translation traditionally considered.
These classifiers are integrated into the translation
system so that the global task gets benefits from the
discriminative learning. As a result, we obtain
significant improvements in the full translation task
at the lexical, syntactic, and semantic levels as
measured by an heterogeneous set of automatic
evaluation metrics.",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "Arabic; discriminative learning; English; statistical
machine translation",
}
@Article{Benajiba:2009:MBS,
author = "Yassine Benajiba and Imed Zitouni",
title = "Morphology-Based Segmentation Combination for {Arabic}
Mention Detection",
journal = j-TALIP,
volume = "8",
number = "4",
pages = "16:1--16:??",
month = dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1644879.1644883",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Mar 29 15:37:17 MDT 2010",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "The Arabic language has a very rich/complex
morphology. Each Arabic word is composed of zero or
more {\em prefixes}, one {\em stem\/} and zero or more
{\em suffixes}. Consequently, the Arabic data is sparse
compared to other languages such as English, and it is
necessary to conduct word segmentation before any
natural language processing task. Therefore, the
word-segmentation step is worth a deeper study since it
is a preprocessing step which shall have a significant
impact on all the steps coming afterward. In this
article, we present an Arabic mention detection system
that has very competitive results in the recent
Automatic Content Extraction (ACE) evaluation campaign.
We investigate the impact of different segmentation
schemes on Arabic mention detection systems and we show
how these systems may benefit from more than one
segmentation scheme. We report the performance of
several mention detection models using different kinds
of possible and known segmentation schemes for Arabic
text: punctuation separation, Arabic Treebank, and
morphological and character-level segmentations. We
show that the combination of competitive segmentation
styles leads to a better performance. Results indicate
a statistically significant improvement when Arabic
Treebank and morphological segmentations are
combined.",
acknowledgement = ack-nhfb,
articleno = "16",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "Arabic information extraction; Arabic mention
detection; Arabic segmentation",
}
@Article{Zitouni:2009:CLI,
author = "Imed Zitouni and Radu Florian",
title = "Cross-Language Information Propagation for {Arabic}
Mention Detection",
journal = j-TALIP,
volume = "8",
number = "4",
pages = "17:1--17:??",
month = dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1644879.1644884",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Mar 29 15:37:17 MDT 2010",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "In the last two decades, significant effort has been
put into annotating linguistic resources in several
languages. Despite this valiant effort, there are still
many languages left that have only small amounts of
such resources. The goal of this article is to present
and investigate a method of propagating information
(specifically mention detection) from a resource-rich
language into a relatively resource-poor language such
as Arabic. Part of the investigation is to quantify the
contribution of propagating information in different
conditions based on the availability of resources in
the target language. Experiments on the language pair
Arabic-English show that one can achieve relatively
decent performance by propagating information from a
language with richer resources such as English into
Arabic alone (no resources or models in the source
language Arabic). Furthermore, results show that
propagated features from English do help improve the
Arabic system performance even when used in conjunction
with all feature types built from the source language.
Experiments also show that using propagated features in
conjunction with lexically derived features only (as
can be obtained directly from a mention annotated
corpus) brings the system performance at the one
obtained in the target language by using feature
derived from many linguistic resources, therefore
improving the system when such resources are not
available. In addition to Arabic-English language pair,
we investigate the effectiveness of our approach on
other language pairs such as Chinese--English and
Spanish--English.",
acknowledgement = ack-nhfb,
articleno = "17",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "Arabic information extraction; Arabic mention
detection",
}
@Article{Lamel:2009:AST,
author = "Lori Lamel and Abdelkhalek Messaoudi and Jean-Luc
Gauvain",
title = "Automatic Speech-to-Text Transcription in {Arabic}",
journal = j-TALIP,
volume = "8",
number = "4",
pages = "18:1--18:??",
month = dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1644879.1644885",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Mar 29 15:37:17 MDT 2010",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "The Arabic language presents a number of challenges
for speech recognition, arising in part from the
significant differences in the spoken and written
forms, in particular the conventional form of texts
being non-vowelized. Being a highly inflected language,
the Arabic language has a very large lexical variety
and typically with several possible (generally
semantically linked) vowelizations for each written
form. This article summarizes research carried out over
the last few years on speech-to-text transcription of
broadcast data in Arabic. The initial research was
oriented toward processing of broadcast news data in
Modern Standard Arabic, and has since been extended to
address a larger variety of broadcast data, which as a
consequence results in the need to also be able to
handle dialectal speech. While standard techniques in
speech recognition have been shown to apply well to the
Arabic language, taking into account language
specificities help to significantly improve system
performance.",
acknowledgement = ack-nhfb,
articleno = "18",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "Arabic language processing; automatic speech
recognition; mophological decomposition; speech
processing; speech-to-text transcription",
}
@Article{Moisl:2009:SLL,
author = "Hermann Moisl",
title = "Sura Length and Lexical Probability Estimation in
Cluster Analysis of the {Qur'an}",
journal = j-TALIP,
volume = "8",
number = "4",
pages = "19:1--19:??",
month = dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1644879.1644886",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Mar 29 15:37:17 MDT 2010",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Thabet [2005] applied cluster analysis to the Qur'an
in the hope of generating a classification of the
(suras) that is useful for understanding of its
thematic structure. The result was positive, but
variation in (sura) length was a problem because
clustering of the shorter was found to be unreliable.
The present discussion addresses this problem in four
parts. The first part summarizes Thabet's work. The
second part argues that unreliable clustering of the
shorter is a consequence of poor estimation of lexical
population probabilities in those. The third part
proposes a solution to the problem based on calculation
of a minimum length threshold using concepts from
statistical sampling theory followed by selection of
and lexical variables based on that threshold. The
fourth part applies the proposed solution to a
reanalysis of the Qur'an.",
acknowledgement = ack-nhfb,
articleno = "19",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "Arabic natural language processing; cluster analysis;
document length normalization; lexical probability
estimation; Qur'an sampling",
}
@Article{Hsu:2010:MST,
author = "Chung-Chian Hsu and Chien-Hsing Chen",
title = "Mining Synonymous Transliterations from the {World
Wide Web}",
journal = j-TALIP,
volume = "9",
number = "1",
pages = "1:1--1:??",
month = mar,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1731035.1731036",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Mar 29 15:34:01 MDT 2010",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "The World Wide Web has been considered one of the
important sources for information. Using search engines
to retrieve Web pages can gather lots of information,
including foreign information. However, to be better
understood by local readers, proper names in a foreign
language, such as English, are often transliterated to
a local language such as Chinese. Due to different
translators and the lack of translation standard,
translating foreign proper nouns may result in
different transliterations and pose a notorious
headache. In particular, it may cause incomplete search
results. Using one transliteration as a query keyword
will fail to retrieve the Web pages which use a
different word as the transliteration. Consequently,
important information may be missed. We present a
framework for mining synonymous transliterations as
many as possible from the Web for a given
transliteration. The results can be used to construct a
database of synonymous transliterations which can be
utilized for query expansion so as to alleviate the
incomplete search problem. Experimental results show
that the proposed framework can effectively retrieve
the set of snippets which may contain synonymous
transliterations and then extract the target terms.
Most of the extracted synonymous transliterations have
higher rank of similarity to the input transliteration
compared to other noise terms.",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "Chinese transliteration; cross-lingual information
retrieval; synonymous transliteration; text mining; Web
mining",
}
@Article{Liu:2010:ISS,
author = "Feifan Liu and Yang Liu",
title = "Identification of Soundbite and Its Speaker Name Using
Transcripts of Broadcast News Speech",
journal = j-TALIP,
volume = "9",
number = "1",
pages = "2:1--2:??",
month = mar,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1731035.1731037",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Mar 29 15:34:01 MDT 2010",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "This article presents a pipeline framework for
identifying soundbite and its speaker name from
Mandarin broadcast news transcripts. Both of the two
modules, soundbite segment detection and soundbite
speaker name recognition, are based on a supervised
classification approach using multiple linguistic
features. We systematically evaluated performance for
each module as well as the entire system, and
investigated the effect of using speech recognition
(ASR) output and automatic sentence segmentation. We
found that both of the two components impact the
pipeline system, with more degradation in the entire
system performance due to automatic speaker name
recognition errors than soundbite segment detection. In
addition, our experimental results show that using ASR
output degrades the system performance significantly,
and that using automatic sentence segmentation greatly
impacts soundbite detection, but has much less effect
on speaker name recognition.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "automatic speech recognition; sentence segmentation;
Soundbite detection; speaker name recognition",
}
@Article{Tepper:2010:IMU,
author = "Michael Tepper and Fei Xia",
title = "Inducing Morphemes Using Light Knowledge",
journal = j-TALIP,
volume = "9",
number = "1",
pages = "3:1--3:??",
month = mar,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1731035.1731038",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Mar 29 15:34:01 MDT 2010",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Allomorphic variation, or form variation among morphs
with the same meaning, is a stumbling block to
morphological induction (MI). To address this problem,
we present a hybrid approach that uses a small amount
of linguistic knowledge in the form of orthographic
rewrite rules to help refine an existing MI-produced
segmentation. Using rules, we derive underlying
analyses of morphs---generalized with respect to
contextual spelling differences---from an existing
surface morph segmentation, and from these we learn a
morpheme-level segmentation. To learn morphemes, we
have extended the Morfessor segmentation algorithm
[Creutz and Lagus 2004; 2005; 2006] by using rules to
infer possible underlying analyses from surface
segmentations. A segmentation produced by Morfessor
Categories-MAP Software v. 0.9.2 is used as input to
our procedure and as a baseline that we evaluate
against. To suggest analyses for our procedure, a set
of language-specific orthographic rules is needed. Our
procedure has yielded promising improvements for
English and Turkish over the baseline approach when
tested on the Morpho Challenge 2005 and 2007 style
evaluations. On the Morpho Challenge 2007 test
evaluation, we report gains over the current best
unsupervised contestant for Turkish, where our
technique shows a 2.5\% absolute {\em F\/} -score
improvement.",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "allomorphy; computational linguistics; machine
learning; Morphological induction",
}
@Article{Baldwin:2010:RMB,
author = "Timothy Baldwin and Sunam Kim and Francis Bond and
Sanae Fujita and David Martinez and Takaaki Tanaka",
title = "A Reexamination of {MRD}-Based Word Sense
Disambiguation",
journal = j-TALIP,
volume = "9",
number = "1",
pages = "4:1--4:??",
month = mar,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1731035.1731039",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Mar 29 15:34:01 MDT 2010",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "This article reconsiders the task of MRD-based word
sense disambiguation, in extending the basic Lesk
algorithm to investigate the impact on WSD performance
of different tokenization schemes and methods of
definition extension. In experimentation over the
Hinoki Sensebank and the Japanese Senseval-2 dictionary
task, we demonstrate that sense-sensitive definition
extension over hyponyms, hypernyms, and synonyms,
combined with definition extension and word
tokenization leads to WSD accuracy above both
unsupervised and supervised baselines. In doing so, we
demonstrate the utility of ontology induction and
establish new opportunities for the development of
baseline unsupervised WSD methods.",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "Japanese; machine-readable dictionary; word sense
disambiguation",
}
@Article{Zhao:2010:UCB,
author = "Hai Zhao and Chang-Ning Huang and Mu Li and Bao-Liang
Lu",
title = "A Unified Character-Based Tagging Framework for
{Chinese} Word Segmentation",
journal = j-TALIP,
volume = "9",
number = "2",
pages = "5:1--5:??",
month = jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1781134.1781135",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Jun 21 18:03:02 MDT 2010",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Chinese word segmentation is an active area in Chinese
language processing though it is suffering from the
argument about what precisely is a word in Chinese.
Based on corpus-based segmentation standard, we
launched this study. In detail, we regard Chinese word
segmentation as a character-based tagging problem. We
show that there has been a potent trend of using a
character-based tagging approach in this field. In
particular, learning from segmented corpus with or
without additional linguistic resources is treated in a
unified way in which the only difference depends on how
the feature template set is selected. It differs from
existing work in that both feature template selection
and tag set selection are considered in our approach,
instead of the previous feature template focus only
technique. We show that there is a significant
performance difference as different tag sets are
selected. This is especially applied to a six-tag set,
which is good enough for most current segmented
corpora. The linguistic meaning of a tag set is also
discussed. Our results show that a simple learning
system with six $n$-gram feature templates and a
six-tag set can obtain competitive performance in the
cases of learning only from a training corpus. In cases
when additional linguistic resources are available, an
ensemble learning technique, assistant segmenter, is
proposed and its effectiveness is verified. Assistant
segmenter is also proven to be an effective method as
segmentation standard adaptation that outperforms
existing ones. Based on the proposed approach, our
system provides state-of-the-art performance in all 12
corpora of three international Chinese word
segmentation bakeoffs.",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "assistant segmenter; character-based tagging method;
Chinese word segmentation; conditional random field;
tag set selection",
}
@Article{Guo:2010:LIS,
author = "Yuqing Guo and Haifeng Wang and Josef van Genabith",
title = "A Linguistically Inspired Statistical Model for
{Chinese} Punctuation Generation",
journal = j-TALIP,
volume = "9",
number = "2",
pages = "6:1--6:??",
month = jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1781134.1781136",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Jun 21 18:03:02 MDT 2010",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "This article investigates a relatively underdeveloped
subject in natural language processing---the generation
of punctuation marks. From a theoretical perspective,
we study 16 Chinese punctuation marks as defined in the
Chinese national standard of punctuation usage, and
categorize these punctuation marks into three different
types according to their syntactic properties. We
implement a three-tier maximum entropy model
incorporating linguistically-motivated features for
generating the commonly used Chinese punctuation marks
in unpunctuated sentences output by a surface realizer.
Furthermore, we present a method to automatically
extract cue words indicating sentence-final punctuation
marks as a specialized feature to construct a more
precise model. Evaluating on the Penn Chinese Treebank
data, the MaxEnt model achieves an {\em f\/} -score of
79.83\% for punctuation insertion and 74.61\% for
punctuation restoration using gold data input, 79.50\%
for insertion and 73.32\% for restoration using
parser-based imperfect input. The experiments show that
the MaxEnt model significantly outperforms a baseline
5-gram language model that scores 54.99\% for
punctuation insertion and 52.01\% for restoration. We
show that our results are not far from human
performance on the same task with human insertion {\em
f\/} -scores in the range of 81-87\% and human
restoration in the range of 71-82\%. Finally, a manual
error analysis of the generation output shows that
close to 40\% of the mismatched punctuation marks do in
fact result in acceptable choices, a fact obscured in
the automatic string-matching based evaluation
scores.",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "Chinese punctuation marks; maximum entropy model;
sentence realization",
}
@Article{Naptali:2010:TDL,
author = "Welly Naptali and Masatoshi Tsuchiya and Seiichi
Nakagawa",
title = "Topic-Dependent Language Model with Voting on Noun
History",
journal = j-TALIP,
volume = "9",
number = "2",
pages = "7:1--7:??",
month = jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1781134.1781137",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Jun 21 18:03:02 MDT 2010",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Language models (LMs) are an important field of study
in automatic speech recognition (ASR) systems. LM helps
acoustic models find the corresponding word sequence of
a given speech signal. Without it, ASR systems would
not understand the language and it would be hard to
find the correct word sequence. During the past few
years, researchers have tried to incorporate long-range
dependencies into statistical word-based $n$-gram LMs.
One of these long-range dependencies is topic. Unlike
words, topic is unobservable. Thus, it is required to
find the meanings behind the words to get into the
topic. This research is based on the belief that nouns
contain topic information. We propose a new approach
for a topic-dependent LM, where the topic is decided in
an unsupervised manner. Latent Semantic Analysis (LSA)
is employed to reveal hidden (latent) relations among
nouns in the context words. To decide the topic of an
event, a fixed size word history sequence (window) is
observed, and voting is then carried out based on noun
class occurrences weighted by a confidence measure.
Experiments were conducted on an English corpus and a
Japanese corpus: {\em The Wall Street Journal\/} corpus
and {\em Mainichi Shimbun\/} (Japanese newspaper)
corpus. The results show that our proposed method gives
better perplexity than the comparative baselines,
including a word-based/class-based $n$-gram LM, their
interpolated LM, a cache-based LM, a topic-dependent LM
based on $n$-gram, and a topic-dependent LM based on
Latent Dirichlet Allocation (LDA). The {\em n\/} -best
list rescoring was conducted to validate its
application in ASR systems.",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "Language model; latent semantic analysis; perplexity;
speech recognition; topic dependent",
}
@Article{Ng:2010:SJ,
author = "Hwee Tou Ng",
title = "The State of the Journal",
journal = j-TALIP,
volume = "9",
number = "3",
pages = "8:1--8:??",
month = sep,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1838745.1838750",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Sep 18 15:58:58 MDT 2010",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Harman:2010:ISI,
author = "Donna Harman and Noriko Kando and Prasenjit Majumder
and Mandar Mitra and Carol Peters",
title = "Introduction to the {Special Issue on Indian Language
Information Retrieval Part I}",
journal = j-TALIP,
volume = "9",
number = "3",
pages = "9:1--9:??",
month = sep,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1838745.1838746",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Sep 18 15:58:58 MDT 2010",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Majumder:2010:FEE,
author = "Prasenjit Majumder and Mandar Mitra and Dipasree Pal
and Ayan Bandyopadhyay and Samaresh Maiti and Sukomal
Pal and Deboshree Modak and Sucharita Sanyal",
title = "The {FIRE 2008} Evaluation Exercise",
journal = j-TALIP,
volume = "9",
number = "3",
pages = "10:1--10:??",
month = sep,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1838745.1838747",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Sep 18 15:58:58 MDT 2010",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "The aim of the Forum for Information Retrieval
Evaluation (FIRE) is to create an evaluation framework
in the spirit of TREC (Text REtrieval Conference), CLEF
(Cross-Language Evaluation Forum), and NTCIR (NII Test
Collection for IR Systems), for Indian language
Information Retrieval. The first evaluation exercise
conducted by FIRE was completed in 2008. This article
describes the test collections used at FIRE 2008,
summarizes the approaches adopted by various
participants, discusses the limitations of the
datasets, and outlines the tasks planned for the next
iteration of FIRE.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "evaluation; Indian languages; information retrieval",
}
@Article{Dolamic:2010:CSI,
author = "Ljiljana Dolamic and Jacques Savoy",
title = "Comparative Study of Indexing and Search Strategies
for the {Hindi}, {Marathi}, and {Bengali} Languages",
journal = j-TALIP,
volume = "9",
number = "3",
pages = "11:1--11:??",
month = sep,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1838745.1838748",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Sep 18 15:58:58 MDT 2010",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "The main goal of this article is to describe and
evaluate various indexing and search strategies for the
Hindi, Bengali, and Marathi languages. These three
languages are ranked among the world's 20 most spoken
languages and they share similar syntax, morphology,
and writing systems. In this article we examine these
languages from an Information Retrieval (IR)
perspective through describing the key elements of
their inflectional and derivational morphologies, and
suggest a light and more aggressive stemming approach
based on them.\par
In our evaluation of these stemming strategies we make
use of the FIRE 2008 test collections, and then to
broaden our comparisons we implement and evaluate two
language independent indexing methods: the $n$-gram and
trunc-$n$ (truncation of the first $n$ letters). We
evaluate these solutions by applying our various IR
models, including the Okapi, Divergence from Randomness
(DFR) and statistical language models (LM) together
with two classical vector-space approaches: {\em tf
idf\/} and {\em Lnu-ltc}.\par
Experiments performed with all three languages
demonstrate that the I(n$_e$)C2 model derived from the
Divergence from Randomness paradigm tends to provide
the best mean average precision (MAP). Our own tests
suggest that improved retrieval effectiveness would be
obtained by applying more aggressive stemmers,
especially those accounting for certain derivational
suffixes, compared to those involving a light stemmer
or ignoring this type of word normalization procedure.
Comparisons between no stemming and stemming indexing
schemes shows that performance differences are almost
always statistically significant. When, for example, an
aggressive stemmer is applied, the relative
improvements obtained are $\approx$28\% for the Hindi
language, $\approx$42\% for Marathi, and $\approx$18\%
for Bengali, as compared to a no-stemming approach.
Based on a comparison of word-based and
language-independent approaches we find that the
trunc-4 indexing scheme tends to result in performance
levels statistically similar to those of an aggressive
stemmer, yet better than the 4-gram indexing scheme. A
query-by-query analysis reveals the reasons for this,
and also demonstrates the advantage of applying a
stemming or a trunc-4 indexing scheme.",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "Bengali language; Hindi language; Indic languages;
Marathi language; natural language processing with
Indo-European languages; search engines for Asian
languages; stemmer",
}
@Article{Leveling:2010:SWI,
author = "Johannes Leveling and Gareth J. F. Jones",
title = "Sub-Word Indexing and Blind Relevance Feedback for
{English}, {Bengali}, {Hindi}, and {Marathi} {IR}",
journal = j-TALIP,
volume = "9",
number = "3",
pages = "12:1--12:??",
month = sep,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1838745.1838749",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Sep 18 15:58:58 MDT 2010",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "The Forum for Information Retrieval Evaluation (FIRE)
provides document collections, topics, and relevance
assessments for information retrieval (IR) experiments
on Indian languages. Several research questions are
explored in this article: (1) How to create a simple,
language-independent corpus-based stemmer, (2) How to
identify sub-words and which types of sub-words are
suitable as indexing units, and (3) How to apply blind
relevance feedback on sub-words and how feedback term
selection is affected by the type of the indexing unit.
More than 140 IR experiments are conducted using the
BM25 retrieval model on the topic titles and
descriptions (TD) for the FIRE 2008 English, Bengali,
Hindi, and Marathi document collections.\par
The major findings are: The corpus-based stemming
approach is effective as a knowledge-light term
conflation step and useful in the case of few
language-specific resources. For English, the
corpus-based stemmer performs nearly as well as the
Porter stemmer and significantly better than the
baseline of indexing words when combined with query
expansion. In combination with blind relevance
feedback, it also performs significantly better than
the baseline for Bengali and Marathi IR.\par
Sub-words such as consonant-vowel sequences and word
prefixes can yield similar or better performance in
comparison to word indexing. There is no best
performing method for all languages. For English,
indexing using the Porter stemmer performs best, for
Bengali and Marathi, overlapping 3-grams obtain the
best result, and for Hindi, 4-prefixes yield the
highest MAP. However, in combination with blind
relevance feedback using 10 documents and 20 terms,
6-prefixes for English and 4-prefixes for Bengali,
Hindi, and Marathi IR yield the highest
MAP.\par
Sub-word identification is a general case of
decompounding. It results in one or more index terms
for a single word form and increases the number of
index terms but decreases their average length. The
corresponding retrieval experiments show that relevance
feedback on sub-words benefits from selecting a larger
number of index terms in comparison with retrieval on
word forms. Similarly, selecting the number of
relevance feedback terms depending on the ratio of word
vocabulary size to sub-word vocabulary size almost
always slightly increases information retrieval
effectiveness compared to using a fixed number of terms
for different languages.",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
keywords = "blind relevance feedback; evaluation; FIRE;
Information retrieval; stemming; sub-word indexing",
}
@Article{Kumaran:2010:CMT,
author = "A. Kumaran and Mitesh M. Khapra and Pushpak
Bhattacharyya",
title = "Compositional Machine Transliteration",
journal = j-TALIP,
volume = "9",
number = "4",
pages = "13:1--13:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1838751.1838752",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Wed Dec 15 10:47:09 MST 2010",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Machine transliteration is an important problem in an
increasingly multilingual world, as it plays a critical
role in many downstream applications, such as machine
translation or crosslingual information retrieval
systems. In this article, we propose compositional
machine transliteration systems, where multiple
transliteration components may be composed either to
improve existing transliteration quality, or to enable
transliteration functionality between languages even
when no direct parallel names corpora exist between
them. Specifically, we propose two distinct forms of
composition: serial and parallel. Serial compositional
system chains individual transliteration components,
say, $X \rightarrow Y$ and $Y \rightarrow Z$ systems,
to provide transliteration functionality, $X
\rightarrow Z$.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Chinnakotla:2010:TRS,
author = "Manoj K. Chinnakotla and Om P. Damani and Avijit
Satoskar",
title = "Transliteration for Resource-Scarce Languages",
journal = j-TALIP,
volume = "9",
number = "4",
pages = "14:1--14:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1838751.1838753",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Wed Dec 15 10:47:09 MST 2010",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Today, parallel corpus-based systems dominate the
transliteration landscape. But the resource-scarce
languages do not enjoy the luxury of large parallel
transliteration corpus. For these languages, rule-based
transliteration is the only viable option. In this
article, we show that by properly harnessing the
monolingual resources in conjunction with manually
created rule base, one can achieve reasonable
transliteration performance. We achieve this
performance by exploiting the power of Character
Sequence Modeling (CSM), which requires only
monolingual resources. We present the results of our
rule-based system for Hindi to English, English to
Hindi, and Persian to English transliteration tasks. We
also perform extrinsic evaluation of transliteration
systems in the context of Cross Lingual Information
Retrieval.",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Mukund:2010:IES,
author = "Smruthi Mukund and Rohini Srihari and Erik Peterson",
title = "An Information-Extraction System for {Urdu}---{A}
Resource-Poor Language",
journal = j-TALIP,
volume = "9",
number = "4",
pages = "15:1--15:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1838751.1838754",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Wed Dec 15 10:47:09 MST 2010",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "There has been an increase in the amount of
multilingual text on the Internet due to the
proliferation of news sources and blogs. The Urdu
language, in particular, has experienced explosive
growth on the Web. Text mining for information
discovery, which includes tasks such as identifying
topics, relationships and events, and sentiment
analysis, requires sophisticated natural language
processing (NLP). NLP systems begin with modules such
as word segmentation, part-of-speech tagging, and
morphological analysis and progress to modules such as
shallow parsing and named entity tagging. While there
have been considerable advances in developing such
comprehensive NLP systems for English, the work for
Urdu is still in its infancy.",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Katz:2011:ISI,
author = "Graham Katz and Mona Diab",
title = "Introduction to the Special Issue on {Arabic}
Computational Linguistics",
journal = j-TALIP,
volume = "10",
number = "1",
pages = "1:1--1:??",
month = mar,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1929908.1929909",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Wed Mar 16 18:07:50 MDT 2011",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Condon:2011:MTE,
author = "S. Condon and D. Parvaz and J. Aberdeen and C. Doran
and A. Freeman and M. Awad",
title = "Machine Translation Errors: {English} and {Iraqi
Arabic}",
journal = j-TALIP,
volume = "10",
number = "1",
pages = "2:1--2:??",
month = mar,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1929908.1929910",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Wed Mar 16 18:07:50 MDT 2011",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Errors in machine translations of English-Iraqi Arabic
dialogues were analyzed using the methods developed for
the Human Translation Error Rate measure (HTER). Human
annotations were used to refine the Translation Error
Rate (TER) annotations. The analyses were performed on
approximately 100 translations into each language from
four translation systems. Results include high
frequencies of pronoun errors and errors involving the
copula in translations to English. High frequencies of
errors in subject/person inflection and closed-word
classes characterized translations to Iraqi Arabic.
There were similar frequencies of word order errors in
both translation directions and low frequencies of
polarity errors. The problems associated with many
errors can be predicted from structural differences
between the two languages.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Rytting:2011:SCD,
author = "C. Anton Rytting and David M. Zajic and Paul Rodrigues
and Sarah C. Wayland and Christian Hettick and Tim
Buckwalter and Charles C. Blake",
title = "Spelling Correction for Dialectal {Arabic} Dictionary
Lookup",
journal = j-TALIP,
volume = "10",
number = "1",
pages = "3:1--3:??",
month = mar,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1929908.1929911",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Wed Mar 16 18:07:50 MDT 2011",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "The ``Did You Mean\ldots{}?'' system, described in this
article, is a spelling corrector for Arabic that is
designed specifically for L2 learners of dialectal
Arabic in the context of dictionary lookup. The authors
use an orthographic density metric to motivate the need
for a finer-grained ranking method for candidate words
than unweighted Levenshtein edit distance. The Did You
Mean\ldots{}? architecture is described, and the authors
show that mean reciprocal rank can be improved by
tuning operation weights according to sound confusions,
and by anticipating likely spelling variants.",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Kulick:2011:ESC,
author = "Seth Kulick",
title = "Exploiting Separation of Closed-Class Categories for
{Arabic} Tokenization and Part-of-Speech Tagging",
journal = j-TALIP,
volume = "10",
number = "1",
pages = "4:1--4:??",
month = mar,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1929908.1929912",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Wed Mar 16 18:07:50 MDT 2011",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Research on the problem of morphological
disambiguation of Arabic has noted that techniques
developed for lexical disambiguation in English do not
easily transfer over, since the affixation present in
Arabic creates a very different tag set than for
English, encoding both inflectional morphology and more
complex tokenization sequences. This work takes a new
approach to this problem based on a distinction between
the open-class and closed-class categories of tokens,
which differ both in their frequencies and in their
possible morphological affixations. This separation
simplifies the morphological analysis problem
considerably, making it possible to use a Conditional
Random Field model for joint tokenization and ``core''
part-of-speech tagging of the open-class items, while
the closed-class items are handled by regular
expressions.",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Abdul-Mageed:2011:ADA,
author = "Muhammad Abdul-Mageed",
title = "Automatic Detection of {Arabic} Non-Anaphoric Pronouns
for Improving Anaphora Resolution",
journal = j-TALIP,
volume = "10",
number = "1",
pages = "5:1--5:??",
month = mar,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1929908.1929913",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Wed Mar 16 18:07:50 MDT 2011",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Anaphora resolution is one of the most difficult tasks
in NLP. The ability to identify non-referential
pronouns before attempting an anaphora resolution task
would be significant, since the system would not have
to attempt resolving such pronouns and hence end up
with fewer errors. In addition, the number of
non-referential pronouns has been found to be
non-trivial in many domains. The task of detecting
non-referential pronouns could also be incorporated
into a part-of-speech tagger or a parser, or treated as
an initial step in semantic interpretation. In this
article, I describe a machine learning method for
identifying non-referential pronouns in an annotated
subsegment of the Penn Arabic Treebank using three
different feature settings.",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Wu:2011:IPD,
author = "Chung-Hsien Wu and Wei-Bin Liang and Jui-Feng Yeh",
title = "Interruption Point Detection of Spontaneous Speech
Using Inter-Syllable Boundary-Based Prosodic Features",
journal = j-TALIP,
volume = "10",
number = "1",
pages = "6:1--6:??",
month = mar,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1929908.1929914",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Wed Mar 16 18:07:50 MDT 2011",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "This article presents a probabilistic scheme for
detecting the interruption point (IP) in spontaneous
speech based on inter-syllable boundary-based prosodic
features. Because of the high error rate in spontaneous
speech recognition, a combined acoustic model
considering both syllable and subsyllable recognition
units, is firstly used to determine the inter-syllable
boundaries and output the recognition confidence of the
input speech. Based on the finding that IPs always
occur at inter-syllable boundaries, a probability
distribution of the prosodic features at the current
potential IP is estimated. The Conditional Random Field
(CRF) model, which employs the clustered prosodic
features of the current potential IP and its preceding
and succeeding inter-syllable boundaries, is employed
to output the IP likelihood measure.",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Wu:2011:ADS,
author = "Chung-Hsien Wu and Hung-Yu Su and Han-Ping Shen",
title = "Articulation-Disordered Speech Recognition Using
Speaker-Adaptive Acoustic Models and Personalized
Articulation Patterns",
journal = j-TALIP,
volume = "10",
number = "2",
pages = "7:1--7:??",
month = jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1967293.1967294",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Tue Jun 28 18:29:03 MDT 2011",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "This article presents a novel approach to
speaker-adaptive recognition of speech from
articulation-disordered speakers without a large amount
of adaptation data. An unsupervised, incremental
adaptation method is adopted for personalized model
adaptation based on the recognized syllables with high
recognition confidence from an automatic speech
recognition (ASR) system. For articulation pattern
discovery, the manually transcribed syllables and the
corresponding recognized syllables are associated with
each other using articulatory features. The Apriori
algorithm is applied to discover the articulation
patterns in the corpus, which are then used to
construct a personalized pronunciation dictionary to
improve the recognition accuracy of the ASR. The
experimental results indicate that the proposed
adaptation method achieves a syllable error rate
reduction of 6.1\%, outperforming the conventional
adaptation methods that have a syllable error rate
reduction of 3.8\%.",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Paik:2011:FCB,
author = "Jiaul H. Paik and Swapan K. Parui",
title = "A Fast Corpus-Based Stemmer",
journal = j-TALIP,
volume = "10",
number = "2",
pages = "8:1--8:??",
month = jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1967293.1967295",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Tue Jun 28 18:29:03 MDT 2011",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Stemming is a mechanism of word form normalization
that transforms the variant word forms to their common
root. In an Information Retrieval system, it is used to
increase the system's performance, specifically the
recall and desirably the precision. Although its
usefulness is shown to be mixed in languages such as
English, because morphologically complex languages
stemming produces a significant performance
improvement. A number of linguistic rule-based stemmers
are available for most European languages which employ
a set of rules to get back the root word from its
variants. But for Indian languages which are highly
inflectional in nature, devising a linguistic
rule-based stemmer needs some additional resources
which are not available.",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Ekbal:2011:WVB,
author = "Asif Ekbal and Sriparna Saha",
title = "Weighted Vote-Based Classifier Ensemble for Named
Entity Recognition: a Genetic Algorithm-Based
Approach",
journal = j-TALIP,
volume = "10",
number = "2",
pages = "9:1--9:??",
month = jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1967293.1967296",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Tue Jun 28 18:29:03 MDT 2011",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "In this article, we report the search capability of
Genetic Algorithm (GA) to construct a weighted
vote-based classifier ensemble for Named Entity
Recognition (NER). Our underlying assumption is that
the reliability of predictions of each classifier
differs among the various named entity (NE) classes.
Thus, it is necessary to quantify the amount of voting
of a particular classifier for a particular output
class. Here, an attempt is made to determine the
appropriate weights of voting for each class in each
classifier using GA. The proposed technique is
evaluated for four leading Indian languages, namely
Bengali, Hindi, Telugu, and Oriya, which are all
resource-poor in nature.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Liu:2011:VPS,
author = "C.-L. Liu and M.-H. Lai and K.-W. Tien and Y.-H.
Chuang and S.-H. Wu and C.-Y. Lee",
title = "Visually and Phonologically Similar Characters in
Incorrect {Chinese} Words: Analyses, Identification,
and Applications",
journal = j-TALIP,
volume = "10",
number = "2",
pages = "10:1--10:??",
month = jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1967293.1967297",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Tue Jun 28 18:29:03 MDT 2011",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Information about students' mistakes opens a window to
an understanding of their learning processes, and helps
us design effective course work to help students avoid
replication of the same errors. Learning from mistakes
is important not just in human learning activities; it
is also a crucial ingredient in techniques for the
developments of student models. In this article, we
report findings of our study on 4,100 erroneous Chinese
words. Seventy-six percent of these errors were related
to the phonological similarity between the correct and
the incorrect characters, 46\% were due to visual
similarity, and 29\% involved both factors. We propose
a computing algorithm that aims at replication of
incorrect Chinese words.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Chen:2011:ISI,
author = "Keh-Jiann Chen and Qun Liu and Nianwen Xue and Le
Sun",
title = "Introduction to the Special Issue on {Chinese}
Language Processing",
journal = j-TALIP,
volume = "10",
number = "3",
pages = "11:1--11:??",
month = sep,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2002980.2002981",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Fri Sep 9 15:01:12 MDT 2011",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Zhu:2011:ATC,
author = "Muhua Zhu and Jingbo Zhu and Tong Xiao",
title = "Automatic Treebank Conversion via Informed Decoding
--- {A} Case Study on {Chinese} Treebanks",
journal = j-TALIP,
volume = "10",
number = "3",
pages = "12:1--12:??",
month = sep,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2002980.2002982",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Fri Sep 9 15:01:12 MDT 2011",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Treebanks are valuable resources for syntactic
parsing. For some languages such as Chinese, we can
obtain multiple constituency treebanks which are
developed by different organizations. However, due to
discrepancies of underlying annotation standards, such
treebanks in general cannot be used together through
direct data combination. To enlarge training data for
syntactic parsing, we focus in this article on the
challenge of unifying standards of disparate treebanks
by automatically converting one treebank (source
treebank) to fit a different standard which is
exhibited by another treebank (target treebank). We
propose to convert a treebank in two sequential steps
which correspond to the part-of-speech level and
syntactic structure level (including tree structures
and grammar labels), respectively.",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Li:2011:USR,
author = "Junhui Li and Guodong Zhou",
title = "Unified Semantic Role Labeling for Verbal and Nominal
Predicates in the {Chinese} Language",
journal = j-TALIP,
volume = "10",
number = "3",
pages = "13:1--13:??",
month = sep,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2002980.2002983",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Fri Sep 9 15:01:12 MDT 2011",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "This article explores unified semantic role labeling
(SRL) for both verbal and nominal predicates in the
Chinese language. This is done by considering SRL for
both verbal and nominal predicates in a unified
framework. First, we systematically examine various
kinds of features for verbal SRL and nominal SRL,
respectively, besides those widely used ones. Then we
further improve the performance of nominal SRL with
various kinds of verbal evidence, that is, merging the
training instances from verbal predicates and
integrating various kinds of features derived from SRL
for verbal predicates. Finally, we address the issue of
automatic predicate recognition, which is essential for
nominal SRL.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Zhang:2011:DPS,
author = "Peng Zhang and Wenjie Li and Yuexian Hou and Dawei
Song",
title = "Developing Position Structure-Based Framework for
{Chinese} Entity Relation Extraction",
journal = j-TALIP,
volume = "10",
number = "3",
pages = "14:1--14:??",
month = sep,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2002980.2002984",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Fri Sep 9 15:01:12 MDT 2011",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Relation extraction is the task of finding semantic
relations between two entities in text, and is often
cast as a classification problem. In contrast to the
significant achievements on English language, research
progress in Chinese relation extraction is relatively
limited. In this article, we present a novel Chinese
relation extraction framework, which is mainly based on
a 9-position structure. The design of this proposed
structure is motivated by the fact that there are some
obvious connections between relation types/subtypes and
position structures of two entities. The 9-position
structure can be captured with less effort than
applying deep natural language processing, and is
effective to relieve the class imbalance problem which
often hurts the classification performance.",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Qian:2011:ECD,
author = "Longhua Qian and Guodong Zhou and Qiaoming Zhu",
title = "Employing Constituent Dependency Information for Tree
Kernel-Based Semantic Relation Extraction between Named
Entities",
journal = j-TALIP,
volume = "10",
number = "3",
pages = "15:1--15:??",
month = sep,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2002980.2002985",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Fri Sep 9 15:01:12 MDT 2011",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "This article proposes a new approach to dynamically
determine the tree span for tree kernel-based semantic
relation extraction between named entities. The basic
idea is to employ constituent dependency information in
keeping the necessary nodes and their head children
along the path connecting the two entities in the
syntactic parse tree, while removing the noisy
information from the tree, eventually leading to a
dynamic syntactic parse tree. This article also
explores various entity features and their possible
combinations via a unified syntactic and semantic tree
framework, which integrates both structural syntactic
parse information and entity-related semantic
information. Evaluation on the ACE RDC 2004 English and
2005 Chinese benchmark corpora shows that our dynamic
syntactic parse tree much outperforms all previous tree
spans, indicating its effectiveness in well
representing the structural nature of relation
instances while removing redundant information.",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Huang:2011:UST,
author = "Chung-Chi Huang and Ho-Ching Yen and Ping-Che Yang and
Shih-Ting Huang and Jason S. Chang",
title = "Using Sublexical Translations to Handle the {OOV}
Problem in Machine Translation",
journal = j-TALIP,
volume = "10",
number = "3",
pages = "16:1--16:??",
month = sep,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2002980.2002986",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Fri Sep 9 15:01:12 MDT 2011",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "We introduce a method for learning to translate
out-of-vocabulary (OOV) words. The method focuses on
combining sublexical/constituent translations of an OOV
to generate its translation candidates. In our
approach, wildcard searches are formulated based on our
OOV analysis, aimed at maximizing the probability of
retrieving OOVs' sublexical translations from existing
resources of Machine Translation (MT) systems. At
run-time, translation candidates of the unknown words
are generated from their suitable sublexical
translations and ranked based on monolingual and
bilingual information.",
acknowledgement = ack-nhfb,
articleno = "16",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Du:2011:ICE,
author = "Jinhua Du and Andy Way",
title = "Improved {Chinese--English} {SMT} with {Chinese}
{``DE'}' Construction Classification and Reordering",
journal = j-TALIP,
volume = "10",
number = "4",
pages = "17:1--17:??",
month = dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2025384.2025385",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Dec 15 09:23:26 MST 2011",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Syntactic reordering on the source side has been
demonstrated to be helpful and effective for handling
different word orders between source and target
languages in SMT. In this article, we focus on the
Chinese (DE) construction which is flexible and
ubiquitous in Chinese and has many different ways to be
translated into English so that it is a major source of
word order differences in terms of translation quality.
This article carries out the Chinese ``DE''
construction study for Chinese--English SMT in which we
propose a new classifier model---discriminative latent
variable model (DPLVM)---with new features to improve
the classification accuracy and indirectly improve the
translation quality compared to a log-linear
classifier.",
acknowledgement = ack-nhfb,
articleno = "17",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Xiao:2011:LMS,
author = "Tong Xiao and Jingbo Zhu and Muhua Zhu",
title = "Language Modeling for {Syntax-Based} Machine
Translation Using Tree Substitution Grammars: a Case
Study on {Chinese-English} Translation",
journal = j-TALIP,
volume = "10",
number = "4",
pages = "18:1--18:??",
month = dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2025384.2025386",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Dec 15 09:23:26 MST 2011",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "The poor grammatical output of Machine Translation
(MT) systems appeals syntax-based approaches within
language modeling. However, previous studies showed
that syntax-based language modeling using
(Context-Free) Treebank Grammars was not very helpful
in improving BLEU scores for Chinese-English machine
translation. In this article we further study this
issue in the context of Chinese-English syntax-based
Statistical Machine Translation (SMT) where Synchronous
Tree Substitution Grammars (STSGs) are utilized to
model the translation process. In particular, we
develop a Tree Substitution Grammar-based language
model for syntax-based MT, and present three methods to
efficiently integrate the proposed language model into
MT decoding. In addition, we design a simple and
effective method to adapt syntax-based language models
for MT tasks.",
acknowledgement = ack-nhfb,
articleno = "18",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Li:2011:MEC,
author = "Lishuang Li and Peng Wang and Degen Huang and Lian
Zhao",
title = "Mining {English--Chinese} Named Entity Pairs from
Comparable Corpora",
journal = j-TALIP,
volume = "10",
number = "4",
pages = "19:1--19:??",
month = dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2025384.2025387",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Dec 15 09:23:26 MST 2011",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Bilingual Named Entity (NE) pairs are valuable
resources for many NLP applications. Since comparable
corpora are more accessible, abundant and up-to-date,
recent researches have concentrated on mining bilingual
lexicons using comparable corpora. Leveraging
comparable corpora, this research presents a novel
approach to mining English-Chinese NE translations by
combining multi-dimension features from various
information sources for every possible NE pair, which
include the transliteration model, English-Chinese
matching, Chinese-English matching, translation model,
length, and context vector. These features are
integrated into one model with linear combination and
minimum sample risk (MSR) algorithm. As for the high
type-dependence of NE translation, we integrate
different features according to different NE types.",
acknowledgement = ack-nhfb,
articleno = "19",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Liu:2011:UBR,
author = "Zhiyuan Liu and Yabin Zheng and Lixing Xie and Maosong
Sun and Liyun Ru and Yang Zhang",
title = "User Behaviors in Related Word Retrieval and New Word
Detection: a Collaborative Perspective",
journal = j-TALIP,
volume = "10",
number = "4",
pages = "20:1--20:??",
month = dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2025384.2025388",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Dec 15 09:23:26 MST 2011",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Nowadays, user behavior analysis and collaborative
filtering have drawn a large body of research in the
machine learning community. The goal is either to
enhance the user experience or discover useful
information hidden in the data. In this article, we
conduct extensive experiments on a Chinese input method
data set, which keeps the word lists that users have
used. Then, from the collaborative perspective, we aim
to solve two tasks in natural language processing, that
is, related word retrieval and new word detection.
Motivated by the observation that two words are usually
highly related to each other if they co-occur
frequently in users' records, we propose a novel
semantic relatedness measure between words that takes
both user behaviors and collaborative filtering into
consideration.",
acknowledgement = ack-nhfb,
articleno = "20",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Wang:2011:DLA,
author = "Baoxun Wang and Bingquan Liu and Xiaolong Wang and
Chengjie Sun and Deyuan Zhang",
title = "Deep Learning Approaches to Semantic Relevance
Modeling for {Chinese} Question--Answer Pairs",
journal = j-TALIP,
volume = "10",
number = "4",
pages = "21:1--21:??",
month = dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2025384.2025389",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Dec 15 09:23:26 MST 2011",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "The human-generated question-answer pairs in the Web
social communities are of great value for the research
of automatic question-answering technique. Due to the
large amount of noise information involved in such
corpora, it is still a problem to detect the answers
even though the questions are exactly located.
Quantifying the semantic relevance between questions
and their candidate answers is essential to answer
detection in social media corpora. Since both the
questions and their answers usually contain a small
number of sentences, the relevance modeling methods
have to overcome the problem of word feature sparsity.
In this article, the deep learning principle is
introduced to address the semantic relevance modeling
task.",
acknowledgement = ack-nhfb,
articleno = "21",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Pal:2012:HRI,
author = "Umapada Pal and Ramachandran Jayadevan and Nabin
Sharma",
title = "Handwriting Recognition in {Indian} Regional Scripts:
a Survey of Offline Techniques",
journal = j-TALIP,
volume = "11",
number = "1",
pages = "1:1--1:??",
month = mar,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2090176.2090177",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Mar 1 16:54:10 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Offline handwriting recognition in Indian regional
scripts is an interesting area of research as almost
460 million people in India use regional scripts. The
nine major Indian regional scripts are Bangla (for
Bengali and Assamese languages), Gujarati, Kannada,
Malayalam, Oriya, Gurumukhi (for Punjabi language),
Tamil, Telugu, and Nastaliq (for Urdu language). A
state-of-the-art survey about the techniques available
in the area of offline handwriting recognition (OHR) in
Indian regional scripts will be of a great aid to the
researchers in the subcontinent and hence a sincere
attempt is made in this article to discuss the
advancements reported in this regard during the last
few decades.",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Zaghouani:2012:RRB,
author = "Wajdi Zaghouani",
title = "{RENAR}: a Rule-Based {Arabic} Named Entity
Recognition System",
journal = j-TALIP,
volume = "11",
number = "1",
pages = "2:1--2:??",
month = mar,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2090176.2090178",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Mar 1 16:54:10 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Named entity recognition has served many natural
language processing tasks such as information
retrieval, machine translation, and question answering
systems. Many researchers have addressed the name
identification issue in a variety of languages and
recently some research efforts have started to focus on
named entity recognition for the Arabic language. We
present a working Arabic information extraction (IE)
system that is used to analyze large volumes of news
texts every day to extract the named entity (NE) types
person, organization, location, date, and number, as
well as quotations (direct reported speech) by and
about people. The named entity recognition (NER) system
was not developed for Arabic, but instead a
multilingual NER system was adapted to also cover
Arabic.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Chang:2012:EDC,
author = "Ru-Yng Chang and Chung-Hsien Wu and Philips Kokoh
Prasetyo",
title = "Error Diagnosis of {Chinese} Sentences Using Inductive
Learning Algorithm and Decomposition-Based Testing
Mechanism",
journal = j-TALIP,
volume = "11",
number = "1",
pages = "3:1--3:??",
month = mar,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2090176.2090179",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Mar 1 16:54:10 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "This study presents a novel approach to error
diagnosis of Chinese sentences for Chinese as second
language (CSL) learners. A penalized probabilistic
First-Order Inductive Learning (pFOIL) algorithm is
presented for error diagnosis of Chinese sentences. The
pFOIL algorithm integrates inductive logic programming
(ILP), First-Order Inductive Learning (FOIL), and a
penalized log-likelihood function for error diagnosis.
This algorithm considers the uncertain, imperfect, and
conflicting characteristics of Chinese sentences to
infer error types and produce human-interpretable rules
for further error correction. In a pFOIL algorithm,
relation pattern background knowledge and quantized
t-score background knowledge are proposed to
characterize a sentence and then used for likelihood
estimation.",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{He:2012:ISP,
author = "Yulan He",
title = "Incorporating Sentiment Prior Knowledge for Weakly
Supervised Sentiment Analysis",
journal = j-TALIP,
volume = "11",
number = "2",
pages = "4:1--4:??",
month = jun,
year = "2012",
DOI = "https://doi.org/10.1145/2184436.2184437",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Tue Jun 12 11:20:16 MDT 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "This article presents two novel approaches for
incorporating sentiment prior knowledge into the topic
model for weakly supervised sentiment analysis where
sentiment labels are considered as topics. One is by
modifying the Dirichlet prior for topic-word
distribution (LDA-DP), the other is by augmenting the
model objective function through adding terms that
express preferences on expectations of sentiment labels
of the lexicon words using generalized expectation
criteria (LDA-GE). We conducted extensive experiments
on English movie review data and multi-domain sentiment
dataset as well as Chinese product reviews about mobile
phones, digital cameras, MP3 players, and monitors. The
results show that while both LDA-DP and LDA-GE perform
comparably to existing weakly supervised sentiment
classification algorithms, they are much simpler and
computationally efficient, rendering them more suitable
for online and real-time sentiment classification on
the Web. We observed that LDA-GE is more effective than
LDA-DP, suggesting that it should be preferred when
considering employing the topic model for sentiment
analysis. Moreover, both models are able to extract
highly domain-salient polarity words from text.",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Transactions on Asian Language Information
Processing (TALIP)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Wang:2012:TUF,
author = "Hongling Wang and Guodong Zhou",
title = "Toward a Unified Framework for Standard and Update
Multi-Document Summarization",
journal = j-TALIP,
volume = "11",
number = "2",
pages = "5:1--5:??",
month = jun,
year = "2012",
DOI = "https://doi.org/10.1145/2184436.2184438",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Tue Jun 12 11:20:16 MDT 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "This article presents a unified framework for
extracting standard and update summaries from a set of
documents. In particular, a topic modeling approach is
employed for salience determination and a dynamic
modeling approach is proposed for redundancy control.
In the topic modeling approach for salience
determination, we represent various kinds of text
units, such as word, sentence, document, documents, and
summary, using a single vector space model via their
corresponding probability distributions over the
inherent topics of given documents or a related corpus.
Therefore, we are able to calculate the similarity
between any two text units via their topic probability
distributions. In the dynamic modeling approach for
redundancy control, we consider the similarity between
the summary and the given documents, and the similarity
between the sentence and the summary, besides the
similarity between the sentence and the given
documents, for standard summarization while for update
summarization, we also consider the similarity between
the sentence and the history documents or summary.
Evaluation on TAC 2008 and 2009 in English language
shows encouraging results, especially the dynamic
modeling approach in removing the redundancy in the
given documents. Finally, we extend the framework to
Chinese multi-document summarization and experiments
show the effectiveness of our framework.",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Transactions on Asian Language Information
Processing (TALIP)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Andrade:2012:SEC,
author = "Daniel Andrade and Takuya Matsuzaki and Jun'ichi
Tsujii",
title = "Statistical Extraction and Comparison of Pivot Words
for Bilingual Lexicon Extension",
journal = j-TALIP,
volume = "11",
number = "2",
pages = "6:1--6:??",
month = jun,
year = "2012",
DOI = "https://doi.org/10.1145/2184436.2184439",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Tue Jun 12 11:20:16 MDT 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Bilingual dictionaries can be automatically extended
by new translations using comparable corpora. The
general idea is based on the assumption that similar
words have similar contexts across languages. However,
previous studies have mainly focused on Indo-European
languages, or use only a bag-of-words model to describe
the context. Furthermore, we argue that it is helpful
to extract only the statistically significant context,
instead of using all context. The present approach
addresses these issues in the following manner. First,
based on the context of a word with an unknown
translation (query word), we extract salient pivot
words. Pivot words are words for which a translation is
already available in a bilingual dictionary. For the
extraction of salient pivot words, we use a Bayesian
estimation of the point-wise mutual information to
measure statistical significance. In the second step,
we match these pivot words across languages to identify
translation candidates for the query word. We therefore
calculate a similarity score between the query word and
a translation candidate using the probability that the
same pivots will be extracted for both the query word
and the translation candidate. The proposed method uses
several context positions, namely, a bag-of-words of
one sentence, and the successors, predecessors, and
siblings with respect to the dependency parse tree of
the sentence. In order to make these context positions
comparable across Japanese and English, which are
unrelated languages, we use several heuristics to
adjust the dependency trees appropriately. We
demonstrate that the proposed method significantly
increases the accuracy of word translations, as
compared to previous methods.",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Transactions on Asian Language Information
Processing (TALIP)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Wang:2012:IGD,
author = "Kun Wang and Chengqing Zong and Keh-Yih Su",
title = "Integrating Generative and Discriminative
Character-Based Models for {Chinese} Word
Segmentation",
journal = j-TALIP,
volume = "11",
number = "2",
pages = "7:1--7:??",
month = jun,
year = "2012",
DOI = "https://doi.org/10.1145/2184436.2184440",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Tue Jun 12 11:20:16 MDT 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Among statistical approaches to Chinese word
segmentation, the word-based n-gram ( generative )
model and the character-based tagging ( discriminative
) model are two dominant approaches in the literature.
The former gives excellent performance for the
in-vocabulary (IV) words; however, it handles
out-of-vocabulary (OOV) words poorly. On the other
hand, though the latter is more robust for OOV words,
it fails to deliver satisfactory performance for IV
words. These two approaches behave differently due to
the unit they use (word vs. character) and the model
form they adopt (generative vs. discriminative). In
general, character-based approaches are more robust
than word-based ones, as the vocabulary of characters
is a closed set; and discriminative models are more
robust than generative ones, since they can flexibly
include all kinds of available information, such as
future context. This article first proposes a
character-based n -gram model to enhance the robustness
of the generative approach. Then the proposed
generative model is further integrated with the
character-based discriminative model to take advantage
of both approaches. Our experiments show that this
integrated approach outperforms all the existing
approaches reported in the literature. Afterwards, a
complete and detailed error analysis is conducted.
Since a significant portion of the critical errors is
related to numerical/foreign strings, character-type
information is then incorporated into the model to
further improve its performance. Last, the proposed
integrated approach is tested on cross-domain corpora,
and a semi-supervised domain adaptation algorithm is
proposed and shown to be effective in our
experiments.",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Transactions on Asian Language Information
Processing (TALIP)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Isozaki:2012:HBP,
author = "Hideki Isozaki and Katsuhito Sudoh and Hajime Tsukada
and Kevin Duh",
title = "{HPSG}-Based Preprocessing for {English-to-Japanese}
Translation",
journal = j-TALIP,
volume = "11",
number = "3",
pages = "8:1--8:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2334801.2334802",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Tue Sep 11 14:17:04 MDT 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Japanese sentences have completely different word
orders from corresponding English sentences. Typical
phrase-based statistical machine translation (SMT)
systems such as Moses search for the best word
permutation within a given distance limit (distortion
limit). For English-to-Japanese translation, we need a
large distance limit to obtain acceptable translations,
and the number of translation candidates is extremely
large. Therefore, SMT systems often fail to find
acceptable translations within a limited time. To solve
this problem, some researchers use rule-based
preprocessing approaches, which reorder English words
just like Japanese by using dozens of rules. Our idea
is based on the following two observations: (1)
Japanese is a typical head-final language, and (2) we
can detect heads of English sentences by a head-driven
phrase structure grammar (HPSG) parser. The main
contributions of this article are twofold: First, we
demonstrate how off-the-shelf, state-of-the-art HPSG
parser enables us to write the reordering rules in an
abstract level and can easily improve the quality of
English-to-Japanese translation. Second, we also show
that syntactic heads achieve better results than
semantic heads. The proposed method outperforms the
best system of NTCIR-7 PATMT EJ task.",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Zhang:2012:ABH,
author = "Lidan Zhang and Kwop-Ping Chan",
title = "Adaptive {Bayesian HMM} for Fully Unsupervised
{Chinese} Part-of-Speech Induction",
journal = j-TALIP,
volume = "11",
number = "3",
pages = "9:1--9:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2334801.2334803",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Tue Sep 11 14:17:04 MDT 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "We propose an adaptive Bayesian hidden Markov model
for fully unsupervised part-of-speech (POS) induction.
The proposed model with its inference algorithm has two
extensions to the first-order Bayesian HMM with
Dirichlet priors. First our algorithm infers the
optimal number of hidden states from the training
corpus rather than fixes the dimensionality of state
space beforehand. The second extension studies the
Chinese unknown word processing module which measures
similarities from both morphological properties and
context distribution. Experimental results showed that
both of these two extensions can help to find the
optimal categories for Chinese in terms of both
unsupervised clustering metrics and grammar induction
accuracies on the Chinese Treebank.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Lee:2012:SMB,
author = "Jinsik Lee and Sungjin Lee and Jonghoon Lee and
Byeongchang Kim and Gary Geunbae Lee",
title = "Stacking Model-Based {Korean} Prosodic Phrasing Using
Speaker Variability Reduction and Linguistic Feature
Engineering",
journal = j-TALIP,
volume = "11",
number = "3",
pages = "10:1--10:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2334801.2334804",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Tue Sep 11 14:17:04 MDT 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "This article presents a prosodic phrasing model for a
general purpose Korean speech synthesis system. To
reflect the factors affecting prosodic phrasing in the
model, linguistically motivated machine-learning
features were investigated. These features were
effectively incorporated using a stacking model. The
phrasing performance was also improved through feature
engineering. The corpus used in the experiment is a
4,392-sentence corpus (55,015 words with an average of
13 words per sentence). Because the corpus contains
speaker-dependent variability and such variability is
not appropriately reflected in a general purpose speech
synthesis system, a method to reduce such variability
is proposed. In addition, the entire set of data used
in the experiment is provided to the public for future
use in comparative research.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Duc:2012:CLL,
author = "Nguyen Tuan Duc and Danushka Bollegala and Mitsuru
Ishizuka",
title = "Cross-Language Latent Relational Search between
{Japanese} and {English} Languages Using a {Web}
Corpus",
journal = j-TALIP,
volume = "11",
number = "3",
pages = "11:1--11:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2334801.2334805",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Tue Sep 11 14:17:04 MDT 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Latent relational search is a novel entity retrieval
paradigm based on the proportional analogy between two
entity pairs. Given a latent relational search query
{(Japan, Tokyo), (France, ?)}, a latent relational
search engine is expected to retrieve and rank the
entity ``Paris'' as the first answer in the result
list. A latent relational search engine extracts
entities and relations between those entities from a
corpus, such as the Web. Moreover, from some supporting
sentences in the corpus, (e.g., ``Tokyo is the capital
of Japan'' and ``Paris is the capital and biggest city
of France''), the search engine must recognize the
relational similarity between the two entity pairs. In
cross-language latent relational search, the entity
pairs as well as the supporting sentences of the first
entity pair and of the second entity pair are in
different languages. Therefore, the search engine must
recognize similar semantic relations across languages.
In this article, we study the problem of cross-language
latent relational search between Japanese and English
using Web data. To perform cross-language latent
relational search in high speed, we propose a
multi-lingual indexing method for storing entities and
lexical patterns that represent the semantic relations
extracted from Web corpora. We then propose a hybrid
lexical pattern clustering algorithm to capture the
semantic similarity between lexical patterns across
languages. Using this algorithm, we can precisely
measure the relational similarity between entity pairs
across languages, thereby achieving high precision in
the task of cross-language latent relational search.
Experiments show that the proposed method achieves an
MRR of 0.605 on Japanese-English cross-language latent
relational search query sets and it also achieves a
reasonable performance on the INEX Entity Ranking
task.",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Mitamura:2012:ISI,
author = "Teruko Mitamura and Noriko Kando and Koichi Takeda",
title = "Introduction to the Special Issue on {RITE}",
journal = j-TALIP,
volume = "11",
number = "4",
pages = "12:1--12:??",
month = dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2382593.2382594",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Dec 6 07:40:55 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Miyao:2012:ETE,
author = "Yusuke Miyao and Hideki Shima and Hiroshi Kanayama and
Teruko Mitamura",
title = "Evaluating Textual Entailment Recognition for
University Entrance Examinations",
journal = j-TALIP,
volume = "11",
number = "4",
pages = "13:1--13:??",
month = dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2382593.2382595",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Dec 6 07:40:55 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "The present article addresses an attempt to apply
questions in university entrance examinations to the
evaluation of textual entailment recognition. Questions
in several fields, such as history and politics,
primarily test the examinee's knowledge in the form of
choosing true statements from multiple choices.
Answering such questions can be regarded as equivalent
to finding evidential texts from a textbase such as
textbooks and Wikipedia. Therefore, this task can be
recast as recognizing textual entailment between a
description in a textbase and a statement given in a
question. We focused on the National Center Test for
University Admission in Japan and converted questions
into the evaluation data for textual entailment
recognition by using Wikipedia as a textbase.
Consequently, it is revealed that nearly half of the
questions can be mapped into textual entailment
recognition; 941 text pairs were created from 404
questions from six subjects. This data set is provided
for a subtask of NTCIR RITE (Recognizing Inference in
Text), and 16 systems from six teams used the data set
for evaluation. The evaluation results revealed that
the best system achieved a correct answer ratio of
56\%, which is significantly better than a random
choice baseline.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Pham:2012:LRT,
author = "Minh Quang Nhat Pham and Minh Le Nguyen and Akira
Shimazu",
title = "Learning to Recognize Textual Entailment in {Japanese}
Texts with the Utilization of Machine Translation",
journal = j-TALIP,
volume = "11",
number = "4",
pages = "14:1--14:??",
month = dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2382593.2382596",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Dec 6 07:40:55 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Recognizing Textual Entailment (RTE) is a fundamental
task in Natural Language Understanding. The task is to
decide whether the meaning of a text can be inferred
from the meaning of another one. In this article, we
conduct an empirical study of recognizing textual
entailment in Japanese texts, in which we adopt a
machine learning-based approach to the task. We
quantitatively analyze the effects of various
entailment features, machine learning algorithms, and
the impact of RTE resources on the performance of an
RTE system. This article also investigates the use of
machine translation for the RTE task and determines
whether machine translation can be used to improve the
performance of our RTE system. Experimental results
achieved on benchmark data sets show that our machine
learning-based RTE system outperforms the baseline
methods based on lexical matching and syntactic
matching. The results also suggest that the machine
translation component can be utilized to improve the
performance of the RTE system.",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Qiu:2012:RIT,
author = "Xipeng Qiu and Ling Cao and Zhao Liu and Xuanjing
Huang",
title = "Recognizing Inference in Texts with {Markov} Logic
Networks",
journal = j-TALIP,
volume = "11",
number = "4",
pages = "15:1--15:??",
month = dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2382593.2382597",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Dec 6 07:40:55 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Recognizing inference in texts (RITE) attracts growing
attention of natural language processing (NLP)
researchers in recent years. In this article, we
propose a novel approach to recognize inference with
probabilistic logical reasoning. Our approach is built
on Markov logic networks (MLNs) framework, which is a
probabilistic extension of first-order logic. We design
specific semantic rules based on the surface,
syntactic, and semantic representations of texts, and
map these rules to logical representations. We also
extract information from some knowledge bases as common
sense logic rules. Then we utilize MLNs framework to
make predictions with combining statistical and logical
reasoning. Experiment results shows that our system can
achieve better performance than state-of-the-art RITE
systems.",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Shibata:2012:PAS,
author = "Tomohide Shibata and Sadao Kurohashi",
title = "Predicate-Argument Structure-Based Textual Entailment
Recognition System Exploiting Wide-Coverage Lexical
Knowledge",
journal = j-TALIP,
volume = "11",
number = "4",
pages = "16:1--16:??",
month = dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2382593.2382598",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Dec 6 07:40:55 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "This article proposes a predicate-argument structure
based Textual Entailment Recognition system exploiting
wide-coverage lexical knowledge. Different from
conventional machine learning approaches where several
features obtained from linguistic analysis and
resources are utilized, our proposed method regards a
predicate-argument structure as a basic unit, and
performs the matching/alignment between a text and
hypothesis. In matching between predicate-arguments,
wide-coverage relations between words/phrases such as
synonym and is-a are utilized, which are automatically
acquired from a dictionary, Web corpus, and
Wikipedia.",
acknowledgement = ack-nhfb,
articleno = "16",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Shih:2012:VCT,
author = "Chengwei Shih and Chengwei Lee and Richard Tzonghan
Tsai and Wenlian Hsu",
title = "Validating Contradiction in Texts Using Online
Co-Mention Pattern Checking",
journal = j-TALIP,
volume = "11",
number = "4",
pages = "17:1--17:??",
month = dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2382593.2382599",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Dec 6 07:40:55 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Detecting contradictive statements is a foundational
and challenging task for text understanding
applications such as textual entailment. In this
article, we aim to address the problem of the shortage
of specific background knowledge in contradiction
detection. A novel contradiction detecting approach
based on the distribution of the query composed of
critical mismatch combinations on the Internet is
proposed to tackle the problem. By measuring the
availability of mismatch conjunction phrases (MCPs),
the background knowledge about two target statements
can be implicitly obtained for identifying
contradictions. Experiments on three different
configurations show that the MCP-based approach
achieves remarkable improvement on contradiction
detection and can significantly improve the performance
of textual entailment recognition.",
acknowledgement = ack-nhfb,
articleno = "17",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Watanabe:2012:LDL,
author = "Yotaro Watanabe and Junta Mizuno and Eric Nichols and
Katsuma Narisawa and Keita Nabeshima and Naoaki Okazaki
and Kentaro Inui",
title = "Leveraging Diverse Lexical Resources for Textual
Entailment Recognition",
journal = j-TALIP,
volume = "11",
number = "4",
pages = "18:1--18:??",
month = dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2382593.2382600",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Dec 6 07:40:55 MST 2012",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Since the problem of textual entailment recognition
requires capturing semantic relations between diverse
expressions of language, linguistic and world knowledge
play an important role. In this article, we explore the
effectiveness of different types of currently available
resources including synonyms, antonyms,
hypernym-hyponym relations, and lexical entailment
relations for the task of textual entailment
recognition. In order to do so, we develop an
entailment relation recognition system which utilizes
diverse linguistic analyses and resources to align the
linguistic units in a pair of texts and identifies
entailment relations based on these alignments. We use
the Japanese subset of the NTCIR-9 RITE-1 dataset for
evaluation and error analysis, conducting ablation
testing and evaluation on hand-crafted alignment gold
standard data to evaluate the contribution of
individual resources. Error analysis shows that
existing knowledge sources are effective for RTE, but
that their coverage is limited, especially for
domain-specific and other low-frequency expressions. To
increase alignment coverage on such expressions, we
propose a method of alignment inference that uses
syntactic and semantic dependency information to
identify likely alignments without relying on external
resources. Evaluation adding alignment inference to a
system using all available knowledge sources shows
improvements in both precision and recall of entailment
relation recognition.",
acknowledgement = ack-nhfb,
articleno = "18",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Hao:2013:TPP,
author = "Tianyong Hao and Chunshen Zhu",
title = "Toward a Professional Platform for {Chinese} Character
Conversion",
journal = j-TALIP,
volume = "12",
number = "1",
pages = "1:1--1:??",
month = mar,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2425327.2425328",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Mar 2 09:25:42 MST 2013",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Increasing communication among Chinese-speaking
regions using respectively traditional and simplified
Chinese character systems has highlighted the
subtle-yet-extensive differences between the two
systems, which can lead to unexpected hindrance in
converting characters from one to the other. This
article proposes a new priority-based multi-data
resources management model, with a new algorithm called
Fused Conversion algorithm from Multi-Data resources
(FCMD), to ensure more context-sensitive, human
controllable, and thus more reliable conversions, by
drawing on reverse maximum matching, n -gram-based
statistical model and pattern-based learning and
matching. After parameter training on the Tagged
Chinese Gigaword corpus, its conversion precision
reaches 91.5\% in context-sensitive cases, the most
difficult part in the conversion, with an overall
precision rate at 99.8\%, a significant improvement
over the state-of-the-art models. The conversion
platform based on the model has extra features such as
data resource selection and $n$-grams self-learning
ability, providing a more sophisticated tool good
especially for high-end professional uses.",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Jiang:2013:LRC,
author = "Mike Tian-Jian Jiang and Tsung-Hsien Lee and Wen-Lian
Hsu",
title = "The Left and Right Context of a Word: Overlapping
{Chinese} Syllable Word Segmentation with Minimal
Context",
journal = j-TALIP,
volume = "12",
number = "1",
pages = "2:1--2:??",
month = mar,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2425327.2425329",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Mar 2 09:25:42 MST 2013",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Since a Chinese syllable can correspond to many
characters (homophones), the syllable-to-character
conversion task is quite challenging for Chinese
phonetic input methods (CPIM). There are usually two
stages in a CPIM: 1. segment the syllable sequence into
syllable words, and 2. select the most likely character
words for each syllable word. A CPIM usually assumes
that the input is a complete sentence, and evaluates
the performance based on a well-formed corpus. However,
in practice, most Pinyin users prefer progressive text
entry in several short chunks, mainly in one or two
words each (most Chinese words consist of two or more
characters). Short chunks do not provide enough
contexts to perform the best possible
syllable-to-character conversion, especially when a
chunk consists of overlapping syllable words. In such
cases, a conversion system often selects the boundary
of a word with the highest frequency. Short chunk input
is even more popular on platforms with limited
computing power, such as mobile phones. Based on the
observation that the relative strength of a word can be
quite different when calculated leftwards or
rightwards, we propose a simple division of the word
context into the left context and the right context.
Furthermore, we design a double ranking strategy for
each word to reduce the number of errors in Step 1. Our
strategy is modeled as the minimum feedback arc set
problem on bipartite tournament with approximate
solutions derived from genetic algorithm. Experiments
show that, compared to the frequency-based method (FBM)
(low memory and fast) and the conditional random fields
(CRF) model (larger memory and slower), our double
ranking strategy has the benefits of less memory and
low power requirement with competitive performance. We
believe a similar strategy could also be adopted to
disambiguate conflicting linguistic patterns
effectively.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Bach:2013:TPF,
author = "Ngo Xuan Bach and Nguyen Le Minh and Tran Thi Oanh and
Akira Shimazu",
title = "A Two-Phase Framework for Learning Logical Structures
of Paragraphs in Legal Articles",
journal = j-TALIP,
volume = "12",
number = "1",
pages = "3:1--3:??",
month = mar,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2425327.2425330",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Mar 2 09:25:42 MST 2013",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Analyzing logical structures of texts is important to
understanding natural language, especially in the legal
domain, where legal texts have their own specific
characteristics. Recognizing logical structures in
legal texts does not only help people in understanding
legal documents, but also in supporting other tasks in
legal text processing. In this article, we present a
new task, learning logical structures of paragraphs in
legal articles, which is studied in research on Legal
Engineering. The goals of this task are recognizing
logical parts of law sentences in a paragraph, and then
grouping related logical parts into some logical
structures of formulas, which describe logical
relations between logical parts. We present a two-phase
framework to learn logical structures of paragraphs in
legal articles. In the first phase, we model the
problem of recognizing logical parts in law sentences
as a multi-layer sequence learning problem, and present
a CRF-based model to recognize them. In the second
phase, we propose a graph-based method to group logical
parts into logical structures. We consider the problem
of finding a subset of complete subgraphs in a
weighted-edge complete graph, where each node
corresponds to a logical part, and a complete subgraph
corresponds to a logical structure. We also present an
integer linear programming formulation for this
optimization problem. Our models achieve 74.37\% in
recognizing logical parts, 80.08\% in recognizing
logical structures, and 58.36\% in the whole task on
the Japanese National Pension Law corpus. Our work
provides promising results for further research on this
interesting task.",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Sundaram:2013:AFB,
author = "Suresh Sundaram and A. G. Ramakrishnan",
title = "Attention-Feedback Based Robust Segmentation of Online
Handwritten Isolated {Tamil} Words",
journal = j-TALIP,
volume = "12",
number = "1",
pages = "4:1--4:??",
month = mar,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2425327.2425331",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Mar 2 09:25:42 MST 2013",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "In this article, we propose a lexicon-free,
script-dependent approach to segment online handwritten
isolated Tamil words into its constituent symbols. Our
proposed segmentation strategy comprises two modules,
namely the (1) Dominant Overlap Criterion Segmentation
(DOCS) module and (2) Attention Feedback Segmentation
(AFS) module. Based on a bounding box overlap criterion
in the DOCS module, the input word is first segmented
into stroke groups. A stroke group may at times
correspond to a part of a valid symbol
(over-segmentation) or a merger of valid symbols
(under-segmentation). Attention on specific features in
the AFS module serve in detecting possibly
over-segmented or under-segmented stroke groups.
Thereafter, feedbacks from the SVM classifier
likelihoods and stroke-group based features are
considered in modifying the suspected stroke groups to
form valid symbols. The proposed scheme is tested on a
set of 10000 isolated handwritten words (containing
53,246 Tamil symbols). The results show that the DOCS
module achieves a symbol-level segmentation accuracy of
98.1\%, which improves to as high as 99.7\% after the
AFS strategy. This in turn entails a symbol recognition
rate of 83.9\% (at the DOCS module) and 88.4\% (after
the AFS module). The resulting word recognition rates
at the DOCS and AFS modules are found to be, 50.9\% and
64.9\% respectively, without any postprocessing.",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Sun:2013:LAC,
author = "Xu Sun and Naoaki Okazaki and Jun'ichi Tsujii and
Houfeng Wang",
title = "Learning Abbreviations from {Chinese} and {English}
Terms by Modeling Non-Local Information",
journal = j-TALIP,
volume = "12",
number = "2",
pages = "5:1--5:??",
month = jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2461316.2461317",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Jun 6 06:48:55 MDT 2013",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "The present article describes a robust approach for
abbreviating terms. First, in order to incorporate
non-local information into abbreviation generation
tasks, we present both implicit and explicit solutions:
the latent variable model and the label encoding with
global information. Although the two approaches compete
with one another, we find they are also highly
complementary. We propose a combination of the two
approaches, and we will show the proposed method
outperforms all of the existing methods on abbreviation
generation datasets. In order to reduce computational
complexity of learning non-local information, we
further present an online training method, which can
arrive the objective optimum with accelerated training
speed. We used a Chinese newswire dataset and a English
biomedical dataset for experiments. Experiments
revealed that the proposed abbreviation generator with
non-local information achieved the best results for
both the Chinese and English languages.",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Hinkle:2013:DES,
author = "Lauren Hinkle and Albert Brouillette and Sujay Jayakar
and Leigh Gathings and Miguel Lezcano and Jugal
Kalita",
title = "Design and Evaluation of Soft Keyboards for {Brahmic}
Scripts",
journal = j-TALIP,
volume = "12",
number = "2",
pages = "6:1--6:??",
month = jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2461316.2461318",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Jun 6 06:48:55 MDT 2013",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Despite being spoken by a large percentage of the
world, Indic languages in general lack user-friendly
and efficient methods for text input. These languages
have poor or no support for typing. Soft keyboards,
because of their ease of installation and lack of
reliance on specific hardware, are a promising solution
as an input device for many languages. Developing an
acceptable soft keyboard requires the frequency
analysis of characters in order to design a layout that
minimizes text-input time. This article proposes the
use of various development techniques, layout
variations, and evaluation methods for the creation of
soft keyboards for Brahmic scripts. We propose that
using optimization techniques such as genetic
algorithms and multi-objective Pareto optimization to
develop multi-layer keyboards will increase the speed
at which text can be entered.",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Fujita:2013:WSD,
author = "Sanae Fujita and Akinori Fujino",
title = "Word Sense Disambiguation by Combining Labeled Data
Expansion and Semi-Supervised Learning Method",
journal = j-TALIP,
volume = "12",
number = "2",
pages = "7:1--7:??",
month = jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2461316.2461319",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Jun 6 06:48:55 MDT 2013",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Lack of labeled data is one of the severest problems
facing word sense disambiguation (WSD). We overcome the
problem by proposing a method that combines automatic
labeled data expansion (Step 1) and semi-supervised
learning (Step 2). The Step 1 and 2 methods are both
effective, but their combination yields a synergistic
effect. In this article, in Step 1, we automatically
extract reliable labeled data from raw corpora using
dictionary example sentences, even the infrequent and
unseen senses (which are not likely to appear in
labeled data). Next, in Step 2, we apply a
semi-supervised classifier and achieve an improvement
using easy-to-get unlabeled data. In this step, we also
show that we can guess even unseen senses. We target a
SemEval-2010 Japanese WSD task, which is a lexical
sample task. Both Step 1 and Step 2 methods performed
better than the best published result (76.4 \%).
Furthermore, the combined method achieved much higher
accuracy (84.2 \%). In this experiment, up to 50 \% of
unseen senses are classified correctly. However, the
number of unseen senses are small, therefore, we delete
one senses per word and apply our proposed method; the
results show that the method is effective and robust
even for unseen senses.",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Sproat:2013:EGN,
author = "Richard Sproat",
title = "Editorial Greetings from the new {Editor-in--Chief}",
journal = j-TALIP,
volume = "12",
number = "3",
pages = "8:1--8:??",
month = aug,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2499955.2499956",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Aug 19 18:39:55 MDT 2013",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Fukunishi:2013:BAA,
author = "Takaaki Fukunishi and Andrew Finch and Seiichi
Yamamoto and Eiichiro Sumita",
title = "A {Bayesian} Alignment Approach to Transliteration
Mining",
journal = j-TALIP,
volume = "12",
number = "3",
pages = "9:1--9:??",
month = aug,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2499955.2499957",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Aug 19 18:39:55 MDT 2013",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "In this article we present a technique for mining
transliteration pairs using a set of simple features
derived from a many-to-many bilingual forced-alignment
at the grapheme level to classify candidate
transliteration word pairs as correct transliterations
or not. We use a nonparametric Bayesian method for the
alignment process, as this process rewards the reuse of
parameters, resulting in compact models that align in a
consistent manner and tend not to over-fit. Our
approach uses the generative model resulting from
aligning the training data to force-align the test
data. We rely on the simple assumption that correct
transliteration pairs would be well modeled and
generated easily, whereas incorrect pairs---being more
random in character---would be more costly to model and
generate. Our generative model generates by
concatenating bilingual grapheme sequence pairs. The
many-to-many generation process is essential for
handling many languages with non-Roman scripts, and it
is hard to train well using a maximum likelihood
techniques, as these tend to over-fit the data. Our
approach works on the principle that generation using
only grapheme sequence pairs that are in the model
results in a high probability derivation, whereas if
the model is forced to introduce a new parameter in
order to explain part of the candidate pair, the
derivation probability is substantially reduced and
severely reduced if the new parameter corresponds to a
sequence pair composed of a large number of graphemes.
The features we extract from the alignment of the test
data are not only based on the scores from the
generative model, but also on the relative proportions
of each sequence that are hard to generate. The
features are used in conjunction with a support vector
machine classifier trained on known positive examples
together with synthetic negative examples to determine
whether a candidate word pair is a correct
transliteration pair. In our experiments, we used all
data tracks from the 2010 Named-Entity Workshop
(NEWS'10) and use the performance of the best system
for each language pair as a reference point. Our
results show that the new features we propose are
powerfully predictive, enabling our approach to achieve
levels of performance on this task that are comparable
to the state of the art.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Iwakura:2013:NER,
author = "Tomoya Iwakura and Hiroya Takamura and Manabu
Okumura",
title = "A Named Entity Recognition Method Based on
Decomposition and Concatenation of Word Chunks",
journal = j-TALIP,
volume = "12",
number = "3",
pages = "10:1--10:??",
month = aug,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2499955.2499958",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Aug 19 18:39:55 MDT 2013",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "We propose a named entity (NE) recognition method in
which word chunks are repeatedly decomposed and
concatenated. Our method identifies word chunks with a
base chunker, such as a noun phrase chunker, and then
recognizes NEs from the recognized word chunk
sequences. By using word chunks, we can obtain features
that cannot be obtained in word-sequence-based
recognition methods, such as the first word of a word
chunk, the last word of a word chunk, and so on.
However, each word chunk may include a part of an NE or
multiple NEs. To solve this problem, we use the
following operators: SHIFT for separating the first
word from a word chunk, POP for separating the last
word from a word chunk, JOIN for concatenating two word
chunks, and REDUCE for assigning an NE label to a word
chunk. We evaluate our method on a Japanese NE
recognition dataset that includes about 200,000
annotations of 191 types of NEs from over 8,500 news
articles. The experimental results show that the
training and processing speeds of our method are faster
than those of a linear-chain structured perceptron and
a semi-Markov perceptron, while maintaining high
accuracy.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Izumi:2013:NCF,
author = "Tomoko Izumi and Kenji Imamura and Taichi Asami and
Kuniko Saito and Genichiro Kikui and Satoshi Sato",
title = "Normalizing Complex Functional Expressions in
{Japanese} Predicates: Linguistically-Directed
Rule-Based Paraphrasing and Its Application",
journal = j-TALIP,
volume = "12",
number = "3",
pages = "11:1--11:??",
month = aug,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2499955.2499959",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Aug 19 18:39:55 MDT 2013",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "The growing need for text mining systems, such as
opinion mining, requires a deep semantic understanding
of the target language. In order to accomplish this,
extracting the semantic information of functional
expressions plays a crucial role, because functional
expressions such as would like to and can't are key
expressions to detecting customers' needs and wants.
However, in Japanese, functional expressions appear in
the form of suffixes, and two different types of
functional expressions are merged into one predicate:
one influences the factual meaning of the predicate
while the other is merely used for discourse purposes.
This triggers an increase in surface forms, which
hinders information extraction systems. In this
article, we present a novel normalization technique
that paraphrases complex functional expressions into
simplified forms that retain only the crucial meaning
of the predicate. We construct paraphrasing rules based
on linguistic theories in syntax and semantics. The
results of experiments indicate that our system
achieves a high accuracy of 79.7\%, while it reduces
the differences in functional expressions by up to
66.7\%. The results also show an improvement in the
performance of predicate extraction, providing
encouraging evidence of the usability of paraphrasing
as a means of normalizing different language
expressions.",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Sudoh:2013:SBP,
author = "Katsuhito Sudoh and Xianchao Wu and Kevin Duh and
Hajime Tsukada and Masaaki Nagata",
title = "Syntax-Based Post-Ordering for Efficient
{Japanese-to-English} Translation",
journal = j-TALIP,
volume = "12",
number = "3",
pages = "12:1--12:??",
month = aug,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2499955.2499960",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Mon Aug 19 18:39:55 MDT 2013",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "This article proposes a novel reordering method for
efficient two-step Japanese-to-English statistical
machine translation (SMT) that isolates reordering from
SMT and solves it after lexical translation. This
reordering problem, called post-ordering, is solved as
an SMT problem from Head-Final English (HFE) to
English. HFE is syntax-based reordered English that is
very successfully used for reordering with
English-to-Japanese SMT. The proposed method
incorporates its advantage into the reverse direction,
Japanese-to-English, and solves the post-ordering
problem by accurate syntax-based SMT with target
language syntax. Two-step SMT with the proposed
post-ordering empirically reduces the decoding time of
the accurate but slow syntax-based SMT by its good
approximation using intermediate HFE. The proposed
method improves the decoding speed of syntax-based SMT
decoding by about six times with comparable translation
accuracy in Japanese-to-English patent translation
experiments.",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Sproat:2013:TP,
author = "Richard Sproat",
title = "{TALIP} Perspectives",
journal = j-TALIP,
volume = "12",
number = "4",
pages = "13:1--13:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2523057.2523058",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Wed Oct 30 12:33:24 MDT 2013",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Paul:2013:HCB,
author = "Michael Paul and Andrew Finch and Eiichrio Sumita",
title = "How to Choose the Best Pivot Language for Automatic
Translation of Low-Resource Languages",
journal = j-TALIP,
volume = "12",
number = "4",
pages = "14:1--14:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2505126",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Wed Oct 30 12:33:24 MDT 2013",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Recent research on multilingual statistical machine
translation focuses on the usage of pivot languages in
order to overcome language resource limitations for
certain language pairs. Due to the richness of
available language resources, English is, in general,
the pivot language of choice. However, factors like
language relatedness can also effect the choice of the
pivot language for a given language pair, especially
for Asian languages, where language resources are
currently quite limited. In this article, we provide
new insights into what factors make a pivot language
effective and investigate the impact of these factors
on the overall pivot translation performance for
translation between 22 Indo-European and Asian
languages. Experimental results using state-of-the-art
statistical machine translation techniques revealed
that the translation quality of 54.8\% of the language
pairs improved when a non-English pivot language was
chosen. Moreover, 81.0\% of system performance
variations can be explained by a combination of factors
such as language family, vocabulary, sentence length,
language perplexity, translation model entropy,
reordering, monotonicity, and engine performance.",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Huang:2013:CAT,
author = "Chung-Chi Huang and Mei-Hua Chen and Ping-Che Yang and
Jason S. Chang",
title = "A Computer-Assisted Translation and Writing System",
journal = j-TALIP,
volume = "12",
number = "4",
pages = "15:1--15:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2505984",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Wed Oct 30 12:33:24 MDT 2013",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "We introduce a method for learning to predict text and
grammatical construction in a computer-assisted
translation and writing framework. In our approach,
predictions are offered on the fly to help the user
make appropriate lexical and grammar choices during the
translation of a source text, thus improving
translation quality and productivity. The method
involves automatically generating general-to-specific
word usage summaries (i.e., writing suggestion module),
and automatically learning high-confidence word- or
phrase-level translation equivalents (i.e., translation
suggestion module). At runtime, the source text and its
translation prefix entered by the user are broken down
into $n$-grams to generate grammar and translation
predictions, which are further combined and ranked via
translation and language models. These ranked
prediction candidates are iteratively and interactively
displayed to the user in a pop-up menu as translation
or writing hints. We present a prototype writing
assistant, TransAhead, that applies the method to a
human-computer collaborative environment. Automatic and
human evaluations show that novice translators or
language learners substantially benefit from our system
in terms of translation performance (i.e., translation
accuracy and productivity) and language learning (i.e.,
collocation usage and grammar). In general, our
methodology of inline grammar and text predictions or
suggestions has great potential in the field of
computer-assisted translation, writing, or even
language learning.",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Chu:2013:CJM,
author = "Chenhui Chu and Toshiaki Nakazawa and Daisuke Kawahara
and Sadao Kurohashi",
title = "{Chinese--Japanese} Machine Translation Exploiting
{Chinese} Characters",
journal = j-TALIP,
volume = "12",
number = "4",
pages = "16:1--16:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2523057.2523059",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Wed Oct 30 12:33:24 MDT 2013",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "The Chinese and Japanese languages share Chinese
characters. Since the Chinese characters in Japanese
originated from ancient China, many common Chinese
characters exist between these two languages. Since
Chinese characters contain significant semantic
information and common Chinese characters share the
same meaning in the two languages, they can be quite
useful in Chinese--Japanese machine translation (MT).
We therefore propose a method for creating a Chinese
character mapping table for Japanese, traditional
Chinese, and simplified Chinese, with the aim of
constructing a complete resource of common Chinese
characters. Furthermore, we point out two main problems
in Chinese word segmentation for Chinese--Japanese MT,
namely, unknown words and word segmentation
granularity, and propose an approach exploiting common
Chinese characters to solve these problems. We also
propose a statistical method for detecting other
semantically equivalent Chinese characters other than
the common ones and a method for exploiting shared
Chinese characters in phrase alignment. Results of the
experiments carried out on a state-of-the-art
phrase-based statistical MT system and an example-based
MT system show that our proposed approaches can improve
MT performance significantly, thereby verifying the
effectiveness of shared Chinese characters for
Chinese--Japanese MT.",
acknowledgement = ack-nhfb,
articleno = "16",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Goto:2013:POP,
author = "Isao Goto and Masao Utiyama and Eiichiro Sumita",
title = "Post-Ordering by Parsing with {ITG} for
{Japanese--English} Statistical Machine Translation",
journal = j-TALIP,
volume = "12",
number = "4",
pages = "17:1--17:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2518100",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Wed Oct 30 12:33:24 MDT 2013",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Word reordering is a difficult task for translation
between languages with widely different word orders,
such as Japanese and English. A previously proposed
post-ordering method for Japanese-to-English
translation first translates a Japanese sentence into a
sequence of English words in a word order similar to
that of Japanese, then reorders the sequence into an
English word order. We employed this post-ordering
framework and improved upon its reordering method. The
existing post-ordering method reorders the sequence of
English words via SMT, whereas our method reorders the
sequence by (1) parsing the sequence using ITG to
obtain syntactic structures which are similar to
Japanese syntactic structures, and (2) transferring the
obtained syntactic structures into English syntactic
structures according to the ITG. The experiments using
Japanese-to-English patent translation demonstrated the
effectiveness of our method and showed that both the
RIBES and BLEU scores were improved over compared
methods.",
acknowledgement = ack-nhfb,
articleno = "17",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Strotgen:2014:TML,
author = "Jannik Str{\"o}tgen and Ayser Armiti and Tran Van Canh
and Julian Zell and Michael Gertz",
title = "Time for More Languages: Temporal Tagging of {Arabic},
{Italian}, {Spanish}, and {Vietnamese}",
journal = j-TALIP,
volume = "13",
number = "1",
pages = "1:1--1:??",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2540989",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Feb 27 12:18:55 MST 2014",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Most of the research on temporal tagging so far is
done for processing English text documents. There are
hardly any multilingual temporal taggers supporting
more than two languages. Recently, the temporal tagger
HeidelTime has been made publicly available, supporting
the integration of new languages by developing
language-dependent resources without modifying the
source code. In this article, we describe our work on
developing such resources for two Asian and two Romance
languages: Arabic, Vietnamese, Spanish, and Italian.
While temporal tagging of the two Romance languages has
been addressed before, there has been almost no
research on Arabic and Vietnamese temporal tagging so
far. Furthermore, we analyze language-dependent
challenges for temporal tagging and explain the
strategies we followed to address them. Our evaluation
results on publicly available and newly annotated
corpora demonstrate the high quality of our new
resources for the four languages, which we make
publicly available to the research community.",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Goto:2014:DMB,
author = "Isao Goto and Masao Utiyama and Eiichiro Sumita and
Akihiro Tamura and Sadao Kurohashi",
title = "Distortion Model Based on Word Sequence Labeling for
Statistical Machine Translation",
journal = j-TALIP,
volume = "13",
number = "1",
pages = "2:1--2:??",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2537128",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Feb 27 12:18:55 MST 2014",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "This article proposes a new distortion model for
phrase-based statistical machine translation. In
decoding, a distortion model estimates the source word
position to be translated next (subsequent position;
SP) given the last translated source word position
(current position; CP). We propose a distortion model
that can simultaneously consider the word at the CP,
the word at an SP candidate, the context of the CP and
an SP candidate, relative word order among the SP
candidates, and the words between the CP and an SP
candidate. These considered elements are called rich
context. Our model considers rich context by
discriminating label sequences that specify spans from
the CP to each SP candidate. It enables our model to
learn the effect of relative word order among SP
candidates as well as to learn the effect of distances
from the training data. In contrast to the learning
strategy of existing methods, our learning strategy is
that the model learns preference relations among SP
candidates in each sentence of the training data. This
leaning strategy enables consideration of all of the
rich context simultaneously. In our experiments, our
model had higher BLUE and RIBES scores for
Japanese-English, Chinese--English, and German-English
translation compared to the lexical reordering
models.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Kim:2014:CLA,
author = "Seokhwan Kim and Minwoo Jeong and Jonghoon Lee and
Gary Geunbae Lee",
title = "Cross-Lingual Annotation Projection for
Weakly-Supervised Relation Extraction",
journal = j-TALIP,
volume = "13",
number = "1",
pages = "3:1--3:??",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2529994",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Feb 27 12:18:55 MST 2014",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Although researchers have conducted extensive studies
on relation extraction in the last decade, statistical
systems based on supervised learning are still limited,
because they require large amounts of training data to
achieve high performance level. In this article, we
propose cross-lingual annotation projection methods
that leverage parallel corpora to build a relation
extraction system for a resource-poor language without
significant annotation efforts. To make our method more
reliable, we introduce two types of projection
approaches with noise reduction strategies. We
demonstrate the merit of our method using a Korean
relation extraction system trained on projected
examples from an English-Korean parallel corpus.
Experiments show the feasibility of our approaches
through comparison to other systems based on
monolingual resources.",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Yahya:2014:ATC,
author = "Adnan Yahya and Ali Salhi",
title = "{Arabic} Text Categorization Based on {Arabic
Wikipedia}",
journal = j-TALIP,
volume = "13",
number = "1",
pages = "4:1--4:??",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2537129",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Feb 27 12:18:55 MST 2014",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "This article describes an algorithm for categorizing
Arabic text, relying on highly categorized corpus-based
datasets obtained from the Arabic Wikipedia by using
manual and automated processes to build and customize
categories. The categorization algorithm was built by
adopting a simple categorization idea then moving
forward to more complex ones. We applied tests and
filtration criteria to reach the best and most
efficient results that our algorithm can achieve. The
categorization depends on the statistical relations
between the input (test) text and the reference
(training) data supported by well-defined
Wikipedia-based categories. Our algorithm supports two
levels for categorizing Arabic text; categories are
grouped into a hierarchy of main categories and
subcategories. This introduces a challenge due to the
correlation between certain subcategories and overlap
between main categories. We argue that our algorithm
achieved good performance compared to other methods
reported in the literature.",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Church:2014:TPG,
author = "Kenneth Church",
title = "{TALIP} Perspectives, Guest Editorial Commentary: What
Counts (and What Ought to Count)?",
journal = j-TALIP,
volume = "13",
number = "1",
pages = "5:1--5:??",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2559789",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Thu Feb 27 12:18:55 MST 2014",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Sulaiman:2014:EJS,
author = "Suliana Sulaiman and Khairuddin Omar and Nazlia Omar
and Mohd Zamri Murah and Hamdan Abdul Rahman",
title = "The Effectiveness of a {Jawi} Stemmer for Retrieving
Relevant {Malay} Documents in {Jawi} Characters",
journal = j-TALIP,
volume = "13",
number = "2",
pages = "6:1--6:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2540988",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Fri Jun 20 18:22:19 MDT 2014",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "The Malay language has two types of writing script,
known as Rumi and Jawi. Most previous stemmer results
have reported on Malay Rumi characters and only a few
have tested Jawi characters. In this article, a new
Jawi stemmer has been proposed and tested for document
retrieval. A total of 36 queries and datasets from the
transliterated Jawi Quran were used. The experiment
shows that the mean average precision for a ``stemmed
Jawi'' document is 8.43\%. At the same time, the mean
average precision for a ``nonstemmed Jawi'' document is
5.14\%. The result from a paired sample t-test showed
that the use of a ``stemmed Jawi'' document increased
the precision in document retrieval. Further
experiments were performed to examine the precision of
the relevant documents that were retrieved at various
cutoff points for all 36 queries. The results for the
``stemmed Jawi'' document showed a significantly
different start, at a cutoff of 40, compared with the
``nonstemmed Jawi'' documents. This result shows the
usefulness of a Jawi stemmer for retrieving relevant
documents in the Jawi script.",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Esmaili:2014:TKI,
author = "Kyumars Sheykh Esmaili and Shahin Salavati and
Anwitaman Datta",
title = "Towards {Kurdish} Information Retrieval",
journal = j-TALIP,
volume = "13",
number = "2",
pages = "7:1--7:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2556948",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Fri Jun 20 18:22:19 MDT 2014",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "The Kurdish language is an Indo-European language
spoken in Kurdistan, a large geographical region in the
Middle East. Despite having a large number of speakers,
Kurdish is among the less-resourced languages and has
not seen much attention from the IR and NLP research
communities. This article reports on the outcomes of a
project aimed at providing essential resources for
processing Kurdish texts. A principal output of this
project is Pewan, the first standard Test Collection to
evaluate Kurdish Information Retrieval systems. The
other language resources that we have built include a
lightweight stemmer and a list of stopwords. Our second
principal contribution is using these newly-built
resources to conduct a thorough experimental study on
Kurdish documents. Our experimental results show that
normalization, and to a lesser extent, stemming, can
greatly improve the performance of Kurdish IR
systems.",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Sharma:2014:WPS,
author = "Manoj Kumar Sharma and Debasis Samanta",
title = "Word Prediction System for Text Entry in {Hindi}",
journal = j-TALIP,
volume = "13",
number = "2",
pages = "8:1--8:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2617590",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Fri Jun 20 18:22:19 MDT 2014",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/spell.bib;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Word prediction is treated as an efficient technique
to enhance text entry rate. Existing word prediction
systems predict a word when a user correctly enters the
initial few characters of the word. In fact, a word
prediction system fails if the user makes errors in the
initial input. Therefore, there is a need to develop a
word prediction system that predicts desired words
while coping with errors in initial entries. This
requirement is more relevant in the case of text entry
in Indian languages, which are involved with a large
set of alphabets, words with complex characters and
inflections, phonetically similar sets of characters,
etc. In fact, text composition in Indian languages
involves frequent spelling errors, which presents a
challenge to develop an efficient word prediction
system. In this article, we address this problem and
propose a novel word prediction system. Our proposed
approach has been tried with Hindi, the national
language of India. Experiments with users substantiate
43.77\% keystroke savings, 92.49\% hit rate, and
95.82\% of prediction utilization with the proposed
word prediction system. Our system also reduces the
spelling error by 89.75\%.",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Keskes:2014:SAT,
author = "Iskandar Keskes and Farah Benamara Zitoune and Lamia
Hadrich Belguith",
title = "Splitting {Arabic} Texts into Elementary Discourse
Units",
journal = j-TALIP,
volume = "13",
number = "2",
pages = "9:1--9:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2601401",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Fri Jun 20 18:22:19 MDT 2014",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "In this article, we propose the first work that
investigates the feasibility of Arabic discourse
segmentation into elementary discourse units within the
segmented discourse representation theory framework. We
first describe our annotation scheme that defines a set
of principles to guide the segmentation process. Two
corpora have been annotated according to this scheme:
elementary school textbooks and newspaper documents
extracted from the syntactically annotated Arabic
Treebank. Then, we propose a multiclass supervised
learning approach that predicts nested units. Our
approach uses a combination of punctuation,
morphological, lexical, and shallow syntactic features.
We investigate how each feature contributes to the
learning process. We show that an extensive
morphological analysis is crucial to achieve good
results in both corpora. In addition, we show that
adding chunks does not boost the performance of our
system.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Rubin:2014:TPG,
author = "Victoria L. Rubin",
title = "{TALIP} Perspectives, Guest Editorial Commentary:
Pragmatic and Cultural Considerations for Deception
Detection in {Asian} Languages",
journal = j-TALIP,
volume = "13",
number = "2",
pages = "10:1--10:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2605292",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Fri Jun 20 18:22:19 MDT 2014",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "In hopes of sparking a discussion, I argue for much
needed research on automated deception detection in
Asian languages. The task of discerning truthful texts
from deceptive ones is challenging, but a logical
sequel to opinion mining. I suggest that applied
computational linguists pursue broader
interdisciplinary research on cultural differences and
pragmatic use of language in Asian cultures, before
turning to detection methods based on a primarily
Western (English-centric) worldview. Deception is
fundamentally human, but how do various cultures
interpret and judge deceptive behavior?",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Na:2014:LAN,
author = "Hwidong Na and Jong-Hyeok Lee",
title = "Linguistic analysis of non-{ITG} word reordering
between language pairs with different word order
typologies",
journal = j-TALIP,
volume = "13",
number = "3",
pages = "11:1--11:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2644810",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Oct 4 06:09:41 MDT 2014",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "The Inversion Transduction Grammar (ITG) constraints
have been widely used for word reordering in machine
translation studies. They are, however, so restricted
that some types of word reordering cannot be handled
properly. We analyze three corpora between SVO and SOV
languages: Chinese--Korean, English-Japanese, and
English-Korean. In our analysis, sentences that require
non-ITG word reordering are manually categorized. We
also report the results for two quantitative measures
that reveal the significance of non-ITG word
reordering. In conclusion, we suggest that ITG
constraints are insufficient to deal with word
reordering in real situations.",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{A:2014:AMO,
author = "Bharath A. and Sriganesh Madhvanath",
title = "Allograph modeling for online handwritten characters
in {Devanagari} using constrained stroke clustering",
journal = j-TALIP,
volume = "13",
number = "3",
pages = "12:1--12:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629622",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Oct 4 06:09:41 MDT 2014",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Writer-specific character writing variations such as
those of stroke order and stroke number are an
important source of variability in the input when
handwriting is captured ``online'' via a stylus and a
challenge for robust online recognition of handwritten
characters and words. It has been shown by several
studies that explicit modeling of character allographs
is important for achieving high recognition accuracies
in a writer-independent recognition system. While
previous approaches have relied on unsupervised
clustering at the character or stroke level to find the
allographs of a character, in this article we propose
the use of constrained clustering using automatically
derived domain constraints to find a minimal set of
stroke clusters. The allographs identified have been
applied to Devanagari character recognition using
Hidden Markov Models and Nearest Neighbor classifiers,
and the results indicate substantial improvement in
recognition accuracy and/or reduction in memory and
computation time when compared to alternate modeling
techniques.",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Paik:2014:IBF,
author = "Jiaul H. Paik and Dipasree Pal and Swapan K. Parui",
title = "Incremental blind feedback: an effective approach to
automatic query expansion",
journal = j-TALIP,
volume = "13",
number = "3",
pages = "13:1--13:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2611521",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Oct 4 06:09:41 MDT 2014",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Automatic query expansion (AQE) is a useful technique
for enhancing the effectiveness of information
retrieval systems. In this article, we propose a novel
AQE algorithm which first adopts a systematic
incremental approach to choose feedback documents from
the top retrieved set and then selects the expansion
terms aggregating the scores from each feedback set. We
also devise a term selection measure and a number of
weighting schemes based on easily computable features.
A set of experiments with a large number of standard
test collections reveals that the proposed incremental
blind feedback algorithm outperforms a number of
state-of-the-art query expansion methods with
remarkable significance and consistency.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Saharia:2014:SRP,
author = "Navanath Saharia and Utpal Sharma and Jugal Kalita",
title = "Stemming resource-poor {Indian} languages",
journal = j-TALIP,
volume = "13",
number = "3",
pages = "14:1--14:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629670",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Oct 4 06:09:41 MDT 2014",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Stemming is a basic method for morphological
normalization of natural language texts. In this study,
we focus on the problem of stemming several
resource-poor languages from Eastern India, viz.,
Assamese, Bengali, Bishnupriya Manipuri and Bodo. While
Assamese, Bengali and Bishnupriya Manipuri are
Indo-Aryan, Bodo is a Tibeto-Burman language. We design
a rule-based approach to remove suffixes from words. To
reduce over-stemming and under-stemming errors, we
introduce a dictionary of frequent words. We observe
that, for these languages a dominant amount of suffixes
are single letters creating problems during suffix
stripping. As a result, we introduce an HMM-based
hybrid approach to classify the mis-matched last
character. For each word, the stem is extracted by
calculating the most probable path in four HMM states.
At each step we measure the stemming accuracy for each
language. We obtain 94\% accuracy for Assamese and
Bengali and 87\%, and 82\% for Bishnupriya Manipuri and
Bodo, respectively, using the hybrid approach. We
compare our work with Morfessor [Creutz and Lagus
2005]. As of now, there is no reported work on stemming
for Bishnupriya Manipuri and Bodo. Our results on
Assamese and Bengali show significant improvement over
prior published work [Sarkar and Bandyopadhyay 2008;
Sharma et al. 2002, 2003].",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Sproat:2014:SJ,
author = "Richard Sproat",
title = "The state of the journal",
journal = j-TALIP,
volume = "13",
number = "3",
pages = "15:1--15:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2656620",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Sat Oct 4 06:09:41 MDT 2014",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}
@Article{Bang:2014:PVP,
author = "Jeesoo Bang and Jonghoon Lee and Gary Geunbae Lee and
Minhwa Chung",
title = "Pronunciation Variants Prediction Method to Detect
Mispronunciations by {Korean} Learners of {English}",
journal = j-TALIP,
volume = "13",
number = "4",
pages = "16:1--16:??",
month = dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629545",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Wed Jan 7 15:23:49 MST 2015",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "This article presents an approach to nonnative
pronunciation variants modeling and prediction. The
pronunciation variants prediction method was developed
by generalized transformation-based error-driven
learning (GTBL). The modified goodness of pronunciation
(GOP) score was applied to effective mispronunciation
detection using logistic regression machine learning
under the pronunciation variants prediction.
English-read speech data uttered by Korean-speaking
learners of English were collected, then pronunciation
variation knowledge was extracted from the differences
between the canonical phonemes and the actual phonemes
of the speech data. With this knowledge, an
error-driven learning approach was designed that
automatically learns phoneme variation rules from
phoneme-level transcriptions. The learned rules
generate an extended recognition network to detect
mispronunciations. Three different mispronunciation
detection methods were tested including our logistic
regression machine learning method with modified GOP
scores and mispronunciation preference features; all
three methods yielded significant improvement in
predictions of pronunciation variants, and our logistic
regression method showed the best performance.",
acknowledgement = ack-nhfb,
articleno = "16",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J820",
}
@Article{Liu:2014:DTL,
author = "Lemao Liu and Tiejun Zhao and Taro Watanabe and
Hailong Cao and Conghui Zhu",
title = "Discriminative Training for Log-Linear Based {SMT}:
Global or Local Methods",
journal = j-TALIP,
volume = "13",
number = "4",
pages = "17:1--17:??",
month = dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2637478",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Wed Jan 7 15:23:49 MST 2015",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "In statistical machine translation, the standard
methods such as MERT tune a single weight with regard
to a given development data. However, these methods
suffer from two problems due to the diversity and
uneven distribution of source sentences. First, their
performance is highly dependent on the choice of a
development set, which may lead to an unstable
performance for testing. Second, the sentence level
translation quality is not assured since tuning is
performed on the document level rather than on sentence
level. In contrast with the standard global training in
which a single weight is learned, we propose novel
local training methods to address these two problems.
We perform training and testing in one step by locally
learning the sentence-wise weight for each input
sentence. Since the time of each tuning step is
unnegligible and learning sentence-wise weights for the
entire test set means many passes of tuning, it is a
great challenge for the efficiency of local training.
We propose an efficient two-phase method to put the
local training into practice by employing the
ultraconservative update. On NIST Chinese-to-English
translation tasks with both medium and large scales of
training data, our local training methods significantly
outperform standard methods with the maximal
improvements up to 2.0 BLEU points, meanwhile their
efficiency is comparable to that of the standard
methods.",
acknowledgement = ack-nhfb,
articleno = "17",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J820",
}
@Article{Zhuang:2014:EPP,
author = "Yi Zhuang and Qing Li and Dickson K. W. Chiu and
Zhiang Wu and Haiyang Hu",
title = "Efficient Personalized Probabilistic Retrieval of
{Chinese} Calligraphic Manuscript Images in Mobile
Cloud Environment",
journal = j-TALIP,
volume = "13",
number = "4",
pages = "18:1--18:??",
month = dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629575",
ISSN = "1530-0226 (print), 1558-3430 (electronic)",
ISSN-L = "1530-0226",
bibdate = "Wed Jan 7 15:23:49 MST 2015",
bibsource = "http://portal.acm.org/;
http://www.math.utah.edu/pub/tex/bib/talip.bib",
abstract = "Ancient language manuscripts constitute a key part of
the cultural heritage of mankind. As one of the most
important languages, Chinese historical calligraphy
work has contributed to not only the Chinese cultural
heritage but also the world civilization at large,
especially for Asia. To support deeper and more
convenient appreciation of Chinese calligraphy works,
based on our previous work on the probabilistic
retrieval of historical Chinese calligraphic character
manuscripts repositories, we propose a system framework
of the multi-feature-based Chinese calligraphic
character images probabilistic retrieval in the mobile
cloud network environment, which is called the DPRC. To
ensure retrieval efficiency, we further propose four
enabling techniques: (1) DRL-based probability
propagation, (2) optimal data placement scheme, (3)
adaptive data robust transmission algorithm, and (4)
index support filtering scheme. Comprehensive
experiments are conducted to testify the effectiveness
and efficiency of our proposed DPRC method.",
acknowledgement = ack-nhfb,
articleno = "18",
fjournal = "ACM Transactions on Asian Language Information
Processing",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J820",
}