@Preamble{"\input bibnames.sty" #
"\ifx \Thorn \undefined \def \Thorn {T}\fi" #
"\hyphenation{
}"
}
@String{ack-nhfb = "Nelson H. F. Beebe,
University of Utah,
Department of Mathematics, 110 LCB,
155 S 1400 E RM 233,
Salt Lake City, UT 84112-0090, USA,
Tel: +1 801 581 5254,
FAX: +1 801 581 4148,
e-mail: \path|beebe@math.utah.edu|,
\path|beebe@acm.org|,
\path|beebe@computer.org| (Internet),
URL: \path|https://www.math.utah.edu/~beebe/|"}
@String{j-TOMCCAP = "ACM Transactions on Multimedia Computing,
Communications, and Applications"}
@String{j-TOMM = "ACM Transactions on Multimedia Computing,
Communications, and Applications"}
@Article{Georganas:2005:EBA,
author = "Nicolas D. Georganas",
title = "{Editorial}: {The} birth of the {ACM Transactions on
Multimedia Computing, Communications and Applications}
{(TOMCCAP)}",
journal = j-TOMCCAP,
volume = "1",
number = "1",
pages = "1--2",
month = feb,
year = "2005",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Thu Apr 14 11:01:03 MDT 2005",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Rowe:2005:ASR,
author = "Lawrence A. Rowe and Ramesh Jain",
title = "{ACM SIGMM Retreat} report on future directions in
multimedia research",
journal = j-TOMCCAP,
volume = "1",
number = "1",
pages = "3--13",
month = feb,
year = "2005",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Thu Apr 14 11:01:03 MDT 2005",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Jain:2005:GEI,
author = "Ramesh Jain and Thomas Plagemann and Ralf Steinmetz",
title = "Guest editorial: {The International ACM Multimedia
Conference 1993} --- ten years after",
journal = j-TOMCCAP,
volume = "1",
number = "1",
pages = "14--15",
month = feb,
year = "2005",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Thu Apr 14 11:01:03 MDT 2005",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Teodosio:2005:SS,
author = "Laura Teodosio and Walter Bender",
title = "Salient stills",
journal = j-TOMCCAP,
volume = "1",
number = "1",
pages = "16--36",
month = feb,
year = "2005",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Thu Apr 14 11:01:03 MDT 2005",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Reddy:2005:DSM,
author = "A. L. N. Reddy and Jim Wyllie and K. B. R.
Wijayaratne",
title = "Disk scheduling in a multimedia {I/O} system",
journal = j-TOMCCAP,
volume = "1",
number = "1",
pages = "37--59",
month = feb,
year = "2005",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Thu Apr 14 11:01:03 MDT 2005",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Buchanan:2005:ATL,
author = "M. Cecelia Buchanan and Polle T. Zellweger",
title = "Automatic temporal layout mechanisms revisited",
journal = j-TOMCCAP,
volume = "1",
number = "1",
pages = "60--88",
month = feb,
year = "2005",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Thu Apr 14 11:01:03 MDT 2005",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Bulterman:2005:SMA,
author = "Dick C. A. Bulterman and Lynda Hardman",
title = "Structured multimedia authoring",
journal = j-TOMCCAP,
volume = "1",
number = "1",
pages = "89--109",
month = feb,
year = "2005",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Thu Apr 14 11:01:03 MDT 2005",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Mayer-Patel:2005:BSM,
author = "Ketan Mayer-Patel and Brian C. Smith and Lawrence A.
Rowe",
title = "The {Berkeley} software {MPEG-1} video decoder",
journal = j-TOMCCAP,
volume = "1",
number = "1",
pages = "110--125",
month = feb,
year = "2005",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Thu Apr 14 11:01:03 MDT 2005",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Plagemann:2005:SPA,
author = "Thomas Plagemann and Prashant Shenoy and John R.
Smith",
title = "Selected papers from the {ACM Multimedia Conference
2003}",
journal = j-TOMCCAP,
volume = "1",
number = "2",
pages = "127--127",
month = may,
year = "2005",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Thu Jul 7 13:52:13 MDT 2005",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Kum:2005:RTM,
author = "Sang-Uok Kum and Ketan Mayer-Patel",
title = "Real-time multidepth stream compression",
journal = j-TOMCCAP,
volume = "1",
number = "2",
pages = "128--150",
month = may,
year = "2005",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Thu Jul 7 13:52:13 MDT 2005",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Feng:2005:PSL,
author = "Wu-Chi Feng and Ed Kaiser and Wu Chang Feng and Mikael
Le Baillif",
title = "{Panoptes}: scalable low-power video sensor networking
technologies",
journal = j-TOMCCAP,
volume = "1",
number = "2",
pages = "151--167",
month = may,
year = "2005",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Thu Jul 7 13:52:13 MDT 2005",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Goh:2005:SFD,
author = "Kingshy Goh and Beitao Li and Edward Y. Chang",
title = "Semantics and feature discovery via confidence-based
ensemble",
journal = j-TOMCCAP,
volume = "1",
number = "2",
pages = "168--189",
month = may,
year = "2005",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Thu Jul 7 13:52:13 MDT 2005",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Baker:2005:UPC,
author = "H. Harlyn Baker and Nina Bhatti and Donald Tanguay and
Irwin Sobel and Dan Gelb and Michael E. Goss and W.
Bruce Culbertson and Thomas Malzbender",
title = "Understanding performance in {Coliseum}, an immersive
videoconferencing system",
journal = j-TOMCCAP,
volume = "1",
number = "2",
pages = "190--210",
month = may,
year = "2005",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Thu Jul 7 13:52:13 MDT 2005",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Adams:2005:IIM,
author = "Brett Adams and Svetha Venkatesh and Ramesh Jain",
title = "{IMCE}: {Integrated} media creation environment",
journal = j-TOMCCAP,
volume = "1",
number = "3",
pages = "211--247",
month = aug,
year = "2005",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Fri Nov 18 08:30:19 MST 2005",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Poellabauer:2005:FCD,
author = "Christian Poellabauer and Karsten Schwan",
title = "Flexible cross-domain event delivery for
quality-managed multimedia applications",
journal = j-TOMCCAP,
volume = "1",
number = "3",
pages = "248--268",
month = aug,
year = "2005",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Fri Nov 18 08:30:19 MST 2005",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Cooper:2005:TEC,
author = "Matthew Cooper and Jonathan Foote and Andreas
Girgensohn and Lynn Wilcox",
title = "Temporal event clustering for digital photo
collections",
journal = j-TOMCCAP,
volume = "1",
number = "3",
pages = "269--288",
month = aug,
year = "2005",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Fri Nov 18 08:30:19 MST 2005",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Li:2005:CEM,
author = "Keqiu Li and Hong Shen",
title = "Coordinated enroute multimedia object caching in
transcoding proxies for tree networks",
journal = j-TOMCCAP,
volume = "1",
number = "3",
pages = "289--314",
month = aug,
year = "2005",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Fri Nov 18 08:30:19 MST 2005",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wu:2005:AFE,
author = "Huahui Wu and Mark Claypool and Robert Kinicki",
title = "Adjusting forward error correction with temporal
scaling for {TCP}-friendly streaming {MPEG}",
journal = j-TOMCCAP,
volume = "1",
number = "4",
pages = "315--337",
month = nov,
year = "2005",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Thu Sep 7 16:13:26 MDT 2006",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Cai:2005:LUL,
author = "Jianfei Cai and Xiangjun Li and Chang Wen Chen",
title = "Layered unequal loss protection with pre-interleaving
for fast progressive image transmission over
packet-loss channels",
journal = j-TOMCCAP,
volume = "1",
number = "4",
pages = "338--353",
month = nov,
year = "2005",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Thu Sep 7 16:13:26 MDT 2006",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Tu:2005:ASP,
author = "Yi-Cheng Tu and Jianzhong Sun and Mohamed Hefeeda and
Sunil Prabhakar",
title = "An analytical study of peer-to-peer media streaming
systems",
journal = j-TOMCCAP,
volume = "1",
number = "4",
pages = "354--376",
month = nov,
year = "2005",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Thu Sep 7 16:13:26 MDT 2006",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Lew:2006:CBM,
author = "Michael S. Lew and Nicu Sebe and Chabane Djeraba and
Ramesh Jain",
title = "Content-based multimedia information retrieval:
{State} of the art and challenges",
journal = j-TOMCCAP,
volume = "2",
number = "1",
pages = "1--19",
month = feb,
year = "2006",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Thu Sep 7 16:13:26 MDT 2006",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{DelBimbo:2006:CBR,
author = "Alberto {Del Bimbo} and Pietro Pala",
title = "Content-based retrieval of {$3$D} models",
journal = j-TOMCCAP,
volume = "2",
number = "1",
pages = "20--43",
month = feb,
year = "2006",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Thu Sep 7 16:13:26 MDT 2006",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Xu:2006:FAF,
author = "Huaxin Xu and Tat-Seng Chua",
title = "Fusion of {AV} features and external information
sources for event detection in team sports video",
journal = j-TOMCCAP,
volume = "2",
number = "1",
pages = "44--67",
month = feb,
year = "2006",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Thu Sep 7 16:13:26 MDT 2006",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Joshi:2006:SPE,
author = "Dhiraj Joshi and James Z. Wang and Jia Li",
title = "The {Story Picturing Engine}---a system for automatic
text illustration",
journal = j-TOMCCAP,
volume = "2",
number = "1",
pages = "68--89",
month = feb,
year = "2006",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Thu Sep 7 16:13:26 MDT 2006",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Snoek:2006:LRS,
author = "Cees G. M. Snoek and Marcel Worring and Alexander G.
Hauptmann",
title = "Learning rich semantics from news video archives by
style analysis",
journal = j-TOMCCAP,
volume = "2",
number = "2",
pages = "91--108",
month = may,
year = "2006",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Thu Sep 7 16:13:26 MDT 2006",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yang:2006:SER,
author = "Guang Yang and Tony Sun and Mario Gerla and M. Y.
Sanadidi and Ling-Jyh Chen",
title = "Smooth and efficient real-time video transport in the
presence of wireless errors",
journal = j-TOMCCAP,
volume = "2",
number = "2",
pages = "109--126",
month = may,
year = "2006",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Thu Sep 7 16:13:26 MDT 2006",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Shao:2006:ASM,
author = "Xi Shao and Changsheng Xu and Namunu C. Maddage and Qi
Tian and Mohan S. Kankanhalli and Jesse S. Jin",
title = "Automatic summarization of music videos",
journal = j-TOMCCAP,
volume = "2",
number = "2",
pages = "127--148",
month = may,
year = "2006",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Thu Sep 7 16:13:26 MDT 2006",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Eide:2006:RTV,
author = "Viktor S. Wold Eide and Ole-Christoffer Granmo and
Frank Eliassen and J{\o}rgen Andreas Michaelsen",
title = "Real-time video content analysis: {QoS}-aware
application composition and parallel processing",
journal = j-TOMCCAP,
volume = "2",
number = "2",
pages = "149--172",
month = may,
year = "2006",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Thu Sep 7 16:13:26 MDT 2006",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Candan:2006:ISI,
author = "K. Sel{\c{c}}uk Candan and Augusto Celentano and
Wolfgang Klas",
title = "Introduction to special issue on the use of context in
multimedia information systems",
journal = j-TOMCCAP,
volume = "2",
number = "3",
pages = "173--176",
month = aug,
year = "2006",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Thu Sep 7 16:13:26 MDT 2006",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Ferrara:2006:SWO,
author = "Alfio Ferrara and Luca A. Ludovico and Stefano
Montanelli and Silvana Castano and Goffredo Haus",
title = "A {Semantic Web} ontology for context-based
classification and retrieval of music resources",
journal = j-TOMCCAP,
volume = "2",
number = "3",
pages = "177--198",
month = aug,
year = "2006",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Thu Sep 7 16:13:26 MDT 2006",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Arigon:2006:HMP,
author = "Anne-Muriel Arigon and Anne Tchounikine and Maryvonne
Miquel",
title = "Handling multiple points of view in a multimedia data
warehouse",
journal = j-TOMCCAP,
volume = "2",
number = "3",
pages = "199--218",
month = aug,
year = "2006",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Thu Sep 7 16:13:26 MDT 2006",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Kahol:2006:MCH,
author = "Kanav Kahol and Priyamvada Tripathi and Troy Mcdaniel
and Laura Bratton and Sethuraman Panchanathan",
title = "Modeling context in haptic perception, rendering, and
visualization",
journal = j-TOMCCAP,
volume = "2",
number = "3",
pages = "219--240",
month = aug,
year = "2006",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Thu Sep 7 16:13:26 MDT 2006",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Gulliver:2006:DUP,
author = "Stephen R. Gulliver and Gheorghita Ghinea",
title = "Defining user perception of distributed multimedia
quality",
journal = j-TOMCCAP,
volume = "2",
number = "4",
pages = "241--257",
month = nov,
year = "2006",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Sat Apr 14 11:19:17 MDT 2007",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Gopalan:2006:SAC,
author = "Kartik Gopalan and Lan Huang and Gang Peng and
Tzi-Cker Chiueh and Yow-Jian Lin",
title = "Statistical admission control using delay distribution
measurements",
journal = j-TOMCCAP,
volume = "2",
number = "4",
pages = "258--281",
month = nov,
year = "2006",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Sat Apr 14 11:19:17 MDT 2007",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Li:2006:MSP,
author = "H. Li and M. Li and B. Prabhakaran",
title = "Middleware for streaming {$3$D} progressive meshes
over lossy networks",
journal = j-TOMCCAP,
volume = "2",
number = "4",
pages = "282--317",
month = nov,
year = "2006",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Sat Apr 14 11:19:17 MDT 2007",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Etsion:2006:PPU,
author = "Yoav Etsion and Dan Tsafrir and Dror G. Feitelson",
title = "Process prioritization using output production:
{Scheduling} for multimedia",
journal = j-TOMCCAP,
volume = "2",
number = "4",
pages = "318--342",
month = nov,
year = "2006",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Sat Apr 14 11:19:17 MDT 2007",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Cesar:2006:GAH,
author = "Pablo Cesar and Petri Vuorimaa and Juha Vierinen",
title = "A graphics architecture for high-end interactive
television terminals",
journal = j-TOMCCAP,
volume = "2",
number = "4",
pages = "343--357",
month = nov,
year = "2006",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Sat Apr 14 11:19:17 MDT 2007",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Madhwacharyula:2006:MHV,
author = "Chitra L. Madhwacharyula and Marc Davis and Philippe
Mulhem and Mohan S. Kankanhalli",
title = "Metadata handling: a video perspective",
journal = j-TOMCCAP,
volume = "2",
number = "4",
pages = "358--388",
month = nov,
year = "2006",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Sat Apr 14 11:19:17 MDT 2007",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Atrey:2007:GOO,
author = "Pradeep K. Atrey and Mohan S. Kankanhalli and John B.
Oommen",
title = "Goal-oriented optimal subset selection of correlated
multimedia streams",
journal = j-TOMCCAP,
volume = "3",
number = "1",
pages = "??--??",
month = feb,
year = "2007",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Sat Apr 14 11:19:17 MDT 2007",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "2",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Chen:2007:DSI,
author = "Datong Chen and Jie Yang and Robert Malkin and Howard
D. Wactlar",
title = "Detecting social interactions of the elderly in a
nursing home environment",
journal = j-TOMCCAP,
volume = "3",
number = "1",
pages = "??--??",
month = feb,
year = "2007",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Sat Apr 14 11:19:17 MDT 2007",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "6",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Heck:2007:VV,
author = "Rachel Heck and Michael Wallick and Michael Gleicher",
title = "Virtual videography",
journal = j-TOMCCAP,
volume = "3",
number = "1",
pages = "??--??",
month = feb,
year = "2007",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Sat Apr 14 11:19:17 MDT 2007",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "4",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Truong:2007:VAS,
author = "Ba Tu Truong and Svetha Venkatesh",
title = "Video abstraction: a systematic review and
classification",
journal = j-TOMCCAP,
volume = "3",
number = "1",
pages = "??--??",
month = feb,
year = "2007",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Sat Apr 14 11:19:17 MDT 2007",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "3",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Xu:2007:CAD,
author = "Changsheng Xu and Namunu C. Maddage and Xi Shao and Qi
Tian",
title = "Content-adaptive digital music watermarking based on
music structure analysis",
journal = j-TOMCCAP,
volume = "3",
number = "1",
pages = "??--??",
month = feb,
year = "2007",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Sat Apr 14 11:19:17 MDT 2007",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "1",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yan:2007:MSO,
author = "Wei-Qi Yan and Mohan S. Kankanhalli",
title = "Multimedia simplification for optimized {MMS}
synthesis",
journal = j-TOMCCAP,
volume = "3",
number = "1",
pages = "??--??",
month = feb,
year = "2007",
CODEN = "????",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Sat Apr 14 11:19:17 MDT 2007",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "5",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Liu:2007:CAT,
author = "Tiecheng Liu and John R. Kender",
title = "Computational approaches to temporal sampling of video
sequences",
journal = j-TOMCCAP,
volume = "3",
number = "2",
pages = "7:1--7:??",
month = may,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1230812.1230813",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:10:04 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Video key frame extraction is one of the most
important research problems for video summarization,
indexing, and retrieval. For a variety of applications
such as ubiquitous media access and video streaming,
the temporal boundaries between video key frames are
required for synchronizing visual content with audio.
In this article, we define temporal video sampling as a
unified process of extracting video key frames and
computing their temporal boundaries, and formulate it
as an optimization problem. We first provide an optimal
approach that minimizes temporal video sampling error
using a dynamic programming process. The optimal
approach retrieves a key frame hierarchy and all
temporal boundaries in $ O(n^4) $ time and $ O(n^2) $
space. To further reduce computational complexity, we
also provide a suboptimal greedy algorithm that
exploits the data structure of a binary heap and uses a
novel ``look-ahead'' computational technique, enabling
all levels of key frames to be extracted with an
average-case computational time of $ O(n \log n) $ and
memory usage of $ O(n) $. Both the optimal and the
greedy methods are free of parameters, thus avoiding
the threshold-selection problem that exists in other
approaches. We empirically compare the proposed optimal
and greedy methods with several existing methods in
terms of video sampling error, computational cost, and
subjective quality. An evaluation of eight videos of
different genres shows that the greedy approach
achieves performance very close to that of the optimal
approach while drastically reducing computational cost,
making it suitable for processing long video sequences
in large video databases.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "7",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "key frame selection; temporal video sampling;
ubiquitous media access; video content analysis; video
summarization",
}
@Article{Moncrieff:2007:OAB,
author = "Simon Moncrieff and Svetha Venkatesh and Geoff West",
title = "Online audio background determination for complex
audio environments",
journal = j-TOMCCAP,
volume = "3",
number = "2",
pages = "8:1--8:??",
month = may,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1230812.1230814",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:10:04 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "We present a method for foreground/background
separation of audio using a background modelling
technique. The technique models the background in an
online, unsupervised, and adaptive fashion, and is
designed for application to long term surveillance and
monitoring problems. The background is determined using
a statistical method to model the states of the audio
over time. In addition, three methods are used to
increase the accuracy of background modelling in
complex audio environments. Such environments can cause
the failure of the statistical model to accurately
capture the background states. An entropy-based
approach is used to unify background representations
fragmented over multiple states of the statistical
model. The approach successfully unifies such
background states, resulting in a more robust
background model. We adaptively adjust the number of
states considered background according to background
complexity, resulting in the more accurate
classification of background models. Finally, we use an
auxiliary model cache to retain potential background
states in the system. This prevents the deletion of
such states due to a rapid influx of observed states
that can occur for highly dynamic sections of the audio
signal. The separation algorithm was successfully
applied to a number of audio environments representing
monitoring applications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "8",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "audio analysis; online background modelling;
surveillance and monitoring",
}
@Article{Oshima:2007:PDS,
author = "Chika Oshima and Kazushi Nishimoto and Norihiro
Hagita",
title = "A piano duo support system for parents to lead
children to practice musical performances",
journal = j-TOMCCAP,
volume = "3",
number = "2",
pages = "9:1--9:??",
month = may,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1230812.1230815",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:10:04 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this article, we propose ``Family Ensemble,'' a
piano duo support system for a musically inept parent
and his/her child who is a beginner at playing the
piano. The system makes it easier for parents to
correctly reproduce a given sequence of pitches along
with the child's performance by using score tracking
and note-replacement functions. The experiments with
this support system showed that the parents can
immediately participate in the piano duo. Furthermore,
we found that during joint practices using Family
Ensemble some subjects discussed musical ideas that
they would not have talked about without using the
system.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "9",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "entertainment; musical expression; piano duo; score
tracking; support system",
}
@Article{He:2007:CSW,
author = "Xiaofei He and Deng Cai and Ji-Rong Wen and Wei-Ying
Ma and Hong-Jiang Zhang",
title = "Clustering and searching {WWW} images using link and
page layout analysis",
journal = j-TOMCCAP,
volume = "3",
number = "2",
pages = "10:1--10:??",
month = may,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1230812.1230816",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:10:04 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Due to the rapid growth of the number of digital
images on the Web, there is an increasing demand for an
effective and efficient method for organizing and
retrieving the available images. This article describes
iFind, a system for clustering and searching WWW
images. By using a vision-based page segmentation
algorithm, a Web page is partitioned into blocks, and
the textual and link information of an image can be
accurately extracted from the block containing that
image. The textual information is used for image
indexing. By extracting the page-to-block,
block-to-image, block-to-page relationships through
link structure and page layout analysis, we construct
an image graph. Our method is less sensitive to noisy
links than previous methods like PageRank, HITS, and
PicASHOW, and hence the image graph can better reflect
the semantic relationship between images. Using the
notion of Markov Chain, we can compute the limiting
probability distributions of the images, ImageRanks,
which characterize the importance of the images. The
ImageRanks are combined with the relevance scores to
produce the final ranking for image search. With the
graph models, we can also use techniques from spectral
graph theory for image clustering and embedding, or 2-D
visualization. Some experimental results on 11.6
million images downloaded from the Web are provided in
the article.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "10",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "image clustering; image search; link analysis; Web
mining",
}
@Article{Jung:2007:NBA,
author = "Byunghee Jung and Junehwa Song and Yoonjoon Lee",
title = "A narrative-based abstraction framework for
story-oriented video",
journal = j-TOMCCAP,
volume = "3",
number = "2",
pages = "11:1--11:??",
month = may,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1230812.1230817",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:10:04 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article proposes a novel video abstraction
framework for online review services of story-oriented
videos such as dramas. Among the many genres of TV
programs, a drama is one of the most popularly watched
on the Web. The abstracts generated by the proposed
framework not only give a summary of a video but also
effectively help viewers understand the overall story.
In addition, our method is duration-flexible. We get
clues about human understanding of a story from
scenario writing rules and editorial techniques that
are popularly used in the process of video production
to explicitly express a narrative, and propose a new
video abstraction model, called a Narrative Abstraction
Model. The model effectively captures the narrative
structure embedded in a story-oriented video and
articulates the progress of the story in a weighted
directed graph, called a Narrative Structure Graph
(NSG). The model provides a basis for a flexible
framework for abstract generation using the NSG as the
intermediary representation of a video. Different
abstracts can be appropriately generated based upon
different user requirements. To show the effectiveness
of the proposed model and method, we developed a video
abstraction system realizing the framework, and
successfully applied it to large volumes of TV dramas.
The evaluation results show that the proposed framework
is a feasible solution for online review services.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "11",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "film; narrative structure; online review services;
story understanding; story-oriented; video abstraction;
video abstraction system",
}
@Article{Shacham:2007:UDP,
author = "Ron Shacham and Henning Schulzrinne and Srisakul
Thakolsri and Wolfgang Kellerer",
title = "Ubiquitous device personalization and use: {The} next
generation of {IP} multimedia communications",
journal = j-TOMCCAP,
volume = "3",
number = "2",
pages = "12:1--12:??",
month = may,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1230812.1230818",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:10:04 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Service usage in emerging ubiquitous environments
includes seamless and personalized usage of public and
private devices discovered in the vicinity of a user.
In our work, we describe an architecture for device
discovery, device configuration, and the transfer of
active sessions between devices. The presented
architecture uses the Session Initiation Protocol (SIP)
as a standardized, widely used signaling protocol for
IP-based multimedia services. Our solution includes
support of simple existing devices, split of sessions
between devices, user-control of location-based
behavior, and handling of security and privacy
concerns. We present the implementation and show the
feasibility of our work with analytical evaluation and
measurements.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "12",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "Internet multimedia; location-based services; mobile
communications; ubiquitous computing",
}
@Article{Chen:2007:EMO,
author = "Herng-Yow Chen and Sheng-Wei Li",
title = "Exploring many-to-one speech-to-text correlation for
{Web}-based language learning",
journal = j-TOMCCAP,
volume = "3",
number = "3",
pages = "13:1--13:??",
month = aug,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1236471.1236472",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:10:32 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article investigates the correlations between
multimedia objects (particularly speech and text)
involved in language lectures in order to design an
effective presentation mechanism for web-based
learning. The cross-media correlations are classified
into implicit relations (retrieved by computing) and
explicit relations (recorded during the preprocessing
stage). The implicit temporal correlation between
speech and text is primarily to help to negotiate
supplementary lecture navigations like tele-pointer
movement, lips-sync movement, and content scrolling. We
propose a speech-text alignment framework, using an
iterative algorithm based on local alignment, to probe
many-to-one temporal correlations, and not the
one-to-one only. The proposed framework is a more
practical method for analyzing general language
lectures, and the algorithm's time complexity conforms
to the best-possible computation cost, O(nm), without
introducing additional computation. In addition, we
have shown the feasibility of creating vivid
presentations by exploiting implicit relations and
artificially simulating some explicit media. To
facilitate the navigation of integrated multimedia
documents, we develop several visualization techniques
for describing media correlations, including guidelines
for speech-text correlations, visible-automatic
scrolling, and levels of detail of timeline, to provide
intuitive and easy-to-use random access mechanisms. We
evaluated the performance of the analysis method and
human perceptions of the synchronized presentation. The
overall performance of the analysis method is that
about 99.5\% of the words analyzed are of a temporal
error within 0.5 sec and the subjective evaluation
result shows that the synchronized presentation is
highly acceptable to human beings.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "13",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "analysis and presentation; computed synchronization;
cross-media correlation; lips sync; speech-to-text
alignment",
}
@Article{Wang:2007:EST,
author = "Surong Wang and Manoranjan Dash and Liang-Tien Chia
and Min Xu",
title = "Efficient sampling of training set in large and noisy
multimedia data",
journal = j-TOMCCAP,
volume = "3",
number = "3",
pages = "14:1--14:??",
month = aug,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1236471.1236473",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:10:32 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "As the amount of multimedia data is increasing
day-by-day thanks to less expensive storage devices and
increasing numbers of information sources, machine
learning algorithms are faced with large-sized and
noisy datasets. Fortunately, the use of a good sampling
set for training influences the final results
significantly. But using a simple random sample (SRS)
may not obtain satisfactory results because such a
sample may not adequately represent the large and noisy
dataset due to its blind approach in selecting samples.
The difficulty is particularly apparent for huge
datasets where, due to memory constraints, only very
small sample sizes are used. This is typically the case
for multimedia applications, where data size is usually
very large. In this article we propose a new and
efficient method to sample of large and noisy
multimedia data. The proposed method is based on a
simple distance measure that compares the histograms of
the sample set and the whole set in order to estimate
the representativeness of the sample. The proposed
method deals with noise in an elegant manner which SRS
and other methods are not able to deal with. We
experiment on image and audio datasets. Comparison with
SRS and other methods shows that the proposed method is
vastly superior in terms of sample representativeness,
particularly for small sample sizes although time-wise
it is comparable to SRS, the least expensive method in
terms of time.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "14",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "audio event identification; histogram; image
classification; noise; sampling",
}
@Article{Zhou:2007:CCO,
author = "Suiping Zhou and Wentong Cai and Stephen J. Turner and
Bu-Sung Lee and Junhu Wei",
title = "Critical causal order of events in distributed virtual
environments",
journal = j-TOMCCAP,
volume = "3",
number = "3",
pages = "15:1--15:??",
month = aug,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1236471.1236474",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:10:32 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "We investigate the causal order of events in
distributed virtual environments (DVEs). We first
define the critical causal order relation among the
events. Then, we propose some mechanisms to enhance the
prevalent RO (receive order delivery) mechanism in DVEs
so that the real-time property of DVEs is preserved
while the critical causal order violations are reduced.
These mechanisms are implemented as a middleware.
Experimental results show that the middleware performs
well in reducing the critical causality violations in
simulation and incurs little processing overhead.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "15",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "causal order; distributed simulation; virtual
environments",
}
@Article{Li:2007:SRM,
author = "Chuanjun Li and S. Q. Zheng and B. Prabhakaran",
title = "Segmentation and recognition of motion streams by
similarity search",
journal = j-TOMCCAP,
volume = "3",
number = "3",
pages = "16:1--16:??",
month = aug,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1236471.1236475",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:10:32 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Fast and accurate recognition of motion data streams
from gesture sensing and motion capture devices has
many applications and is the focus of this article.
Based on the analysis of the geometric structures
revealed by singular value decompositions (SVD) of
motion data, a similarity measure is proposed for
simultaneously segmenting and recognizing motion
streams. A direction identification approach is
explored to further differentiate motions with similar
data geometric structures. Experiments show that the
proposed similarity measure can segment and recognize
motion streams of variable lengths with high accuracy,
without knowing beforehand the number of motions in a
stream.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "16",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "gesture recognition; motion capture; pattern analysis;
principal component analysis; segmentation; similarity
measures; singular value decomposition",
}
@Article{Ott:2007:OAT,
author = "David E. Ott and Ketan Mayer-Patel",
title = "An open architecture for transport-level protocol
coordination in distributed multimedia applications",
journal = j-TOMCCAP,
volume = "3",
number = "3",
pages = "17:1--17:??",
month = aug,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1236471.1236476",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:10:32 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "We consider the problem of flow coordination in
distributed multimedia applications. Most
transport-level protocols are designed to operate
independently and lack mechanisms for sharing
information with other flows and coordinating data
transport in various ways. This limitation becomes
problematic in distributed applications that employ
numerous flows between two computing clusters sharing
the same intermediary forwarding path across the
Internet. In this article, we propose an open
architecture that supports the sharing of network state
information, peer flow information, and
application-specific information. Called simply the
coordination protocol (CP), the scheme facilitates
coordination of network resource usage across flows
belonging to the same application, as well as aiding
other types of coordination. The effectiveness of our
approach is illustrated in the context of
multistreaming in 3D tele-immersion where consistency
of network information across flows both greatly
improves frame transport synchrony and minimizes
buffering delay.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "17",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "distributed applications; flow coordination; network
protocols",
}
@Article{Sakr:2007:RCB,
author = "Ziad Sakr and Nicolas D. Georganas",
title = "Robust content-based {MPEG}-4 {XMT} scene structure
authentication and multimedia content location",
journal = j-TOMCCAP,
volume = "3",
number = "3",
pages = "18:1--18:??",
month = aug,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1236471.1236477",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:10:32 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "For the past decade, there have been numerous research
works focusing on the protection of digital images,
audio, video, 3D virtual scenes, and software data from
unauthorized use and distribution. With the emerging
technology of the MPEG-4 standard, MPEG-4 scenes that
may include images, video, audio, and 3D objects can
easily be built using the text-based MPEG-4 XMT
standard. XMT allows content authors to exchange their
content with other authors, tools, or service providers
and facilitates interoperability with MPEG-4, X3D, and
SMIL. In order for owners and designers to protect
and/or authenticate their work, some form of security
needs to be applied into the MPEG-4 XMT structure and
its media content. Unlike images or videos,
watermarking an XMT structure is not an easy task,
since the structure contains no noise components to
embed the watermark. This article is the first one
proposing a novel robust algorithm for the
authentication of a given MPEG-4 XMT structured scene
and the location of its multimedia content.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "18",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "MPEG-4; multimedia; polynomial; pseudorandom
sequences; steganography; VRML; watermarking; XML;
XMT",
}
@Article{Ghinea:2007:ISI,
author = "Gheorghita Ghinea and Chabane Djeraba and Stephen
Gulliver and Kara Pernice Coyne",
title = "Introduction to special issue on eye-tracking
applications in multimedia systems",
journal = j-TOMCCAP,
volume = "3",
number = "4",
pages = "1:1--1:4",
month = dec,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1314303.1314304",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:11:20 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Colombo:2007:RTR,
author = "Carlo Colombo and Dario Comanducci and Alberto {Del
Bimbo}",
title = "Robust tracking and remapping of eye appearance with
passive computer vision",
journal = j-TOMCCAP,
volume = "3",
number = "4",
pages = "2:1--2:20",
month = dec,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1314303.1314305",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:11:20 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "A single-camera iris-tracking and remapping approach
based on passive computer vision is presented. Tracking
is aimed at obtaining accurate and robust measurements
of the iris/pupil position. To this purpose, a robust
method for ellipse fitting is used, employing search
constraints so as to achieve better performance with
respect to the standard RANSAC algorithm. Tracking also
embeds an iris localization algorithm (working as a
bootstrap multiple-hypotheses generation step), and a
blink detector that can detect voluntary eye blinks in
human-computer interaction applications. On-screen
remapping incorporates a head-tracking method capable
of compensating for small user-head movements. The
approach operates in real time under different light
conditions and in the presence of distractors. An
extensive set of experiments is presented and
discussed. In particular, an evaluation method for the
choice of layout of both hardware components and
calibration points is described. Experiments also
investigate the importance of providing a visual
feedback to the user, and the benefits gained from
performing head compensation, especially during
image-to-screen map calibration.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "eye blink detection; eye tracking and remapping;
eye-driven human-computer interaction; robust fitting",
}
@Article{Wang:2007:UGP,
author = "Jun Wang and Lijun Yin and Jason Moore",
title = "Using geometric properties of topographic manifold to
detect and track eyes for human-computer interaction",
journal = j-TOMCCAP,
volume = "3",
number = "4",
pages = "3:1--3:20",
month = dec,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1314303.1314306",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:11:20 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Automatic eye detection and tracking is an important
component for advanced human-computer interface design.
Accurate eye localization can help develop a successful
system for face recognition and emotion identification.
In this article, we propose a novel approach to detect
and track eyes using geometric surface features on
topographic manifold of eye images. First, in the joint
spatial-intensity domain, a facial image is treated as
a 3D terrain surface or image topographic manifold. In
particular, eye regions exhibit certain intrinsic
geometric traits on this topographic manifold, namely,
the pit -labeled center and hillside -like surround
regions. Applying a terrain classification procedure on
the topographic manifold of facial images, each
location of the manifold can be labeled to generate a
terrain map. We use the distribution of terrain labels
to represent the eye terrain pattern. The Bhattacharyya
affinity is employed to measure the distribution
similarity between two topographic manifolds. Based on
the Bhattacharyya kernel, a support vector machine is
applied for selecting proper eye pairs from the
pit-labeled candidates. Second, given detected eyes on
the first frame of a video sequence, a
mutual-information-based fitting function is defined to
describe the similarity between two terrain surfaces of
neighboring frames. By optimizing the fitting function,
eye locations are updated for subsequent frames. The
distinction of the proposed approach lies in that both
eye detection and eye tracking are performed on the
derived topographic manifold, rather than on an
original-intensity image domain. The robustness of the
approach is demonstrated under various imaging
conditions and with different facial appearances, using
both static images and video sequences without
background constraints.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "Bhattacharyya affinity; eye detection; eye tracking;
mutual information; topographic manifold",
}
@Article{Agrafiotis:2007:TEC,
author = "D. Agrafiotis and S. J. C. Davies and N. Canagarajah
and D. R. Bull",
title = "Towards efficient context-specific video coding based
on gaze-tracking analysis",
journal = j-TOMCCAP,
volume = "3",
number = "4",
pages = "4:1--4:15",
month = dec,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1314303.1314307",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:11:20 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article discusses a framework for model-based,
context-dependent video coding based on exploitation of
characteristics of the human visual system. The system
utilizes variable-quality coding based on priority maps
which are created using mostly context-dependent rules.
The technique is demonstrated through two case studies
of specific video context, namely open signed content
and football sequences. Eye-tracking analysis is
employed for identifying the characteristics of each
context, which are subsequently exploited for coding
purposes, either directly or through a gaze prediction
model. The framework is shown to achieve a considerable
improvement in coding efficiency.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "applications; context-based video coding; eye
tracking; multimedia perceptual quality; subjective
video quality; transformation of eye movements into
useful knowledge",
}
@Article{Urruty:2007:DEF,
author = "Thierry Urruty and Stanislas Lew and Nacim Ihadaddene
and Dan A. Simovici",
title = "Detecting eye fixations by projection clustering",
journal = j-TOMCCAP,
volume = "3",
number = "4",
pages = "5:1--5:20",
month = dec,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1314303.1314308",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:11:20 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Eye movements are certainly the most natural and
repetitive movement of a human being. The most mundane
activity, such as watching television or reading a
newspaper, involves this automatic activity which
consists of shifting our gaze from one point to
another.\par
Identification of the components of eye movements
(fixations and saccades) is an essential part in the
analysis of visual behavior because these types of
movements provide the basic elements used by further
investigations of human vision.\par
However, many of the algorithms that detect fixations
present a number of problems. In this article, we
present a new fixation identification technique that is
based on clustering of eye positions, using projections
and projection aggregation applied to static pictures.
We also present a new method that computes dispersion
of eye fixations in videos considering a multiuser
environment.\par
To demonstrate the performance and usefulness of our
approach we discuss our experimental work with two
different applications: on fixed image and video.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "eye fixations; interaction modeling; projected
clustering; static pictures; videos",
}
@Article{Duchowski:2007:FGC,
author = "Andrew T. Duchowski and Arzu {\c{C}}{\"o}ltekin",
title = "Foveated gaze-contingent displays for peripheral {LOD}
management, {$3$D} visualization, and stereo imaging",
journal = j-TOMCCAP,
volume = "3",
number = "4",
pages = "6:1--6:18",
month = dec,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1314303.1314309",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:11:20 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Advancements in graphics hardware have allowed
development of hardware-accelerated imaging displays.
This article reviews techniques for real-time
simulation of arbitrary visual fields over still images
and video. The goal is to provide the vision sciences
and perceptual graphics communities techniques for the
investigation of fundamental processes of visual
perception. Classic gaze-contingent displays used for
these purposes are reviewed and for the first time a
pixel shader is introduced for display of a
high-resolution window over peripherally degraded
stimulus. The pixel shader advances current
state-of-the-art by allowing real-time processing of
still or streamed images, obviating the need for
preprocessing or storage.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "eye tracking; foveation; gaze-contingent displays;
level-of-detail",
}
@Article{Loschky:2007:HLC,
author = "Lester C. Loschky and Gary S. Wolverton",
title = "How late can you update gaze-contingent
multiresolutional displays without detection?",
journal = j-TOMCCAP,
volume = "3",
number = "4",
pages = "7:1--7:10",
month = dec,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1314303.1314310",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:11:20 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This study investigated perceptual disruptions in
gaze-contingent multiresolutional displays (GCMRDs) due
to delays in updating the center of highest resolution
after an eye movement. GCMRDs can be used to save
processing resources and transmission bandwidth in many
types of single-user display applications, such as
virtual reality, video-telephony, simulators, and
remote piloting. The current study found that image
update delays as late as 60 ms after an eye movement
did not significantly increase the detectability of
image blur and/or motion transients due to the update.
This is good news for designers of GCMRDs, since 60 ms
is ample time to update many GCMRDs after an eye
movement without disrupting perception. The study also
found that longer eye movements led to greater blur
and/or transient detection due to moving the eyes
further into the low-resolution periphery, effectively
reducing the image resolution at fixation prior to the
update. In GCMRD applications where longer saccades are
more likely (e.g., displays with relatively large
distances between objects), this problem could be
overcome by increasing the size of the region of
highest resolution.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "area of interest; bandwidth; blur detection; contrast
thresholds; display updates; eye movements; eye
tracking; foveated; foveation; gaze-contingent;
level-of-detail; multiresolution; perceptual
compression; peripheral vision; saccades; saccadic
suppression; visual perception",
}
@Article{Murray:2007:AEG,
author = "Norman Murray and Dave Roberts and Anthony Steed and
Paul Sharkey and Paul Dickerson and John Rae",
title = "An assessment of eye-gaze potential within immersive
virtual environments",
journal = j-TOMCCAP,
volume = "3",
number = "4",
pages = "8:1--8:17",
month = dec,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1314303.1314311",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:11:20 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In collaborative situations, eye gaze is a critical
element of behavior which supports and fulfills many
activities and roles. In current computer-supported
collaboration systems, eye gaze is poorly supported.
Even in a state-of-the-art video conferencing system
such as the access grid, although one can see the face
of the user, much of the communicative power of eye
gaze is lost. This article gives an overview of some
preliminary work that looks towards integrating eye
gaze into an immersive collaborative virtual
environment and assessing the impact that this would
have on interaction between the users of such a system.
Three experiments were conducted to assess the efficacy
of eye gaze within immersive virtual environments. In
each experiment, subjects observed on a large screen
the eye-gaze behavior of an avatar. The eye-gaze
behavior of that avatar had previously been recorded
from a user with the use of a head-mounted eye tracker.
The first experiment was conducted to assess the
difference between users' abilities to judge what
objects an avatar is looking at with only head gaze
being viewed and also with eye- and head-gaze data
being displayed. The results from the experiment show
that eye gaze is of vital importance to the subjects,
correctly identifying what a person is looking at in an
immersive virtual environment. The second experiment
examined whether a monocular or binocular eye-tracker
would be required. This was examined by testing
subjects' ability to identify where an avatar was
looking from their eye direction alone, or by eye
direction combined with convergence. This experiment
showed that convergence had a significant impact on the
subjects' ability to identify where the avatar was
looking. The final experiment looked at the effects of
stereo and mono-viewing of the scene, with the subjects
being asked to identify where the avatar was looking.
This experiment showed that there was no difference in
the subjects' ability to detect where the avatar was
gazing. This is followed by a description of how the
eye-tracking system has been integrated into an
immersive collaborative virtual environment and some
preliminary results from the use of such a system.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "eye gaze; immersive virtual environments",
}
@Article{Rachovides:2007:CIM,
author = "Dorothy Rachovides and James Walkerdine and Peter
Phillips",
title = "The conductor interaction method",
journal = j-TOMCCAP,
volume = "3",
number = "4",
pages = "9:1--9:23",
month = dec,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1314303.1314312",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:11:20 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Computers have increasingly become part of our
everyday lives, with many activities either involving
their direct use or being supported by one. This has
prompted research into developing methods and
mechanisms to assist humans in interacting with
computers (human-computer interaction, or HCI). A
number of HCI techniques have been developed over the
years, some of which are quite old but continue to be
used, and some more recent and still evolving. Many of
these interaction techniques, however, are not natural
in their use and typically require the user to learn a
new means of interaction. Inconsistencies within these
techniques and the restrictions they impose on user
creativity can also make such interaction techniques
difficult to use, especially for novice users.\par
This article proposes an alternative interaction
method, the conductor interaction method (CIM), which
aims to provide a more natural and easier-to-learn
interaction technique. This novel interaction method
extends existing HCI methods by drawing upon techniques
found in human-human interaction. It is argued that the
use of a two-phased multimodal interaction mechanism,
using gaze for selection and gesture for manipulation,
incorporated within a metaphor-based environment, can
provide a viable alternative for interacting with a
computer (especially for novice users). Both the model
and an implementation of the CIM within a system are
presented in this article. This system formed the basis
of a number of user studies that have been performed to
assess the effectiveness of the CIM, the findings of
which are discussed in this work.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "gaze- and gesture-based interfaces; human-computer
interaction",
}
@Article{Luo:2008:IFH,
author = "Hangzai Luo and Yuli Gao and Xiangyang Xue and Jinye
Peng and Jianping Fan",
title = "Incorporating feature hierarchy and boosting to
achieve more effective classifier training and
concept-oriented video summarization and skimming",
journal = j-TOMCCAP,
volume = "4",
number = "1",
pages = "1:1--1:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1324287.1324288",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:12:06 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "For online medical education purposes, we have
developed a novel scheme to incorporate the results of
semantic video classification to select the most
representative video shots for generating
concept-oriented summarization and skimming of surgery
education videos. First, salient objects are used as
the video patterns for feature extraction to achieve a
good representation of the intermediate video
semantics. The salient objects are defined as the
salient video compounds that can be used to
characterize the most significant perceptual properties
of the corresponding real world physical objects in a
video, and thus the appearances of such salient objects
can be used to predict the appearances of the relevant
semantic video concepts in a specific video domain.
Second, a novel multi-modal boosting algorithm is
developed to achieve more reliable video classifier
training by incorporating feature hierarchy and
boosting to dramatically reduce both the training cost
and the size of training samples, thus it can
significantly speed up SVM (support vector machine)
classifier training. In addition, the unlabeled samples
are integrated to reduce the human efforts on labeling
large amount of training samples. Finally, the results
of semantic video classification are incorporated to
enable concept-oriented video summarization and
skimming. Experimental results in a specific domain of
surgery education videos are provided.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "1",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "concept-oriented video skimming; feature hierarchy;
multi-modal boosting; salient objects; semantic video
classification; unlabeled samples",
}
@Article{Hefeeda:2008:RDO,
author = "Mohamed Hefeeda and Cheng-Hsin Hsu",
title = "Rate-distortion optimized streaming of fine-grained
scalable video sequences",
journal = j-TOMCCAP,
volume = "4",
number = "1",
pages = "2:1--2:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1324287.1324289",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:12:06 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "We present optimal schemes for allocating bits of
fine-grained scalable video sequences among multiple
senders streaming to a single receiver. This allocation
problem is critical in optimizing the perceived quality
in peer-to-peer and distributed multi-server streaming
environments. Senders in such environments are
heterogeneous in their outgoing bandwidth and they hold
different portions of the video stream. We first
formulate and optimally solve the problem for
individual frames, then we generalize to the multiple
frame case. Specifically, we formulate the allocation
problem as an optimization problem, which is nonlinear
in general. We use rate-distortion models in the
formulation to achieve the minimum distortion in the
rendered video, constrained by the outgoing bandwidth
of senders, availability of video data at senders, and
incoming bandwidth of receiver. We show how the adopted
rate-distortion models transform the nonlinear problem
to an integer linear programming (ILP) problem. We then
design a simple rounding scheme that transforms the ILP
problem to a linear programming (LP) one, which can be
solved efficiently using common optimization techniques
such as the Simplex method. We prove that our rounding
scheme always produces a feasible solution, and the
solution is within a negligible margin from the optimal
solution. We also propose a new algorithm (FGSAssign)
for the single-frame allocation problem that runs in $
O(n \log n) $ steps, where n is the number of senders.
We prove that FGSAssign is optimal. Furthermore, we
propose a heuristic algorithm (mFGSAssign) that
produces near-optimal solutions for the multiple-frame
case, and runs an order of magnitude faster than the
optimal one. Because of its short running time,
mFGSAssign can be used in real time. Our experimental
study validates our analytical analysis and shows the
effectiveness of our allocation algorithms in improving
the video quality.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "2",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "distributed streaming; FGS; fine-grained scalable
streaming; peer-to-peer streaming; rate-distortion
models; rate-distortion optimized streaming; video
streaming",
}
@Article{Babich:2008:VQE,
author = "Fulvio Babich and Marco D'orlando and Francesca
Vatta",
title = "Video quality estimation in wireless {IP} networks:
{Algorithms} and applications",
journal = j-TOMCCAP,
volume = "4",
number = "1",
pages = "3:1--3:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1324287.1324290",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:12:06 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article proposes three methods to estimate the
distortion deriving from packet losses in wireless
video communication. The proposed methods take into
account the short-term properties of the encoded video
sequences. A suitable set of functions is adopted to
model the distortion envelope resulting from multiple
losses. The estimated performance is compared with the
actual distortion, evaluated by decoding the received
sequence with a properly designed decoder. Numerical
results confirm the accuracy of the proposed models in
approximating the actual Mean Square Error (MSE) for a
wide range of loss rates. Some applications of the
proposed algorithms are presented.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "3",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "distortion estimation; error-concealment;
error-resilience; H.264; packet loss rate; real time
video; wireless networks",
}
@Article{Kotharu:2008:PQR,
author = "Phani S. Kotharu and B. Prabhakaran",
title = "Partial query resolution for animation authoring",
journal = j-TOMCCAP,
volume = "4",
number = "1",
pages = "4:1--4:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1324287.1324291",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:12:06 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Animations are a part of multimedia and techniques
such as motion mapping and inverse kinematics aid in
reusing models and motion sequences to create new
animations. This reuse approach is facilitated by the
use of content-based retrieval techniques that often
require fuzzy query resolution. Most fuzzy query
resolution approaches work on all the attributes of the
query to minimize the database access cost thus
resulting in an unsatisfactory result set. It turns out
that the query resolution can be carried out in a
partial manner to achieve user satisfactory results and
aid in easy authoring. In this article, we present two
partial fuzzy query resolution approaches, one that
results in high-quality animations and the other that
produces results with decreasing number of satisfied
conditions in the query.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "4",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "aggregation function; animation toolkit; fuzzy query;
multimedia authoring; partial ordering; top-k query",
}
@Article{Ip:2008:RRS,
author = "Alan T. S. Ip and John C. S. Lui and Jiangchuan Liu",
title = "A revenue-rewarding scheme of providing incentive for
cooperative proxy caching for media streaming systems",
journal = j-TOMCCAP,
volume = "4",
number = "1",
pages = "5:1--5:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1324287.1324292",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:12:06 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Network entities cooperating together can improve
system performance of media streaming. In this paper,
we address the ``incentive issue'' of a cooperative
proxy caching system and how to motivate each proxy to
provide cache space to the system. To encourage proxies
to participate, we propose a ``revenue-rewarding
scheme'' to credit the cooperative proxies according to
the resources they contribute. A game-theoretic model
is used to analyze the interactions among proxies under
the revenue-rewarding scheme. We propose two
cooperative game settings that lead to optimal
situations. In particular, (1) We propose a distributed
incentive framework for peers to participate in
resource contribution for media streaming; (2) Proxies
are encouraged to cooperate under the revenue-rewarding
scheme; (3) Profit and social welfare are maximized in
these cooperative games; and (4) Cost-effective
resource allocation is achieved in these cooperative
games. Large scale simulation is carried out to
validate and verify the merits of our proposed
incentive schemes.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "5",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "game-theoretic analysis; incentive mechanism; Nash
equilibrium; pricing; resource allocation",
}
@Article{Zhang:2008:AEE,
author = "Cha Zhang and Yong Rui and Jim Crawford and Li-Wei
He",
title = "An automated end-to-end lecture capture and
broadcasting system",
journal = j-TOMCCAP,
volume = "4",
number = "1",
pages = "6:1--6:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1324287.1324293",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:12:06 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Remote viewing of lectures presented to a live
audience is becoming increasingly popular. At the same
time, the lectures can be recorded for subsequent
on-demand viewing over the Internet. Providing such
services, however, is often prohibitive due to the
labor-intensive cost of capturing and
pre/post-processing. This article presents a complete
automated end-to-end system that supports capturing,
broadcasting, viewing, archiving and searching of
presentations. Specifically, we describe a system
architecture that minimizes the pre- and
post-production time, and a fully automated lecture
capture system called iCam2 that synchronously captures
all contents of the lecture, including audio, video,
and presentation material. No staff is needed during
lecture capture and broadcasting, so the operational
cost of the system is negligible. The system has been
used on a daily basis for more than 4 years, during
which 522 lectures have been captured. These lectures
have been viewed over 20,000 times.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "6",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "automated lecture capture; lecture broadcasting;
live/on-demand broadcasting",
}
@Article{Nguyen:2008:OIV,
author = "Giang Phuong Nguyen and Marcel Worring",
title = "Optimization of interactive visual-similarity-based
search",
journal = j-TOMCCAP,
volume = "4",
number = "1",
pages = "7:1--7:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1324287.1324294",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:12:06 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "At one end of the spectrum, research in interactive
content-based retrieval concentrates on machine
learning methods for effective use of relevance
feedback. On the other end, the information
visualization community focuses on effective methods
for conveying information to the user. What is lacking
is research considering the information visualization
and interactive retrieval as truly integrated parts of
one content-based search system. In such an integrated
system, there are many degrees of freedom like the
similarity function, the number of images to display,
the image size, different visualization modes, and
possible feedback modes. To base the optimal values for
all of those on user studies is unfeasible. We
therefore develop search scenarios in which tasks and
user actions are simulated. From there, the proposed
scheme is optimized based on objective constraints and
evaluation criteria. In such a manner, the degrees of
freedom are reduced and the remaining degrees can be
evaluated in user studies. In this article, we present
a system that integrates advanced similarity based
visualization with active learning. We have performed
extensive experimentation on interactive category
search with different image collections. The results
using the proposed simulation scheme show that indeed
the use of advanced visualization and active learning
pays off in all of these datasets.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "7",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "active learning; interactive search; similarity based
visualization",
}
@Article{Hlavacs:2008:HVP,
author = "Helmut Hlavacs and Shelley Buchinger",
title = "Hierarchical video patching with optimal server
bandwidth",
journal = j-TOMCCAP,
volume = "4",
number = "1",
pages = "8:1--8:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1324287.1324295",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:12:06 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Video patching is a way for transporting true
video-on-demand, that is, instantaneous without any
delay, from a video server to several clients. Instead
of sending a unique stream to each newly arriving
client, clients share as many multicast transmissions
as possible, and are serviced only those parts of the
video that they have missed.\par
We present a novel video patching scheme using
hierarchies of patches. Our scheme minimizes the
bandwidth needed by the video server, and may result in
the fact that clients receive several streams in
parallel. We show analytically that for Poisson arrival
our algorithm achieves the optimal possible server
bandwidth for all schemes where clients share multicast
transmissions.\par
We also show, how our approach can be combined with
batching. This combination requires less server
bandwidth than all fixed start point periodic broadcast
algorithms.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "8",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "batching; server bandwidth; true video-on-demand;
video patching",
}
@Article{Chen:2008:ASD,
author = "Songqing Chen and Shiping Chen and Huiping Guo and Bo
Shen and Sushil Jajodia",
title = "Achieving simultaneous distribution control and
privacy protection for {Internet} media delivery",
journal = j-TOMCCAP,
volume = "4",
number = "2",
pages = "9:1--9:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1352012.1352013",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:12:37 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Massive Internet media distribution demands prolonged
continuous consumption of networking and disk
bandwidths in large capacity. Many proxy-based Internet
media distribution algorithms and systems have been
proposed, implemented, and evaluated to address the
scalability and performance issue. However, few of them
have been used in practice, since two important issues
are not satisfactorily addressed. First, existing
proxy-based media distribution architectures lack an
efficient media distribution control mechanism. Without
copyright protection, content providers are hesitant to
use proxy-based fast distribution techniques. Second,
little has been done to protect client privacy during
content accesses on the Internet. Straightforward
solutions to address these two issues independently
lead to conflicts. For example, to enforce distribution
control, only legitimate users should be granted access
rights. However, this normally discloses more
information (such as which object the client is
accessing) other than the client identity, which
conflicts with the client's desire for privacy
protection. In this article, we propose a unified
proxy-based media distribution protocol to effectively
address these two problems simultaneously. We further
design a set of new algorithms in a cooperative proxy
environment where our proposed scheme works efficiently
and practically. Simulation-based experiments are
conducted to extensively evaluate the proposed system.
Preliminary results demonstrate the effectiveness of
our proposed strategy.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "9",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "cooperative proxy; distribution control; media
delivery; privacy; proxy caching",
}
@Article{Li:2008:FSE,
author = "Rui Li and Bir Bhanu and Anlei Dong",
title = "Feature synthesized {EM} algorithm for image
retrieval",
journal = j-TOMCCAP,
volume = "4",
number = "2",
pages = "10:1--10:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1352012.1352014",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:12:37 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "As a commonly used unsupervised learning algorithm in
Content-Based Image Retrieval (CBIR),
Expectation-Maximization (EM) algorithm has several
limitations, including the curse of dimensionality and
the convergence at a local maximum. In this article, we
propose a novel learning approach, namely
Coevolutionary Feature Synthesized
Expectation-Maximization (CFS-EM), to address the above
problems. The CFS-EM is a hybrid of coevolutionary
genetic programming (CGP) and EM algorithm applied on
partially labeled data. CFS-EM is especially suitable
for image retrieval because the images can be searched
in the synthesized low-dimensional feature space, while
a kernel-based method has to make classification
computation in the original high-dimensional space.
Experiments on real image databases show that CFS-EM
outperforms Radial Basis Function Support Vector
Machine (RBF-SVM), CGP, Discriminant-EM (D-EM) and
Transductive-SVM (TSVM) in the sense of classification
performance and it is computationally more efficient
than RBF-SVM in the query phase.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "10",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "coevolutionary feature synthesis; content-based image
retrieval; expectation maximization; semi-supervised
learning",
}
@Article{Xu:2008:AKG,
author = "Min Xu and Changsheng Xu and Lingyu Duan and Jesse S.
Jin and Suhuai Luo",
title = "Audio keywords generation for sports video analysis",
journal = j-TOMCCAP,
volume = "4",
number = "2",
pages = "11:1--11:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1352012.1352015",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:12:37 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Sports video has attracted a global viewership.
Research effort in this area has been focused on
semantic event detection in sports video to facilitate
accessing and browsing. Most of the event detection
methods in sports video are based on visual features.
However, being a significant component of sports video,
audio may also play an important role in semantic event
detection. In this paper, we have borrowed the concept
of the ``keyword'' from the text mining domain to
define a set of specific audio sounds. These specific
audio sounds refer to a set of game-specific sounds
with strong relationships to the actions of players,
referees, commentators, and audience, which are the
reference points for interesting sports events. Unlike
low-level features, audio keywords can be considered as
a mid-level representation, able to facilitate
high-level analysis from the semantic concept point of
view. Audio keywords are created from low-level audio
features with learning by support vector machines. With
the help of video shots, the created audio keywords can
be used to detect semantic events in sports video by
Hidden Markov Model (HMM) learning. Experiments on
creating audio keywords and, subsequently, event
detection based on audio keywords have been very
encouraging. Based on the experimental results, we
believe that the audio keyword is an effective
representation that is able to achieve satisfying
results for event detection in sports video.
Application in three sports types demonstrates the
practicality of the proposed method.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "11",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "audio keywords; event detection; semantics analysis;
sports video analysis; support vector machines",
}
@Article{Tullimas:2008:MSU,
author = "Sunand Tullimas and Thinh Nguyen and Rich Edgecomb and
Sen-ching Cheung",
title = "Multimedia streaming using multiple {TCP}
connections",
journal = j-TOMCCAP,
volume = "4",
number = "2",
pages = "12:1--12:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1352012.1352016",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:12:37 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In recent years, multimedia applications over the
Internet become increasingly popular. However, packet
loss, delay, and time-varying bandwidth of the Internet
have remained the major problems for multimedia
streaming applications. As such, a number of
approaches, including network infrastructure and
protocol, source and channel coding, have been proposed
to either overcome or alleviate these drawbacks of the
Internet. In this article, we propose the MultiTCP
system, a receiver-driven, TCP-based system for
multimedia streaming over the Internet. Our proposed
algorithm aims at providing resilience against short
term insufficient bandwidth by using multiple TCP
connections for the same application. Our proposed
system enables the application to achieve and control
the desired sending rate during congested periods,
which cannot be achieved using traditional TCP.
Finally, our proposed system is implemented at the
application layer, and hence, no kernel modification to
TCP is necessary. We analyze the proposed system, and
present simulation and experimental results to
demonstrate its advantages over the traditional
single-TCP-based approach.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "12",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "multimedia streaming",
}
@Article{Tjondronegoro:2008:SES,
author = "Dian Tjondronegoro and Yi-Ping Phoebe Chen and Adrien
Joly",
title = "A scalable and extensible segment-event-object-based
sports video retrieval system",
journal = j-TOMCCAP,
volume = "4",
number = "2",
pages = "13:1--13:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1352012.1352017",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:12:37 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Sport video data is growing rapidly as a result of the
maturing digital technologies that support digital
video capture, faster data processing, and large
storage. However, (1) semi-automatic content extraction
and annotation, (2) scalable indexing model, and (3)
effective retrieval and browsing, still pose the most
challenging problems for maximizing the usage of large
video databases. This article will present the findings
from a comprehensive work that proposes a scalable and
extensible sports video retrieval system with two major
contributions in the area of sports video indexing and
retrieval. The first contribution is a new sports video
indexing model that utilizes semi-schema-based indexing
scheme on top of an Object-Relationship approach. This
indexing model is scalable and extensible as it enables
gradual index construction which is supported by
ongoing development of future content extraction
algorithms. The second contribution is a set of novel
queries which are based on XQuery to generate dynamic
and user-oriented summaries and event structures. The
proposed sports video retrieval system has been fully
implemented and populated with soccer, tennis,
swimming, and diving video. The system has been
evaluated against 20 users to demonstrate and confirm
its feasibility and benefits. The experimental sports
genres were specifically selected to represent the four
main categories of sports domain: period-, set-point-,
time (race)-, and performance-based sports. Thus, the
proposed system should be generic and robust for all
types of sports.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "13",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "automatic content extraction; indexing; mobile video
interaction; MPEG-7; sports video retrieval; video
database system; XML; XQuery",
}
@Article{Zimmermann:2008:DMP,
author = "Roger Zimmermann and Elaine Chew and Sakire Arslan Ay
and Moses Pawar",
title = "Distributed musical performances: {Architecture} and
stream management",
journal = j-TOMCCAP,
volume = "4",
number = "2",
pages = "14:1--14:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1352012.1352018",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:12:37 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "An increasing number of novel applications produce a
rich set of different data types that need to be
managed efficiently and coherently. In this article we
present our experience with designing and implementing
a data management infrastructure for a distributed
immersive performance (DIP) application. The DIP
project investigates a versatile framework for the
capture, recording, and replay of video, audio, and
MIDI (Musical Instrument Digital Interface) streams in
an interactive environment for collaborative music
performance. We are focusing on two classes of data
streams that are generated within this environment. The
first category consists of high-resolution isochronous
media streams, namely audio and video. The second class
comprises MIDI data produced by electronic instruments.
MIDI event sequences are alphanumeric in nature and
fall into the category of the data streams that have
been of interest to data management researchers in
recent years.\par
We present our data management architecture, which
provides a repository for all DIP data. Streams of both
categories need to be acquired, transmitted, stored,
and replayed in real time. Data items are correlated
across different streams with temporal indices. The
audio and video streams are managed in our own
High-performance Data Recording Architecture (HYDRA),
which integrates multistream recording and retrieval in
a consistent manner. This paper reports on the
practical issues and challenges that we encountered
during the design, implementation and experimental
phases of our prototype. We also present some analysis
results and discuss future extensions for the
architecture.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "14",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "distributed immersive performance; multimedia storage;
multimodal data recorder; networked musical
performance",
}
@Article{Hsu:2008:ACR,
author = "Cheng-Hsin Hsu and Mohamed Hefeeda",
title = "On the accuracy and complexity of rate-distortion
models for fine-grained scalable video sequences",
journal = j-TOMCCAP,
volume = "4",
number = "2",
pages = "15:1--15:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1352012.1352019",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:12:37 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Rate-distortion (R-D) models are functions that
describe the relationship between the bitrate and
expected level of distortion in the reconstructed video
stream. R-D models enable optimization of the received
video quality in different network conditions. Several
R-D models have been proposed for the increasingly
popular fine-grained scalable video sequences. However,
the models' relative performance has not been
thoroughly analyzed. Moreover, the time complexity of
each model is not known, nor is the range of bitrates
in which the model produces valid results. This lack of
quantitative performance analysis makes it difficult to
select the model that best suits a target streaming
system. In this article, we classify, analyze, and
rigorously evaluate all R-D models proposed for FGS
coders in the literature. We classify R-D models into
three categories: analytic, empirical, and
semi-analytic. We describe the characteristics of each
category. We analyze the R-D models by following their
mathematical derivations, scrutinizing the assumptions
made, and explaining when the assumptions fail and why.
In addition, we implement all R-D models, a total of
eight, and evaluate them using a diverse set of video
sequences. In our evaluation, we consider various
source characteristics, diverse channel conditions,
different encoding/decoding parameters, different frame
types, and several performance metrics including
accuracy, range of applicability, and time complexity
of each model. We also present clear systematic ways
(pseudo codes) for constructing various R-D models from
a given video sequence. Based on our experimental
results, we present a justified list of recommendations
on selecting the best R-D models for video-on-demand,
video conferencing, real-time, and peer-to-peer
streaming systems.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "15",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "fine-grained scalable coding; multimedia streaming;
rate-distortion models",
}
@Article{Wang:2008:MST,
author = "Bing Wang and Jim Kurose and Prashant Shenoy and Don
Towsley",
title = "Multimedia streaming via {TCP}: an analytic
performance study",
journal = j-TOMCCAP,
volume = "4",
number = "2",
pages = "16:1--16:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1352012.1352020",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Jun 16 17:12:37 MDT 2008",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "TCP is widely used in commercial multimedia streaming
systems, with recent measurement studies indicating
that a significant fraction of Internet streaming media
is currently delivered over HTTP/TCP. These
observations motivate us to develop analytic
performance models to systematically investigate the
performance of TCP for both live and stored-media
streaming. We validate our models via ns simulations
and experiments conducted over the Internet. Our models
provide guidelines indicating the circumstances under
which TCP streaming leads to satisfactory performance,
showing, for example, that TCP generally provides good
streaming performance when the achievable TCP
throughput is roughly twice the media bitrate, with
only a few seconds of startup delay.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "16",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "multimedia streaming; performance modeling",
}
@Article{Lin:2008:NNB,
author = "Tsungnan Lin and Chiapin Wang and Po-Chiang Lin",
title = "A neural-network-based context-aware handoff algorithm
for multimedia computing",
journal = j-TOMCCAP,
volume = "4",
number = "3",
pages = "17:1--17:??",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1386109.1386110",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:51:12 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The access of multimedia computing in wireless
networks is concerned with the performance of handoff
because of the irretrievable property of real-time data
delivery. To lessen throughput degradation incurred by
unnecessary handoffs or handoff latencies leading to
media disruption perceived by users, this paper
presents a link quality based handoff algorithm. Neural
networks are used to learn the cross-layer correlation
between the link quality estimator such as packet
success rate and the corresponding context metric
indicators, for example, the transmitting packet
length, received signal strength, and signal to noise
ratio. Based on a pre-processed learning of link
quality profile, neural networks make essential handoff
decisions efficiently with the evaluations of link
quality instead of the comparisons between relative
signal strength. The experiment and simulation results
show that the proposed algorithm improves the user
perceived qualities in a transmission scenario of VoIP
applications by minimizing both the number of lost
packets and unnecessary handoffs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "17",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "context-aware; handoff; Multimedia computing; neural
networks",
}
@Article{Franke:2008:TAC,
author = "Ingmar S. Franke and Sebastian Pannasch and Jens R.
Helmert and Robert Rieger and Rainer Groh and Boris M.
Velichkovsky",
title = "Towards attention-centered interfaces: an aesthetic
evaluation of perspective with eye tracking",
journal = j-TOMCCAP,
volume = "4",
number = "3",
pages = "18:1--18:??",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1386109.1386111",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:51:12 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The established method of representing
three-dimensional space on a two-dimensional surface
involves camera based, point of regard systems,
comparable in design to the early ``camera obscura''.
However, geometrical limitations of such models lead to
distortions of perspective when projected. This
research investigated the influence of single- versus
multi-perspectives on aesthetic choices within one
image. A clear perceptual bias towards
multi-perspective images was found, additionally
supported by an eye tracking study. We propose that
human users are more attracted by multi-perspective
images, which emphasize the ``semantic foci'' of the
scene, than by those being synthesized statically with
only one geometrical prospect.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "18",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "Eye tracking; perspective projection; scene
perception; subjective evaluation",
}
@Article{Wu:2008:ELS,
author = "Chuan Wu and Baochun Li and Shuqiao Zhao",
title = "Exploring large-scale peer-to-peer live streaming
topologies",
journal = j-TOMCCAP,
volume = "4",
number = "3",
pages = "19:1--19:??",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1386109.1386112",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:51:12 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Real-world live peer-to-peer (P2P) streaming
applications have been successfully deployed in the
Internet, delivering live multimedia content to
millions of users at any given time. With relative
simplicity in design with respect to peer selection and
topology construction protocols and without much
algorithmic sophistication, current-generation live P2P
streaming applications are able to provide users with
adequately satisfying viewing experiences. That said,
little existing research has provided sufficient
insights on the time-varying internal characteristics
of peer-to-peer topologies in live streaming. This
article presents {\em Magellan}, our collaborative work
with UUSee Inc., Beijing, China, for exploring and
charting graph theoretical properties of practical P2P
streaming topologies, gaining important insights in
their topological dynamics over a long period of
time.\par
With more than 120 GB worth of traces starting
September 2006 from a commercially deployed P2P live
streaming system that represents UUSee's core product,
we have completed a thorough and in-depth investigation
of the topological properties in large-scale live P2P
streaming, as well as their evolutionary behavior over
time, for example, at different times of the day and in
flash crowd scenarios. We seek to explore real-world
P2P streaming topologies with respect to their graph
theoretical metrics, such as the degree, clustering
coefficient, and reciprocity. In addition, we compare
our findings with results from existing studies on
topological properties of P2P file sharing
applications, and present new and unique observations
specific to streaming. We have observed that live P2P
streaming sessions demonstrate excellent scalability, a
high level of reciprocity, a clustering phenomenon in
each ISP, and a degree distribution that does {\em
not\/} follow the power-law distribution.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "19",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "Peer-to-peer streaming; topology characterization",
}
@Article{Goel:2008:LLA,
author = "Ashvin Goel and Charles Krasic and Jonathan Walpole",
title = "Low-latency adaptive streaming over {TCP}",
journal = j-TOMCCAP,
volume = "4",
number = "3",
pages = "20:1--20:??",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1386109.1386113",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:51:12 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Media streaming over TCP has become increasingly
popular because TCP's congestion control provides
remarkable stability to the Internet. Streaming over
TCP requires adapting to bandwidth availability, but
unfortunately, TCP can introduce significant latency at
the application level, which causes unresponsive and
poor adaptation. This article shows that this latency
is not inherent in TCP but occurs as a result of
throughput-optimized TCP implementations. We show that
this latency can be minimized by dynamically tuning
TCP's send buffer. Our evaluation shows that this
approach leads to better application-level adaptation
and it allows supporting interactive and other
low-latency applications over TCP.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "20",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "low latency streaming; multimedia applications; TCP",
}
@Article{Lim:2008:DPP,
author = "Seung-Ho Lim and Yo-Won Jeong and Kyu Ho Park",
title = "Data placement and prefetching with accurate bit rate
control for interactive media server",
journal = j-TOMCCAP,
volume = "4",
number = "3",
pages = "21:1--21:??",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1386109.1386114",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:51:12 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "An interactive Media Server should support
unrestricted control to viewers with their service
level agreements. It is important to manage video data
effectively to facilitate efficient retrieval. In this
paper, we propose an efficient placement algorithm as
part of an effective retrieval scheme to increase the
number of clients who can be provided with interactive
service. The proposed management schemes are
incorporated with a bit count control method that is
based on repeated tuning of quantization parameters to
adjust the actual bit count to the target bit count.
The encoder using this method can generate coded frames
whose sizes are synchronized with the RAID stripe size,
so that when various fast-forward levels are accessed
we can reduce the seek and rotational latency and
enhance the disk throughput of each disk in the RAID
system. Experimental results demonstrate that the
proposed schemes can significantly improve the average
service time and guarantee more users service of
quality, and the interactive media server can thereby
efficiently service a large number of clients.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "21",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "bit count control; disk array; Interactive media
server; stripe size; video rate",
}
@Article{Jie:2008:VGD,
author = "Li Jie and James J. Clark",
title = "Video game design using an eye-movement-dependent
model of visual attention",
journal = j-TOMCCAP,
volume = "4",
number = "3",
pages = "22:1--22:??",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1386109.1386115",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:51:12 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Eye movements can be used to infer the allocation of
covert attention. In this article, we propose to model
the allocation of attention in a task-dependent manner
based on different eye movement conditions,
specifically fixation and pursuit. We show that the
image complexity at eye fixation points during
fixation, and the pursuit direction during pursuit are
significant factors in attention allocation. Results of
the study are applied to the design of an interactive
computer game. Real-time eye movement information is
taken as one of inputs for the game. The utility of
such eye information for controlling game difficulty is
shown.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "22",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "Entertainment; eye movements; eye tracking; HCI; video
games; visual attention",
}
@Article{Komogortsev:2008:PRT,
author = "Oleg V. Komogortsev and Javed I. Khan",
title = "Predictive real-time perceptual compression based on
eye-gaze-position analysis",
journal = j-TOMCCAP,
volume = "4",
number = "3",
pages = "23:1--23:??",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1386109.1386116",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:51:12 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article designs a real-time perceptual
compression system (RTPCS) based on eye-gaze-position
analysis. Our results indicate that the
eye-gaze-position containment metric provides more
efficient and effective evaluation of an RTPCS than the
eye fixation containment. The presented RTPCS is
designed for a network communication scenario with a
feedback loop delay. The proposed RTPCS uses human
visual system properties to compensate for the delay
and to provide high ratios of multimedia compression.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "23",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "human visual system; Real-time multimedia
compression",
}
@Article{Cesar:2008:ISI,
author = "Pablo Cesar and Dick C. A. Bulterman and Luiz Fernando
Gomes Soares",
title = "Introduction to special issue: {Human-centered}
television --- directions in interactive digital
television research",
journal = j-TOMCCAP,
volume = "4",
number = "4",
pages = "24:1--24:??",
month = oct,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1412196.1412197",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:51:32 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The research area of interactive digital TV is in the
midst of a significant revival. Unlike the first
generation of digital TV, which focused on producer
concerns that effectively limited (re)distribution, the
current generation of research is closely linked to the
role of the user in selecting, producing, and
distributing content. The research field of interactive
digital television is being transformed into a study of
human-centered television. Our guest editorial reviews
relevant aspects of this transformation in the three
main stages of the content lifecycle: content
production, content delivery, and content consumption.
While past research on content production tools focused
on full-fledged authoring tools for professional
editors, current research studies lightweight, often
informal end-user authoring systems. In terms of
content delivery, user-oriented infrastructures such as
peer-to-peer are being seen as alternatives to more
traditional broadcast solutions. Moreover, end-user
interaction is no longer limited to content selection,
but now facilitates nonlinear participatory television
productions. Finally, user-to-user communication
technologies have allowed television to become a
central component of an interconnected social
experience. The background context given in this
article provides a framework for appreciating the
significance of four detailed contributions that
highlight important directions in transforming
interactive television research.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "24",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "Interactive television; shared experiences; standards;
survey",
}
@Article{Ursu:2008:ITN,
author = "Marian F. Ursu and Maureen Thomas and Ian Kegel and
Doug Williams and Mika Tuomola and Inger Lindstedt and
Terence Wright and Andra Leurdijk and Vilmos Zsombori
and Julia Sussner and Ulf Myrestam and Nina Hall",
title = "Interactive {TV} narratives: {Opportunities},
progress, and challenges",
journal = j-TOMCCAP,
volume = "4",
number = "4",
pages = "25:1--25:??",
month = oct,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1412196.1412198",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:51:32 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article is motivated by the question whether
television should do more than simply offer interactive
services alongside (and separately from) traditional
linear programs, in the context of its dominance being
seriously challenged and threatened by interactive
forms of screen media entertainment. It suggests: yes.
Interactive {\em narrativity}, that is, the ability to
interact with (and influence) stories whilst they are
being told, represents one clear development path for
interactive television. The capabilities of computing
technology are ripe for exploring this new form of
storytelling, from creation to commercial distribution.
The article starts by looking at the relationship
between narrativity and interactivity in the current
context of screen media, and identifies clear signs of
interest from certain European public broadcasters in
interactive TV narratives. It then presents in detail
four recent experimental interactive TV productions in
the genres of drama, news, and documentary, developed
in collaboration with public broadcasters, which
illustrate the potential and richness of this new form
of storytelling, but also highlight new technological
capabilities necessary for such productions. A number
of essential technological requirements are then
discussed in more detail in the final part. The article
suggests that the ShapeShifting Media Technology,
employed in the implementation of the four productions,
has made significant advances both at the technological
and the creative ends in supporting the development of
interactive TV narrativity, but, however, that further
developments are required before being able to answer
questions such as ``Would end users want such a form of
screen media entertainment?'' and ``Would it be
effective for both end users and producers?''",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "25",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "computational narrativity; digital storytelling;
entertainment; Interactive; media; narrativity;
nonlinear; screen media; shapeshifting; television",
}
@Article{Cheng:2008:GIP,
author = "Bin Cheng and Lex Stein and Hai Jin and Xiaofei Liao
and Zheng Zhang",
title = "{GridCast}: {Improving} peer sharing for {P2P VoD}",
journal = j-TOMCCAP,
volume = "4",
number = "4",
pages = "26:1--26:??",
month = oct,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1412196.1412199",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:51:32 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Video-on-Demand (VoD) is a compelling application, but
costly. VoD is costly due to the load it places on
video source servers. Many have proposed using
peer-to-peer (P2P) techniques to shift load from
servers to peers. Yet, nobody has implemented and
deployed a system to openly and systematically evaluate
how these techniques work.\par
This article describes the design, implementation and
evaluation of GridCast, a real deployed P2P VoD system.
GridCast has been live on CERNET since May of 2006. It
provides seek, pause, and play operations, and employs
peer sharing to improve system scalability. In peak
months, GridCast has served videos to 23,000 unique
users. From the first deployment, we have gathered
information to understand the system and evaluate how
to further improve peer sharing through caching and
replication.\par
We first show that GridCast with single video caching
(SVC) can decrease load on source servers by an average
of 22\% from a client-server architecture. We analyze
the net effect on system resources and determine that
peer upload is largely idle. This leads us to changing
the caching algorithm to cache multiple videos (MVC).
MVC decreases source load by an average of 51\% over
the client-server. The improvement is greater as user
load increases. This bodes well for peer-assistance at
larger scales.\par
A detailed analysis of MVC shows that departure misses
become a major issue in a P2P VoD system with caching
optimization. Motivated by this observation, we examine
how to use replication to eliminate departure misses
and further reduce server load. A framework for lazy
replication is presented and evaluated in this article.
In this framework, two predictors are plugged in to
create the working replication algorithm. With these
two simple predictors, lazy replication can decrease
server load by 15\% from MVC with only a minor increase
in network traffic.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "26",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "caching; peer-to-peer; replication; Video-on-demand",
}
@Article{Metcalf:2008:EPL,
author = "Crysta Metcalf and Gunnar Harboe and Joe Tullio and
Noel Massey and Guy Romano and Elaine M. Huang and
Frank Bentley",
title = "Examining presence and lightweight messaging in a
social television experience",
journal = j-TOMCCAP,
volume = "4",
number = "4",
pages = "27:1--27:??",
month = oct,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1412196.1412200",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:51:32 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "We report on a field evaluation of a prototype social
television system (Social TV) that incorporates
lightweight messaging as well as ambient awareness of
user presence on the system. This evaluation was
conducted over a two-week period and involved the
participation of ten households. Participants
appreciated the ability to see their buddies' presence
on the system, the ability to see or suggest the
programs they were currently watching, and the ability
to send short messages to one another. The presence
facilities available in Social TV also allowed
participants to learn more about one another's TV
viewing habits and preferences, and fostered a sense of
connectedness between them. However, they also felt
constrained by the limitations of the communication
options available to them and demanded free-form text
or voice chat to be able to fully express themselves.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "27",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "ambient displays; awareness displays;
computer-mediated communication; Social television",
}
@Article{Cattelan:2008:WCP,
author = "Renan G. Cattelan and Cesar Teixeira and Rudinei
Goularte and Maria Da Gra{\c{c}}a C. Pimentel",
title = "Watch-and-comment as a paradigm toward ubiquitous
interactive video editing",
journal = j-TOMCCAP,
volume = "4",
number = "4",
pages = "28:1--28:??",
month = oct,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1412196.1412201",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:51:32 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The literature reports research efforts allowing the
editing of interactive TV multimedia documents by
end-users. In this article we propose complementary
contributions relative to end-user generated
interactive video, video tagging, and collaboration. In
earlier work we proposed the {\em watch-and-comment\/}
(WaC) paradigm as the seamless capture of an
individual's comments so that corresponding annotated
interactive videos be automatically generated. As a
proof of concept, we implemented a prototype
application, the WaCTool, that supports the capture of
digital ink and voice comments over individual frames
and segments of the video, producing a declarative
document that specifies both: different media stream
structure and synchronization.\par
In this article, we extend the WaC paradigm in two
ways. First, user-video interactions are associated
with edit commands and digital ink operations. Second,
focusing on collaboration and distribution issues, we
employ annotations as simple containers for context
information by using them as tags in order to organize,
store and distribute information in a P2P-based
multimedia capture platform. We highlight the design
principles of the watch-and-comment paradigm, and
demonstrate related results including the current
version of the WaCTool and its architecture. We also
illustrate how an interactive video produced by the
WaCTool can be rendered in an interactive video
environment, the Ginga-NCL player, and include results
from a preliminary evaluation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "28",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "Annotation; Ginga-NCL; interactive digital video; P2P
collaboration",
}
@Article{Bailey:2008:SSA,
author = "Brian P. Bailey and Nicu Sebe and Alan Hanjalic",
title = "Special section from the {ACM Multimedia Conference
2007}",
journal = j-TOMCCAP,
volume = "5",
number = "1",
pages = "1:1--1:??",
month = oct,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1404880.1404881",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:51:49 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "1",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Gleicher:2008:RCI,
author = "Michael L. Gleicher and Feng Liu",
title = "Re-cinematography: {Improving} the camerawork of
casual video",
journal = j-TOMCCAP,
volume = "5",
number = "1",
pages = "2:1--2:??",
month = oct,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1404880.1404882",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:51:49 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article presents an approach to postprocessing
casually captured videos to improve apparent camera
movement. {\em Re-cinematography\/} transforms each
frame of a video such that the video better follows
cinematic conventions. The approach breaks a video into
shorter segments. Segments of the source video where
there is no intentional camera movement are made to
appear as if the camera is completely static. For
segments with camera motions, camera paths are
keyframed automatically and interpolated with matrix
logarithms to give velocity-profiled movements that
appear intentional and directed. Closeups are inserted
to provide compositional variety in otherwise uniform
segments. The approach automatically balances the
tradeoff between motion smoothness and distortion to
the original imagery. Results from our prototype show
improvements to poor quality home videos.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "2",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "casual video; cinematography; Image stabilization",
}
@Article{Qi:2008:CMV,
author = "Guo-Jun Qi and Xian-Sheng Hua and Yong Rui and Jinhui
Tang and Tao Mei and Meng Wang and Hong-Jiang Zhang",
title = "Correlative multilabel video annotation with temporal
kernels",
journal = j-TOMCCAP,
volume = "5",
number = "1",
pages = "3:1--3:??",
month = oct,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1404880.1404883",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:51:49 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Automatic video annotation is an important ingredient
for semantic-level video browsing, search and
navigation. Much attention has been paid to this topic
in recent years. These researches have evolved through
two paradigms. In the first paradigm, each concept is
individually annotated by a pre-trained binary
classifier. However, this method ignores the rich
information between the video concepts and only
achieves limited success. Evolved from the first
paradigm, the methods in the second paradigm add an
extra step on the top of the first individual
classifiers to fuse the multiple detections of the
concepts. However, the performance of these methods can
be degraded by the error propagation incurred in the
first step to the second fusion one. In this article,
another paradigm of the video annotation method is
proposed to address these problems. It simultaneously
annotates the concepts as well as model correlations
between them in one step by the proposed {\em
Correlative Multilabel\/} (CML) method, which benefits
from the compensation of complementary information
between different labels. Furthermore, since the video
clips are composed by temporally ordered frame
sequences, we extend the proposed method to exploit the
rich temporal information in the videos. Specifically,
a temporal-kernel is incorporated into the CML method
based on the discriminative information between {\em
Hidden Markov Models\/} (HMMs) that are learned from
the videos. We compare the performance between the
proposed approach and the state-of-the-art approaches
in the first and second paradigms on the widely used
TRECVID data set. As to be shown, superior performance
of the proposed method is gained.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "3",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "concept correlation; multilabeling; temporal kernel;
Video annotation",
}
@Article{Chen:2008:DDN,
author = "Yinpeng Chen and Weiwei Xu and Hari Sundaram and
Thanassis Rikakis and Sheng-Min Liu",
title = "A dynamic decision network framework for online media
adaptation in stroke rehabilitation",
journal = j-TOMCCAP,
volume = "5",
number = "1",
pages = "4:1--4:??",
month = oct,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1404880.1404884",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:51:49 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this article, we present a media adaptation
framework for an immersive biofeedback system for
stroke patient rehabilitation. In our biofeedback
system, media adaptation refers to changes in
audio/visual feedback as well as changes in physical
environment. Effective media adaptation frameworks help
patients recover generative plans for arm movement with
potential for significantly shortened therapeutic time.
The media adaptation problem has significant challenges
--- (a) high dimensionality of adaptation parameter
space; (b) variability in the patient performance
across and within sessions; (c) the actual
rehabilitation plan is typically a non-first-order
Markov process, making the learning task hard.\par
Our key insight is to understand media adaptation as a
real-time feedback control problem. We use a
mixture-of-experts based Dynamic Decision Network (DDN)
for online media adaptation. We train DDN mixtures per
patient, per session. The mixture models address two
basic questions --- (a) given a specific adaptation
suggested by the domain experts, predict the patient
performance, and (b) given the expected performance,
determine the optimal adaptation decision. The
questions are answered through an optimality criterion
based search on DDN models trained in previous
sessions. We have also developed new validation metrics
and have very good results for both questions on actual
stroke rehabilitation data.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "4",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "Biofeedback; dynamic decision network; media
adaptation; mixture of experts",
}
@Article{Thouin:2008:EAV,
author = "Frederic Thouin and Mark Coates",
title = "Equipment allocation in video-on-demand network
deployments",
journal = j-TOMCCAP,
volume = "5",
number = "1",
pages = "5:1--5:??",
month = oct,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1404880.1404885",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:51:49 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Video-on-Demand (VoD) services are very user-friendly,
but also complex and resource demanding. Deployments
involve careful design of many mechanisms where content
attributes and usage models should be taken into
account. We define, and propose a methodology to solve,
the {\em VoD Equipment Allocation Problem\/} of
determining the number and type of streaming servers
with directly attached storage (VoD servers) to install
at each potential location in a metropolitan area
network topology such that deployment costs are
minimized. We develop a cost model for VoD deployments
based on streaming, storage and transport costs and
train a parametric function that maps the amount of
available storage to a worst-case hit ratio. We observe
the impact of having to determine the amount of storage
and streaming cojointly, and determine the minimum
demand required to deploy replicas as well as the
average hit ratio at each location. We observe that
common video-on-demand server configurations lead to
the installation of excessive storage, because a
relatively high hit-ratio can be achieved with small
amounts of storage so streaming requirements
dominate.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "5",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "equipment allocation; optimization; resource
allocation; Video-on-demand",
}
@Article{Kolan:2008:NLV,
author = "Prakash Kolan and Ram Dantu and Jo{\~a}o W. Cangussu",
title = "Nuisance level of a voice call",
journal = j-TOMCCAP,
volume = "5",
number = "1",
pages = "6:1--6:??",
month = oct,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1404880.1404886",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:51:49 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In our everyday life, we communicate with many people
such as family, friends, neighbors, and colleagues. We
communicate with them using different communication
media such as email, telephone calls, and face-to-face
interactions. While email is not real-time and
face-to-face communications require geographic
proximity, voice and video communications are preferred
over other modes of communication. However, real-time
voice/video calls may create nuisance to the receiver.
In this article, we describe a mathematical model for
computing nuisance level of incoming voice/video calls.
We computed the closeness and nuisance level using the
calling patterns between the caller and the callee. To
validate the nuisance model, we collected cell phone
call records of real-life people at our university and
computed the nuisance value for all voice calls. We
validated the nuisance levels using the feedback from
those real-life people. Such a nuisance model is useful
for predicting unwanted voice and video sessions in an
IP communication network.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "6",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "behavior; Multimedia communications; nuisance;
presence; security; tolerance; unwantedness",
}
@Article{Zheng:2008:CVP,
author = "Qing-Fang Zheng and Wen Gao",
title = "Constructing visual phrases for effective and
efficient object-based image retrieval",
journal = j-TOMCCAP,
volume = "5",
number = "1",
pages = "7:1--7:??",
month = oct,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1404880.1404887",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:51:49 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The explosion of multimedia data necessitates
effective and efficient ways for us to get access to
our desired ones. In this article, we draw an analogy
between image retrieval and text retrieval and propose
a visual phrase-based approach to retrieve images
containing desired objects (object-based image
retrieval). The visual phrase is defined as a pair of
frequently co-occurred adjacent local image patches and
is constructed using data mining. We design methods on
how to construct visual phrase and how to index/search
images based on visual phrase. We demonstrate
experiments to show our visual phrase-based approach
can be very efficient and more effective than current
visual word-based approach.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "7",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "Content-based image retrieval; inverted index; local
image descriptor; object-based image retrieval; SIFT;
visual phrase",
}
@Article{Gill:2008:SDM,
author = "Phillipa Gill and Liqi Shi and Anirban Mahanti and
Zongpeng Li and Derek L. Eager",
title = "Scalable on-demand media streaming for heterogeneous
clients",
journal = j-TOMCCAP,
volume = "5",
number = "1",
pages = "8:1--8:??",
month = oct,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1404880.1404888",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:51:49 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Periodic broadcast protocols enable efficient
streaming of highly popular media files to large
numbers of concurrent clients. Most previous periodic
broadcast protocols, however, assume that all clients
can receive at the same rate, and also assume that
reception bandwidth is not time-varying. In this
article, we first develop a new periodic broadcast
protocol, Optimized Heterogeneous Periodic Broadcast
(OHPB), that can be optimized for a given population of
clients with heterogeneous reception bandwidths and
quality-of-service requirements. The OHPB protocol
utilizes an optimized segment size progression
determined by solving a linear optimization model that
takes as input the client population characteristics
and an objective function such as mean client startup
delay. We then develop a generalization of the OHPB
linear optimization model that allows optimal server
bandwidth allocation among multiple concurrent OHPB
broadcasts, wherein each media file and its clients may
have different characteristics. Finally, we propose
complementary client protocols employing work-ahead
buffering of data during playback, so as to enable more
uniform playback quality when the reception bandwidth
is time-varying.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "8",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "linear programming; periodic broadcasts;
quality-of-service; Scalable streaming",
}
@Article{Jung:2008:SSL,
author = "Dawoon Jung and Jaegeuk Kim and Jin-Soo Kim and
Joonwon Lee",
title = "{ScaleFFS}: a scalable log-structured flash file
system for mobile multimedia systems",
journal = j-TOMCCAP,
volume = "5",
number = "1",
pages = "9:1--9:??",
month = oct,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1404880.1404889",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:51:49 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "NAND flash memory has become one of the most popular
storage media for mobile multimedia systems. A key
issue in designing storage systems for mobile
multimedia systems is handling large-capacity storage
media and numerous large files with limited resources
such as memory. However, existing flash file systems,
including JFFS2 and YAFFS in particular, exhibit many
limitations in addressing the storage capacity of
mobile multimedia systems.\par
In this article, we design and implement a scalable
flash file system, called ScaleFFS, for mobile
multimedia systems. ScaleFFS is designed to require
only a small fixed amount of memory space and to
provide fast mount time, even if the file system size
grows to more than tens of gigabytes. The measurement
results show that ScaleFFS can be instantly mounted
regardless of the file system size, while achieving the
same write bandwidth and up to 22\% higher read
bandwidth compared to JFFS2.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "9",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "File system; flash memory; NAND; storage system",
}
@Article{Moncrieff:2008:DPA,
author = "Simon Moncrieff and Svetha Venkatesh and Geoff West",
title = "Dynamic privacy assessment in a smart house
environment using multimodal sensing",
journal = j-TOMCCAP,
volume = "5",
number = "2",
pages = "10:1--10:??",
month = nov,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1413862.1413863",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:52:17 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Surveillance applications in private environments such
as smart houses require a privacy management policy if
such systems are to be accepted by the occupants of the
environment. This is due to the invasive nature of
surveillance, and the private nature of the home. In
this article, we propose a framework for dynamically
altering the privacy policy applied to the monitoring
of a smart house based on the situation within the
environment. Initially the situation, or context,
within the environment is determined; we identify
several factors for determining environmental context,
and propose methods to quantify the context using audio
and binary sensor data. The context is then mapped to
an appropriate privacy policy, which is implemented by
applying data hiding techniques to control access to
data gathered from various information sources. The
significance of this work lies in the examination of
privacy issues related to assisted-living smart house
environments. A single privacy policy in such
applications would be either too restrictive for an
observer, for example, a carer, or too invasive for the
occupants. We address this by proposing a dynamic
method, with the aim of decreasing the invasiveness of
the technology, while retaining the purpose of the
system.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "10",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "Assisted living; audio; context aware; privacy;
surveillance and monitoring",
}
@Article{Adams:2008:SUS,
author = "Brett Adams and Dinh Phung and Svetha Venkatesh",
title = "Sensing and using social context",
journal = j-TOMCCAP,
volume = "5",
number = "2",
pages = "11:1--11:??",
month = nov,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1413862.1413864",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:52:17 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "We present online algorithms to extract social
context: Social spheres are labeled locations of
significance, represented as convex hulls extracted
from GPS traces. Colocation is determined from
Bluetooth and GPS to extract social rhythms, patterns
in time, duration, place, and people corresponding to
real-world activities. Social ties are formulated from
proximity and shared spheres and rhythms. Quantitative
evaluation is performed for 10+ million samples over 45
man-months. Applications are presented with assessment
of perceived utility: {\em Socio-Graph}, a video and
photo browser with filters for social metadata, and
{\em Jive}, a blog browser that uses rhythms to
discover similarity between entries automatically.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "11",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "Multimedia browsing; social context",
}
@Article{Mohanty:2008:IWB,
author = "Saraju P. Mohanty and Bharat K. Bhargava",
title = "Invisible watermarking based on creation and robust
insertion-extraction of image adaptive watermarks",
journal = j-TOMCCAP,
volume = "5",
number = "2",
pages = "12:1--12:??",
month = nov,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1413862.1413865",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:52:17 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article presents a novel invisible robust
watermarking scheme for embedding and extracting a
digital watermark in an image. The novelty lies in
determining a perceptually important subimage in the
host image. Invisible insertion of the watermark is
performed in the most significant region of the host
image such that tampering of that portion with an
intention to remove or destroy will degrade the
esthetic quality and value of the image. One feature of
the algorithm is that this subimage is used as a region
of interest for the watermarking process and eliminates
the chance of watermark removal. Another feature of the
algorithm is the creation of a compound watermark using
the input user watermark (logo) and attributes of the
host image. This facilitates the homogeneous fusion of
a watermark with the cover image, preserves the quality
of the host image, and allows robust
insertion-extraction. Watermark creation consists of
two distinct phases. During the first phase, a
statistical image is synthesized from a perceptually
important subimage of the image. A compound watermark
is created by embedding a watermark (logo) into the
statistical synthetic image by using a visible
watermarking technique. This compound watermark is
invisibly embedded into the important block of the host
image. The authentication process involves extraction
of the perceptive logo as well statistical testing for
two-layer evidence. Results of the experimentation
using standard benchmarks demonstrates the robustness
and efficacy of the proposed watermarking approach.
Ownership proof could be established under various
hostile attacks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "12",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "content protection; copyright protection; image;
invisible watermarking; Watermarking",
}
@Article{Yiu:2008:ODC,
author = "Wai-Pun Ken Yiu and Shueng-Han Gary Chan",
title = "Offering data confidentiality for multimedia overlay
multicast: {Design} and analysis",
journal = j-TOMCCAP,
volume = "5",
number = "2",
pages = "13:1--13:??",
month = nov,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1413862.1413866",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:52:17 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Application layer multicast (ALM) has been proposed to
overcome current limitations in IP multicast for
large-group multimedia communication. We address
offering data confidentiality tailored for ALM. To
achieve confidentiality, a node may need to
continuously {\em re-encrypt\/} packets before
forwarding them downstream. Furthermore, keys have to
be changed whenever there is a membership change,
leading to {\em rekey\/} processing overhead at the
nodes. For a large and dynamic group, these
reencryption and rekeying operations incur high
processing overhead at the nodes. We propose and
analyze a scalable scheme called Secure Overlay
Multicast (SOM) which clusters ALM peers so as to
localize rekeying within a cluster and to limit
re-encryption at cluster boundaries, thereby minimizing
the total nodal processing overhead. We describe the
operations of SOM and compare its nodal processing
overhead with two other basic approaches, namely,
host-to-host encryption and whole group encryption. We
also present a simplified analytic model for SOM and
show that there exists an optimal cluster size to
minimize the total nodal processing overhead. By
comparing with a recently proposed ALM scheme (DT
protocol), SOM achieves a substantial reduction in
nodal processing overhead with similar network
performance in terms of network stress and delay.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "13",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "Key management; multicast security; overlay multicast;
performance analysis",
}
@Article{Nakayama:2008:ECR,
author = "Minoru Nakayama and Yosiyuki Takahasi",
title = "Estimation of certainty for responses to
multiple-choice questionnaires using eye movements",
journal = j-TOMCCAP,
volume = "5",
number = "2",
pages = "14:1--14:??",
month = nov,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1413862.1413867",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:52:17 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "To examine the feasibility of estimating the degree of
strength of belief (SOB) of responses using eye
movements, the scan paths of eye movements were
analyzed while subjects reviewed their own responses to
multiple choice tasks. All fixation points of eye
movements were classified into visual areas, or cells,
which corresponded with the positions of answers. Two
estimation procedures are proposed using eye-movement
data. The first one is identifying SOB using scan-path
transitions. By comparing subject's reports of high and
low SOB and eye-movement estimations, a significant
correct rate of discrimination of SOB was observed.
When the threshold of discrimination was controlled, a
high rate of correct responses was obtained if it was
set at a low level.\par
The second procedure is conducting SOB discrimination
using support vector machines (SVM) trained with
features of fixations. Subject's gazing features were
analyzed while they reviewed their own responses. A
discrimination model for SOB was trained with several
combinations of features to see whether performance of
a significant level could be obtained. As a result, a
trained model with 3 features (which consist of
interval time, vertical difference, and length between
fixations) can provide significant discrimination
performance for SOB.\par
These results provide evidence that strength of belief
can be estimated using eye movements",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "14",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "certainty; Eye-movements; scan-path analysis; support
vector machines",
}
@Article{Shipman:2008:AVG,
author = "Frank Shipman and Andreas Girgensohn and Lynn Wilcox",
title = "Authoring, viewing, and generating hypervideo: an
overview of {Hyper-Hitchcock}",
journal = j-TOMCCAP,
volume = "5",
number = "2",
pages = "15:1--15:??",
month = nov,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1413862.1413868",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:52:17 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Hyper-Hitchcock consists of three components for
creating and viewing a form of interactive video called
detail-on-demand video: a hypervideo editor, a
hypervideo player, and algorithms for automatically
generating hypervideo summaries. Detail-on-demand video
is a form of hypervideo that supports one hyperlink at
a time for navigating between video sequences. The
Hyper-Hitchcock editor enables authoring of
detail-on-demand video without programming and uses
video processing to aid in the authoring process. The
Hyper-Hitchcock player uses labels and keyframes to
support navigation through and back hyperlinks.
Hyper-Hitchcock includes techniques for automatically
generating hypervideo summaries of one or more videos
that take the form of multiple linear summaries of
different lengths with links from the shorter to the
longer summaries. User studies on authoring and viewing
provided insight into the various roles of links in
hypervideo and found that player interface design
greatly affects people's understanding of hypervideo
structure and the video they access.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "15",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "Hypervideo; link generation; video editing; video
summarization",
}
@Article{He:2008:EED,
author = "Wenbo He and Klara Nahrstedt and Xue Liu",
title = "End-to-end delay control of multimedia applications
over multihop wireless links",
journal = j-TOMCCAP,
volume = "5",
number = "2",
pages = "16:1--16:??",
month = nov,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1413862.1413869",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:52:17 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The proliferation of multimedia applications over
mobile, resource-constrained wireless networks has
raised the need for techniques that adapt these
applications both to clients' Quality of Service (QoS)
requirements and to network resource constraints. This
article investigates the upper-layer adaptation
mechanisms to achieve end-to-end delay control for
multimedia applications. The proposed adaptation
approach spans application layer, middleware layer and
network layer. In application layer, the requirement
adaptor dynamically changes the requirement levels
according to end-to-end delay measurement and
acceptable QoS requirements for the end-users. In
middleware layer, the priority adaptor is used to
dynamically adjust the service classes for applications
using feedback control theory. In network layer, the
service differentiation scheduler assigns different
network resources (e.g., bandwidth) to different
service classes. With the coordination of these three
layers, our approach can adaptively assign resources to
multimedia applications. To evaluate the impact of our
adaptation scheme, we built a real IEEE 802.11 ad hoc
network testbed. The test-bed experiments show that the
proposed upper-layer adaptation for end-to-end delay
control successfully adjusts multimedia applications to
meet delay requirements in many scenarios.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "16",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "End-to-end delay QoS; wireless ad hoc networks",
}
@Article{Pan:2008:CBM,
author = "Leon Pan and Chang N. Zhang",
title = "A criterion-based multilayer access control approach
for multimedia applications and the implementation
considerations",
journal = j-TOMCCAP,
volume = "5",
number = "2",
pages = "17:1--17:??",
month = nov,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1413862.1413870",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:52:17 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this article, a novel criterion-based multilayer
access control (CBMAC) approach is presented to enhance
existing access control models such as Role-Based,
Mandatory, and Discretionary Access Control models to
support multilayer (multilevel) access control. The
proposed approach is based on a set of predefined
security criteria which are extracted from
authorization rules. The security attributes of objects
and users are specified by security criterion
expressions (serving as locks) and the elements
(serving as keys) of security criterion subsets
respectively. An object embedded with a number of
security criterion expressions becomes a secure object
while a user associated with a security criterion
subset is called a secure user. The multilayer access
control is achieved by evaluating the embedded security
criterion expressions (actuating locks) by the elements
(keys) in a user's security criterion subset. The paper
also provides the details of integrating the proposed
approach with existing access control models and
presents the implementation considerations of
Criterion-Based Role-Based Multilayer Access Control,
the integration of CBMAC and Role-Based Access
Control.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "17",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "Multilayer access control; secure object; secure
permission; secure user; security criterion",
}
@Article{Candan:2009:ISS,
author = "K. Sel{\c{c}}uk Candan and Alberto {Del Bimbo} and
Carsten Griwodz and Alejandro Jaimes",
title = "Introduction to the special section for the best
papers of {ACM Multimedia 2008}",
journal = j-TOMCCAP,
volume = "5",
number = "3",
pages = "18:1--18:??",
month = aug,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1556134.1556135",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:52:39 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "18",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Cesar:2009:FTE,
author = "Pablo Cesar and Dick C. A. Bulterman and Jack Jansen
and David Geerts and Hendrik Knoche and William
Seager",
title = "Fragment, tag, enrich, and send: {Enhancing} social
sharing of video",
journal = j-TOMCCAP,
volume = "5",
number = "3",
pages = "19:1--19:??",
month = aug,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1556134.1556136",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:52:39 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The migration of media consumption to personal
computers retains distributed social viewing, but only
via nonsocial, strictly personal interfaces. This
article presents an architecture, and implementation
for media sharing that allows for enhanced social
interactions among users. Using a mixed-device model,
our work allows targeted, personalized enrichment of
content. All recipients see common content, while
differentiated content is delivered to individuals via
their personal secondary screens. We describe the
goals, architecture, and implementation of our system
in this article. In order to validate our results, we
also present results from two user studies involving
disjoint sets of test participants.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "19",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "Asynchronous media sharing; differentiated content
enrichment; secondary screens",
}
@Article{Knoche:2009:BPS,
author = "H. Knoche and M. A. Sasse",
title = "The big picture on small screens delivering acceptable
video quality in mobile {TV}",
journal = j-TOMCCAP,
volume = "5",
number = "3",
pages = "20:1--20:??",
month = aug,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1556134.1556137",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:52:39 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Mobile TV viewers can change the viewing distance and
(on some devices) scale the picture to their preferred
viewing ratio, trading off size for angular resolution.
We investigated optimal trade-offs between size and
resolution through a series of studies. Participants
selected their preferred size and rated the
acceptability of the visual experience on a 200ppi
device at a 4:3 aspect ratio. They preferred viewing
ratios similar to living room TV setups regardless of
the much lower resolution: at a minimum 14 pixels per
degree. While traveling on trains people required
videos with a height larger than 35mm.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "20",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "Mobile multimedia consumption; resolution; size;
trade-off",
}
@Article{Mondet:2009:CPP,
author = "Sebastien Mondet and Wei Cheng and Geraldine Morin and
Romulus Grigoras and Frederic Boudon and Wei Tsang
Ooi",
title = "Compact and progressive plant models for streaming in
networked virtual environments",
journal = j-TOMCCAP,
volume = "5",
number = "3",
pages = "21:1--21:??",
month = aug,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1556134.1556138",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:52:39 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Just as in the real world, plants are important
objects in virtual worlds for creating pleasant and
realistic environments, especially those involving
natural scenes. As such, much effort has been made in
realistic modeling of plants. As the trend moves
towards networked and distributed virtual environments,
however, the current models are inadequate as they are
not designed for progressive transmissions. In this
article, we fill in this gap by proposing a progressive
representation for plants based on generalized
cylinders. We model the shape and thickness of branches
in a plant as B{\'e}zier curves, group the curves
according to the similarity, and differentially code
the curves to represent the plant in a compact and
progressive manner. To facilitate the transmission of
the plants, we quantify the visual contribution of each
branch and use this weight in packet scheduling. We
show the efficiency of our representations and the
effectiveness of our packet scheduler through
experiments over a wide area network.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "21",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "multiresolution; networked virtual environment; plant
models; progressive coding; progressive transmission;
Streaming",
}
@Article{Wei:2009:CCM,
author = "Yong Wei and Suchendra M. Bhandarkar and Kang Li",
title = "Client-centered multimedia content adaptation",
journal = j-TOMCCAP,
volume = "5",
number = "3",
pages = "22:1--22:??",
month = aug,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1556134.1556139",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:52:39 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The design and implementation of a client-centered
multimedia content adaptation system suitable for a
mobile environment comprising of resource-constrained
handheld devices or clients is described. The primary
contributions of this work are: (1) the overall
architecture of the client-centered content adaptation
system, (2) a data-driven multi-level Hidden Markov
model (HMM)-based approach to perform both video
segmentation and video indexing in a single pass, and
(3) the formulation and implementation of a
Multiple-choice Multidimensional Knapsack Problem
(MMKP)-based video personalization strategy. In order
to segment and index video data, a video stream is
modeled at both the semantic unit level and video
program level. These models are learned entirely from
training data and no domain-dependent knowledge about
the structure of video programs is used. This makes the
system capable of handling various kinds of videos
without having to manually redefine the program model.
The proposed MMKP-based personalization strategy is
shown to include more relevant video content in
response to the client's request than the existing 0/1
knapsack problem and fractional knapsack problem-based
strategies, and is capable of satisfying multiple
client-side constraints simultaneously. Experimental
results on CNN news videos and Major League Soccer
(MLS) videos are presented and analyzed.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "22",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "hidden Markov models; multiple choice multidimensional
knapsack problem; video indexing; Video
personalization",
}
@Article{Sivaram:2009:DMS,
author = "G. S. V. S. Sivaram and Mohan S. Kankanhalli and K. R.
Ramakrishnan",
title = "Design of multimedia surveillance systems",
journal = j-TOMCCAP,
volume = "5",
number = "3",
pages = "23:1--23:??",
month = aug,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1556134.1556140",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:52:39 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article addresses the problem of how to select
the optimal combination of sensors and how to determine
their optimal placement in a surveillance region in
order to meet the given performance requirements at a
minimal cost for a multimedia surveillance system. We
propose to solve this problem by obtaining a
performance vector, with its elements representing the
performances of subtasks, for a given input combination
of sensors and their placement. Then we show that the
optimal sensor selection problem can be converted into
the form of Integer Linear Programming problem (ILP) by
using a linear model for computing the optimal
performance vector corresponding to a sensor
combination. Optimal performance vector corresponding
to a sensor combination refers to the performance
vector corresponding to the optimal placement of a
sensor combination. To demonstrate the utility of our
technique, we design and build a surveillance system
consisting of PTZ (Pan-Tilt-Zoom) cameras and active
motion sensors for capturing faces. Finally, we show
experimentally that optimal placement of sensors based
on the design maximizes the system performance.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "23",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "Performance vector; sensor selection and placement",
}
@Article{Liu:2009:SSE,
author = "Xiaotao Liu and Mark Corner and Prashant Shenoy",
title = "{\em {SEVA\/}}: {Sensor-enhanced} video annotation",
journal = j-TOMCCAP,
volume = "5",
number = "3",
pages = "24:1--24:??",
month = aug,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1556134.1556141",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:52:39 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this article, we study how a sensor-rich world can
be exploited by digital recording devices such as
cameras and camcorders to improve a user's ability to
search through a large repository of image and video
files. We design and implement a digital recording
system that records identities and locations of objects
(as advertised by their sensors) along with visual
images (as recorded by a camera). The process, which we
refer to as {\em Sensor-Enhanced Video Annotation
(SEVA)}, combines a series of correlation,
interpolation, and extrapolation techniques. It
produces a tagged stream that later can be used to
efficiently search for videos or frames containing
particular objects or people. We present detailed
experiments with a prototype of our system using both
stationary and mobile objects as well as GPS and
ultrasound. Our experiments show that: (i) SEVA has
zero error rates for static objects, except very close
to the boundary of the viewable area; (ii) for moving
objects or a moving camera, SEVA only misses objects
leaving or entering the viewable area by 1--2 frames;
(iii) SEVA can scale to 10 fast-moving objects using
current sensor technology; and (iv) SEVA runs online
using relatively inexpensive hardware.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "24",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "context-based retrieval; location-based services;
sensor-enhanced; Video annotation",
}
@Article{Wang:2009:MLS,
author = "Bing Wang and Wei Wei and Zheng Guo and Don Towsley",
title = "Multipath live streaming via {TCP}: {Scheme},
performance and benefits",
journal = j-TOMCCAP,
volume = "5",
number = "3",
pages = "25:1--25:??",
month = aug,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1556134.1556142",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:52:39 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Motivated by the wide use of TCP for multimedia
streaming in practice and the increasing availability
of multipath between end hosts, we study multipath live
streaming via TCP in this article. We first design a
simple and practical TCP-based multipath streaming
scheme, named {\em Dynamic MPath-streaming
(DMP-streaming)}, which dynamically distributes packets
over multiple paths by {\em implicitly inferring\/} the
available bandwidths on these paths. To allow
systematic performance study, we develop an analytical
model for DMP-streaming and validate the model using
extensive {\em ns\/} simulation and Internet
experiments. We explore the parameter space of this
model and find that DMP-streaming generally provides
satisfactory performance when the aggregate achievable
TCP throughput is 1.6 times the video bitrate, when
allowing a few seconds of startup delay. Last, we
comment on the benefits of using multipath versus
single path for TCP-based streaming.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "25",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "multimedia streaming; Performance modeling",
}
@Article{Li:2009:PBR,
author = "Mingzhe Li and Mark Claypool and Robert Kinicki",
title = "Playout buffer and rate optimization for streaming
over {IEEE 802.11} wireless networks",
journal = j-TOMCCAP,
volume = "5",
number = "3",
pages = "26:1--26:??",
month = aug,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1556134.1556143",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:52:39 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Most streaming rate selection and buffer optimization
algorithms are developed for wired networks and can
perform poorly over wireless networks. Wireless MAC
layer behavior, such as rate adaptation,
retransmissions, and medium sharing, can significantly
degrade the effectiveness of current streaming
algorithms. This article presents the Buffer and Rate
Optimization for Streaming (BROS) algorithm to improve
streaming performance. BROS uses a bandwidth estimation
tool designed specifically for wireless networks and
models the relationship between buffer size, streaming
data rate, and available bandwidth distribution. BROS
optimizes the streaming data rate and initial buffer
size, resulting in a high data rate but with few frame
losses and buffer underflow events, while still keeping
a small initial buffer delay. BROS is implemented in
the Emulated Streaming (EmuS) client-server system and
evaluated on an IEEE 802.11 wireless testbed with
various wireless conditions. The evaluation shows that
BROS can effectively optimize the streaming rate and
initial buffer size based on wireless network bandwidth
conditions, thus achieving better performance than
static rate or buffer selection and jitter removal
buffers.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "26",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "Multimedia networking; playout buffer; streaming rate;
wireless networks",
}
@Article{Sauer:2009:MDC,
author = "Danielle Sauer and Yee-Hong Yang",
title = "Music-driven character animation",
journal = j-TOMCCAP,
volume = "5",
number = "4",
pages = "27:1--27:??",
month = oct,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1596990.1596991",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:53:03 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Music-driven character animation extracts musical
features from a song and uses them to create an
animation. This article presents a system that builds a
new animation directly from musical attributes, rather
than simply synchronizing it to the music like similar
systems. Using a simple script that identifies the
movements involved in the performance and their timing,
the user can easily control the animation of
characters. Another unique feature of the system is its
ability to incorporate multiple characters into the
same animation, both with synchronized and
unsynchronized movements. A system that integrates
Celtic dance movements is developed in this article. An
evaluation of the results shows that the majority of
animations are found to be appealing to viewers and
that altering the music can change the attractiveness
of the final result.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "27",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "Character animation; motion synthesis; music analysis;
primitive movements",
}
@Article{Deng:2009:SCA,
author = "Robert H. Deng and Yanjiang Yang",
title = "A study of content authentication in proxy-enabled
multimedia delivery systems: {Model}, techniques, and
applications",
journal = j-TOMCCAP,
volume = "5",
number = "4",
pages = "28:1--28:??",
month = oct,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1596990.1596992",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:53:03 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Compared with the direct server-user approach, the
server-proxy-user architecture for multimedia delivery
promises significantly improved system scalability. The
introduction of the intermediary transcoding proxies
between content servers and end users in this
architecture, however, brings unprecedented challenges
to content security. In this article, we present a
systematic study on the end-to-end content
authentication problem in the server-proxy-user
context, where intermediary proxies transcode
multimedia content dynamically. We present a formal
model for the authentication problem, propose a
concrete construction for authenticating generic data
modality and formally prove its security. We then apply
the generic construction to authenticating specific
multimedia formats, for example, JPEG2000 code-streams
and MPEG-4 video streams. The prototype implementation
shows that our scheme is suitable for practical
applications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "28",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "end-to-end authentication; Multimedia content
delivery; security",
}
@Article{Cha:2009:TVS,
author = "Jongeun Cha and Mohamad Eid and Abdulmotaleb {El
Saddik}",
title = "Touchable {$3$D} video system",
journal = j-TOMCCAP,
volume = "5",
number = "4",
pages = "29:1--29:??",
month = oct,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1596990.1596993",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:53:03 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Multimedia technologies are reaching the limits of
providing audio-visual media that viewers consume
passively. An important factor, which will ultimately
enhance the user's experience in terms of
impressiveness and immersion, is interaction. Among
daily life interactions, haptic interaction plays a
prominent role in enhancing the quality of experience
of users, and in promoting physical and emotional
development. Therefore, a critical step in multimedia
research is expected to bring the sense of touch, or
haptics, into multimedia systems and applications. This
article proposes a touchable 3D video system where
viewers can actively touch a video scene through a
force-feedback device, and presents the underlying
technologies in three functional components: (1)
contents generation, (2) contents transmission, and (3)
viewing and interaction. First of all, we introduce a
depth image-based haptic representation (DIBHR) method
that adds haptic and heightmap images, in addition to
the traditional depth image-based representation
(DIBR), to encode the haptic surface properties of the
video media. In this representation, the haptic image
contains the stiffness, static friction, and dynamic
friction, whereas the heightmap image contains
roughness of the video contents. Based on this
representation method, we discuss how to generate
synthetic and natural (real) video media through a 3D
modeling tool and a depth camera, respectively. Next,
we introduce a transmission mechanism based on the
MPEG-4 framework where new MPEG-4 BIFS nodes are
designed to describe the haptic scene. Finally, a
haptic rendering algorithm to compute the interaction
force between the scene and the viewer is described. As
a result, the performance of the haptic rendering
algorithm is evaluated in terms of computational time
and smooth contact force. It operates marginally within
a 1 kHz update rate that is required to provide stable
interaction force and provide smoother contact force
with the depth image that has high frequency
geometrical noise using a median filter.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "29",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "haptic rendering algorithm; Haptic surface properties;
video representation",
}
@Article{Benevenuto:2009:VIO,
author = "Fabr{\'\i}cio Benevenuto and Tiago Rodrigues and
Virgilio Almeida and Jussara Almeida and Keith Ross",
title = "Video interactions in online video social networks",
journal = j-TOMCCAP,
volume = "5",
number = "4",
pages = "30:1--30:??",
month = oct,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1596990.1596994",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:53:03 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article characterizes video-based interactions
that emerge from YouTube's video response feature,
which allows users to discuss themes and to provide
reviews for products or places using much richer media
than text. Based on crawled data covering a
representative subset of videos and users, we present a
characterization from two perspectives: the video
response view and the interaction network view. In
addition to providing valuable statistical models for
various characteristics, our study uncovers typical
user behavioral patterns in video-based environments
and shows evidence of opportunistic behavior.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "30",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "opportunistic behavior; promotion; social media;
social networks; video communication; Video
interactions; video spam; YouTube",
}
@Article{Erdmann:2009:IEB,
author = "Maike Erdmann and Kotaro Nakayama and Takahiro Hara
and Shojiro Nishio",
title = "Improving the extraction of bilingual terminology from
{Wikipedia}",
journal = j-TOMCCAP,
volume = "5",
number = "4",
pages = "31:1--31:??",
month = oct,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1596990.1596995",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:53:03 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Research on the automatic construction of bilingual
dictionaries has achieved impressive results. Bilingual
dictionaries are usually constructed from parallel
corpora, but since these corpora are available only for
selected text domains and language pairs, the potential
of other resources is being explored as well.\par
In this article, we want to further pursue the idea of
using Wikipedia as a corpus for bilingual terminology
extraction. We propose a method that extracts
term-translation pairs from different types of
Wikipedia link information. After that, an SVM
classifier trained on the features of manually labeled
training data determines the correctness of unseen
term-translation pairs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "31",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "Bilingual dictionary; link analysis; Wikipedia
mining",
}
@Article{Carlsson:2010:SSL,
author = "Niklas Carlsson and Derek L. Eager",
title = "Server selection in large-scale video-on-demand
systems",
journal = j-TOMCCAP,
volume = "6",
number = "1",
pages = "1:1--1:??",
month = feb,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1671954.1671955",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:53:23 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Video on demand, particularly with user-generated
content, is emerging as one of the most
bandwidth-intensive applications on the Internet. Owing
to content control and other issues, some
video-on-demand systems attempt to prevent downloading
and peer-to-peer content delivery. Instead, such
systems rely on server replication, such as via
third-party content distribution networks, to support
video streaming (or pseudostreaming) to their clients.
A major issue with such systems is the cost of the
required server resources.\par
By synchronizing the video streams for clients that
make closely spaced requests for the same video from
the same server, server costs (such as for retrieval of
the video data from disk) can be amortized over
multiple requests. A fundamental trade-off then arises,
however, with respect to server selection. Network
delivery cost is minimized by selecting the {\em
nearest\/} server, while server cost is minimized by
directing closely spaced requests for the same video to
a {\em common\/} server.\par
This article compares classes of server selection
policies within the context of a simple system model.
We conclude that: (i) server selection using dynamic
system state information (rather than only proximities
and average loads) can yield large improvements in
performance, (ii) deferring server selection for a
request as late as possible (i.e., until just before
streaming is to begin) can yield additional large
improvements, and (iii) within the class of policies
using dynamic state information and deferred selection,
policies using only ``local'' (rather than global)
request information are able to achieve most of the
potential performance gains.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "1",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "content distribution networks; modeling; Performance
analysis; server selection; video-on-demand",
}
@Article{Agarwal:2010:BRW,
author = "Parag Agarwal and Balakrishnan Prabhakaran",
title = "Blind robust watermarking of {$3$D} motion data",
journal = j-TOMCCAP,
volume = "6",
number = "1",
pages = "2:1--2:??",
month = feb,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1671954.1671956",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:53:23 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The article addresses the problem of copyright
protection for 3D motion-captured data by designing a
robust blind watermarking mechanism. The mechanism
segments motion capture data and identifies clusters of
3D points per segment. A watermark can be embedded and
extracted within these clusters by using a proposed
extension of 3D quantization index modulation. The
watermarking scheme is blind in nature and the encoded
watermarks are shown to be imperceptible, and secure.
The resulting hiding capacity has bounds based on
cluster size. The watermarks are shown to be robust
against attacks such as uniform affine transformations
(scaling, rotation, and translation), cropping,
reordering, and noise addition. The time complexity for
watermark embedding and extraction is estimated as
O({\em n\/} log {\em n\/}) and O({\em n\/}$^2$ log {\em
n\/}), respectively.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "2",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "blind; decoding; encoding; spatial; Watermarking",
}
@Article{Yang:2010:DMD,
author = "Bo Yang",
title = "{DSI}: a model for distributed multimedia semantic
indexing and content integration",
journal = j-TOMCCAP,
volume = "6",
number = "1",
pages = "3:1--3:??",
month = feb,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1671954.1671957",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:53:23 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Considerable research has been done on the
content-based multimedia delivery and access in
distributed data repositories. As noted in the
literature, there is always a trade-off between
multimedia quality and access speed. In addition, the
overall performance is greatly determined by the
distribution of the multimedia data. In this article,
an unsupervised multimedia semantic integration
approach for a distributed infrastructure, the
Distributed Semantic Indexing (DSI), is presented that
addresses both the data quality and search performance.
With the ability of summarizing content information and
guiding data distribution, the proposed approach is
distinguished by: (1) logic-based representation and
concise abstraction of the semantic contents of
multimedia data, which are further integrated to form a
general overview of a multimedia data repository ---
content signature; (2) application of linguistic
relationships to construct a hierarchical metadata
based on the content signatures allowing imprecise
queries; and (3) achieving the optimal performance in
terms of search cost. The fundamental structure of the
proposed model is presented. The proposed scheme has
been simulated and the simulation results are analyzed
and compared against several other approaches that have
been advocated in the literature.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "3",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "distributed indexing; image retrieval; Semantic
representation",
}
@Article{Nystrom:2010:ECO,
author = "Marcus Nystr{\"o}m and Kenneth Holmqvist",
title = "Effect of compressed offline foveated video on viewing
behavior and subjective quality",
journal = j-TOMCCAP,
volume = "6",
number = "1",
pages = "4:1--4:??",
month = feb,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1671954.1671958",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:53:23 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Offline foveation is a technique to improve the
compression efficiency of digitized video. The general
idea behind offline foveation is to blur video regions
where no or a small number of previewers look without
decreasing the subjective quality for later viewers. It
relies on the fact that peripheral vision is reduced
compared to central vision, and the observation that
during free-viewing humans' gaze positions generally
coincide when watching video. In this article, we
conduct two experiments to assess how offline foveation
affects viewing behavior and subjective quality. In the
first experiment, 15 subjects free-viewed six video
clips before and after offline foveation whereas in the
second experiment we had 17 subjects assessing the
quality of these videos after one, two, and three
consecutive viewings. Eye movements were measured
during the experiments. Results showed that, although
offline foveation prior to encoding with H.264 yielded
data reductions up to 52\% (20\% average) on the tested
videos, it had little or no effect on where people
looked, their intersubject dispersion, fixation
duration, saccade amplitude, or the experienced quality
during first-time viewing. However, seeing the videos
more than once increased the intersubject dispersion
and decreased the subjective quality. In view of these
results, we discuss the usage of offline foveated video
in practical applications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "4",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "Eye-tracking; foveation; subjective quality; video
compression",
}
@Article{Ivanov:2010:RTH,
author = "Yuri V. Ivanov and C. J. Bleakley",
title = "Real-time {H.264} video encoding in software with fast
mode decision and dynamic complexity control",
journal = j-TOMCCAP,
volume = "6",
number = "1",
pages = "5:1--5:??",
month = feb,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1671954.1671959",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:53:23 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article presents a novel real-time algorithm for
reducing and dynamically controlling the computational
complexity of an H.264 video encoder implemented in
software. A fast mode decision algorithm, based on a
Pareto-optimal macroblock classification scheme, is
combined with a dynamic complexity control algorithm
that adjusts the MB class decisions such that a
constant frame rate is achieved. The average coding
efficiency of the proposed algorithm was found to be
similar to that of conventional encoding operating at
half the frame rate. The proposed algorithm was found
to provide lower average bitrate and distortion than
static complexity scaling.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "5",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "complexity; complexity control; fast mode decision;
H/264/AVC; mode decision; rate distortion; real time",
}
@Article{Hefeeda:2010:ASM,
author = "Mohamed Hefeeda and Kianoosh Mokhtarian",
title = "Authentication schemes for multimedia streams:
{Quantitative} analysis and comparison",
journal = j-TOMCCAP,
volume = "6",
number = "1",
pages = "6:1--6:??",
month = feb,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1671954.1671960",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Mar 16 18:53:23 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "With the rapid increase in the demand for multimedia
services, securing the delivery of multimedia content
has become an important issue. Accordingly, the problem
of multimedia stream authentication has received
considerable attention by previous research and various
solutions have been proposed. However, these solutions
have not been rigorously analyzed and contrasted to
each other, and thus their relative suitability for
different streaming environments is not clear. This
article presents comprehensive analysis and comparison
among different schemes proposed in the literature to
authenticate multimedia streams. Authentication schemes
for nonscalable and scalable multimedia streams are
analyzed. To conduct this analysis, we define five
important performance metrics, which are computation
cost, communication overhead, receiver buffer size,
delay, and tolerance to packet losses. We derive
analytic formulas for these metrics for all considered
authentication schemes to numerically analyze their
performance. In addition, we implement all schemes in a
simulator to study and compare their performance in
different environments. The parameters for the
simulator are carefully chosen to mimic realistic
settings. We draw several conclusions on the advantages
and disadvantages of each scheme. We extend our
analysis to authentication techniques for scalable
streams. We pay careful attention to the flexibility of
scalable streams and analyze its impacts on the
authentication schemes. Our analysis and comparison
reveal the merits and shortcomings of each scheme,
provide guidelines on choosing the most appropriate
scheme for a given multimedia streaming application,
and could stimulate designing new authentication
schemes or improving existing ones. For example, our
detailed analysis has led us to design a new
authentication scheme that combines the best features
of two previous schemes.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "6",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "authentication schemes; Multimedia authentication;
multimedia security; multimedia streaming; scalable
coding; secure streaming",
}
@Article{Yang:2010:EMP,
author = "Zhenyu Yang and Wanmin Wu and Klara Nahrstedt and
Gregorij Kurillo and Ruzena Bajcsy",
title = "Enabling multi-party {$3$D} tele-immersive
environments with {{\em ViewCast}}",
journal = j-TOMCCAP,
volume = "6",
number = "2",
pages = "7:1--7:??",
month = mar,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1671962.1671963",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Sat Aug 14 17:17:15 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Three-dimensional tele-immersive (3DTI) environments
have great potential to promote collaborative work
among geographically distributed users. However, most
existing 3DTI systems only work with two sites due to
the huge demand of resources and the lack of a simple
yet powerful networking model to handle connectivity,
scalability, and quality-of-service (QoS)
guarantees.\par
In this article, we explore the design space from the
angle of multi-stream management to enable multi-party
3DTI communication. Multiple correlated 3D video
streams are employed to provide a comprehensive
representation of the physical scene in each 3DTI
environment, and are rendered together to establish a
common cyberspace among all participating 3DTI
environments. The existence of multi-stream correlation
provides the unique opportunity for new approaches in
QoS provisioning. Previous work mostly concentrated on
compression and adaptation techniques on the per-stream
basis while ignoring the application layer semantics
and the coordination required among streams. We propose
an innovative and generalized {\em ViewCast\/} model to
coordinate the multi-stream content dissemination over
an overlay network. ViewCast leverages view semantics
in 3D free-viewpoint video systems to fill the gap
between high-level user interest and low-level stream
management. In ViewCast, only the view information is
specified by the user/application, while the underlying
control dynamically performs stream differentiation,
selection, coordination, and dissemination. We present
the details of ViewCast and evaluate it through both
simulation and 3DTI sessions among tele-immersive
environments residing in different institutes across
the Internet2. Our experimental results demonstrate the
implementation feasibility and performance enhancement
of ViewCast in supporting multi-party 3DTI
collaboration.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "7",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "3D tele-immersion; application level multicast;
distributed multimedia system; multi-stream
coordination; networking protocol; QoS adaptation",
}
@Article{Wu:2010:ELT,
author = "Junwen Wu and Mohan M. Trivedi",
title = "An eye localization, tracking and blink pattern
recognition system: {Algorithm} and evaluation",
journal = j-TOMCCAP,
volume = "6",
number = "2",
pages = "8:1--8:??",
month = mar,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1671962.1671964",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Sat Aug 14 17:17:15 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This study is to investigate the fundamental problems
of, (1) facial feature detection and localization,
especially eye features; and (2) eye dynamics,
including tracking and blink detection. We first
describe our contribution to eye localization.
Following that, we discuss a simultaneous eye tracking
and blink detection system. Facial feature detection is
solved in a general object detection framework and its
performance for eye localization is presented. A binary
tree representation based on feature dependency
partitions the object feature space in a coarse to fine
manner. In each compact feature subspace, independent
component analysis (ICA) is used to get the independent
sources, whose probability density functions (PDFs) are
modeled by Gaussian mixtures. When applying this
representation for the task of eye detection, a
subwindow is used to scan the entire image and each
obtained image patch is examined using Bayesian
criteria to determine the presence of an eye subject.
After the eyes are automatically located with binary
tree-based probability learning, interactive particle
filters are used for simultaneously tracking the eyes
and detecting the blinks. The particle filters use
classification-based observation models, in which the
posterior probabilities are evaluated by logistic
regressions in tensor subspaces. Extensive experiments
are used to evaluate the performance from two aspects,
(1) blink detection rate and the accuracy of blink
duration in terms of the frame numbers; (2) eye
tracking accuracy. We also present an experimental
setup for obtaining the benchmark data in tracking
accuracy evaluation. The experimental evaluation
demonstrates the capability of this approach.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "8",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "Eye blink detection; human computer interface;
particle filtering; video processing",
}
@Article{Jin:2010:DMN,
author = "Xing Jin and S.-H. Gary Chan",
title = "Detecting malicious nodes in peer-to-peer streaming by
peer-based monitoring",
journal = j-TOMCCAP,
volume = "6",
number = "2",
pages = "9:1--9:??",
month = mar,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1671962.1671965",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Sat Aug 14 17:17:15 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Current peer-to-peer (P2P) streaming systems often
assume that nodes cooperate to upload and download
data. However, in the open environment of the Internet,
this is not necessarily true and there exist malicious
nodes in the system. In this article, we study
malicious actions of nodes that can be detected through
peer-based monitoring. We require each node to monitor
the data received and to periodically send monitoring
messages about its neighbors to some trustworthy nodes.
To efficiently store and search messages among multiple
trustworthy nodes, we organize trustworthy nodes into a
threaded binary tree. Trustworthy nodes also
dynamically redistribute monitoring messages among
themselves to achieve load balancing. Our simulation
results show that this scheme can efficiently detect
malicious nodes with high accuracy, and that the
dynamic redistribution method can achieve good load
balancing among trustworthy nodes.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "9",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "Malicious nodes; peer monitoring; peer-to-peer
streaming",
}
@Article{Chiu:2010:FMH,
author = "Chih-Yi Chiu and Hsin-Min Wang and Chu-Song Chen",
title = "Fast min-hashing indexing and robust spatio-temporal
matching for detecting video copies",
journal = j-TOMCCAP,
volume = "6",
number = "2",
pages = "10:1--10:??",
month = mar,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1671962.1671966",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Sat Aug 14 17:17:15 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The increase in the number of video copies, both legal
and illegal, has become a major problem in the
multimedia and Internet era. In this article, we
propose a novel method for detecting various video
copies in a video sequence. To achieve fast and robust
detection, the method fully integrates several
components, namely the min-hashing signature to
compactly represent a video sequence, a spatio-temporal
matching scheme to accurately evaluate video similarity
compiled from the spatial and temporal aspects, and
some speedup techniques to expedite both min-hashing
indexing and spatio-temporal matching. The results of
experiments demonstrate that, compared to several
baseline methods with different feature descriptors and
matching schemes, the proposed method which combines
both global and local feature descriptors yields the
best performance when encountering a variety of video
transformations. The method is very fast, requiring
approximately 0.06 seconds to search for copies of a
thirty-second video clip in a six-hour video
sequence.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "10",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "Content-based copy detection; histogram pruning;
near-duplicate",
}
@Article{Sarhan:2010:WTP,
author = "Nabil J. Sarhan and Mohammad A. Alsmirat and Musab
Al-Hadrusi",
title = "Waiting-time prediction in scalable on-demand video
streaming",
journal = j-TOMCCAP,
volume = "6",
number = "2",
pages = "11:1--11:??",
month = mar,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1671962.1671967",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Sat Aug 14 17:17:15 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Providing video streaming users with expected waiting
times enhances their perceived quality-of-service (QoS)
and encourages them to wait. In the absence of any
waiting-time feedback, users are more likely to defect
because of the uncertainty as to when their services
will start. We analyze waiting-time predictability in
scalable video streaming. We propose two prediction
schemes and study their effectiveness when applied with
various stream merging techniques and scheduling
policies. The results demonstrate that the waiting time
can be predicted accurately, especially when enhanced
cost-based scheduling is applied. The combination of
waiting-time prediction and cost-based scheduling leads
to outstanding performance benefits.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "11",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
keywords = "Scheduling; stream merging; time-of-service
guarantees; video streaming; waiting-time prediction",
}
@Article{Xu:2010:IBP,
author = "Changsheng Xu and Eckehard Steinbach and Abdulmotaleb
{El Saddik} and Michelle Zhou",
title = "Introduction to the best papers of {ACM Multimedia
2009}",
journal = j-TOMCCAP,
volume = "6",
number = "3",
pages = "12:1--12:??",
month = aug,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1823746.1830482",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Nov 23 10:03:16 MST 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "12",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zha:2010:VQS,
author = "Zheng-Jun Zha and Linjun Yang and Tao Mei and Meng
Wang and Zengfu Wang and Tat-Seng Chua and Xian-Sheng
Hua",
title = "Visual query suggestion: {Towards} capturing user
intent in {Internet} image search",
journal = j-TOMCCAP,
volume = "6",
number = "3",
pages = "13:1--13:??",
month = aug,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1823746.1823747",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Nov 23 10:03:16 MST 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "13",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Jiang:2010:AVA,
author = "Wei Jiang and Courtenay Cotton and Shih-Fu Chang and
Dan Ellis and Alexander C. Loui",
title = "Audio-visual atoms for generic video concept
classification",
journal = j-TOMCCAP,
volume = "6",
number = "3",
pages = "14:1--14:??",
month = aug,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1823746.1823748",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Nov 23 10:03:16 MST 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "14",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{DeOliveira:2010:LND,
author = "Rodrigo {De Oliveira} and Mauro Cherubini and Nuria
Oliver",
title = "Looking at near-duplicate videos from a human-centric
perspective",
journal = j-TOMCCAP,
volume = "6",
number = "3",
pages = "15:1--15:??",
month = aug,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1823746.1823749",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Nov 23 10:03:16 MST 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "15",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yin:2010:LEC,
author = "Hao Yin and Xuening Liu and Tongyu Zhan and Vyas Sekar
and Feng Qiu and Chuang Lin and Hui Zhang and Bo Li",
title = "{LiveSky}: {Enhancing} {CDN} with {P2P}",
journal = j-TOMCCAP,
volume = "6",
number = "3",
pages = "16:1--16:??",
month = aug,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1823746.1823750",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Nov 23 10:03:16 MST 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "16",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Money:2010:EEL,
author = "Arthur G. Money and Harry Agius",
title = "{ELVIS}: {Entertainment-Led VIdeo Summaries}",
journal = j-TOMCCAP,
volume = "6",
number = "3",
pages = "17:1--17:??",
month = aug,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1823746.1823751",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Nov 23 10:03:16 MST 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "17",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Hoi:2010:SSD,
author = "Steven C. h. Hoi and Wei Liu and Shih-Fu Chang",
title = "Semi-supervised distance metric learning for
collaborative image retrieval and clustering",
journal = j-TOMCCAP,
volume = "6",
number = "3",
pages = "18:1--18:??",
month = aug,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1823746.1823752",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Nov 23 10:03:16 MST 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "18",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Maddage:2010:WLA,
author = "Namunu C. Maddage and Khe Chai Sim and Haizhou Li",
title = "Word level automatic alignment of music and lyrics
using vocal synthesis",
journal = j-TOMCCAP,
volume = "6",
number = "3",
pages = "19:1--19:??",
month = aug,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1823746.1823753",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Nov 23 10:03:16 MST 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "19",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Qudah:2010:EDD,
author = "Bashar Qudah and Nabil J. Sarhan",
title = "Efficient delivery of on-demand video streams to
heterogeneous receivers",
journal = j-TOMCCAP,
volume = "6",
number = "3",
pages = "20:1--20:??",
month = aug,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1823746.1823754",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Nov 23 10:03:16 MST 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "20",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Gomes:2010:STA,
author = "Jo{\~a}o V. P. Gomes and Pedro R. M. In{\'a}cio and
Branka Lakic and M{\'a}rio M. Freire and Henrique J. A.
Da Silva and Paulo P. Monteiro",
title = "Source traffic analysis",
journal = j-TOMCCAP,
volume = "6",
number = "3",
pages = "21:1--21:??",
month = aug,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1823746.1823755",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Nov 23 10:03:16 MST 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "21",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Boll:2010:CPA,
author = "Susanne Boll and Jiebo Luo and Ramesh Jain and Dong
Xu",
title = "Call for papers: {ACM Transactions on Multimedia
Computing, Communications and Applications} special
issue on social media",
journal = j-TOMCCAP,
volume = "6",
number = "3",
pages = "22:1--22:??",
month = aug,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1823746.1837254",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Nov 23 10:03:16 MST 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "22",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Steinmetz:2010:OOD,
author = "Ralf Steinmetz",
title = "Obituary to our dear friend {Professor Dr. Nicolas D.
Georganas, PhD}",
journal = j-TOMCCAP,
volume = "6",
number = "4",
pages = "23:1--23:??",
month = nov,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1865106.1865107",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Nov 23 10:03:16 MST 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "23",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Haenselmann:2010:FSI,
author = "Thomas Haenselmann",
title = "Foreword to the special issue on multimedia sensor
fusion",
journal = j-TOMCCAP,
volume = "6",
number = "4",
pages = "24:1--24:??",
month = nov,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1865106.1865108",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Nov 23 10:03:16 MST 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "24",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wang:2010:MBA,
author = "Xiangyu Wang and Mohan Kankanhalli",
title = "{MultiFusion}: a boosting approach for multimedia
fusion",
journal = j-TOMCCAP,
volume = "6",
number = "4",
pages = "25:1--25:??",
month = nov,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1865106.1865109",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Nov 23 10:03:16 MST 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "25",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Chetty:2010:MSF,
author = "Girija Chetty and Matthew White",
title = "Multimedia sensor fusion for retrieving identity in
biometric access control systems",
journal = j-TOMCCAP,
volume = "6",
number = "4",
pages = "26:1--26:??",
month = nov,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1865106.1865110",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Nov 23 10:03:16 MST 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "26",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Friedland:2010:DAS,
author = "Gerald Friedland and Chuohao Yeo and Hayley Hung",
title = "Dialocalization: {Acoustic} speaker diarization and
visual localization as joint optimization problem",
journal = j-TOMCCAP,
volume = "6",
number = "4",
pages = "27:1--27:??",
month = nov,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1865106.1865111",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Nov 23 10:03:16 MST 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "27",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Rahman:2010:SGA,
author = "Abu Saleh Md Mahfujur Rahman and M. Anwar Hossain and
Abdulmotaleb {El Saddik}",
title = "Spatial-geometric approach to physical mobile
interaction based on accelerometer and {IR} sensory
data fusion",
journal = j-TOMCCAP,
volume = "6",
number = "4",
pages = "28:1--28:??",
month = nov,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1865106.1865112",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Nov 23 10:03:16 MST 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "28",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yang:2010:EMT,
author = "Zhenyu Yang and Wanmin Wu and Klara Nahrstedt and
Gregorij Kurillo and Ruzena Bajcsy",
title = "Enabling multiparty {$3$D} tele-immersive environments
with {ViewCast}",
journal = j-TOMCCAP,
volume = "6",
number = "4",
pages = "29:1--29:??",
month = nov,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1865106.1865113",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Nov 23 10:03:16 MST 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "29",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Marshall:2010:OCM,
author = "Damien Marshall and S{\'e}amus Mcloone and Tom{\'a}s
Ward",
title = "Optimizing consistency by maximizing bandwidth usage
in distributed interactive applications",
journal = j-TOMCCAP,
volume = "6",
number = "4",
pages = "30:1--30:??",
month = nov,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1865106.1865114",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Nov 23 10:03:16 MST 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "30",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Vu:2010:UOC,
author = "Long Vu and Indranil Gupta and Klara Nahrstedt and Jin
Liang",
title = "Understanding overlay characteristics of a large-scale
peer-to-peer {IPTV} system",
journal = j-TOMCCAP,
volume = "6",
number = "4",
pages = "31:1--31:??",
month = nov,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1865106.1865115",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Tue Nov 23 10:03:16 MST 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "31",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Meyer:2011:MRL,
author = "Marek Meyer and Christoph Rensing and Ralf Steinmetz",
title = "Multigranularity reuse of learning resources",
journal = j-TOMCCAP,
volume = "7",
number = "1",
pages = "1:1--1:??",
month = jan,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1870121.1870122",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Wed Mar 16 09:25:41 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "1",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Bouyakoub:2011:SBI,
author = "Samia Bouyakoub and Abdelkader Belkhir",
title = "{SMIL} builder: an incremental authoring tool for
{SMIL Documents}",
journal = j-TOMCCAP,
volume = "7",
number = "1",
pages = "2:1--2:??",
month = jan,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1870121.1870123",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Wed Mar 16 09:25:41 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "2",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Hossain:2011:MAQ,
author = "M. Anwar Hossain and Pradeep K. Atrey and Abdulmotaleb
{El Saddik}",
title = "Modeling and assessing quality of information in
multisensor multimedia monitoring systems",
journal = j-TOMCCAP,
volume = "7",
number = "1",
pages = "3:1--3:??",
month = jan,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1870121.1870124",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Wed Mar 16 09:25:41 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "3",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhu:2011:NDK,
author = "Jianke Zhu and Steven C. H. Hoi and Michael R. Lyu and
Shuicheng Yan",
title = "Near-duplicate keyframe retrieval by semi-supervised
learning and nonrigid image matching",
journal = j-TOMCCAP,
volume = "7",
number = "1",
pages = "4:1--4:??",
month = jan,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1870121.1870125",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Wed Mar 16 09:25:41 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "4",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Hsu:2011:FCL,
author = "Cheng-Hsin Hsu and Mohamed Hefeeda",
title = "A framework for cross-layer optimization of video
streaming in wireless networks",
journal = j-TOMCCAP,
volume = "7",
number = "1",
pages = "5:1--5:??",
month = jan,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1870121.1870126",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Wed Mar 16 09:25:41 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "5",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Chandra:2011:EAS,
author = "Surendar Chandra and Xuwen Yu",
title = "An empirical analysis of serendipitous media sharing
among campus-wide wireless users",
journal = j-TOMCCAP,
volume = "7",
number = "1",
pages = "6:1--6:??",
month = jan,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1870121.1870127",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Wed Mar 16 09:25:41 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "6",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Gopinathan:2011:OLM,
author = "Ajay Gopinathan and Zongpeng Li",
title = "Optimal layered multicast",
journal = j-TOMCCAP,
volume = "7",
number = "2",
pages = "7:1--7:??",
month = feb,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1925101.1925102",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Wed Mar 16 09:25:42 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "7",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Hsu:2011:USS,
author = "Cheng-Hsin Hsu and Mohamed Hefeeda",
title = "Using simulcast and scalable video coding to
efficiently control channel switching delay in mobile
{TV} broadcast networks",
journal = j-TOMCCAP,
volume = "7",
number = "2",
pages = "8:1--8:??",
month = feb,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1925101.1925103",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Wed Mar 16 09:25:42 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "8",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Jin:2011:KDH,
author = "Yohan Jin and Balakrishnan Prabhakaran",
title = "Knowledge discovery from {$3$D} human motion streams
through semantic dimensional reduction",
journal = j-TOMCCAP,
volume = "7",
number = "2",
pages = "9:1--9:??",
month = feb,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1925101.1925104",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Wed Mar 16 09:25:42 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "9",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Cheng:2011:MPM,
author = "Wei Cheng and Wei Tsang Ooi and Sebastien Mondet and
Romulus Grigoras and G{\'e}raldine Morin",
title = "Modeling progressive mesh streaming: {Does} data
dependency matter?",
journal = j-TOMCCAP,
volume = "7",
number = "2",
pages = "10:1--10:??",
month = feb,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1925101.1925105",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Wed Mar 16 09:25:42 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "10",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Bagchi:2011:FAD,
author = "Susmit Bagchi",
title = "A fuzzy algorithm for dynamically adaptive multimedia
streaming",
journal = j-TOMCCAP,
volume = "7",
number = "2",
pages = "11:1--11:??",
month = feb,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1925101.1925106",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Wed Mar 16 09:25:42 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "11",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Hsu:2011:SMV,
author = "Cheng-Hsin Hsu and Mohamed Hefeeda",
title = "Statistical multiplexing of variable-bit-rate videos
streamed to mobile devices",
journal = j-TOMCCAP,
volume = "7",
number = "2",
pages = "12:1--12:??",
month = feb,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1925101.1925107",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Wed Mar 16 09:25:42 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "12",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Steinmetz:2011:EN,
author = "Ralf Steinmetz",
title = "Editorial notice",
journal = j-TOMCCAP,
volume = "7",
number = "3",
pages = "13:1--13:??",
month = aug,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2000486.2000487",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Sep 5 17:00:22 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "13",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Korshunov:2011:VQF,
author = "Pavel Korshunov and Wei Tsang Ooi",
title = "Video quality for face detection, recognition, and
tracking",
journal = j-TOMCCAP,
volume = "7",
number = "3",
pages = "14:1--14:??",
month = aug,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2000486.2000488",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Sep 5 17:00:22 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "14",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Lin:2011:PCI,
author = "Pei-Yu Lin and Jung-San Lee and Chin-Chen Chang",
title = "Protecting the content integrity of digital imagery
with fidelity preservation",
journal = j-TOMCCAP,
volume = "7",
number = "3",
pages = "15:1--15:??",
month = aug,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2000486.2000489",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Sep 5 17:00:22 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "15",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{VanLeuken:2011:SVO,
author = "Reinier H. {Van Leuken} and Remco C. Veltkamp",
title = "Selecting vantage objects for similarity indexing",
journal = j-TOMCCAP,
volume = "7",
number = "3",
pages = "16:1--16:??",
month = aug,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2000486.2000490",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Sep 5 17:00:22 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "16",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Feng:2011:SRI,
author = "Wu-Chi Feng and Thanh Dang and John Kassebaum and Tim
Bauman",
title = "Supporting region-of-interest cropping through
constrained compression",
journal = j-TOMCCAP,
volume = "7",
number = "3",
pages = "17:1--17:??",
month = aug,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2000486.2000491",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Sep 5 17:00:22 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "17",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Liu:2011:DBA,
author = "Qingzhong Liu and Andrew H. Sung and Mengyu Qiao",
title = "Derivative-based audio steganalysis",
journal = j-TOMCCAP,
volume = "7",
number = "3",
pages = "18:1--18:??",
month = aug,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2000486.2000492",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Sep 5 17:00:22 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "18",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Li:2011:GDO,
author = "Frederick W. B. Li and Rynson W. H. Lau and Danny
Kilis and Lewis W. F. Li",
title = "Game-on-demand:: an online game engine based on
geometry streaming",
journal = j-TOMCCAP,
volume = "7",
number = "3",
pages = "19:1--19:??",
month = aug,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2000486.2000493",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
bibdate = "Mon Sep 5 17:00:22 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "19",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Shirmohammadi:2011:IAM,
author = "Shervin Shirmohammadi and Jiebo Luo and Jie Yang and
Abdulmotaleb {El Saddik}",
title = "Introduction to {ACM Multimedia 2010} best paper
candidates",
journal = j-TOMCCAP,
volume = "7S",
number = "1",
pages = "20:1--20:??",
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2037676.2037677",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun Nov 6 06:36:59 MST 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "20",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Bhattacharya:2011:HAA,
author = "Subhabrata Bhattacharya and Rahul Sukthankar and
Mubarak Shah",
title = "A holistic approach to aesthetic enhancement of
photographs",
journal = j-TOMCCAP,
volume = "7S",
number = "1",
pages = "21:1--21:??",
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2037676.2037678",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun Nov 6 06:36:59 MST 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "21",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Tan:2011:URS,
author = "Shulong Tan and Jiajun Bu and Chun Chen and Bin Xu and
Can Wang and Xiaofei He",
title = "Using rich social media information for music
recommendation via hypergraph model",
journal = j-TOMCCAP,
volume = "7S",
number = "1",
pages = "22:1--22:??",
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2037676.2037679",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun Nov 6 06:36:59 MST 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "22",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Milani:2011:CAE,
author = "Simone Milani and Giancarlo Calvagno",
title = "A cognitive approach for effective coding and
transmission of {$3$D} video",
journal = j-TOMCCAP,
volume = "7S",
number = "1",
pages = "23:1--23:??",
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2037676.2037680",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun Nov 6 06:36:59 MST 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "23",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Hong:2011:VAE,
author = "Richang Hong and Meng Wang and Xiao-Tong Yuan and
Mengdi Xu and Jianguo Jiang and Shuicheng Yan and
Tat-Seng Chua",
title = "Video accessibility enhancement for hearing-impaired
users",
journal = j-TOMCCAP,
volume = "7S",
number = "1",
pages = "24:1--24:??",
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2037676.2037681",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun Nov 6 06:36:59 MST 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "24",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Boll:2011:ISI,
author = "Susanne Boll and Ramesh Jain and Jiebo Luo and Dong
Xu",
title = "Introduction to special issue on social media",
journal = j-TOMCCAP,
volume = "7S",
number = "1",
pages = "25:1--25:??",
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2037676.2037682",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun Nov 6 06:36:59 MST 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "25",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Lin:2011:EOM,
author = "Yu-Ching Lin and Yi-Hsuan Yang and Homer H. Chen",
title = "Exploiting online music tags for music emotion
classification",
journal = j-TOMCCAP,
volume = "7S",
number = "1",
pages = "26:1--26:??",
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2037676.2037683",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun Nov 6 06:36:59 MST 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "26",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Rabbath:2011:ACP,
author = "Mohamad Rabbath and Philipp Sandhaus and Susanne
Boll",
title = "Automatic creation of photo books from stories in
social media",
journal = j-TOMCCAP,
volume = "7S",
number = "1",
pages = "27:1--27:??",
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2037676.2037684",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun Nov 6 06:36:59 MST 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "27",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Hu:2011:RAI,
author = "Weiming Hu and Haiqiang Zuo and Ou Wu and Yunfei Chen
and Zhongfei Zhang and David Suter",
title = "Recognition of adult images, videos, and web page
bags",
journal = j-TOMCCAP,
volume = "7S",
number = "1",
pages = "28:1--28:??",
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2037676.2037685",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun Nov 6 06:36:59 MST 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "28",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Lin:2011:SSC,
author = "Yu-Ru Lin and K. Sel{\c{c}}cuk Candan and Hari
Sundaram and Lexing Xie",
title = "{SCENT}: {Scalable} compressed monitoring of evolving
multirelational social networks",
journal = j-TOMCCAP,
volume = "7S",
number = "1",
pages = "29:1--29:??",
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2037676.2037686",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun Nov 6 06:36:59 MST 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "29",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Sang:2011:BCT,
author = "Jitao Sang and Changsheng Xu",
title = "Browse by chunks: {Topic} mining and organizing on
web-scale social media",
journal = j-TOMCCAP,
volume = "7S",
number = "1",
pages = "30:1--30:??",
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2037676.2037687",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun Nov 6 06:36:59 MST 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "30",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Ji:2011:MFL,
author = "Rongrong Ji and Yue Gao and Bineng Zhong and Hongxun
Yao and Qi Tian",
title = "Mining {\tt flickr} landmarks by modeling
reconstruction sparsity",
journal = j-TOMCCAP,
volume = "7S",
number = "1",
pages = "31:1--31:??",
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2037676.2037688",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun Nov 6 06:36:59 MST 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "31",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Mandel:2011:CTI,
author = "Michael I. Mandel and Razvan Pascanu and Douglas Eck
and Yoshua Bengio and Luca M. Aiello and Rossano
Schifanella and Filippo Menczer",
title = "Contextual tag inference",
journal = j-TOMCCAP,
volume = "7S",
number = "1",
pages = "32:1--32:??",
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2037676.2037689",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun Nov 6 06:36:59 MST 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "32",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Biel:2011:VCB,
author = "Joan-Isaac Biel and Daniel Gatica-Perez",
title = "{VlogSense}: {Conversational} behavior and social
attention in {YouTube}",
journal = j-TOMCCAP,
volume = "7S",
number = "1",
pages = "33:1--33:??",
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2037676.2037690",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun Nov 6 06:36:59 MST 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "33",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Anonymous:2011:TCO,
author = "Anonymous",
title = "Table of Contents: Online Supplement Volume {7S},
Number 1",
journal = j-TOMCCAP,
volume = "7",
number = "4",
pages = "34:1--34:??",
month = nov,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2043612.2043620",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 15 08:53:32 MST 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "34",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Hong:2011:BSE,
author = "Richang Hong and Jinhui Tang and Hung-Khoon Tan and
Chong-Wah Ngo and Shuicheng Yan and Tat-Seng Chua",
title = "Beyond search: Event-driven summarization for {Web}
videos",
journal = j-TOMCCAP,
volume = "7",
number = "4",
pages = "35:1--35:??",
month = nov,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2043612.2043613",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 15 08:53:32 MST 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "35",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Kuo:2011:TPQ,
author = "Wen-Kuang Kuo and Kuo-Wei Wu",
title = "Traffic prediction and {QoS} transmission of real-time
live {VBR} videos in {WLANs}",
journal = j-TOMCCAP,
volume = "7",
number = "4",
pages = "36:1--36:??",
month = nov,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2043612.2043614",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 15 08:53:32 MST 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "36",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Maddage:2011:BSS,
author = "Namunu C. Maddage and Haizhou Li",
title = "Beat space segmentation and octave scale cepstral
feature for sung language recognition in pop music",
journal = j-TOMCCAP,
volume = "7",
number = "4",
pages = "37:1--37:??",
month = nov,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2043612.2043615",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 15 08:53:32 MST 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "37",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Santini:2011:ECQ,
author = "Simone Santini",
title = "Efficient computation of queries on feature streams",
journal = j-TOMCCAP,
volume = "7",
number = "4",
pages = "38:1--38:??",
month = nov,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2043612.2043616",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 15 08:53:32 MST 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "38",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Verdugo:2011:IFC,
author = "Renato Verdugo and Miguel Nussbaum and Pablo Corro and
Pablo Nu{\~n}nez and Paula Navarrete",
title = "Interactive films and coconstruction",
journal = j-TOMCCAP,
volume = "7",
number = "4",
pages = "39:1--39:??",
month = nov,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2043612.2043617",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 15 08:53:32 MST 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "39",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Ghandeharizadeh:2011:DCC,
author = "Shahram Ghandeharizadeh and Shahin Shayandeh",
title = "Domical cooperative caching for streaming media in
wireless home networks",
journal = j-TOMCCAP,
volume = "7",
number = "4",
pages = "40:1--40:??",
month = nov,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2043612.2043618",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 15 08:53:32 MST 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "40",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Ghandeharizadeh:2011:CPS,
author = "Shahram Ghandeharizadeh and Shahin Shayandeh",
title = "Call for papers: Special issue on {$3$D} mobile
multimedia",
journal = j-TOMCCAP,
volume = "7",
number = "4",
pages = "41:1--41:??",
month = nov,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2043612.2043619",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 15 08:53:32 MST 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "41",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Steinmetz:2012:ENC,
author = "Ralf Steinmetz",
title = "Editorial note and call for nominations: {Nicolas D.
Georganas} best paper award",
journal = j-TOMCCAP,
volume = "8",
number = "1",
pages = "1:1--1:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2071396.2071397",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Mar 16 15:56:02 MDT 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "1",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Ghinea:2012:SSS,
author = "Georghita Ghinea and Oluwakemi Ademoye",
title = "The sweet smell of success: Enhancing multimedia
applications with olfaction",
journal = j-TOMCCAP,
volume = "8",
number = "1",
pages = "2:1--2:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2071396.2071398",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Mar 16 15:56:02 MDT 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Olfaction, or smell, is one of the last challenges
which multimedia applications have to conquer. As far
as computerized smell is concerned, there are several
difficulties to overcome, particularly those associated
with the ambient nature of smell. In this article, we
present results from an empirical study exploring
users' perception of olfaction-enhanced multimedia
displays. Findings show that olfaction significantly
adds to the user multimedia experience. Moreover, use
of olfaction leads to an increased sense of reality and
relevance. Our results also show that users are
tolerant of the interference and distortion effects
caused by olfactory effect in multimedia.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "2",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Hefeeda:2012:DET,
author = "Mohamed Hefeeda and Cheng-Hsin Hsu",
title = "Design and evaluation of a testbed for mobile {TV}
networks",
journal = j-TOMCCAP,
volume = "8",
number = "1",
pages = "3:1--3:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2071396.2071399",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Mar 16 15:56:02 MDT 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article presents the design of a complete,
open-source, testbed for broadcast networks that offer
mobile TV services. Although basic architectures and
protocols have been developed for such networks,
detailed performance tuning and analysis are still
needed, especially when these networks scale to serve
many diverse TV channels to numerous subscribers. The
detailed performance analysis could also motivate
designing new protocols and algorithms for enhancing
future mobile TV networks. Currently, many researchers
evaluate the performance of mobile TV networks using
simulation and/or theoretical modeling methods. These
methods, while useful for early assessment, typically
abstract away many necessary details of actual, fairly
complex, networks. Therefore, an open-source platform
for evaluating new ideas in a real mobile TV network is
needed. This platform is currently not possible with
commercial products, because they are sold as black
boxes without the source code. In this article, we
summarize our experiences in designing and implementing
a testbed for mobile TV networks. We integrate
off-the-shelf hardware components with carefully
designed software modules to realize a scalable testbed
that covers almost all aspects of real networks. We use
our testbed to empirically analyze various performance
aspects of mobile TV networks and validate/refute
several claims made in the literature as well as
discover/quantify multiple important performance
tradeoffs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "3",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Lin:2012:DMS,
author = "Yu-Ru Lin and Hari Sundaram and Munmun {De Choudhury}
and Aisling Kelliher",
title = "Discovering multirelational structure in social media
streams",
journal = j-TOMCCAP,
volume = "8",
number = "1",
pages = "4:1--4:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2071396.2071400",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Mar 16 15:56:02 MDT 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this article, we present a novel algorithm to
discover multirelational structures from social media
streams. A media item such as a photograph exists as
part of a meaningful interrelationship among several
attributes, including time, visual content, users, and
actions. Discovery of such relational structures
enables us to understand the semantics of human
activity and has applications in content organization,
recommendation algorithms, and exploratory social
network analysis. We are proposing a novel nonnegative
matrix factorization framework to characterize
relational structures of group photo streams. The
factorization incorporates image content features and
contextual information. The idea is to consider a
cluster as having similar relational patterns; each
cluster consists of photos relating to similar content
or context. Relations represent different aspects of
the photo stream data, including visual content,
associated tags, photo owners, and post times. The
extracted structures minimize the mutual information of
the predicted joint distribution. We also introduce a
relational modularity function to determine the
structure cost penalty, and hence determine the number
of clusters. Extensive experiments on a large Flickr
dataset suggest that our approach is able to extract
meaningful relational patterns from group photo
streams. We evaluate the utility of the discovered
structures through a tag prediction task and through a
user study. Our results show that our method based on
relational structures, outperforms baseline methods,
including feature and tag frequency based techniques,
by 35\%--420\%. We have conducted a qualitative user
study to evaluate the benefits of our framework in
exploring group photo streams. The study indicates that
users found the extracted clustering results clearly
represent major themes in a group; the clustering
results not only reflect how users describe the group
data but often lead the users to discover the evolution
of the group activity.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "4",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Cheng:2012:EIC,
author = "Xu Cheng and Jiangchuan Liu",
title = "Exploring interest correlation for peer-to-peer
socialized video sharing",
journal = j-TOMCCAP,
volume = "8",
number = "1",
pages = "5:1--5:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2071396.2071401",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Mar 16 15:56:02 MDT 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The last five years have witnessed an explosion of
networked video sharing, represented by YouTube, as a
new killer Internet application. Their sustainable
development however is severely hindered by the
intrinsic limit of their client/server architecture. A
shift to the peer-to-peer paradigm has been widely
suggested with success already shown in live video
streaming and movie-on-demand. Unfortunately, our
latest measurement demonstrates that short video clips
exhibit drastically different statistics, which would
simply render these existing solutions suboptimal, if
not entirely inapplicable. Our long-term measurement
over five million YouTube videos, on the other hand,
reveals interesting social networks with strong
correlation among the videos, thus opening new
opportunities to explore. In this article, we present
NetTube, a novel peer-to-peer assisted delivering
framework that explores the user interest correlation
for short video sharing. We address a series of key
design issues to realize the system, including a
bi-layer overlay, an efficient indexing scheme, a
delay-aware scheduling mechanism, and a prefetching
strategy leveraging interest correlation. We evaluate
NetTube through both simulations and prototype
experiments, which show that it greatly reduces the
server workload, improves the playback quality and
scales well.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "5",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Mei:2012:ITC,
author = "Tao Mei and Lusong Li and Xian-Sheng Hua and Shipeng
Li",
title = "{ImageSense}: Towards contextual image advertising",
journal = j-TOMCCAP,
volume = "8",
number = "1",
pages = "6:1--6:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2071396.2071402",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Mar 16 15:56:02 MDT 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The daunting volumes of community-contributed media
contents on the Internet have become one of the primary
sources for online advertising. However, conventional
advertising treats image and video advertising as
general text advertising by displaying relevant ads
based on the contents of the Web page, without
considering the inherent characteristics of visual
contents. This article presents a contextual
advertising system driven by images, which
automatically associates relevant ads with an image
rather than the entire text in a Web page and
seamlessly inserts the ads in the nonintrusive areas
within each individual image. The proposed system,
called ImageSense, supports scalable advertising of,
from root to node, Web sites, pages, and images. In
ImageSense, the ads are selected based on not only
textual relevance but also visual similarity, so that
the ads yield contextual relevance to both the text in
the Web page and the image content. The ad insertion
positions are detected based on image salience, as well
as face and text detection, to minimize intrusiveness
to the user. We evaluate ImageSense on a large-scale
real-world images and Web pages, and demonstrate the
effectiveness of ImageSense for online image
advertising.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "6",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Snidaro:2012:FMV,
author = "Lauro Snidaro and Ingrid Visentini and Gian Luca
Foresti",
title = "Fusing multiple video sensors for surveillance",
journal = j-TOMCCAP,
volume = "8",
number = "1",
pages = "7:1--7:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2071396.2071403",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Mar 16 15:56:02 MDT 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Real-time detection, tracking, recognition, and
activity understanding of moving objects from multiple
sensors represent fundamental issues to be solved in
order to develop surveillance systems that are able to
autonomously monitor wide and complex environments. The
algorithms that are needed span therefore from image
processing to event detection and behaviour
understanding, and each of them requires dedicated
study and research. In this context, sensor fusion
plays a pivotal role in managing the information and
improving system performance. Here we present a novel
fusion framework for combining the data coming from
multiple and possibly heterogeneous sensors observing a
surveillance area.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "7",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Huang:2012:TAM,
author = "Jiun-Long Huang and Shih-Chuan Chiu and Man-Kwan
Shan",
title = "Towards an automatic music arrangement framework using
score reduction",
journal = j-TOMCCAP,
volume = "8",
number = "1",
pages = "8:1--8:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2071396.2071404",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Mar 16 15:56:02 MDT 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Score reduction is a process that arranges music for a
target instrument by reducing original music. In this
study we present a music arrangement framework that
uses score reduction to automatically arrange music for
a target instrument. The original music is first
analyzed to determine the type of arrangement element
of each section, then the phrases are identified and
each is assigned a utility according to its type of
arrangement element. For a set of utility-assigned
phrases, we transform the music arrangement into an
optimization problem and propose a phrase selection
algorithm. The music is arranged by selecting
appropriate phrases satisfying the playability
constraints of a target instrument. Using the proposed
framework, we implement a music arrangement system for
the piano. An approach similar to Turing test is used
to evaluate the quality of the music arranged by our
system. The experiment results show that our system is
able to create viable music for the piano.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "8",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Steinmetz:2012:EN,
author = "Ralf Steinmetz",
title = "Editorial note",
journal = j-TOMCCAP,
volume = "8s",
number = "1",
pages = "9:1--9:??",
month = feb,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2089085.2089086",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Mar 16 15:56:04 MDT 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "9",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Liu:2012:BET,
author = "Dongyu Liu and Fei Li and Bo Shen and Songqing Chen",
title = "Building an efficient transcoding overlay for {P2P}
streaming to heterogeneous devices",
journal = j-TOMCCAP,
volume = "8s",
number = "1",
pages = "10:1--10:??",
month = feb,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2089085.2089087",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Mar 16 15:56:04 MDT 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "With the increasing deployment of Internet P2P/overlay
streaming systems, more and more clients use mobile
devices, such as smart phones and PDAs, to access these
Internet streaming services. Compared to wired
desktops, mobile devices normally have a smaller screen
size, a less color depth, and lower bandwidth and thus
cannot correctly and effectively render and display the
data streamed to desktops. To address this problem, in
this paper, we propose PAT (Peer-Assisted Transcoding)
to enable effective online transcoding in P2P/overlay
streaming. PAT has the following unique features.
First, it leverages active peer cooperation without
demanding infrastructure support such as transcoding
servers. Second, as online transcoding is
computationally intensive while the various devices
used by participating clients may have limited
computing power and related resources (e.g., battery,
bandwidth), an additional overlay, called metadata
overlay, is constructed to instantly share the
intermediate transcoding result of a transcoding
procedure with other transcoding nodes to minimize the
total computing overhead in the system. The
experimental results collected within a realistically
simulated testbed show that by consuming 6\% extra
bandwidth, PAT could save up to 58\% CPU cycles for
online transcoding.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "10",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Shen:2012:IFP,
author = "Zhijie Shen and Roger Zimmermann",
title = "{ISP}-friendly {P2P} live streaming: a roadmap to
realization",
journal = j-TOMCCAP,
volume = "8s",
number = "1",
pages = "11:1--11:??",
month = feb,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2089085.2089088",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Mar 16 15:56:04 MDT 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Peer-to-Peer (P2P) applications generate large amounts
of Internet network traffic. The wide-reaching
connectivity of P2P systems is creating resource
inefficiencies for network providers. Recent studies
have demonstrated that localizing cross-ISP (Internet
service provider) traffic can mitigate this challenge.
However, bandwidth sensitivity and display quality
requirements complicate the ISP-friendly design for
live streaming systems. To this date, although some
prior techniques focusing on live streaming systems
exist, the correlation between traffic localization and
streaming quality guarantee has not been well explored.
Additionally, the proposed solutions are often not easy
to apply in practice. In our presented work, we
demonstrate that the cross-ISP traffic of P2P live
streaming systems can be significantly reduced with
little impact on the streaming quality. First, we
analytically investigate and quantify the tradeoff
between traffic localization and streaming quality
guarantee, determining the lower bound of the inter-AS
(autonomous system) streaming rate below which
streaming quality cannot be preserved. Based on the
analysis, we further propose a practical ISP-friendly
solution, termed IFPS, which requires only minor
changes to the peer selection mechanism and can easily
be integrated into both new and existing systems.
Additionally, the significant opportunity for
localizing traffic is underscored by our collected
traces from PPLive, which also enabled us to derive
realistic parameters to guide our simulations. The
experimental results demonstrate that IFPS reduces
cross-ISP traffic from 81\% up to 98\% while keeping
streaming quality virtually unaffected.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "11",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Lou:2012:QDD,
author = "Xiaosong Lou and Kai Hwang",
title = "Quality of data delivery in peer-to-peer video
streaming",
journal = j-TOMCCAP,
volume = "8s",
number = "1",
pages = "12:1--12:??",
month = feb,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2089085.2089089",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Mar 16 15:56:04 MDT 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "QoS in a P2P video streaming system is evaluated in
three stages: content generation, data delivery and
video playback. We use jitter-free probability as the
main performance metric to study Quality of Data
delivery (QoD). A new model that incorporates both
bandwidth and data availability of P2P network is
proposed. Our model relies on a sharing factor that
models data availability among all peers. We simulate
on a minimalistic network to demonstrate how to apply
the analytical model to design a P2P video streaming
system with a very low jitter rate. Our simulation
experimental results reveal that the lower bound on
jitter-free probability is indeed effective to reflect
the QoD of the entire system. Our model captures the
impact of many design choices, including upload
bandwidth limit, peer selection strategies, and video
stream chunking schemes.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "12",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wu:2012:DNW,
author = "Chuan Wu and Baochun Li and Shuqiao Zhao",
title = "Diagnosing network-wide {P2P} live streaming
inefficiencies",
journal = j-TOMCCAP,
volume = "8s",
number = "1",
pages = "13:1--13:??",
month = feb,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2089085.2089090",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Mar 16 15:56:04 MDT 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Large-scale live peer-to-peer (P2P) streaming
applications have been successfully deployed in today's
Internet. While they can accommodate hundreds of
thousands of users simultaneously with hundreds of
channels of programming, there still commonly exist
channels and times where and when the streaming quality
is unsatisfactory. In this paper, based on more than
two terabytes and one year worth of live traces from
UUSee, a large-scale commercial P2P live streaming
system, we show an in-depth network-wide diagnosis of
streaming inefficiencies, commonly present in typical
mesh-based P2P live streaming systems. As the first
highlight of our work, we identify an evolutionary
pattern of low streaming quality in the system, and the
distribution of streaming inefficiencies across various
streaming channels and in different geographical
regions. We then carry out an extensive investigation
to explore the causes to such streaming inefficiencies
over different times and across different
channels/regions at specific times, by investigating
the impact of factors such as the number of peers, peer
upload bandwidth, inter-peer bandwidth availability,
server bandwidth consumption, and many more. The
original discoveries we have brought forward include
the two-sided effects of peer population on the
streaming quality in a streaming channel, the
significant impact of inter-peer bandwidth bottlenecks
at peak times, and the inefficient utilization of
server capacities across concurrent channels. Based on
these insights, we identify problems within the
existing P2P live streaming design and discuss a number
of suggestions to improve real-world streaming
protocols operating at a large scale.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "13",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wu:2012:ABP,
author = "Chuan Wu and Zongpeng Li and Xuanjia Qiu and Francis
C. M. Lau",
title = "Auction-based {P2P VoD} streaming: Incentives and
optimal scheduling",
journal = j-TOMCCAP,
volume = "8s",
number = "1",
pages = "14:1--14:??",
month = feb,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2089085.2089091",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Mar 16 15:56:04 MDT 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Real-world large-scale Peer-to-Peer (P2P)
Video-on-Demand (VoD) streaming applications face more
design challenges as compared to P2P live streaming,
due to higher peer dynamics and less buffer overlap.
The situation is further complicated when we consider
the selfish nature of peers, who in general wish to
download more and upload less, unless otherwise
motivated. Taking a new perspective of distributed
dynamic auctions, we design efficient P2P VoD streaming
algorithms with simultaneous consideration of peer
incentives and streaming optimality. In our solution,
media block exchanges among peers are carried out
through local auctions, in which budget-constrained
peers bid for desired blocks from their neighbors,
which in turn deliver blocks to the winning bidders and
collect revenue. With strategic design of a
discriminative second price auction with seller
reservation, a supplying peer has full incentive to
maximally contribute its bandwidth to increase its
budget; requesting peers are also motivated to bid in
such a way that optimal media block scheduling is
achieved effectively in a fully decentralized fashion.
Applying techniques from convex optimization and
mechanism design, we prove (a) the incentive
compatibility at the selling and buying peers, and (b)
the optimality of the induced media block scheduling in
terms of social welfare maximization. Large-scale
empirical studies are conducted to investigate the
behavior of the proposed auction mechanisms in dynamic
P2P VoD systems based on real-world settings.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "14",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhang:2012:PHL,
author = "Tieying Zhang and Xueqi Cheng and Jianming Lv and
Zhenhua Li and Weisong Shi",
title = "Providing hierarchical lookup service for {P2P--VoD}
systems",
journal = j-TOMCCAP,
volume = "8s",
number = "1",
pages = "15:1--15:??",
month = feb,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2089085.2089092",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Mar 16 15:56:04 MDT 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Supporting random jump in P2P-VoD systems requires
efficient lookup for the `best' suppliers, where `best'
means the suppliers should meet two requirements:
content match and network quality match. Most studies
use a DHT-based method to provide content lookup;
however, these methods are neither able to meet the
network quality requirements nor suitable for VoD
streaming due to the large overhead. In this paper, we
propose Mediacoop, a novel hierarchical lookup scheme
combining both content and quality match to provide
random jumps for P2P-VoD systems. It exploits the play
position to efficiently locate the candidate suppliers
with required data (content match), and performs
refined lookup within the candidates to meet quality
match. Theoretical analysis and simulation results show
that Mediacoop is able to achieve lower jump latency
and control overhead than the typical DHT-based method.
Moreover, we implement Mediacoop in a BitTorrent-like
P2P-VoD system called CoolFish and make optimizations
for such ` total cache' applications. The
implementation and evaluation in CoolFish show that
Mediacoop is able to improve user experiences,
especially the jump latency, which verifies the
practicability of our design.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "15",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Anonymous:2012:TCO,
author = "Anonymous",
title = "Table of Contents: Online Supplement Volume {8S},
Number 1",
journal = j-TOMCCAP,
volume = "8",
number = "2",
pages = "16:1--16:??",
month = may,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2168996.2169004",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:03 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "16",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Dornaika:2012:IRF,
author = "Fadi Dornaika and James H. Elder",
title = "Image registration for foveated panoramic sensing",
journal = j-TOMCCAP,
volume = "8",
number = "2",
pages = "17:1--17:??",
month = may,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2168996.2168997",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:03 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article addresses the problem of registering
high-resolution, small field-of-view images with
low-resolution panoramic images provided by a panoramic
catadioptric video sensor. Such systems may find
application in surveillance and telepresence systems
that require a large field of view and high resolution
at selected locations. Although image registration has
been studied in more conventional applications, the
problem of registering panoramic and conventional video
has not previously been addressed, and this problem
presents unique challenges due to (i) the extreme
differences in resolution between the sensors (more
than a 16:1 linear resolution ratio in our
application), and (ii) the resolution inhomogeneity of
panoramic images. The main contributions of this
article are as follows. First, we introduce our
foveated panoramic sensor design. Second, we show how a
coarse registration can be computed from the raw images
using parametric template matching techniques. Third,
we propose two refinement methods allowing automatic
and near real-time registration between the two image
streams. The first registration method is based on
matching extracted interest points using a closed form
method. The second registration method is featureless
and based on minimizing the intensity discrepancy
allowing the direct recovery of both the geometric and
the photometric transforms. Fourth, a comparison
between the two registration methods is carried out,
which shows that the featureless method is superior in
accuracy. Registration examples using the developed
methods are presented.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "17",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhang:2012:CPC,
author = "Xin Zhang and Tom{\'a}s Ward and S{\'e}amus Mcloone",
title = "Comparison of predictive contract mechanisms from an
information theory perspective",
journal = j-TOMCCAP,
volume = "8",
number = "2",
pages = "18:1--18:??",
month = may,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2168996.2168998",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:03 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Inconsistency arises across a Distributed Virtual
Environment due to network latency induced by state
changes communications. Predictive Contract Mechanisms
(PCMs) combat this problem through reducing the amount
of messages transmitted in return for perceptually
tolerable inconsistency. To date there are no methods
to quantify the efficiency of PCMs in communicating
this reduced state information. This article presents
an approach derived from concepts in information theory
for a deeper understanding of PCMs. Through a
comparison of representative PCMs, the worked analysis
illustrates interesting aspects of PCMs operation and
demonstrates how they can be interpreted as a form of
lossy information compression.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "18",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Olsen:2012:ITN,
author = "Dan R. Olsen and Derek Bunn and Trent Boulter and
Robert Walz",
title = "Interactive television news",
journal = j-TOMCCAP,
volume = "8",
number = "2",
pages = "19:1--19:??",
month = may,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2168996.2168999",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:03 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "A new interactive television experience has been
created for watching television news. The goal is to
create a news experience that is similar to the way
people watch television in their living rooms while
giving viewers the power to make choices about what
they see. We partnered with existing news organizations
to create tools consistent with current news production
practices. The viewer experience allows selection of
the order of news content, skipping unwanted content
and exploring stories in more depth. These tools were
used to produce seven days of interactive commercial
news that were viewed in ten homes.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "19",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Armitage:2012:ROF,
author = "Grenville Armitage and Amiel Heyde",
title = "{REED}: {Optimizing} first person shooter game server
discovery using network coordinates",
journal = j-TOMCCAP,
volume = "8",
number = "2",
pages = "20:1--20:??",
month = may,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2168996.2169000",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:03 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Online First Person Shooter (FPS) games typically use
a client-server communication model, with thousands of
enthusiast-hosted game servers active at any time.
Traditional FPS server discovery may take minutes, as
clients create thousands of short-lived packet flows
while probing all available servers to find a selection
of game servers with tolerable round trip time (RTT).
REED reduces a client's probing time and network
traffic to 1\% of traditional server discovery. REED
game servers participate in a centralized, incremental
calculation of their network coordinates, and clients
use these coordinates to expedite the discovery of
servers with low RTTs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "20",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Liu:2012:ILC,
author = "Xiaobai Liu and Shuicheng Yan and Tat-Seng Chua and
Hai Jin",
title = "Image label completion by pursuing contextual
decomposability",
journal = j-TOMCCAP,
volume = "8",
number = "2",
pages = "21:1--21:??",
month = may,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2168996.2169001",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:03 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article investigates how to automatically
complete the missing labels for the partially annotated
images, without image segmentation. The label
completion procedure is formulated as a nonnegative
data factorization problem, to decompose the global
image representations that are used for describing the
entire images, for instance, various image feature
descriptors, into their corresponding label
representations, that are used for describing the local
semantic regions within images. The solution provided
in this work is motivated by following observations.
First, label representations of the regions with the
same label often share certain commonness, yet may be
essentially different due to the large intraclass
variations. Thus, each label or concept should be
represented by using a subspace spanned by an ensemble
of basis, instead of a single one, to characterize the
intralabel diversities. Second, the subspaces for
different labels are different from each other. Third,
while two images are similar with each other, the
corresponding label representations should be similar.
We formulate this cross-image context as well as the
given partial label annotations in the framework of
nonnegative data factorization and then propose an
efficient multiplicative nonnegative update rules to
alternately optimize the subspaces and the
reconstruction coefficients. We also provide the
theoretic proof of algorithmic convergence and
correctness. Extensive experiments over several
challenging image datasets clearly demonstrate the
effectiveness of our proposed solution in boosting the
quality of image label completion and image annotation
accuracy. Based on the same formulation, we further
develop a label ranking algorithms, to refine the
noised image labels without any manual supervision. We
compare the proposed label ranking algorithm with the
state-of-the-arts over the popular evaluation databases
and achieve encouragingly improvements.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "21",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Chen:2012:SGU,
author = "Yi Chen and Abhidnya A. Deshpande and Ramazan S.
Ayg{\"u}un",
title = "Sprite generation using sprite fusion",
journal = j-TOMCCAP,
volume = "8",
number = "2",
pages = "22:1--22:??",
month = may,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2168996.2169002",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:03 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "There has been related research for sprite or mosaic
generation for over 15 years. In this article, we try
to understand the methodologies for sprite generation
and identify what has not actually been covered for
sprite generation. We first identify issues and focus
on the domain of videos for sprite generation. We
introduce a novel sprite fusion method that blends two
sprites. Sprite fusion method produces good results for
tracking videos and does not require object
segmentation. We present sample results of our
experiments.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "22",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Weng:2012:CVR,
author = "Ming-Fang Weng and Yung-Yu Chuang",
title = "Collaborative video reindexing via matrix
factorization",
journal = j-TOMCCAP,
volume = "8",
number = "2",
pages = "23:1--23:??",
month = may,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2168996.2169003",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:03 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Concept-based video indexing generates a matrix of
scores predicting the possibilities of concepts
occurring in video shots. Based on the idea of
collaborative filtering, this article presents
unsupervised methods to refine the initial scores
generated by concept classifiers by taking into account
the concept-to-concept correlation and shot-to-shot
similarity embedded within the score matrix. Given a
noisy matrix, we refine the inaccurate scores via
matrix factorization. This method is further improved
by learning multiple local models and incorporating
contextual-temporal structures. Experiments on the
TRECVID 2006--2008 datasets demonstrate relative
performance gains ranging from 13\% to 52\% without
using any user annotations or external knowledge
resources.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "23",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Kankanhalli:2012:ISI,
author = "Mohan S. Kankanhalli",
title = "Introduction to special issue on multimedia security",
journal = j-TOMCCAP,
volume = "8",
number = "2S",
pages = "31:1--31:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2344436.2344437",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:05 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "31",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Weir:2012:IHV,
author = "Jonathan Weir and Weiqi Yan and Mohan S. Kankanhalli",
title = "Image hatching for visual cryptography",
journal = j-TOMCCAP,
volume = "8",
number = "2S",
pages = "32:1--32:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2344436.2344438",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:05 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Image hatching (or nonphotorealistic line-art) is a
technique widely used in the printing or engraving of
currency. Diverse styles of brush strokes have
previously been adopted for different areas of an image
to create aesthetically pleasing textures and shading.
Because there is no continuous tone within these types
of images, a multilevel scheme is proposed, which uses
different textures based on a threshold level. These
textures are then applied to the different levels and
are then combined to build up the final hatched image.
The proposed technique allows a secret to be hidden
using Visual Cryptography (VC) within the hatched
images. Visual cryptography provides a very powerful
means by which one secret can be distributed into two
or more pieces known as shares. When the shares are
superimposed exactly together, the original secret can
be recovered without computation. Also provided is a
comparison between the original grayscale images and
the resulting hatched images that are generated by the
proposed algorithm. This reinforces that the overall
quality of the hatched scheme is sufficient. The
Structural SIMilarity index (SSIM) is used to perform
this comparison.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "32",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Li:2012:RIB,
author = "Jian Li and Hongmei Liu and Jiwu Huang and Yun Q.
Shi",
title = "Reference index-based {H.264} video watermarking
scheme",
journal = j-TOMCCAP,
volume = "8",
number = "2S",
pages = "33:1--33:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2344436.2344439",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:05 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Video watermarking has received much attention over
the past years as a promising solution to copy
protection. Watermark robustness is still a key issue
of research, especially when a watermark is embedded in
the compressed video domain. In this article, a robust
watermarking scheme for H.264 video is proposed. During
video encoding, the watermark is embedded in the index
of the reference frame, referred to as reference index,
a bitstream syntax element newly proposed in the H.264
standard. Furthermore, the video content (current coded
blocks) is modified based on an optimization model,
aiming at improving watermark robustness without
unacceptably degrading the video's visual quality or
increasing the video's bit rate. Compared with the
existing schemes, our method has the following three
advantages: (1) The bit rate of the watermarked video
is adjustable; (2) the robustness against common video
operations can be achieved; (3) the watermark embedding
and extraction are simple. Extensive experiments have
verified the good performance of the proposed
watermarking scheme.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "33",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Gao:2012:RHC,
author = "Xifeng Gao and Caiming Zhang and Yan Huang and Zhigang
Deng",
title = "A robust high-capacity affine-transformation-invariant
scheme for watermarking {$3$D} geometric models",
journal = j-TOMCCAP,
volume = "8",
number = "2S",
pages = "34:1--34:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2344436.2344440",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:05 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this article we propose a novel, robust, and
high-capacity watermarking method for 3D meshes with
arbitrary connectivities in the spatial domain based on
affine invariants. Given a 3D mesh model, a watermark
is embedded as affine-invariant length ratios of one
diagonal segment to the residing diagonal intersected
by the other one in a coplanar convex quadrilateral. In
the extraction process, a watermark is recovered by
combining all the watermark pieces embedded in length
ratios through majority voting. Extensive experimental
results demonstrate the robustness, high computational
efficiency, high capacity, and
affine-transformation-invariant characteristics of the
proposed approach.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "34",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yang:2012:EMA,
author = "Rui Yang and Zhenhua Qu and Jiwu Huang",
title = "Exposing {MP3} audio forgeries using frame offsets",
journal = j-TOMCCAP,
volume = "8",
number = "2S",
pages = "35:1--35:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2344436.2344441",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:05 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Audio recordings should be authenticated before they
are used as evidence. Although audio watermarking and
signature are widely applied for authentication, these
two techniques require accessing the original audio
before it is published. Passive authentication is
necessary for digital audio, especially for the most
popular audio format: MP3. In this article, we propose
a passive approach to detect forgeries of MP3 audio.
During the process of MP3 encoding the audio samples
are divided into frames, and thus each frame has its
own frame offset after encoding. Forgeries lead to the
breaking of framing grids. So the frame offset is a
good indication for locating forgeries, and it can be
retrieved by the identification of the quantization
characteristic. In this way, the doctored positions can
be automatically located. Experimental results
demonstrate that the proposed approach is effective in
detecting some common forgeries, such as deletion,
insertion, substitution, and splicing. Even when the
bit rate is as low as 32 kbps, the detection rate is
above 99\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "35",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Feng:2012:CAO,
author = "Hui Feng and Hefei Ling and Fuhao Zou and Weiqi Yan
and Zhengding Lu",
title = "A collusion attack optimization strategy for digital
fingerprinting",
journal = j-TOMCCAP,
volume = "8",
number = "2S",
pages = "36:1--36:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2344436.2344442",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:05 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Collusion attack is a cost-efficient attack for
digital fingerprinting. In this article, we propose a
novel collusion attack strategy, Iterative Optimization
Collusion Attack (IOCA), which is based upon the
gradient attack and the principle of informed watermark
embedding. We evaluate the performance of the proposed
collusion attack strategy in defeating four typical
fingerprinting schemes under a well-constructed
evaluation framework. The simulation results show that
the proposed strategy performs more effectively than
the gradient attack, and adopting no more than three
fingerprinted copies can sufficiently collapse examined
fingerprinting schemes. Meanwhile, the content resulted
from the proposed attack still preserves high
perceptual quality.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "36",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Sachan:2012:ALV,
author = "Amit Sachan and Sabu Emmanuel and Mohan S.
Kankanhalli",
title = "Aggregate licenses validation for digital rights
violation detection",
journal = j-TOMCCAP,
volume = "8",
number = "2S",
pages = "37:1--37:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2344436.2344443",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:05 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Digital Rights Management (DRM) is the term associated
with the set of technologies to prevent illegal
multimedia content distribution and consumption. DRM
systems generally involve multiple parties such as
owner, distributors, and consumers. The owner issues
redistribution licenses to its distributors. The
distributors in turn using their received
redistribution licenses can generate and issue new
redistribution licenses to other distributors and new
usage licenses to consumers. As a part of rights
violation detection, these newly generated licenses
must be validated by a validation authority against the
redistribution license used to generate them. The
validation of these newly generated licenses becomes
quite complex when there exist multiple redistribution
licenses for a media with the distributors. In such
cases, the validation process requires validation using
an exponential number (to the number of redistribution
licenses) of validation inequalities and each
validation inequality may contain up to an exponential
number of summation terms. This makes the validation
process computationally intensive and necessitates to
do the validation efficiently. To overcome this, we
propose validation tree, a prefix-tree-based validation
method to do the validation efficiently. Theoretical
analysis and experimental results show that our
proposed technique reduces the validation time
significantly.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "37",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Riiser:2012:VSU,
author = "Haakon Riiser and Tore Endestad and Paul Vigmostad and
Carsten Griwodz and P{\^a}l Halvorsen",
title = "Video streaming using a location-based
bandwidth-lookup service for bitrate planning",
journal = j-TOMCCAP,
volume = "8",
number = "3",
pages = "24:1--24:??",
month = jul,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2240136.2240137",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:06 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "A lot of people around the world commute using public
transportation and would like to spend this time
viewing streamed video content such as news or sports
updates. However, mobile wireless networks typically
suffer from severe bandwidth fluctuations, and the
networks are often completely unresponsive for several
seconds, sometimes minutes. Today, there are several
ways of adapting the video bitrate and thus the video
quality to such fluctuations, for example, using
scalable video codecs or segmented adaptive HTTP
streaming that switches between nonscalable video
streams encoded in different bitrates. Still, for a
better long-term video playout experience that avoids
disruptions and frequent quality changes while using
existing video adaptation technology, it is desirable
to perform bandwidth prediction and planned quality
adaptation. This article describes a video streaming
system for receivers equipped with a GPS. A receiver's
download rate is constantly monitored, and periodically
reported back to a central database along with
associated GPS positional data. Thus, based on the
current location, a streaming device can use a
GPS-based bandwidth-lookup service in order to better
predict the near-future bandwidth availability and
create a schedule for the video playout that takes
likely future availability into account. To create a
prototype and perform initial tests, we conducted
several field trials while commuting using public
transportation. We show how our database has been used
to predict bandwidth fluctuations and network outages,
and how this information helps maintain uninterrupted
playback with less compromise on video quality than
possible without prediction.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "24",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Valdes:2012:AEV,
author = "Victor Valdes and Jose M. Martinez",
title = "Automatic evaluation of video summaries",
journal = j-TOMCCAP,
volume = "8",
number = "3",
pages = "25:1--25:??",
month = jul,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2240136.2240138",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:06 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article describes a method for the automatic
evaluation of video summaries based on the training of
individual predictors for different quality measures
from the TRECVid 2008 BBC Rushes Summarization Task.
The obtained results demonstrate that, with a large set
of evaluation data, it is possible to train fully
automatic evaluation systems based on visual features
automatically extracted from the summaries. The
proposed approach will enable faster and easier
estimation of the results of newly developed
abstraction algorithms and the study of which summary
characteristics influence their perceived quality.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "25",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Tian:2012:STL,
author = "Xinmei Tian and Dacheng Tao and Yong Rui",
title = "Sparse transfer learning for interactive video search
reranking",
journal = j-TOMCCAP,
volume = "8",
number = "3",
pages = "26:1--26:??",
month = jul,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2240136.2240139",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:06 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Visual reranking is effective to improve the
performance of the text-based video search. However,
existing reranking algorithms can only achieve limited
improvement because of the well-known semantic gap
between low-level visual features and high-level
semantic concepts. In this article, we adopt
interactive video search reranking to bridge the
semantic gap by introducing user's labeling effort. We
propose a novel dimension reduction tool, termed sparse
transfer learning (STL), to effectively and efficiently
encode user's labeling information. STL is particularly
designed for interactive video search reranking.
Technically, it (a) considers the pair-wise
discriminative information to maximally separate
labeled query relevant samples from labeled query
irrelevant ones, (b) achieves a sparse representation
for the subspace to encodes user's intention by
applying the elastic net penalty, and (c) propagates
user's labeling information from labeled samples to
unlabeled samples by using the data distribution
knowledge. We conducted extensive experiments on the
TRECVID 2005, 2006 and 2007 benchmark datasets and
compared STL with popular dimension reduction
algorithms. We report superior performance by using the
proposed STL-based interactive video search
reranking.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "26",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhang:2012:IBD,
author = "Xin Zhang and Tom{\'a}s E. Ward and S{\'e}amus
Mcloone",
title = "An information-based dynamic extrapolation model for
networked virtual environments",
journal = j-TOMCCAP,
volume = "8",
number = "3",
pages = "27:1--27:??",
month = jul,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2240136.2240140",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:06 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Various Information Management techniques have been
developed to help maintain a consistent shared virtual
world in a Networked Virtual Environment. However, such
techniques have to be carefully adapted to the
application state dynamics and the underlying network.
This work presents a novel framework that minimizes
inconsistency by optimizing bandwidth usage to deliver
useful information. This framework measures the state
evolution using an information model and dynamically
switches extrapolation models and the packet rate to
make the most information-efficient usage of the
available bandwidth. The results shown demonstrate that
this approach can help optimize consistency under
constrained and time-varying network conditions.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "27",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yang:2012:UCM,
author = "Linjun Yang and Bo Geng and Alan Hanjalic and
Xian-Sheng Hua",
title = "A unified context model for web image retrieval",
journal = j-TOMCCAP,
volume = "8",
number = "3",
pages = "28:1--28:??",
month = jul,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2240136.2240141",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:06 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Content-based web image retrieval based on the
query-by-example (QBE) principle remains a challenging
problem due to the semantic gap as well as the gap
between a user's intent and the representativeness of a
typical image query. In this article, we propose to
address this problem by integrating query-related
contextual information into an advanced query model to
improve the performance of QBE-based web image
retrieval. We consider both the local and global
context of the query image. The local context can be
inferred from the web pages and the click-through log
associated with the query image, while the global
context is derived from the entire corpus comprising
all web images and the associated web pages. To
effectively incorporate the local query context we
propose a language modeling based approach to deal with
the combined structured query representation from the
contextual and visual information. The global query
context is integrated by the multi-modal relevance
model to ``reconstruct'' the query from the document
models indexed in the corpus. In this way, the global
query context is employed to address the noise or
missing information in the query and its local context,
so that a comprehensive and robust query model can be
obtained. We evaluated the proposed approach on a
representative product image dataset collected from the
web and demonstrated that the inclusion of the local
and global query contexts significantly improves the
performance of QBE-based web image retrieval.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "28",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Patras:2012:CTS,
author = "Paul Patras and Albert Banchs and Pablo Serrano",
title = "A control theoretic scheme for efficient video
transmission over {IEEE 802.11e EDCA WLANs}",
journal = j-TOMCCAP,
volume = "8",
number = "3",
pages = "29:1--29:??",
month = jul,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2240136.2240142",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:06 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The EDCA mechanism of the IEEE 802.11 standard has
been designed to support, among others, video traffic.
This mechanism relies on a number of parameters whose
configuration is left open by the standard. Although
there are some recommended values for these parameters,
they are fixed independent of the WLAN conditions,
which results in suboptimal performance. Following this
observation, a number of approaches in the literature
have been devised to set the EDCA parameters based on
an estimation of the WLAN conditions. However, these
previous approaches are based on heuristics and hence
do not guarantee optimized performance. In this article
we propose a novel algorithm to adjust the EDCA
parameters to carry video traffic which, in contrast to
previous approaches, is sustained on mathematical
foundations that guarantee optimal performance. In
particular, our approach builds upon (i) an analytical
model of the WLAN performance under video traffic, used
to derive the optimal point of operation of EDCA, and
(ii) a control theoretic designed mechanism which
drives the WLAN to this point of operation. Via
extensive simulations, we show that the proposed
approach performs optimally and substantially
outperforms the standard recommended configuration as
well as previous adaptive proposals.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "29",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhu:2012:JLS,
author = "Xinglei Zhu and Chang W. Chen",
title = "A joint layered scheme for reliable and secure mobile
{JPEG-2000} streaming",
journal = j-TOMCCAP,
volume = "8",
number = "3",
pages = "30:1--30:??",
month = jul,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2240136.2240143",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:06 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article presents a novel joint layered approach
to simultaneously achieve both reliable and secure
mobile JPEG-2000 image streaming. With a priori
knowledge of JPEG-2000 source coding and channel
coding, the proposed joint system integrates
authentication into the media error protection
components to ensure that every source-decodable media
unit is authenticated. By such a dedicated design, the
proposed scheme protects both compressed JPEG-2000
codestream and the authentication data from wireless
channel impairments. It is fundamentally different from
many existing systems that consider the problem of
media authentication separately from the other
operations in the media transmission system. By
utilizing the contextual relationship, such as coding
dependency and content importance between media slices
for authentication hash appending, the proposed scheme
generates an extremely low authentication overhead.
Under this joint layered coding framework, an optimal
rate allocation algorithm for source coding, channel
coding, and media authentication is developed to
guarantee end-to-end media quality. Experiment results
on JPEG-2000 images validate the proposed scheme and
demonstrate that the performance of the proposed scheme
is approaching its upper bound, in which case no
authentication is applied to the media stream.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "30",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Gatica-Perez:2012:ISS,
author = "Daniel Gatica-Perez and Gang Hua and Wei Tsang Ooi and
P{\aa}l Halvorsen",
title = "Introduction to the special section of best papers of
{ACM Multimedia 2011}",
journal = j-TOMCCAP,
volume = "8",
number = "3s",
pages = "38:1--38:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2348816.2348817",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:07 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "38",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wu:2012:CPA,
author = "Wanmin Wu and Ahsan Arefin and Gregorij Kurillo and
Pooja Agarwal and Klara Nahrstedt and Ruzena Bajcsy",
title = "{CZLoD}: a psychophysical approach for {$3$D}
tele-immersive video",
journal = j-TOMCCAP,
volume = "8",
number = "3s",
pages = "39:1--39:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2348816.2348818",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:07 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article presents a psychophysical study that
measures the perceptual thresholds of a new factor
called Color-plus-Depth Level-of-Details (CZLoD)
peculiar to polygon-based 3D tele-immersive video. The
results demonstrate the existence of Just Noticeable
Degradation and Just Unacceptable Degradation
thresholds on the factor. In light of the results, we
design and implement a real-time perception-based
quality adaptor for 3D tele-immersive video. Our
experimental results show that the adaptation scheme
can reduce resource usage (e.g., CPU cycles) while
considerably enhancing the overall perceived visual
quality. Our analysis confirms the potential temporal
and spatial performance benefits achievable with CZLoD
adaptation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "39",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Ji:2012:AQS,
author = "Rongrong Ji and Felix X. Yu and Tongtao Zhang and
Shih-Fu Chang",
title = "Active query sensing: {Suggesting} the best query view
for mobile visual search",
journal = j-TOMCCAP,
volume = "8",
number = "3s",
pages = "40:1--40:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2348816.2348819",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:07 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "While much exciting progress is being made in mobile
visual search, one important question has been left
unexplored in all current systems. When searching
objects or scenes in the 3D world, which viewing angle
is more likely to be successful? More particularly, if
the first query fails to find the right target, how
should the user control the mobile camera to form the
second query? In this article, we propose a novel
Active Query Sensing system for mobile location search,
which actively suggests the best subsequent query view
to recognize the physical location in the mobile
environment. The proposed system includes two unique
components: (1) an offline process for analyzing the
saliencies of different views associated with each
geographical location, which predicts the location
search precisions of individual views by modeling their
self-retrieval score distributions. (2) an online
process for estimating the view of an unseen query, and
suggesting the best subsequent view change.
Specifically, the optimal viewing angle change for the
next query can be formulated as an online information
theoretic approach. Using a scalable visual search
system implemented over a NYC street view dataset (0.3
million images), we show a performance gain by reducing
the failure rate of mobile location search to only 12\%
after the second query. We have also implemented an
end-to-end functional system, including user interfaces
on iPhones, client-server communication, and a remote
search server. This work may open up an exciting new
direction for developing interactive mobile media
applications through the innovative exploitation of
active sensing and query formulation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "40",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Shirmohammadi:2012:ISS,
author = "Shervin Shirmohammadi and Mohamed Hefeeda and Wei
Tsang Ooi and Romulus Grigoras",
title = "Introduction to special section on {$3$D} mobile
multimedia",
journal = j-TOMCCAP,
volume = "8",
number = "3s",
pages = "41:1--41:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2348816.2348820",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:07 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "41",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Liu:2012:QOV,
author = "Yanwei Liu and Song Ci and Hui Tang and Yun Ye and
Jinxia Liu",
title = "{QoE}-oriented {$3$D} video transcoding for mobile
streaming",
journal = j-TOMCCAP,
volume = "8",
number = "3s",
pages = "42:1--42:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2348816.2348821",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:07 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "With advance in mobile 3D display, mobile 3D video is
already enabled by the wireless multimedia networking,
and it will be gradually popular since it can make
people enjoy the natural 3D experience anywhere and
anytime. In current stage, mobile 3D video is generally
delivered over the heterogeneous network combined by
wired and wireless channels. How to guarantee the
optimal 3D visual quality of experience (QoE) for the
mobile 3D video streaming is one of the important
topics concerned by the service provider. In this
article, we propose a QoE-oriented transcoding approach
to enhance the quality of mobile 3D video service. By
learning the pre-controlled QoE patterns of 3D
contents, the proposed 3D visual QoE inferring model
can be utilized to regulate the transcoding
configurations in real-time according to the feedbacks
of network and user-end device information. In the
learning stage, we propose a piecewise linear mean
opinion score (MOS) interpolation method to further
reduce the cumbersome manual work of preparing QoE
patterns. Experimental results show that the proposed
transcoding approach can provide the adapted 3D stream
to the heterogeneous network, and further provide
superior QoE performance to the fixed quantization
parameter (QP) transcoding and mean squared error (MSE)
optimized transcoding for mobile 3D video streaming.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "42",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Liu:2012:NVT,
author = "Shujie Liu and Chang Wen Chen",
title = "A novel {$3$D} video transcoding scheme for adaptive
{$3$D} video transmission to heterogeneous terminals",
journal = j-TOMCCAP,
volume = "8",
number = "3s",
pages = "43:1--43:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2348816.2348822",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:07 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Three-dimensional video (3DV) is attracting many
interests with its enhanced viewing experience and more
user driven features. 3DV has several unique
characteristics different from 2D video: (1) It has a
much larger amount of data captured and compressed, and
corresponding video compression techniques can be much
more complicated in order to explore data redundancy.
This will lead to more constraints on users' network
access and computational capability, (2) Most users
only need part of the 3DV data at any given time, while
the users' requirements exhibit large diversity, (3)
Only a limited number of views are captured and
transmitted for 3DV. View rendering is thus necessary
to generate virtual views based on the received 3DV
data. However, many terminal devices do not have the
functionality to generate virtual views. To enable 3DV
experience for the majority of users with limited
capabilities, adaptive 3DV transmission is necessary to
extract/generate the required data content and
represent it with supported formats and bitrates for
heterogeneous terminal devices. 3DV transcoding is an
emerging and effective technique to achieve desired
adaptive 3DV transmission. In this article, we propose
the first efficient 3DV transcoding scheme that can
obtain any desired view, either an encoded one or a
virtual one, and compress it with more universal
H.264/AVC. The key idea of the proposed scheme is to
appropriately utilize motion information contained in
the bitstream to generate candidate motion information.
Original information of both the desired view and
reference views are used to obtain this candidate
information and a proper motion refinement process is
carried out for certain blocks. Simulation results show
that, compared to the straightforward cascade
algorithm, the proposed scheme is able to output
compressed bitstream of the required view with
significantly reduced complexity while incurring
negligible performance loss. Such a 3DV transcoding can
be applied to most gateways that usually have
constraints on computational complexity and time
delay.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "43",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Roodaki:2012:NMD,
author = "Hoda Roodaki and Mahmoud Reza Hashemi and Shervin
Shirmohammadi",
title = "A new methodology to derive objective quality
assessment metrics for scalable multiview {$3$D} video
coding",
journal = j-TOMCCAP,
volume = "8",
number = "3s",
pages = "44:1--44:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2348816.2348823",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:07 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "With the growing demand for 3D video, efforts are
underway to incorporate it in the next generation of
broadcast and streaming applications and standards. 3D
video is currently available in games, entertainment,
education, security, and surveillance applications. A
typical scenario for multiview 3D consists of several
3D video sequences captured simultaneously from the
same scene with the help of multiple cameras from
different positions and through different angles.
Multiview video coding provides a compact
representation of these multiple views by exploiting
the large amount of inter-view statistical
dependencies. One of the major challenges in this field
is how to transmit the large amount of data of a
multiview sequence over error prone channels to
heterogeneous mobile devices with different bandwidth,
resolution, and processing/battery power, while
maintaining a high visual quality. Scalable Multiview
3D Video Coding (SMVC) is one of the methods to address
this challenge; however, the evaluation of the overall
visual quality of the resulting scaled-down video
requires a new objective perceptual quality measure
specifically designed for scalable multiview 3D video.
Although several subjective and objective quality
assessment methods have been proposed for multiview 3D
sequences, no comparable attempt has been made for
quality assessment of scalable multiview 3D video. In
this article, we propose a new methodology to build
suitable objective quality assessment metrics for
different scalable modalities in multiview 3D video.
Our proposed methodology considers the importance of
each layer and its content as a quality of experience
factor in the overall quality. Furthermore, in addition
to the quality of each layer, the concept of disparity
between layers (inter-layer disparity) and disparity
between the units of each layer (intra-layer disparity)
is considered as an effective feature to evaluate
overall perceived quality more accurately. Simulation
results indicate that by using this methodology, more
efficient objective quality assessment metrics can be
introduced for each multiview 3D video scalable
modalities.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "44",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Hamza:2012:EEM,
author = "Ahmed Hamza and Mohamed Hefeeda",
title = "Energy-efficient multicasting of multiview {$3$D}
videos to mobile devices",
journal = j-TOMCCAP,
volume = "8",
number = "3s",
pages = "45:1--45:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2348816.2348824",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:07 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Multicasting multiple video streams over wireless
broadband access networks enables the delivery of
multimedia content to large-scale user communities in a
cost-efficient manner. Three dimensional (3D) videos
are the next natural step in the evolution of digital
media technologies. In order to provide 3D perception,
3D video streams contain one or more views that greatly
increase their bandwidth requirements. Due to the
limited channel capacity and variable bit rate of the
videos, multicasting multiple 3D videos over wireless
broadband networks is a challenging problem. In this
article, we consider a 4G wireless access network in
which a number of 3D videos represented in two-view
plus depth format and encoded using scalable video
coders are multicast. We formulate the optimal 3D video
multicasting problem to maximize the quality of
rendered virtual views on the receivers' displays. We
show that this problem is NP-complete and present a
polynomial time approximation algorithm to solve it. We
then extend the proposed algorithm to efficiently
schedule the transmission of the chosen substreams from
each video in order to maximize the power saving on the
mobile receivers. Our simulation-based experimental
results show that our algorithm provides solutions that
are within 0.3 dB of the optimal solutions while
satisfying real-time requirements of multicast systems.
In addition, our algorithm results in an average power
consumption reduction of 86\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "45",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Shi:2012:RTR,
author = "Shu Shi and Klara Nahrstedt and Roy Campbell",
title = "A real-time remote rendering system for interactive
mobile graphics",
journal = j-TOMCCAP,
volume = "8",
number = "3s",
pages = "46:1--46:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2348816.2348825",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:07 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Mobile devices are gradually changing people's
computing behaviors. However, due to the limitations of
physical size and power consumption, they are not
capable of delivering a 3D graphics rendering
experience comparable to desktops. Many applications
with intensive graphics rendering workloads are unable
to run on mobile platforms directly. This issue can be
addressed with the idea of remote rendering: the heavy
3D graphics rendering computation runs on a powerful
server and the rendering results are transmitted to the
mobile client for display. However, the simple remote
rendering solution inevitably suffers from the large
interaction latency caused by wireless networks, and is
not acceptable for many applications that have very
strict latency requirements. In this article, we
present an advanced low-latency remote rendering system
that assists mobile devices to render interactive 3D
graphics in real-time. Our design takes advantage of an
image based rendering technique: 3D image warping, to
synthesize the mobile display from the depth images
generated on the server. The research indicates that
the system can successfully reduce the interaction
latency while maintaining the high rendering quality by
generating multiple depth images at the carefully
selected viewpoints. We study the problem of viewpoint
selection, propose a real-time reference viewpoint
prediction algorithm, and evaluate the algorithm
performance with real-device experiments.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "46",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Guan:2012:EMM,
author = "Wei Guan and Suya You and Ulrich Newmann",
title = "Efficient matchings and mobile augmented reality",
journal = j-TOMCCAP,
volume = "8",
number = "3s",
pages = "47:1--47:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2348816.2348826",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Nov 6 18:13:07 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "With the fast-growing popularity of smart phones in
recent years, augmented reality (AR) on mobile devices
is gaining more attention and becomes more demanding
than ever before. However, the limited processors in
mobile devices are not quite promising for AR
applications that require real-time processing speed.
The challenge exists due to the fact that, while fast
features are usually not robust enough in matchings,
robust features like SIFT or SURF are not
computationally efficient. There is always a tradeoff
between robustness and efficiency and it seems that we
have to sacrifice one for the other. While this is true
for most existing features, researchers have been
working on designing new features with both robustness
and efficiency. In this article, we are not trying to
present a completely new feature. Instead, we propose
an efficient matching method for robust features. An
adaptive scoring scheme and a more distinctive
descriptor are also proposed for performance
improvements. Besides, we have developed an outdoor
augmented reality system that is based on our proposed
methods. The system demonstrates that not only it can
achieve robust matchings efficiently, it is also
capable to handle large occlusions such as passengers
and moving vehicles, which is another challenge for
many AR applications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "47",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{TOMCCAP-STAFF:2012:TCO,
author = "{TOMCCAP-STAFF}",
title = "Table of contents: Online supplement volume 8, number
2s, online supplement volume 8, number 3s",
journal = j-TOMCCAP,
volume = "8",
number = "4",
pages = "48:1--48:??",
month = nov,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2379790.2382432",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun May 5 09:14:21 MDT 2013",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "48",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Steinmetz:2012:E,
author = "Ralf Steinmetz",
title = "Editorial",
journal = j-TOMCCAP,
volume = "8",
number = "4",
pages = "49:1--49:??",
month = nov,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2379790.2379791",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun May 5 09:14:21 MDT 2013",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "49",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Liu:2012:LRC,
author = "Xiaobai Liu and Shuicheng Yan and Bin Cheng and Jinhui
Tang and Tat-Sheng Chua and Hai Jin",
title = "Label-to-region with continuity-biased bi-layer
sparsity priors",
journal = j-TOMCCAP,
volume = "8",
number = "4",
pages = "50:1--50:??",
month = nov,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2379790.2379792",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun May 5 09:14:21 MDT 2013",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this work, we investigate how to reassign the fully
annotated labels at image level to those contextually
derived semantic regions, namely Label-to-Region (L2R),
in a collective manner. Given a set of input images
with label annotations, the basic idea of our approach
to L2R is to first discover the patch correspondence
across images, and then propagate the common labels
shared in image pairs to these correlated patches.
Specially, our approach consists of following aspects.
First, each of the input images is encoded as a
Bag-of-Hierarchical-Patch (BOP) for capturing the rich
cues at variant scales, and the individual patches are
expressed by patch-level feature descriptors. Second,
we present a sparse representation formulation for
discovering how well an image or a semantic region can
be robustly reconstructed by all the other image
patches from the input image set. The underlying
philosophy of our formulation is that an image region
can be sparsely reconstructed with the image patches
belonging to the other images with common labels, while
the robustness in label propagation across images
requires that these selected patches come from very few
images. This preference of being sparse at both patch
and image level is named bi-layer sparsity prior.
Meanwhile, we enforce the preference of choosing
larger-size patches in reconstruction, referred to as
continuity-biased prior in this work, which may further
enhance the reliability of L2R assignment. Finally, we
harness the reconstruction coefficients to propagate
the image labels to the matched patches, and fuse the
propagation results over all patches to finalize the
L2R task. As a by-product, the proposed
continuity-biased bi-layer sparse representation
formulation can be naturally applied to perform image
annotation on new testing images. Extensive experiments
on three public image datasets clearly demonstrate the
effectiveness of our proposed framework in both L2R
assignment and image annotation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "50",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Rooij:2012:ETS,
author = "Ork De Rooij and Marcel Worring",
title = "Efficient targeted search using a focus and context
video browser",
journal = j-TOMCCAP,
volume = "8",
number = "4",
pages = "51:1--51:??",
month = nov,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2379790.2379793",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun May 5 09:14:21 MDT 2013",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Currently there are several interactive content-based
video retrieval techniques and systems available.
However, retrieval performance depends heavily on the
means of interaction. We argue that effective CBVR
requires efficient, specialized user interfaces. In
this article we propose guidelines for such an
interface, and we propose an effective CBVR engine: the
ForkBrowser, which builds upon the principle of focus
and context. This browser is evaluated using a
combination of user simulation and real user
evaluation. Results indicate that the ideas have merit,
and that the browser performs very well when compared
to the state-of-the-art in video retrieval.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "51",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Ghinea:2012:UPM,
author = "Gheorghita Ghinea and Oluwakemi Ademoye",
title = "User perception of media content association in
olfaction-enhanced multimedia",
journal = j-TOMCCAP,
volume = "8",
number = "4",
pages = "52:1--52:??",
month = nov,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2379790.2379794",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun May 5 09:14:21 MDT 2013",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Olfaction is an exciting challenge facing multimedia
applications. In this article we have investigated user
perception of the association between olfactory media
content and video media content in olfactory-enhanced
multimedia. Results show that the association between
scent and content has a significant impact on the
user-perceived experience of olfactory-enhanced
multimedia.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "52",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Spicer:2012:NAD,
author = "Ryan Spicer and Yu-Ru Lin and Aisling Kelliher and
Hari Sundaram",
title = "{NextSlidePlease}: Authoring and delivering agile
multimedia presentations",
journal = j-TOMCCAP,
volume = "8",
number = "4",
pages = "53:1--53:??",
month = nov,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2379790.2379795",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun May 5 09:14:21 MDT 2013",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Presentation support tools, such as Microsoft
PowerPoint, pose challenges both in terms of creating
linear presentations from complex data and fluidly
navigating such linear structures when presenting to
diverse audiences. NextSlidePlease is a slideware
application that addresses these challenges using a
directed graph structure approach for authoring and
delivering multimedia presentations. The application
combines novel approaches for searching and analyzing
presentation datasets, composing meaningfully
structured presentations, and efficiently delivering
material under a variety of time constraints. We
introduce and evaluate a presentation analysis
algorithm intended to simplify the process of authoring
dynamic presentations, and a time management and path
selection algorithm that assists users in prioritizing
content during the presentation process. Results from
two comparative user studies indicate that the directed
graph approach promotes the creation of hyperlinks, the
consideration of connections between content items, and
a richer understanding of the time management
consequences of including and selecting presentation
material.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "53",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Qi:2012:OBI,
author = "Heng Qi and Keqiu Li and Yanming Shen and Wenyu Qu",
title = "Object-based image retrieval with kernel on adjacency
matrix and local combined features",
journal = j-TOMCCAP,
volume = "8",
number = "4",
pages = "54:1--54:??",
month = nov,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2379790.2379796",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun May 5 09:14:21 MDT 2013",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In object-based image retrieval, there are two
important issues: an effective image representation
method for representing image content and an effective
image classification method for processing user
feedback to find more images containing the
user-desired object categories. In the image
representation method, the local-based representation
is the best selection for object-based image retrieval.
As a kernel-based classification method, Support Vector
Machine (SVM) has shown impressive performance on image
classification. But SVM cannot work on the local-based
representation unless there is an appropriate kernel.
To address this problem, some representative kernels
are proposed in literatures. However, these kernels
cannot work effectively in object-based image retrieval
due to ignoring the spatial context and the combination
of local features. In this article, we present Adjacent
Matrix (AM) and the Local Combined Features (LCF) to
incorporate the spatial context and the combination of
local features into the kernel. We propose the AM-LCF
feature vector to represent image content and the
AM-LCF kernel to measure the similarities between
AM-LCF feature vectors. According to the detailed
analysis, we show that the proposed kernel can overcome
the deficiencies of existing kernels. Moreover, we
evaluate the proposed kernel through experiments of
object-based image retrieval on two public image sets.
The experimental results show that the performance of
object-based image retrieval can be improved by the
proposed kernel.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "54",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Li:2012:VPA,
author = "Guangda Li and Meng Wang and Zheng Lu and Richang Hong
and Tat-Seng Chua",
title = "In-video product annotation with {Web} information
mining",
journal = j-TOMCCAP,
volume = "8",
number = "4",
pages = "55:1--55:??",
month = nov,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2379790.2379797",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun May 5 09:14:21 MDT 2013",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Product annotation in videos is of great importance
for video browsing, search, and advertisement. However,
most of the existing automatic video annotation
research focuses on the annotation of high-level
concepts, such as events, scenes, and object
categories. This article presents a novel solution to
the annotation of specific products in videos by mining
information from the Web. It collects a set of
high-quality training data for each product by
simultaneously leveraging Amazon and Google image
search engine. A visual signature for each product is
then built based on the bag-of-visual-words
representation of the training images. A correlative
sparsification approach is employed to remove noisy
bins in the visual signatures. These signatures are
used to annotate video frames. We conduct experiments
on more than 1,000 videos and the results demonstrate
the feasibility and effectiveness of our approach.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "55",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Gopinathan:2012:ASO,
author = "Ajay Gopinathan and Zongpeng Li",
title = "Algorithms for stochastic optimization of multicast
content delivery with network coding",
journal = j-TOMCCAP,
volume = "8",
number = "4",
pages = "56:1--56:??",
month = nov,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2379790.2379798",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun May 5 09:14:21 MDT 2013",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The usage of network resources by content providers is
commonly governed by Service-Level Agreements (SLA)
between the content provider and the network service
provider. Resource usage exceeding the limits specified
in the SLA incurs the content provider additional
charges, usually at a higher cost. Hence, the content
provider's goal is to provision adequate resources in
the SLA based on forecasts of future demand. We study
capacity purchasing strategies when the content
provider employs network coded multicast as the media
delivery mechanism, with uncertainty in its future
customer set explicitly taken into consideration. The
latter requires the content provider to make capacity
provisioning decisions based on market predictions and
historical customer usage patterns. The probabilistic
element suggests a stochastic optimization approach. We
model this problem as a two-stage stochastic
optimization problem with recourse. Such optimizations
are \#P-hard to solve directly, and we design two
approximation algorithms for them. The first is a
heuristic algorithm that exploits properties unique to
network coding, so that only polynomial-time operations
are needed. It performs well in general scenarios, but
the gap from the optimal solution is not bounded by any
constant in the worst case. This motivates our second
approach, a sampling algorithm partly inspired from the
work of Gupta et al. [2004a]. We employ techniques from
duality theory in linear optimization to prove that the
sampling algorithm provides a 3-approximation to the
stochastic multicast problem. We conduct extensive
simulations to illustrate the efficacy of both
algorithms, and show that the performance of both is
usually within 10\% of the optimal solution in
practice.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "56",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Hendrikx:2013:PCG,
author = "Mark Hendrikx and Sebastiaan Meijer and Joeri {Van Der
Velden} and Alexandru Iosup",
title = "Procedural content generation for games: a survey",
journal = j-TOMCCAP,
volume = "9",
number = "1",
pages = "1:1--1:??",
month = feb,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2422956.2422957",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun May 5 09:14:22 MDT 2013",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Hundreds of millions of people play computer games
every day. For them, game content-from 3D objects to
abstract puzzles-plays a major entertainment role.
Manual labor has so far ensured that the quality and
quantity of game content matched the demands of the
playing community, but is facing new scalability
challenges due to the exponential growth over the last
decade of both the gamer population and the production
costs. Procedural Content Generation for Games (PCG-G)
may address these challenges by automating, or aiding
in, game content generation. PCG-G is difficult, since
the generator has to create the content, satisfy
constraints imposed by the artist, and return
interesting instances for gamers. Despite a large body
of research focusing on PCG-G, particularly over the
past decade, ours is the first comprehensive survey of
the field of PCG-G. We first introduce a comprehensive,
six-layered taxonomy of game content: bits, space,
systems, scenarios, design, and derived. Second, we
survey the methods used across the whole field of PCG-G
from a large research body. Third, we map PCG-G methods
to game content layers; it turns out that many of the
methods used to generate game content from one layer
can be used to generate content from another. We also
survey the use of methods in practice, that is, in
commercial or prototype games. Fourth and last, we
discuss several directions for future research in
PCG-G, which we believe deserve close attention in the
near future.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "1",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Liu:2013:IRQ,
author = "Dong Liu and Shuicheng Yan and Rong-Rong Ji and
Xian-Sheng Hua and Hong-Jiang Zhang",
title = "Image retrieval with query-adaptive hashing",
journal = j-TOMCCAP,
volume = "9",
number = "1",
pages = "2:1--2:??",
month = feb,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2422956.2422958",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun May 5 09:14:22 MDT 2013",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Hashing-based approximate nearest-neighbor search may
well realize scalable content-based image retrieval.
The existing semantic-preserving hashing methods
leverage the labeled data to learn a fixed set of
semantic-aware hash functions. However, a fixed hash
function set is unable to well encode all semantic
information simultaneously, and ignores the specific
user's search intention conveyed by the query. In this
article, we propose a query-adaptive hashing method
which is able to generate the most appropriate binary
codes for different queries. Specifically, a set of
semantic-biased discriminant projection matrices are
first learnt for each of the semantic concepts, through
which a semantic-adaptable hash function set is learnt
via a joint sparsity variable selection model. At query
time, we further use the sparsity representation
procedure to select the most appropriate hash function
subset that is informative to the semantic information
conveyed by the query. Extensive experiments over three
benchmark image datasets well demonstrate the
superiority of our proposed query-adaptive hashing
method over the state-of-the-art ones in terms of
retrieval accuracy.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "2",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zheng:2013:GSD,
author = "Yan-Tao Zheng and Shuicheng Yan and Zheng-Jun Zha and
Yiqun Li and Xiangdong Zhou and Tat-Seng Chua and
Ramesh Jain",
title = "{GPSView}: a scenic driving route planner",
journal = j-TOMCCAP,
volume = "9",
number = "1",
pages = "3:1--3:??",
month = feb,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2422956.2422959",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun May 5 09:14:22 MDT 2013",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "GPS devices have been widely used in automobiles to
compute navigation routes to destinations. The
generated driving route targets the minimal traveling
distance, but neglects the sightseeing experience of
the route. In this study, we propose an augmented GPS
navigation system, GPSView, to incorporate a scenic
factor into the routing. The goal of GPSView is to plan
a driving route with scenery and sightseeing qualities,
and therefore allow travelers to enjoy sightseeing on
the drive. To do so, we first build a database of
scenic roadways with vistas of landscapes and sights
along the roadside. Specifically, we adapt an
attention-based approach to exploit
community-contributed GPS-tagged photos on the Internet
to discover scenic roadways. The premise is: a
multitude of photos taken along a roadway imply that
this roadway is probably appealing and catches the
public's attention. By analyzing the geospatial
distribution of photos, the proposed approach discovers
the roadside sight spots, or Points-Of-Interest (POIs),
which have good scenic qualities and visibility to
travelers on the roadway. Finally, we formulate scenic
driving route planning as an optimization task towards
the best trade-off between sightseeing experience and
traveling distance. Testing in the northern California
area shows that the proposed system can deliver
promising results.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "3",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhou:2013:SMV,
author = "Wengang Zhou and Houqiang Li and Yijuan Lu and Qi
Tian",
title = "{SIFT} match verification by geometric coding for
large-scale partial-duplicate web image search",
journal = j-TOMCCAP,
volume = "9",
number = "1",
pages = "4:1--4:??",
month = feb,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2422956.2422960",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun May 5 09:14:22 MDT 2013",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Most large-scale image retrieval systems are based on
the bag-of-visual-words model. However, the traditional
bag-of-visual-words model does not capture the
geometric context among local features in images well,
which plays an important role in image retrieval. In
order to fully explore geometric context of all visual
words in images, efficient global geometric
verification methods have been attracting lots of
attention. Unfortunately, current existing methods on
global geometric verification are either
computationally expensive to ensure real-time response,
or cannot handle rotation well. To solve the preceding
problems, in this article, we propose a novel geometric
coding algorithm, to encode the spatial context among
local features for large-scale partial-duplicate Web
image retrieval. Our geometric coding consists of
geometric square coding and geometric fan coding, which
describe the spatial relationships of SIFT features
into three geo-maps for global verification to remove
geometrically inconsistent SIFT matches. Our approach
is not only computationally efficient, but also
effective in detecting partial-duplicate images with
rotation, scale changes, partial-occlusion, and
background clutter. Experiments in partial-duplicate
Web image search, using two datasets with one million
Web images as distractors, reveal that our approach
outperforms the baseline bag-of-visual-words approach
even following a RANSAC verification in mean average
precision. Besides, our approach achieves comparable
performance to other state-of-the-art global geometric
verification methods, for example, spatial coding
scheme, but is more computationally efficient.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "4",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Park:2013:ISL,
author = "Jong-Seung Park and Ramesh Jain",
title = "Identification of scene locations from geotagged
images",
journal = j-TOMCCAP,
volume = "9",
number = "1",
pages = "5:1--5:??",
month = feb,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2422956.2422961",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun May 5 09:14:22 MDT 2013",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Due to geotagging capabilities of consumer cameras, it
has become easy to capture the exact geometric location
where a picture is taken. However, the location is not
the whereabouts of the scene taken by the photographer
but the whereabouts of the photographer himself. To
determine the actual location of an object seen in a
photo some sophisticated and tiresome steps are
required on a special camera rig, which are generally
not available in common digital cameras. This article
proposes a novel method to determine the geometric
location corresponding to a specific image pixel. A new
technique of stereo triangulation is introduced to
compute the relative depth of a pixel position.
Geographical metadata embedded in images are utilized
to convert relative depths to absolute coordinates.
When a geographic database is available we can also
infer the semantically meaningful description of a
scene object from where the specified pixel is
projected onto the photo. Experimental results
demonstrate the effectiveness of the proposed approach
in accurately identifying actual locations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "5",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wang:2013:RAA,
author = "Yichuan Wang and Ting-An Lin and Cheng-Hsin Hsu and
Xin Liu",
title = "Region- and action-aware virtual world clients",
journal = j-TOMCCAP,
volume = "9",
number = "1",
pages = "6:1--6:??",
month = feb,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2422956.2422962",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun May 5 09:14:22 MDT 2013",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "We propose region- and action-aware virtual world
clients. To develop such clients, we present a
parameterized network traffic model, based on a large
collection of Second Life traces gathered by us. Our
methodology is also applicable to virtual worlds other
than Second Life. With the traffic model, various
optimization criteria can be adopted, including visual
quality, response time, and energy consumption. We use
energy consumption as the show case, and demonstrate
via trace-driven simulations that, compared to two
existing schemes, a mobile client can save up to 36\%
and 41\% communication energy by selectively turning on
its WiFi network interface.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "6",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Khodabakhshi:2013:SSF,
author = "Naghmeh Khodabakhshi and Mohamed Hefeeda",
title = "{Spider}: a system for finding {$3$D} video copies",
journal = j-TOMCCAP,
volume = "9",
number = "1",
pages = "7:1--7:??",
month = feb,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2422956.2422963",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun May 5 09:14:22 MDT 2013",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article presents a novel content-based copy
detection system for 3D videos. The system creates
compact and robust depth and visual signatures from the
3D videos. Then, signature of a query video is compared
against an indexed database of reference videos'
signatures. The system returns a score, using both
spatial and temporal characteristics of videos,
indicating whether the query video matches any video in
the reference video database, and in case of matching,
which portion of the reference video matches the query
video. Analysis shows that the system is efficient,
both computationally and storage-wise. The system can
be used, for example, by video content owners, video
hosting sites, and third-party companies to find
illegally copied 3D videos. We implemented Spider, a
complete realization of the proposed system, and
conducted rigorous experiments on it. Our experimental
results show that the proposed system can achieve high
accuracy in terms of precision and recall even if the
3D videos are subjected to several transformations at
the same time. For example, the proposed system yields
100\% precision and recall when copied videos are parts
of original videos, and more than 90\% precision and
recall when copied videos are subjected to different
individual transformations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "7",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Abrams:2013:WAG,
author = "Austin Abrams and Robert Pless",
title = "{Web}-accessible geographic integration and
calibration of webcams",
journal = j-TOMCCAP,
volume = "9",
number = "1",
pages = "8:1--8:??",
month = feb,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2422956.2422964",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun May 5 09:14:22 MDT 2013",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "A global network of webcams offers unique viewpoints
from tens of thousands of locations. Understanding the
geographic context of this imagery is vital in using
these cameras for quantitative environmental monitoring
or surveillance applications. We derive robust
geo-calibration constraints that allow users to
geo-register static or pan-tilt-zoom cameras by
specifying a few corresponding points, and describe our
Web interface suitable for novices. We discuss design
decisions that support our scalable, publicly
accessible Web service that allows webcam textures to
be displayed live on 3D geographic models. Finally, we
demonstrate several multimedia applications for
geo-calibrated cameras.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "8",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Steinmetz:2013:EN,
author = "Ralf Steinmetz",
title = "Editorial note",
journal = j-TOMCCAP,
volume = "9",
number = "1s",
pages = "31:1--31:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2523001.2523002",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:45 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "31",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Nahrstedt:2013:ISS,
author = "Klara Nahrstedt and Rainer Lienhart and Malcolm
Slaney",
title = "Introduction to the special section on the 20th
anniversary of the {ACM International Conference on
Multimedia}",
journal = j-TOMCCAP,
volume = "9",
number = "1s",
pages = "32:1--32:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2523001.2523003",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:45 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "32",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Li:2013:TDI,
author = "Baochun Li and Zhi Wang and Jiangchuan Liu and Wenwu
Zhu",
title = "Two decades of {Internet} video streaming: a
retrospective view",
journal = j-TOMCCAP,
volume = "9",
number = "1s",
pages = "33:1--33:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2505805",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:45 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "For over two decades, video streaming over the
Internet has received a substantial amount of attention
from both academia and industry. Starting from the
design of transport protocols for streaming video,
research interests have later shifted to the
peer-to-peer paradigm of designing streaming protocols
at the application layer. More recent research has
focused on building more practical and scalable
systems, using Dynamic Adaptive Streaming over HTTP. In
this article, we provide a retrospective view of the
research results over the past two decades, with a
focus on peer-to-peer streaming protocols and the
effects of cloud computing and social media.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "33",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Huang:2013:ETM,
author = "Zixia Huang and Klara Nahrstedt and Ralf Steinmetz",
title = "Evolution of temporal multimedia synchronization
principles: a historical viewpoint",
journal = j-TOMCCAP,
volume = "9",
number = "1s",
pages = "34:1--34:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2490821",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:45 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The evolution of multimedia applications has
drastically changed human life and behaviors. New
communication technologies lead to new requirements for
multimedia synchronization. This article presents a
historical view of temporal synchronization studies
focusing on continuous multimedia. We demonstrate how
the development of multimedia systems has created new
challenges for synchronization technologies. We
conclude with a new application-dependent,
multilocation, multirequirement synchronization
framework to address these new challenges.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "34",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Bulterman:2013:SAM,
author = "Dick C. A. Bulterman and Pablo Cesar and Rodrigo
Laiola Guimar{\~a}es",
title = "Socially-aware multimedia authoring: {Past}, present,
and future",
journal = j-TOMCCAP,
volume = "9",
number = "1s",
pages = "35:1--35:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2491893",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:45 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Creating compelling multimedia productions is a
nontrivial task. This is as true for creating
professional content as it is for nonprofessional
editors. During the past 20 years, authoring networked
content has been a part of the research agenda of the
multimedia community. Unfortunately, authoring has been
seen as an initial enterprise that occurs before `real'
content processing takes place. This limits the options
open to authors and to viewers of rich multimedia
content for creating and receiving focused, highly
personal media presentations. This article reflects on
the history of multimedia authoring. We focus on the
particular task of supporting socially-aware
multimedia, in which the relationships within
particular social groups among authors and viewers can
be exploited to create highly personal media
experiences. We provide an overview of the requirements
and characteristics of socially-aware multimedia
authoring within the context of exploiting community
content. We continue with a short historical
perspective on authoring support for these types of
situations. We then present an overview of a current
system for supporting socially-aware multimedia
authoring within the community content. We conclude
with a discussion of the issues that we feel can
provide a fruitful basis for future multimedia
authoring support. We argue that providing support for
socially-aware multimedia authoring can have a profound
impact on the nature and architecture of the entire
multimedia information processing pipeline.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "35",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhang:2013:IST,
author = "Lei Zhang and Yong Rui",
title = "Image search-from thousands to billions in 20 years",
journal = j-TOMCCAP,
volume = "9",
number = "1s",
pages = "36:1--36:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2490823",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:45 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article presents a comprehensive review and
analysis on image search in the past 20 years,
emphasizing the challenges and opportunities brought by
the astonishing increase of dataset scales from
thousands to billions in the same time period, which
was witnessed first-hand by the authors as active
participants in this research area. Starting with a
retrospective review of three stages of image search in
the history, the article highlights major breakthroughs
around the year 2000 in image search features, indexing
methods, and commercial systems, which marked the
transition from stage two to stage three. Subsequent
sections describe the image search research from four
important aspects: system framework, feature extraction
and image representation, indexing, and big data's
potential. Based on the review, the concluding section
discusses open research challenges and suggests future
research directions in effective visual representation,
image knowledge base construction, implicit user
feedback and crowdsourcing, mobile image search, and
creative multimedia interfaces.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "36",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Rowe:2013:LFY,
author = "Lawrence A. Rowe",
title = "Looking forward 10 years to multimedia successes",
journal = j-TOMCCAP,
volume = "9",
number = "1s",
pages = "37:1--37:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2490825",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:45 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "A panel at ACM Multimedia 2012 addressed research
successes in the past 20 years. While the panel focused
on the past, this article discusses successes since the
ACM SIGMM 2003 Retreat and suggests research directions
in the next ten years. While significant progress has
been made, more research is required to allow
multimedia to impact our everyday computing
environment. The importance of hardware changes on
future research directions is discussed. We believe
ubiquitous computing-meaning abundant computation and
network bandwidth-should be applied in novel ways to
solve multimedia grand challenges and continue the IT
revolution of the past century.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "37",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Shenoy:2013:MSR,
author = "Prashant Shenoy",
title = "Multimedia systems research: {The} first twenty years
and lessons for the next twenty",
journal = j-TOMCCAP,
volume = "9",
number = "1s",
pages = "38:1--38:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2490859",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:45 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This retrospective article examines the past two
decades of multimedia systems research through the lens
of three research topics that were in vogue in the
early days of the field and offers perspectives on the
evolution of these research topics. We discuss the
eventual impact of each line of research and offer
lessons for future research in the field.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "38",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Hua:2013:OVD,
author = "Kien A. Hua",
title = "Online video delivery: {Past}, present, and future",
journal = j-TOMCCAP,
volume = "9",
number = "1s",
pages = "39:1--39:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2502435",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:45 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Video streaming is the core technology for online
video delivery systems. Initial research on this
technology faced many challenges. In this article,
lessons learned from beginning trials are discussed;
some pioneering works that provided early solutions and
inspired subsequent research are presented; and new
techniques required for emerging applications are
examined.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "39",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Swaminathan:2013:WMV,
author = "Viswanathan Swaminathan",
title = "Are we in the middle of a video streaming
revolution?",
journal = j-TOMCCAP,
volume = "9",
number = "1s",
pages = "40:1--40:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2490826",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:45 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "It has been roughly 20 years since the beginning of
video streaming over the Internet. Until very recently,
video streaming experiences left much to be desired.
Over the last few years, this has significantly
improved making monetization of streaming, possible.
Recently, there has been an explosion of commercial
video delivery services over the Internet, sometimes
referred to as over-the-top (OTT) delivery. All these
services invariably use streaming technologies.
Initially, streaming had all the promise, then for a
long time, it was download and play, later progressive
download for short content, and now it is streaming
again. Did streaming win the download versus streaming
contest? Did the best technology win? The improvement
in streaming experience has been possible through a
variety of new streaming technologies, some proprietary
and others extensions to standard protocols. The
primary delivery mechanism for entertainment video,
both premium content like movies and user generated
content (UGC), tends to be HTTP streaming. Is HTTP
streaming the panacea for all problems? The goal of
this article is to give an industry perspective of what
fundamentally changed in video streaming that makes it
commercially viable now. This article outlines how a
blend of technology choices between download and
streaming makes the current wave of ubiquitous
streaming possible for entertainment video delivery.
After identifying problems that still need to be
solved, the article concludes with the lessons learnt
from the video streaming evolution.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "40",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Chou:2013:AIC,
author = "Philip A. Chou",
title = "Advances in immersive communication: (1) {Telephone},
(2) {Television}, (3) {Teleportation}",
journal = j-TOMCCAP,
volume = "9",
number = "1s",
pages = "41:1--41:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2492704",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:45 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The last great advances in immersive communication
were the invention of the telephone over 137 years ago
and the invention of the video telephone (n{\'e}
television) over 86 years ago. However, a perfect storm
is brewing for the next advance in immersive
communication, thanks to the convergence of massive
amounts of computation, bandwidth, resolution, new
sensors, and new displays. It could well be the
Multimedia community that turns this brew into the next
great advance in immersive communication, something
akin to teleportation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "41",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Chang:2013:HFW,
author = "Shih-Fu Chang",
title = "How far we've come: {Impact} of 20 years of multimedia
information retrieval",
journal = j-TOMCCAP,
volume = "9",
number = "1s",
pages = "42:1--42:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2491844",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:45 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article reviews the major research trends that
emerged in the last two decades within the broad area
of multimedia information retrieval, with a focus on
the ACM Multimedia community. Trends are defined
(nonscientifically) to be topics that appeared in ACM
multimedia publications and have had a significant
number of citations. The article also assesses the
impacts of these trends on real-world applications. The
views expressed are subjective and likely biased but
hopefully useful for understanding the heritage of the
community and stimulating new research direction.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "42",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Effelsberg:2013:PLB,
author = "Wolfgang Effelsberg",
title = "A personal look back at twenty years of research in
multimedia content analysis",
journal = j-TOMCCAP,
volume = "9",
number = "1s",
pages = "43:1--43:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2502434",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:45 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This paper is a personal look back at twenty years of
research in multimedia content analysis. It addresses
the areas of audio, photo and video analysis for the
purpose of indexing and retrieval from the perspective
of a multimedia researcher. Whereas a general analysis
of content is impossible due to the personal bias of
the user, significant progress was made in the
recognition of specific objects or events. The paper
concludes with a brief outlook on the future.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "43",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Hanjalic:2013:MRM,
author = "Alan Hanjalic",
title = "Multimedia retrieval that matters",
journal = j-TOMCCAP,
volume = "9",
number = "1s",
pages = "44:1--44:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2490827",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:45 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article emphasizes the need to refocus multimedia
information retrieval (MIR) research towards bridging
the utility gap, the gap between the expected and
defacto usefulness of MIR solutions. This requires us
to revisit the notion of relevance, but also to
consider other criteria for assessing MIR solutions,
like the informativeness of the retrieved results and
how helpful they are for the users. The article also
states that this focus shift cannot be realized
incrementally, but by revisiting the foundations of MIR
solutions, that is, by a utility-by-design approach. In
this respect, a number of research challenges are
proposed.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "44",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Turk:2013:TYE,
author = "Matthew Turk",
title = "Over twenty years of eigenfaces",
journal = j-TOMCCAP,
volume = "9",
number = "1s",
pages = "45:1--45:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2490824",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:45 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The inaugural ACM Multimedia Conference coincided with
a surge of interest in computer vision technologies for
detecting and recognizing people and their activities
in images and video. Face recognition was the first of
these topics to broadly engage the vision and
multimedia research communities. The Eigenfaces
approach was, deservedly or not, the method that
captured much of the initial attention, and it
continues to be taught and used as a benchmark over 20
years later. This article is a brief personal view of
the genesis of Eigenfaces for face recognition and its
relevance to the multimedia community.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "45",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Whitman:2013:CSF,
author = "Brian Whitman",
title = "Care and scale: {Fifteen} years of music retrieval",
journal = j-TOMCCAP,
volume = "9",
number = "1s",
pages = "46:1--46:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2492703",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:45 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The co-founder of The Echo Nest, a music intelligence
company that now powers recommendation and discovery
for most music services, discusses the notion of care
and scale, cultural analysis of music, a brief history
of music retrieval, and how and why The Echo Nest got
started.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "46",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Szeliski:2013:NWC,
author = "Richard Szeliski and Noah Snavely and Steven M.
Seitz",
title = "Navigating the worldwide community of photos",
journal = j-TOMCCAP,
volume = "9",
number = "1s",
pages = "47:1--47:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2492208",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:45 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The last decade has seen an explosion in the number of
photographs available on the Internet. The sheer volume
of interesting photos makes it a challenge to explore
this space. Various Web and social media sites, along
with search and indexing techniques, have been
developed in response. One natural way to navigate
these images in a 3D geo-located context. In this
article, we reflect on our work in this area, with a
focus on techniques that build partial 3D scene models
to help find and navigate interesting photographs in an
interactive, immersive 3D setting. We also discuss how
finding such relationships among photographs opens up
exciting new possibilities for multimedia authoring,
visualization, and editing.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "47",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Andre:2013:EUU,
author = "Elisabeth Andre",
title = "Exploiting unconscious user signals in multimodal
human-computer interaction",
journal = j-TOMCCAP,
volume = "9",
number = "1s",
pages = "48:1--48:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2502433",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:45 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article presents the idea of empathic stimulation
that relies on the power and potential of unconsciously
conveyed attentive and emotional information to
facilitate human-machine interaction. Starting from a
historical review of related work presented at past ACM
Multimedia conferences, we discuss challenges that
arise when exploiting unconscious human signals for
empathic stimulation, such as the real-time analysis of
psychological user states and the smooth adaptation of
the human-machine interface based on this analysis. A
classical application field that might benefit from the
idea of unconscious human-computer interaction is the
exploration of massive datasets.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "48",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Sundaram:2013:EMS,
author = "Hari Sundaram",
title = "Experiential media systems",
journal = j-TOMCCAP,
volume = "9",
number = "1s",
pages = "49:1--49:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2502432",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:45 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article presents a personalized narrative on the
early discussions within the Multimedia community and
the subsequent research on experiential media systems.
I discuss two different research initiatives-design of
real-time, immersive multimedia feedback environments
for stroke rehabilitation; exploratory environments for
events that exploited the user's ability to make
connections. I discuss the issue of foundations: the
question of multisensory integration and
superadditivity; the need for identification of
``first-class'' Multimedia problems; expanding the
scope of Multimedia research.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "49",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Kompatsiaris:2013:ISS,
author = "Ioannis (Yiannis) Kompatsiaris and Wenjun (Kevin) Zeng
and Gang Hua and Liangliang Cao",
title = "Introduction to the special section of best papers of
{ACM} multimedia 2012",
journal = j-TOMCCAP,
volume = "9",
number = "1s",
pages = "50:1--50:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2523001.2523004",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:45 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "50",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Liu:2013:RAM,
author = "Heng Liu and Tao Mei and Houqiang Li and Jiebo Luo and
Shipeng Li",
title = "Robust and accurate mobile visual localization and its
applications",
journal = j-TOMCCAP,
volume = "9",
number = "1s",
pages = "51:1--51:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2491735",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:45 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Mobile applications are becoming increasingly popular.
More and more people are using their phones to enjoy
ubiquitous location-based services (LBS). The
increasing popularity of LBS creates a fundamental
problem: mobile localization. Besides traditional
localization methods that use GPS or wireless signals,
using phone-captured images for localization has drawn
significant interest from researchers. Photos contain
more scene context information than the embedded
sensors, leading to a more precise location
description. With the goal being to accurately sense
real geographic scene contexts, this article presents a
novel approach to mobile visual localization according
to a given image (typically associated with a rough GPS
position). The proposed approach is capable of
providing a complete set of more accurate parameters
about the scene geo-context including the real
locations of both the mobile user and perhaps more
importantly the captured scene, as well as the viewing
direction. To figure out how to make image localization
quick and accurate, we investigate various techniques
for large-scale image retrieval and 2D-to-3D matching.
Specifically, we first generate scene clusters using
joint geo-visual clustering, with each scene being
represented by a reconstructed 3D model from a set of
images. The 3D models are then indexed using a visual
vocabulary tree structure. Taking geo-tags of the
database image as prior knowledge, a novel
location-based codebook weighting scheme proposed to
embed this additional information into the codebook.
The discriminative power of the codebook is enhanced,
thus leading to better image retrieval performance. The
query image is aligned with the models obtained from
the image retrieval results, and eventually registered
to a real-world map. We evaluate the effectiveness of
our approach using several large-scale datasets and
achieving estimation accuracy of a user's location
within 13 meters, viewing direction within 12 degrees,
and viewing distance within 26 meters. Of particular
note is our showcase of three novel applications based
on localization results: (1) an on-the-spot tour guide,
(2) collaborative routing, and (3) a sight-seeing
guide. The evaluations through user studies demonstrate
that these applications are effective in facilitating
the ideal rendezvous for mobile users.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "51",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wang:2013:PBS,
author = "Zhi Wang and Wenwu Zhu and Xiangwen Chen and Lifeng
Sun and Jiangchuan Liu and Minghua Chen and Peng Cui
and Shiqiang Yang",
title = "Propagation-based social-aware multimedia content
distribution",
journal = j-TOMCCAP,
volume = "9",
number = "1s",
pages = "52:1--52:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2523001.2523005",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:45 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Online social networks have reshaped how multimedia
contents are generated, distributed, and consumed on
today's Internet. Given the massive number of
user-generated contents shared in online social
networks, users are moving to directly access these
contents in their preferred social network services. It
is intriguing to study the service provision of social
contents for global users with satisfactory quality of
experience. In this article, we conduct large-scale
measurement of a real-world online social network
system to study the social content propagation. We have
observed important propagation patterns, including
social locality, geographical locality, and temporal
locality. Motivated by the measurement insights, we
propose a propagation-based social-aware delivery
framework using a hybrid edge-cloud and peer-assisted
architecture. We also design replication strategies for
the architecture based on three propagation predictors
designed by jointly considering user, content, and
context information. In particular, we design a
propagation region predictor and a global audience
predictor to guide how the edge-cloud servers backup
the contents, and a local audience predictor to guide
how peers cache the contents for their friends. Our
trace-driven experiments further demonstrate the
effectiveness and superiority of our design.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "52",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Sang:2013:SIA,
author = "Jitao Sang and Changsheng Xu",
title = "Social influence analysis and application on
multimedia sharing websites",
journal = j-TOMCCAP,
volume = "9",
number = "1s",
pages = "53:1--53:??",
month = oct,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2502436",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:45 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Social media is becoming popular these days, where
users necessarily interact with each other to form
social networks. Influence network, as one special case
of social network, has been recognized as significantly
impacting social activities and user decisions. We
emphasize in this article that the inter-user influence
is essentially topic-sensitive, as for different tasks
users tend to trust different influencers and be
influenced most by them. While existing research
focuses on global influence modeling and applies to
text-based networks, this work investigates the problem
of topic-sensitive influence modeling in the multimedia
domain. According to temporal data justification, we
propose a multimodal probabilistic model, considering
both users' textual annotation and uploaded visual
images. This model is capable of simultaneously
extracting user topic distributions and topic-sensitive
influence strengths. By identifying the topic-sensitive
influencer, we are able to conduct applications, like
collective search and collaborative recommendation. A
risk minimization-based general framework for
personalized image search is further presented, where
the image search task is transferred to measure the
distance of image and personalized query language
models. The framework considers the noisy tag issue and
enables easy incorporation of social influence. We have
conducted experiments on a large-scale Flickr dataset.
Qualitative as well as quantitative evaluation results
have validated the effectiveness of the topic-sensitive
influencer mining model, and demonstrated the advantage
of incorporating topic-sensitive influence in
personalized image search and topic-based image
recommendation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "53",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Silva:2013:HPH,
author = "Juan M. Silva and Mauricio Orozco and Jongeun Cha and
Abdulmotaleb {El Saddik} and Emil M. Petriu",
title = "Human perception of haptic-to-video and
haptic-to-audio skew in multimedia applications",
journal = j-TOMCCAP,
volume = "9",
number = "2",
pages = "9:1--9:??",
month = may,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2457450.2457451",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:48 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The purpose of this research is to assess the
sensitivity of humans to perceive asynchrony among
media signals coming from a computer application.
Particularly we examine haptic-to-video and
haptic-to-audio skew. For this purpose we have designed
an experimental setup, where users are exposed to a
basic multimedia presentation resembling a ping-pong
game. For every collision between a ball and a racket,
the user is able to perceive auditory, visual, and
haptic cues about the collision event. We artificially
introduce negative and positive delay to the auditory
and visual cues with respect to the haptic stream. We
subjectively evaluate the perception of inter-stream
asynchrony perceived by the users using two types of
haptic devices. The statistical results of our
evaluation show perception rates of around 100 ms
regardless of modality and type of device.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "9",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Bhatt:2013:RPB,
author = "Chidansh A. Bhatt and Pradeep K. Atrey and Mohan S.
Kankanhalli",
title = "A reward-and-punishment-based approach for concept
detection using adaptive ontology rules",
journal = j-TOMCCAP,
volume = "9",
number = "2",
pages = "10:1--10:??",
month = may,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2457450.2457452",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:48 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Despite the fact that performance improvements have
been reported in the last years, semantic concept
detection in video remains a challenging problem.
Existing concept detection techniques, with ontology
rules, exploit the static correlations among primitive
concepts but not the dynamic spatiotemporal
correlations. The proposed method rewards (or punishes)
detected primitive concepts using dynamic
spatiotemporal correlations of the given ontology rules
and updates these ontology rules based on the accuracy
of detection. Adaptively learned ontology rules
significantly help in improving the overall accuracy of
concept detection as shown in the experimental
result.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "10",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Alsulaiman:2013:IVB,
author = "Fawaz A. Alsulaiman and Nizar Sakr and Julio J.
Vald{\'e}s and Abdulmotaleb {El Saddik}",
title = "Identity verification based on handwritten signatures
with haptic information using genetic programming",
journal = j-TOMCCAP,
volume = "9",
number = "2",
pages = "11:1--11:??",
month = may,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2457450.2457453",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:48 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this article, haptic-based handwritten signature
verification using Genetic Programming (GP)
classification is presented. A comparison of GP-based
classification with classical classifiers including
support vector machine, $k$-nearest neighbors,
na{\"\i}ve Bayes, and random forest is conducted. In
addition, the use of GP in discovering small
knowledge-preserving subsets of features in
high-dimensional datasets of haptic-based signatures is
investigated and several approaches are explored.
Subsets of features extracted from GP-generated models
(analytic functions) are also exploited to determine
the importance and relevance of different haptic data
types (e.g., force, position, torque, and orientation)
in user identity verification. The results revealed
that GP classifiers compare favorably with the
classical methods and use a much fewer number of
attributes (with simple function sets).",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "11",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhang:2013:MAS,
author = "Qianni Zhang and Ebroul Izquierdo",
title = "Multifeature analysis and semantic context learning
for image classification",
journal = j-TOMCCAP,
volume = "9",
number = "2",
pages = "12:1--12:??",
month = may,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2457450.2457454",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:48 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article introduces an image classification
approach in which the semantic context of images and
multiple low-level visual features are jointly
exploited. The context consists of a set of semantic
terms defining the classes to be associated to
unclassified images. Initially, a multiobjective
optimization technique is used to define a multifeature
fusion model for each semantic class. Then, a Bayesian
learning procedure is applied to derive a context model
representing relationships among semantic classes.
Finally, this context model is used to infer object
classes within images. Selected results from a
comprehensive experimental evaluation are reported to
show the effectiveness of the proposed approaches.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "12",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhao:2013:MEU,
author = "Zhen Wei Zhao and Sameer Samarth and Wei Tsang Ooi",
title = "Modeling the effect of user interactions on mesh-based
{P2P VoD} streaming systems",
journal = j-TOMCCAP,
volume = "9",
number = "2",
pages = "13:1--13:??",
month = may,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2457450.2457455",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:48 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "User interactions such as seeks and pauses are widely
supported by existing Peer-to-Peer Video-on-Demand (P2P
VoD) streaming systems. Their effect on the streaming
system, however, has not been well studied. Seeks cause
peers to skip part of the video, making them stay in
the system for shorter time, and thus contribute less.
On the other hand, only part of the video is downloaded
due to seeks, reducing peers' demand from the system.
It is unclear which factor dominates the effect of
seeks on the streaming system. Pauses during playback,
on one hand, allow peers to stay longer in the system
and upload more content. When interleaved with seeks,
however, long pauses may increase peers' demand
unnecessarily as peers may download content that will
eventually be skipped by subsequent forward seeks. The
collective effect of seeks and pauses, together with
the known random peer departure, is unintuitive and
needs to be addressed properly so as to understand the
effect of human factors on the streaming system
performance. In this article, we develop an analytical
model to both qualitatively and quantitatively study
the effect of seeks and pauses on mesh-based P2P VoD
streaming systems, in particular, the effect on the
server cost. Our model can help in understanding how
human factors such as seeks and pauses affect the
streaming system performance, tuning a P2P VoD system
towards better system performance and stability, and
providing a framework for capacity planning.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "13",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yang:2013:ETT,
author = "Yang Yang and Yi Yang and Heng Tao Shen",
title = "Effective transfer tagging from image to video",
journal = j-TOMCCAP,
volume = "9",
number = "2",
pages = "14:1--14:??",
month = may,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2457450.2457456",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:48 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Recent years have witnessed a great explosion of
user-generated videos on the Web. In order to achieve
an effective and efficient video search, it is critical
for modern video search engines to associate videos
with semantic keywords automatically. Most of the
existing video tagging methods can hardly achieve
reliable performance due to deficiency of training
data. It is noticed that abundant well-tagged data are
available in other relevant types of media (e.g.,
images). In this article, we propose a novel video
tagging framework, termed as Cross-Media Tag Transfer
(CMTT), which utilizes the abundance of well-tagged
images to facilitate video tagging. Specifically, we
build a ``cross-media tunnel'' to transfer knowledge
from images to videos. To this end, an optimal kernel
space, in which distribution distance between images
and video is minimized, is found to tackle the
domain-shift problem. A novel cross-media video tagging
model is proposed to infer tags by exploring the
intrinsic local structures of both labeled and
unlabeled data, and learn reliable video classifiers.
An efficient algorithm is designed to optimize the
proposed model in an iterative and alternative way.
Extensive experiments illustrate the superiority of our
proposal compared to the state-of-the-art algorithms.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "14",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhao:2013:AAP,
author = "Zhen Wei Zhao and Wei Tsang Ooi",
title = "{APRICOD}: an access-pattern-driven distributed
caching middleware for fast content discovery of
noncontinuous media access",
journal = j-TOMCCAP,
volume = "9",
number = "2",
pages = "15:1--15:??",
month = may,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2457450.2457457",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:48 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Content discovery is a major source of latency in
peer-to-peer (P2P) media streaming systems, especially
in the presence of noncontinuous user access, such as
random seek in Video-on-Demand (VoD) streaming and
teleportation in a Networked Virtual Environment (NVE).
After the aforementioned user interactions, streaming
systems often need to initiate the content discovery
process to identify where to retrieve the requested
media objects. Short content lookup latency is demanded
to ensure smooth user experience. Existing content
discovery systems based on either a Distributed Hash
Table (DHT) or gossip mechanism cannot cope with
noncontinuous access efficiently due to their long
lookup latency. In this work, we propose an
access-pattern-driven distributed caching middleware
named APRICOD, which caters for fast and scalable
content discovery in peer-to-peer media streaming
systems, especially when user interactions are present.
APRICOD exploits correlations among media objects
accessed by users, and adapts to shift in the user
access pattern automatically. We first present a
general APRICOD design that can be used with any
existing content discovery system. We then present an
implementation of APRICOD on top of Pastry, which we
use to evaluate APRICOD. Our evaluation in a 1024-node
system, using a Second Life trace with 5,735 users and
a VoD trace with 54 users, shows that APRICOD can
effectively resolve all continuous access queries with
a single hop deterministically with node failure as an
exception, and resolve noncontinuous access queries
with a single hop with high probability.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "15",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Anonymous:2013:CPM,
author = "Anonymous",
title = "Call for papers: {Multiple} sensorial {(MulSeMedia)}
multi-modal media: {Advances} and applications",
journal = j-TOMCCAP,
volume = "9",
number = "3",
pages = "15:1--15:??",
month = jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2487268.2500818",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:50 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "15",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Mei:2013:NLS,
author = "Tao Mei and Lin-Xie Tang and Jinhui Tang and
Xian-Sheng Hua",
title = "Near-lossless semantic video summarization and its
applications to video analysis",
journal = j-TOMCCAP,
volume = "9",
number = "3",
pages = "16:1--16:??",
month = jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2487268.2487269",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:50 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The ever increasing volume of video content on the Web
has created profound challenges for developing
efficient indexing and search techniques to manage
video data. Conventional techniques such as video
compression and summarization strive for the two
commonly conflicting goals of low storage and high
visual and semantic fidelity. With the goal of
balancing both video compression and summarization,
this article presents a novel approach, called
Near-Lossless Semantic Summarization (NLSS), to
summarize a video stream with the least high-level
semantic information loss by using an extremely small
piece of metadata. The summary consists of compressed
image and audio streams, as well as the metadata for
temporal structure and motion information. Although at
a very low compression rate (around $ 1 / 4 $0; of
H.264 baseline, where traditional compression
techniques can hardly preserve an acceptable visual
fidelity), the proposed NLSS still can be applied to
many video-oriented tasks, such as visualization,
indexing and browsing, duplicate detection, concept
detection, and so on. We evaluate the NLSS on TRECVID
and other video collections, and demonstrate that it is
a powerful tool for significantly reducing storage
consumption, while keeping high-level semantic
fidelity.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "16",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Ademoye:2013:IRT,
author = "Oluwakemi A. Ademoye and Gheorghita Ghinea",
title = "Information recall task impact in olfaction-enhanced
multimedia",
journal = j-TOMCCAP,
volume = "9",
number = "3",
pages = "17:1--17:??",
month = jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2487268.2487270",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:50 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Enhancing multimedia applications with olfactory
sensations is one of the last challenges in the area.
While there is evidence, both scientific and anecdotal,
that olfactory cues help users in information recall
tasks, there is a lack of work when the targeted
information is one contained in a multimedia
presentation, which is precisely the focus of this
article. Accordingly, we present the results of two
experimental studies. The first study measured the
impact of olfactory media variation on the user's
ability to perceive, synthesize, and analyze the
informational content of olfactory-enhanced multimedia
videos; the second study measured the impact of
information content, and an information recall task in
respect of user perception of the relevance, sense of
reality, and acceptability of the olfactory media
content, as well as the overall enjoyment of the
experience. Results show that the use of olfactory
media content, both pleasant and unpleasant, in
multimedia displays does not significantly impact on
information assimilation in a negative way. Moreover,
the addition of a performance task may enhance the
user's understanding of the correlation between the
characteristic odor(s) and the scenario under
consideration, as well as enable users to consciously
learn the odors.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "17",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yeh:2013:CAS,
author = "Lo-Yao Yeh and Jiun-Long Huang",
title = "A conditional access system with efficient key
distribution and revocation for mobile pay-{TV}
systems",
journal = j-TOMCCAP,
volume = "9",
number = "3",
pages = "18:1--18:??",
month = jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2487268.2487271",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:50 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Current mobile pay-TV systems have two types of
Conditional Access Systems (CAS): group-key-based and
public-key systems. The best feature of group-key-based
systems is the ability to enjoy the broadcast nature in
delivery multimedia contents, while the major advantage
of public-key systems is consolidating the security
foundation to withstand various attacks, such as
collusion attacks. However, the problems of
group-key-based systems include collusion attacks, lack
of nonrepudiation, and troublesome key distribution.
Even worse, the benefit of broadcast efficiency is
confined to a group size of no more than 512
subscribers. For public-key systems, the poor delivery
scalability is the major shortcoming because the unique
private key feature is only suitable for one-to-one
delivery. In this article, we introduce a scalable
access control scheme to integrate the merits of
broadcasting regardless of group size and sound
security assurance, including fine-grained access
control and collusion attack resistance. For subscriber
revocation, a single message is broadcast to the other
subscribers to get the updated key, thus significantly
boosting subscriber revocation scalability. Due to
mobile subscribers' dynamic movements, this article
also analyzes the benefit of retransmission cases in
our system. Through the performance evaluation and
functionality comparison, the proposed scheme should be
a decent candidate to enhance the security strength and
transmission efficiency in a mobile pay-TV system.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "18",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Naskar:2013:GTL,
author = "Ruchira Naskar and Rajat Subhra Chakraborty",
title = "A generalized tamper localization approach for
reversible watermarking algorithms",
journal = j-TOMCCAP,
volume = "9",
number = "3",
pages = "19:1--19:??",
month = jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2487268.2487272",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:50 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In general reversible watermarking algorithms, the
convention is to reject the entire cover image at the
receiver end if it fails authentication, since there is
no way to detect the exact locations of tampering. This
feature may be exploited by an adversary to bring about
a form of DoS attack. Here we provide a solution to
this problem in form of a tamper localization mechanism
for reversible watermarking algorithms, which allows
selective rejection of distorted cover image regions in
case of authentication failure, thus avoiding rejection
of the complete image. Additionally it minimizes the
bandwidth requirement of the communication channel.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "19",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Doherty:2013:SSA,
author = "Jonathan Doherty and Kevin Curran and Paul Mckevitt",
title = "A self-similarity approach to repairing large dropouts
of streamed music",
journal = j-TOMCCAP,
volume = "9",
number = "3",
pages = "20:1--20:??",
month = jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2487268.2487273",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:50 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Enjoyment of audio has now become about flexibility
and personal freedom. Digital audio content can be
acquired from many sources and wireless networking
allows digital media devices and associated peripherals
to be unencumbered by wires. However, despite recent
improvements in capacity and quality of service,
wireless networks are inherently unreliable
communications channels for the streaming of audio,
being susceptible to the effects of range,
interference, and occlusion. This time-varying
reliability of wireless audio transfer introduces data
corruption and loss, with unpleasant audible effects
that can be profound and prolonged in duration.
Traditional communications techniques for error
mitigation perform poorly and in a bandwidth
inefficient manner in the presence of such large-scale
defects in a digital audio stream. A novel solution
that can complement existing techniques takes account
of the semantics and natural repetition of music.
Through the use of self-similarity metadata, missing or
damaged audio segments can be seamlessly replaced with
similar undamaged segments that have already been
successfully received. We propose a technology to
generate relevant self-similarity metadata for
arbitrary audio material and to utilize this metadata
within a wireless audio receiver to provide
sophisticated and real-time correction of large-scale
errors. The primary objectives are to match the current
section of a song being received with previous sections
while identifying incomplete sections and determining
replacements based on previously received portions of
the song. This article outlines our approach to Forward
Error Correction (FEC) technology that is used to
``repair'' a bursty dropout when listening to
time-dependent media on a wireless network. Using
self-similarity analysis on a music file, we can
``automatically'' repair the dropout with a similar
portion of the music already received thereby
minimizing a listener's discomfort.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "20",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Ho:2013:IPC,
author = "Edmond S. L. Ho and Jacky C. P. Chan and Taku Komura
and Howard Leung",
title = "Interactive partner control in close interactions for
real-time applications",
journal = j-TOMCCAP,
volume = "9",
number = "3",
pages = "21:1--21:??",
month = jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2487268.2487274",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:50 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article presents a new framework for synthesizing
motion of a virtual character in response to the
actions performed by a user-controlled character in
real time. In particular, the proposed method can
handle scenes in which the characters are closely
interacting with each other such as those in partner
dancing and fighting. In such interactions,
coordinating the virtual characters with the human
player automatically is extremely difficult because the
system has to predict the intention of the player
character. In addition, the style variations from
different users affect the accuracy in recognizing the
movements of the player character when determining the
responses of the virtual character. To solve these
problems, our framework makes use of the spatial
relationship-based representation of the body parts
called interaction mesh, which has been proven
effective for motion adaptation. The method is
computationally efficient, enabling real-time character
control for interactive applications. We demonstrate
its effectiveness and versatility in synthesizing a
wide variety of motions with close interactions.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "21",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Steinmetz:2013:ER,
author = "Ralf Steinmetz",
title = "Editorial: Reviewers",
journal = j-TOMCCAP,
volume = "9",
number = "4",
pages = "22:1--22:??",
month = aug,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2501643.2501644",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:51 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "22",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Sakai:2013:PPC,
author = "Kazuya Sakai and Wei-Shinn Ku and Min-Te Sun and Roger
Zimmermann",
title = "Privacy preserving continuous multimedia streaming in
{MANETs}",
journal = j-TOMCCAP,
volume = "9",
number = "4",
pages = "23:1--23:??",
month = aug,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2501643.2501645",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:51 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "At present, mobile devices are prevalent with end
users and continuous media streaming services in mobile
ad-hoc networks (MANETs) support popular applications.
It is required for applications that stream isochronous
media that the network link be continuously available.
In this study, we introduce two group-server scheduling
schemes to improve link continuity: static group-server
scheduling and dynamic group-server scheduling. With
our solution, if one of the current links between a
client and a server instance breaks, the client can
still download the multimedia content from another
scheduled server peer. In addition, we incorporate the
data link layer constraints as well as privacy concerns
into our protocol design. The simulation results show
that the proposed schemes significantly improve the
effective link duration, overall system performance,
and degree of privacy in MANETs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "23",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Dong:2013:RIA,
author = "Jian Dong and Bin Cheng and Xiangyu Chen and Tat-Seng
Chua and Shuicheng Yan and Xi Zhou",
title = "Robust image annotation via simultaneous feature and
sample outlier pursuit",
journal = j-TOMCCAP,
volume = "9",
number = "4",
pages = "24:1--24:??",
month = aug,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2501643.2501646",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:51 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Graph-based semi-supervised image annotation has
achieved great success in a variety of studies, yet it
essentially and intuitively suffers from both the
irrelevant/noisy features (referred to as feature
outliers) and the unusual/corrupted samples (referred
to as sample outliers). In this work, we investigate
how to derive robust sample affinity matrix via
simultaneous feature and sample outlier pursuit. This
task is formulated as a Dual-outlier and Prior-driven
Low-Rank Representation (DP-LRR) problem, which
possesses convexity in objective function. In DP-LRR,
the clean data are assumed to be self-reconstructible
with low-rank coefficient matrix as in LRR; while the
error matrix is decomposed as the sum of a row-wise
sparse matrix and a column-wise sparse matrix, the
l$_{2, 1}$ -norm minimization of which encourages the
pursuit of feature and sample outliers respectively.
The DP-LRR is further regularized by the priors from
side information, that is, the inhomogeneous data
pairs. An efficient iterative procedure based on
linearized alternating direction method is presented to
solve the DP-LRR problem, with closed-form solutions
within each iteration. The derived low-rank
reconstruction coefficient matrix is then fed into any
graph based semi-supervised label propagation algorithm
for image annotation, and as a by-product, the cleaned
data from DP-LRR can also be utilized as a better image
representation to generally boost image annotation
performance. Extensive experiments on MIRFlickr,
Corel30K, NUS-WIDE-LITE and NUS-WIDE databases well
demonstrate the effectiveness of the proposed
formulation for robust image annotation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "24",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Villanueva:2013:HMB,
author = "Arantxa Villanueva and Victoria Ponz and Laura
Sesma-Sanchez and Mikel Ariz and Sonia Porta and Rafael
Cabeza",
title = "Hybrid method based on topography for robust detection
of iris center and eye corners",
journal = j-TOMCCAP,
volume = "9",
number = "4",
pages = "25:1--25:??",
month = aug,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2501643.2501647",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:51 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "A multistage procedure to detect eye features is
presented. Multiresolution and topographic
classification are used to detect the iris center. The
eye corner is calculated combining valley detection and
eyelid curve extraction. The algorithm is tested in the
BioID database and in a proprietary database containing
more than 1200 images. The results show that the
suggested algorithm is robust and accurate. Regarding
the iris center our method obtains the best average
behavior for the BioID database compared to other
available algorithms. Additional contributions are that
our algorithm functions in real time and does not
require complex post processing stages.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "25",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wang:2013:ECR,
author = "Bo Wang and Jinqiao Wang and Hanqing Lu",
title = "Exploiting content relevance and social relevance for
personalized ad recommendation on {Internet TV}",
journal = j-TOMCCAP,
volume = "9",
number = "4",
pages = "26:1--26:??",
month = aug,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2501643.2501648",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:51 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "There have been not many interactions between the two
dominant forms of mass communication: television and
the Internet, while nowadays the appearance of Internet
television makes them more closely. Different with
traditional TV in a passive mode of transmission,
Internet TV makes it more possible to make personalized
service recommendation because of the interactivity
between users and the Internet. In this article, we
introduce a scheme to provide targeted ad
recommendation to Internet TV users by exploiting the
content relevance and social relevance. First, we
annotate TV videos in terms of visual content analysis
and textual analysis by aligning visual and textual
information. Second, with user-user, video-video and
user-video relationships, we employ Multi-Relationship
based Probabilistic Matrix Factorization (MRPMF) to
learn representative tags for modeling user preference.
And then semantic content relevance (between product/ad
and TV video) and social relevance (between product/ad
and user interest) are calculated by projecting the
corresponding tags into our advertising concept space.
Finally, with relevancy scores we make ranking for
relevant product/ads to effectively provide users
personalized recommendation. The experimental results
demonstrate attractiveness and effectiveness of our
proposed approach.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "26",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Alam:2013:MHB,
author = "Kazi Masudul Alam and Abu Saleh Md Mahfujur Rahman and
Abdulmotaleb {El Saddik}",
title = "Mobile haptic e-book system to support {$3$D}
immersive reading in ubiquitous environments",
journal = j-TOMCCAP,
volume = "9",
number = "4",
pages = "27:1--27:??",
month = aug,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2501643.2501649",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:51 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In order to leverage the use of various modalities
such as audio-visual materials in instilling effective
learning behavior we present an intuitive approach of
annotation based hapto-audio-visual interaction with
the traditional digital learning materials such as
e-books. By integrating the home entertainment system
in the user's reading experience combined with haptic
interfaces we want to examine whether such augmentation
of modalities influence the user's learning patterns.
The proposed Haptic E--Book (HE-Book) system leverages
the haptic jacket, haptic arm band as well as haptic
sofa interfaces to receive haptic emotive signals
wirelessly in the form of patterned vibrations of the
actuators and expresses the learning material by
incorporating image, video, 3D environment based
augmented display in order to pave ways for intimate
reading experience in the popular mobile e-book
platform.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "27",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Nguyen:2013:TDA,
author = "Tam V. Nguyen and Si Liu and Bingbing Ni and Jun Tan
and Yong Rui and Shuicheng Yan",
title = "Towards decrypting attractiveness via multi-modality
cues",
journal = j-TOMCCAP,
volume = "9",
number = "4",
pages = "28:1--28:??",
month = aug,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2501643.2501650",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:51 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Decrypting the secret of beauty or attractiveness has
been the pursuit of artists and philosophers for
centuries. To date, the computational model for
attractiveness estimation has been actively explored in
computer vision and multimedia community, yet with the
focus mainly on facial features. In this article, we
conduct a comprehensive study on female attractiveness
conveyed by single/multiple modalities of cues, that
is, face, dressing and/or voice, and aim to discover
how different modalities individually and collectively
affect the human sense of beauty. To extensively
investigate the problem, we collect the Multi-Modality
Beauty (M$^2$ B) dataset, which is annotated with
attractiveness levels converted from manual $k$-wise
ratings and semantic attributes of different
modalities. Inspired by the common consensus that
middle-level attribute prediction can assist
higher-level computer vision tasks, we manually labeled
many attributes for each modality. Next, a tri-layer
Dual-supervised Feature-Attribute-Task (DFAT) network
is proposed to jointly learn the attribute model and
attractiveness model of single/multiple modalities. To
remedy possible loss of information caused by
incomplete manual attributes, we also propose a novel
Latent Dual-supervised Feature-Attribute-Task (LDFAT)
network, where latent attributes are combined with
manual attributes to contribute to the final
attractiveness estimation. The extensive experimental
evaluations on the collected M$^2$ B dataset well
demonstrate the effectiveness of the proposed DFAT and
LDFAT networks for female attractiveness prediction.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "28",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Tang:2013:TOH,
author = "Jinhui Tang and Qiang Chen and Meng Wang and Shuicheng
Yan and Tat-Seng Chua and Ramesh Jain",
title = "Towards optimizing human labeling for interactive
image tagging",
journal = j-TOMCCAP,
volume = "9",
number = "4",
pages = "29:1--29:??",
month = aug,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2501643.2501651",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:51 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Interactive tagging is an approach that combines human
and computer to assign descriptive keywords to image
contents in a semi-automatic way. It can avoid the
problems in automatic tagging and pure manual tagging
by achieving a compromise between tagging performance
and manual cost. However, conventional research efforts
on interactive tagging mainly focus on sample selection
and models for tag prediction. In this work, we
investigate interactive tagging from a different
aspect. We introduce an interactive image tagging
framework that can more fully make use of human's
labeling efforts. That means, it can achieve a
specified tagging performance by taking less manual
labeling effort or achieve better tagging performance
with a specified labeling cost. In the framework,
hashing is used to enable a quick clustering of image
regions and a dynamic multiscale clustering labeling
strategy is proposed such that users can label a large
group of similar regions each time. We also employ a
tag refinement method such that several inappropriate
tags can be automatically corrected. Experiments on a
large dataset demonstrate the effectiveness of our
approach",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "29",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Carbunar:2013:FNA,
author = "Bogdan Carbunar and Rahul Potharaju and Michael Pearce
and Venugopal Vasudevan and Michael Needham",
title = "A framework for network aware caching for video on
demand systems",
journal = j-TOMCCAP,
volume = "9",
number = "4",
pages = "30:1--30:??",
month = aug,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2501643.2501652",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:51 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
note = "See errata \cite{Carbunar:2014:EFN}.",
abstract = "Video on Demand (VoD) services allow users to select
and locally consume remotely stored content. We
investigate the use of caching to solve the scalability
issues of several existing VoD providers. We propose
metrics and goals that define the requirements of a
caching framework for CDNs of VoD systems. Using data
logs collected from Motorola equipment from Comcast VoD
deployments we show that several classic caching
solutions do not satisfy the proposed goals. We address
this issue by developing novel techniques for
predicting future values of several metrics of
interest. We rely on computed predictions to define the
penalty imposed on the system, both network and caching
sites, when not storing individual items. We use item
penalties to devise novel caching and static content
placement strategies. We use the previously mentioned
data logs to validate our solutions and show that they
satisfy all the defined goals.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "30",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Li:2013:ENO,
author = "Zechao Li and Jing Liu and Meng Wang and Changsheng Xu
and Hanqing Lu",
title = "Enhancing news organization for convenient retrieval
and browsing",
journal = j-TOMCCAP,
volume = "10",
number = "1",
pages = "1:1--1:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2488732",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:53 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "To facilitate users to access news quickly and
comprehensively, we design a news search and browsing
system named GeoVisNews, in which the news elements of
``Where'', ``Who'', ``What'' and ``When'' are enhanced
via news geo-localization, image enrichment and joint
ranking, respectively. For news geo-localization, an
Ordinal Correlation Consistent Matrix Factorization
(OCCMF) model is proposed to maintain the relevance
rankings of locations to a specific news document and
simultaneously capture intra-relations among locations
and documents. To visualize news, we develop a novel
method to enrich news documents with appropriate web
images. Specifically, multiple queries are first
generated from news documents for image search, and
then the appropriate images are selected from the
collected web images by an intelligent fusion approach
based on multiple features. Obtaining the geo-localized
and image enriched news resources, we further employ a
joint ranking strategy to provide relevant, timely and
popular news items as the answer of user searching
queries. Extensive experiments on a large-scale news
dataset collected from the web demonstrate the superior
performance of the proposed approaches over related
methods.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "1",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Knees:2013:SMS,
author = "Peter Knees and Markus Schedl",
title = "A survey of music similarity and recommendation from
music context data",
journal = j-TOMCCAP,
volume = "10",
number = "1",
pages = "2:1--2:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2542205.2542206",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:53 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this survey article, we give an overview of methods
for music similarity estimation and music
recommendation based on music context data. Unlike
approaches that rely on music content and have been
researched for almost two decades, music-context -based
(or contextual ) approaches to music retrieval are a
quite recent field of research within music information
retrieval (MIR). Contextual data refers to all
music-relevant information that is not included in the
audio signal itself. In this article, we focus on
contextual aspects of music primarily accessible
through web technology. We discuss different sources of
context-based data for individual music pieces and for
music artists. We summarize various approaches for
constructing similarity measures based on the
collaborative or cultural knowledge incorporated into
these data sources. In particular, we identify and
review three main types of context-based similarity
approaches: text-retrieval-based approaches (relying on
web-texts, tags, or lyrics), co-occurrence-based
approaches (relying on playlists, page counts,
microblogs, or peer-to-peer-networks), and approaches
based on user ratings or listening habits. This article
elaborates the characteristics of the presented
context-based measures and discusses their strengths as
well as their weaknesses.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "2",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhao:2013:DPO,
author = "Yi-Liang Zhao and Qiang Chen and Shuicheng Yan and
Tat-Seng Chua and Daqing Zhang",
title = "Detecting profilable and overlapping communities with
user-generated multimedia contents in {LBSNs}",
journal = j-TOMCCAP,
volume = "10",
number = "1",
pages = "3:1--3:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2502415",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:53 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In location-based social networks (LBSNs), users
implicitly interact with each other by visiting places,
issuing comments and/or uploading photos. These
heterogeneous interactions convey the latent
information for identifying meaningful user groups,
namely social communities, which exhibit unique
location-oriented characteristics. In this work, we aim
to detect and profile social communities in LBSNs by
representing the heterogeneous interactions with a
multimodality nonuniform hypergraph. Here, the vertices
of the hypergraph are users, venues, textual comments
or photos and the hyperedges characterize the k
-partite heterogeneous interactions such as posting
certain comments or uploading certain photos while
visiting certain places. We then view each detected
social community as a dense subgraph within the
heterogeneous hypergraph, where the user community is
constructed by the vertices and edges in the dense
subgraph and the profile of the community is
characterized by the vertices related with venues,
comments and photos and their inter-relations. We
present an efficient algorithm to detect the overlapped
dense subgraphs, where the profile of each social
community is guaranteed to be available by constraining
the minimal number of vertices in each modality.
Extensive experiments on Foursquare data well validated
the effectiveness of the proposed framework in terms of
detecting meaningful social communities and uncovering
their underlying profiles in LBSNs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "3",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Bhatnagar:2013:SRI,
author = "Gaurav Bhatnagar and Q. M. Jonathan Wu and Pradeep K.
Atrey",
title = "Secure randomized image watermarking based on singular
value decomposition",
journal = j-TOMCCAP,
volume = "10",
number = "1",
pages = "4:1--4:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2542205.2542207",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:53 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this article, a novel logo watermarking scheme is
proposed based on wavelet frame transform, singular
value decomposition and automatic thresholding. The
proposed scheme essentially rectifies the ambiguity
problem in the SVD-based watermarking. The core idea is
to randomly upscale the size of host image using
reversible random extension transform followed by the
embedding of logo watermark in the wavelet frame
domain. After embedding, a verification phase is casted
with the help of a binary watermark and toral
automorphism. At the extraction end, the binary
watermark is first extracted followed by the
verification of watermarked image. The logo watermark
is extracted if and only if the watermarked image is
verified. The security, attack and comparative analysis
confirm high security, efficiency and robustness of the
proposed watermarking system.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "4",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Mou:2013:CBC,
author = "Luntian Mou and Tiejun Huang and Yonghong Tian and
Menglin Jiang and Wen Gao",
title = "Content-based copy detection through multimodal
feature representation and temporal pyramid matching",
journal = j-TOMCCAP,
volume = "10",
number = "1",
pages = "5:1--5:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2542205.2542208",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:53 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Content-based copy detection (CBCD) is drawing
increasing attention as an alternative technology to
watermarking for video identification and copyright
protection. In this article, we present a comprehensive
method to detect copies that are subjected to
complicated transformations. A multimodal feature
representation scheme is designed to exploit the
complementarity of audio features, global and local
visual features so that optimal overall robustness to a
wide range of complicated modifications can be
achieved. Meanwhile, a temporal pyramid matching
algorithm is proposed to assemble frame-level
similarity search results into sequence-level matching
results through similarity evaluation over multiple
temporal granularities. Additionally, inverted indexing
and locality sensitive hashing (LSH) are also adopted
to speed up similarity search. Experimental results
over benchmarking datasets of TRECVID 2010 and 2009
demonstrate that the proposed method outperforms other
methods for most transformations in terms of copy
detection accuracy. The evaluation results also suggest
that our method can achieve competitive copy
localization preciseness.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "5",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Chen:2013:LSM,
author = "Xiangyu Chen and Yadong Mu and Hairong Liu and
Shuicheng Yan and Yong Rui and Tat-Seng Chua",
title = "Large-scale multilabel propagation based on efficient
sparse graph construction",
journal = j-TOMCCAP,
volume = "10",
number = "1",
pages = "6:1--6:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2542205.2542209",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:53 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "With the popularity of photo-sharing websites, the
number of web images has exploded into unseen
magnitude. Annotating such large-scale data will cost
huge amount of human resources and is thus
unaffordable. Motivated by this challenging problem, we
propose a novel sparse graph based multilabel
propagation (SGMP) scheme for super large scale
datasets. Both the efficacy and accuracy of the image
annotation are further investigated under different
graph construction strategies, where Gaussian noise and
non-Gaussian sparse noise are simultaneously considered
in the formulations of these strategies. Our proposed
approach outperforms the state-of-the-art algorithms by
focusing on: (1) For large-scale graph construction, a
simple yet efficient LSH (Locality Sensitive
Hashing)-based sparse graph construction scheme is
proposed to speed up the construction. We perform the
multilabel propagation on this hashing-based graph
construction, which is derived with LSH approach
followed by sparse graph construction within the
individual hashing buckets; (2) To further improve the
accuracy, we propose a novel sparsity induced scalable
graph construction scheme, which is based on a general
sparse optimization framework. Sparsity essentially
implies a very strong prior: for large scale
optimization, the values of most variables shall be
zeros when the solution reaches the optimum. By
utilizing this prior, the solutions of large-scale
sparse optimization problems can be derived by solving
a series of much smaller scale subproblems; (3) For
multilabel propagation, different from the traditional
algorithms that propagate over individual label
independently, our proposed propagation first encodes
the label information of an image as a unit label
confidence vector and naturally imposes inter-label
constraints and manipulates labels interactively. Then,
the entire propagation problem is formulated on the
concept of Kullback--Leibler divergence defined on
probabilistic distributions, which guides the
propagation of the supervision information. Extensive
experiments on the benchmark dataset NUS-WIDE with 270k
images and its lite version NUS-WIDE-LITE with 56k
images well demonstrate the effectiveness and
scalability of the proposed multi-label propagation
scheme.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "6",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Houle:2013:API,
author = "Michael E. Houle and Vincent Oria and Shin'ichi Satoh
and Jichao Sun",
title = "Annotation propagation in image databases using
similarity graphs",
journal = j-TOMCCAP,
volume = "10",
number = "1",
pages = "7:1--7:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2487736",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:53 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The practicality of large-scale image indexing and
querying methods depends crucially upon the
availability of semantic information. The manual
tagging of images with semantic information is in
general very labor intensive, and existing methods for
automated image annotation may not always yield
accurate results. The aim of this paper is to reduce to
a minimum the amount of human intervention required in
the semantic annotation of images, while preserving a
high degree of accuracy. Ideally, only one copy of each
object of interest would be labeled manually, and the
labels would then be propagated automatically to all
other occurrences of the objects in the database. To
this end, we propose an influence propagation strategy,
SW-KProp, that requires no human intervention beyond
the initial labeling of a subset of the images.
SW-KProp distributes semantic information within a
similarity graph defined on all images in the database:
each image iteratively transmits its current label
information to its neighbors, and then readjusts its
own label according to the combined influences of its
neighbors. SW-KProp influence propagation can be
efficiently performed by means of matrix computations,
provided that pairwise similarities of images are
available. We also propose a variant of SW-KProp which
enhances the quality of the similarity graph by
selecting a reduced feature set for each prelabeled
image and rebuilding its neighborhood. The performances
of the SW-KProp method and its variant were evaluated
against several competing methods on classification
tasks for three image datasets: a handwritten digit
dataset, a face dataset and a web image dataset. For
the digit images, SW-KProp and its variant performed
consistently better than the other methods tested. For
the face and web images, SW-KProp outperformed its
competitors for the case when the number of prelabeled
images was relatively small. The performance was seen
to improve significantly when the feature selection
strategy was applied.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "7",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Mallik:2013:MOR,
author = "Anupama Mallik and Hiranmay Ghosh and Santanu
Chaudhury and Gaurav Harit",
title = "{MOWL}: an ontology representation language for
{Web}-based multimedia applications",
journal = j-TOMCCAP,
volume = "10",
number = "1",
pages = "8:1--8:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2542205.2542210",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:53 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Several multimedia applications need to reason with
concepts and their media properties in specific domain
contexts. Media properties of concepts exhibit some
unique characteristics that cannot be dealt with
conceptual modeling schemes followed in the existing
ontology representation and reasoning schemes. We have
proposed a new perceptual modeling technique for
reasoning with media properties observed in multimedia
instances and the latent concepts. Our knowledge
representation scheme uses a causal model of the world
where concepts manifest in media properties with
uncertainties. We introduce a probabilistic reasoning
scheme for belief propagation across domain concepts
through observation of media properties. In order to
support the perceptual modeling and reasoning paradigm,
we propose a new ontology language, Multimedia Web
Ontology Language (MOWL). Our primary contribution in
this article is to establish the need for the new
ontology language and to introduce the semantics of its
novel language constructs. We establish the generality
of our approach with two disparate knowledge-intensive
applications involving reasoning with media properties
of concepts.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "8",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Deng:2014:DLB,
author = "Yunhua Deng and Rynson W. H. Lau",
title = "Dynamic load balancing in distributed virtual
environments using heat diffusion",
journal = j-TOMCCAP,
volume = "10",
number = "2",
pages = "16:1--16:??",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2499906",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:57 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Distributed virtual environments (DVEs) are attracting
a lot of attention in recent years, due to the
increasing popularity of online gaming and social
networks. As the number of concurrent users of a DVE
increases, a critical problem is on how the workload
among multiple servers can be balanced in order to
maintain real-time performance. Although a number of
load balancing methods have been proposed, they either
try to produce high quality load balancing results and
become too slow or emphasize on efficiency and the load
balancing results become less effective. In this
article, we propose a new approach to address this
problem based on heat diffusion. Our work has two main
contributions. First, we propose a local and a global
load balancing methods for DVEs based on heat
diffusion. Second, we investigate two performance
factors of the proposed methods, the convergence
threshold and the load balancing interval. We have
conducted a number of experiments to extensively
evaluate the performance of the proposed methods. Our
experimental results show that the proposed methods
outperform existing methods in that our methods are
effective in reducing server overloading while at the
same time being efficient.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "16",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{She:2014:CID,
author = "James She and Jon Crowcroft and Hao Fu and Flora Li",
title = "Convergence of interactive displays with smart mobile
devices for effective advertising: a survey",
journal = j-TOMCCAP,
volume = "10",
number = "2",
pages = "17:1--17:??",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2557450",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:57 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The trend of replacing public static signages with
digital displays creates opportunities for interactive
display systems, which can be used in collaborative
workspaces, social gaming platforms and advertising.
Based on marketing communication concepts and existing
models for consumer behavior, three stages, namely
attraction, interaction and conation, are defined in
this article to analyze the effectiveness of
interactive display advertising. By reviewing various
methods and strategies employed by existing systems
with attraction, interaction and conation stages, this
article concludes that smart mobile devices should be
integrated as a component to increase the effectiveness
of interactive displays as advertising tools. Future
research challenges related to this topic are also
discussed.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "17",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Gonina:2014:SMC,
author = "Ekaterina Gonina and Gerald Friedland and Eric
Battenberg and Penporn Koanantakool and Michael
Driscoll and Evangelos Georganas and Kurt Keutzer",
title = "Scalable multimedia content analysis on parallel
platforms using {Python}",
journal = j-TOMCCAP,
volume = "10",
number = "2",
pages = "18:1--18:??",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2517151",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:57 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/pvm.bib;
https://www.math.utah.edu/pub/tex/bib/python.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this new era dominated by consumer-produced media
there is a high demand for web-scalable solutions to
multimedia content analysis. A compelling approach to
making applications scalable is to explicitly map their
computation onto parallel platforms. However,
developing efficient parallel implementations and fully
utilizing the available resources remains a challenge
due to the increased code complexity, limited
portability and required low-level knowledge of the
underlying hardware. In this article, we present
PyCASP, a Python-based framework that automatically
maps computation onto parallel platforms from Python
application code to a variety of parallel platforms.
PyCASP is designed using a systematic, pattern-oriented
approach to offer a single software development
environment for multimedia content analysis
applications. Using PyCASP, applications can be
prototyped in a couple hundred lines of Python code and
automatically scale to modern parallel processors.
Applications written with PyCASP are portable to a
variety of parallel platforms and efficiently scale
from a single desktop Graphics Processing Unit (GPU) to
an entire cluster with a small change to application
code. To illustrate our approach, we present three
multimedia content analysis applications that use our
framework: a state-of-the-art speaker diarization
application, a content-based music recommendation
system based on the Million Song Dataset, and a video
event detection system for consumer-produced videos. We
show that across this wide range of applications, our
approach achieves the goal of automatic portability and
scalability while at the same time allowing easy
prototyping in a high-level language and efficient
performance of low-level optimized code.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "18",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Chandra:2014:HPM,
author = "Surendar Chandra and John Boreczky and Lawrence A.
Rowe",
title = "High performance many-to-many intranet screen sharing
with {DisplayCast}",
journal = j-TOMCCAP,
volume = "10",
number = "2",
pages = "19:1--19:??",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2534328",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:57 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "DisplayCast is a many to many Intranet screen sharing
system. Its screen capture mechanism creates a sequence
of pixmap images of the screen updates. Prior systems
that used a similar approach were designed to operate
over constrained wide-area networks and did not exploit
the Intranet network conditions to achieve high capture
rates. First we empirically analyzed the screen
contents for a variety of scenarios. We showed that
screen updates were sporadic with long periods of
inactivity. When active, screens were updated at far
higher rates than was supported by earlier systems. The
mismatch was pronounced for interactive scenarios. Even
during active screen updates, the number of updated
pixels were frequently small. We showed that crucial
information can be lost if individual updates were
merged. When the available system resources could not
support high capture rates, we showed ways in which
updates can be effectively collapsed. Next, we
investigate compression mechanisms for streaming these
updates. Even while using a hardware encoder, lossy
compressors such as H.264 were unable to sustain high
frame rates. Though Zlib lossless compression operated
within the latency and compression rate requirements,
the compression efficiency was poor. By analyzing the
screen pixels, we developed a practical transformation
that significantly improved compression rates.
DisplayCast incorporates these observations. It shares
the processor and network resources required for screen
capture, compression and transmission with host
applications whose output needs to be shared.
DisplayCast is agile and uses faster processing
capability to achieve even higher performance. Our
system components operate natively in Windows 7, Mac OS
X and iOS and is deployed in a production setting.
DisplayCast is released under a New BSD License.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "19",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Lee:2014:NDH,
author = "Ya-Lin Lee and Wen-Hsiang Tsai",
title = "A new data hiding method via revision history records
on collaborative writing platforms",
journal = j-TOMCCAP,
volume = "10",
number = "2",
pages = "20:1--20:??",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2534408",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:57 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "A new data hiding method via collaboratively-written
articles with forged revision history records on
collaborative writing platforms is proposed. The hidden
message is camouflaged as a stego-document consisting
of a stego-article and a revision history created
through a simulated process of collaborative writing.
The revisions are forged using a database constructed
by mining word sequences used in real cases from an
English Wikipedia XML dump. Four characteristics of
article revisions are identified and utilized to embed
secret messages, including the author of each revision,
the number of corrected word sequences, the content of
the corrected word sequences, and the word sequences
replacing the corrected ones. Related problems arising
in utilizing these characteristics for data hiding are
identified and solved skillfully, resulting in an
effective multiway method for hiding secret messages
into the revision history. To create more realistic
revisions, Huffman coding based on the word sequence
frequencies collected from Wikipedia is applied to
encode the word sequences. Good experimental results
show the feasibility of the proposed method.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "20",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yuan:2014:MRB,
author = "Jin Yuan and Yi-Liang Zhao and Huanbo Luan and Meng
Wang and Tat-Seng Chua",
title = "Memory recall based video search: Finding videos you
have seen before based on your memory",
journal = j-TOMCCAP,
volume = "10",
number = "2",
pages = "21:1--21:??",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2534409",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:57 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "We often remember images and videos that we have seen
or recorded before but cannot quite recall the exact
venues or details of the contents. We typically have
vague memories of the contents, which can often be
expressed as a textual description and/or rough visual
descriptions of the scenes. Using these vague memories,
we then want to search for the corresponding videos of
interest. We call this ``Memory Recall based Video
Search'' (MRVS). To tackle this problem, we propose a
video search system that permits a user to input
his/her vague and incomplete query as a combination of
text query, a sequence of visual queries, and/or
concept queries. Here, a visual query is often in the
form of a visual sketch depicting the outline of scenes
within the desired video, while each corresponding
concept query depicts a list of visual concepts that
appears in that scene. As the query specified by users
is generally approximate or incomplete, we need to
develop techniques to handle this inexact and
incomplete specification by also leveraging on user
feedback to refine the specification. We utilize
several innovative approaches to enhance the automatic
search. First, we employ a visual query suggestion
model to automatically suggest potential visual
features to users as better queries. Second, we utilize
a color similarity matrix to help compensate for
inexact color specification in visual queries. Third,
we leverage on the ordering of visual queries and/or
concept queries to rerank the results by using a greedy
algorithm. Moreover, as the query is inexact and there
is likely to be only one or few possible answers, we
incorporate an interactive feedback loop to permit the
users to label related samples which are visually
similar or semantically close to the relevant sample.
Based on the labeled samples, we then propose
optimization algorithms to update visual queries and
concept weights to refine the search results. We
conduct experiments on two large-scale video datasets:
TRECVID 2010 and YouTube. The experimental results
demonstrate that our proposed system is effective for
MRVS tasks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "21",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Liu:2014:MIK,
author = "Xianglong Liu and Yadong Mu and Bo Lang and Shih-Fu
Chang",
title = "Mixed image-keyword query adaptive hashing over
multilabel images",
journal = j-TOMCCAP,
volume = "10",
number = "2",
pages = "22:1--22:??",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2540990",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 13 07:37:57 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article defines a new hashing task motivated by
real-world applications in content-based image
retrieval, that is, effective data indexing and
retrieval given mixed query (query image together with
user-provided keywords). Our work is distinguished from
state-of-the-art hashing research by two unique
features: (1) Unlike conventional image retrieval
systems, the input query is a combination of an
exemplar image and several descriptive keywords, and
(2) the input image data are often associated with
multiple labels. It is an assumption that is more
consistent with the realistic scenarios. The mixed
image-keyword query significantly extends traditional
image-based query and better explicates the user
intention. Meanwhile it complicates semantics-based
indexing on the multilabel data. Though several
existing hashing methods can be adapted to solve the
indexing task, unfortunately they all prove to suffer
from low effectiveness. To enhance the hashing
efficiency, we propose a novel scheme ``boosted shared
hashing''. Unlike prior works that learn the hashing
functions on either all image labels or a single label,
we observe that the hashing function can be more
effective if it is designed to index over an optimal
label subset. In other words, the association between
labels and hash bits are moderately sparse. The
sparsity of the bit-label association indicates greatly
reduced computation and storage complexities for
indexing a new sample, since only limited number of
hashing functions will become active for the specific
sample. We develop a Boosting style algorithm for
simultaneously optimizing both the optimal label
subsets and hashing functions in a unified formulation,
and further propose a query-adaptive retrieval
mechanism based on hash bit selection for mixed
queries, no matter whether or not the query words exist
in the training data. Moreover, we show that the
proposed method can be easily extended to the case
where the data similarity is gauged by nonlinear kernel
functions. Extensive experiments are conducted on
standard image benchmarks like CIFAR-10, NUS-WIDE and
a-TRECVID. The results validate both the sparsity of
the bit-label association and the convergence of the
proposed algorithm, and demonstrate that the proposed
hashing scheme achieves substantially superior
performances over state-of-the-art methods under the
same hash bit budget.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "22",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Anonymous:2014:TCO,
author = "Anonymous",
title = "Table of Contents: Online Supplement Volume 10, Number
1s",
journal = j-TOMCCAP,
volume = "10",
number = "3",
pages = "22:1--22:??",
month = apr,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2602969",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Apr 15 12:20:53 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "22",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Liu:2014:DUB,
author = "Ning Liu and Huajie Cui and S.-H. Gary Chan and
Zhipeng Chen and Yirong Zhuang",
title = "Dissecting User Behaviors for a Simultaneous Live and
{VoD IPTV} System",
journal = j-TOMCCAP,
volume = "10",
number = "3",
pages = "23:1--23:??",
month = apr,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2568194",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Apr 15 12:20:53 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "IPTV services deployed nowadays often consist of both
live TV and Video-on-Demand (VoD), offered by the same
service provider to the same pool of users over the
same managed network. Understanding user behaviors in
such a setting is hence an important step for system
modelling and optimization. Previous studies on user
behavior on video services were on either live TV or
VoD. For the first time, we conduct an in-depth
large-scale behavior study for IPTV users offering
simultaneously live TV and VoD choices at the same
time. Our data is from the largest IPTV service
provider in China, offering hundreds of live channels
and hundreds of thousands of VoD files, with traces
covering more than 1.9 million users over a period of 5
months. This large dataset provides us a unique
opportunity to cross-compare user viewing behaviors for
these services on the same platform, and sheds valuable
insights on how users interact with such a simultaneous
system. Our results lead to new understanding on IPTV
user behaviors which have strong implications on system
design. For example, we find that the average holding
time for VoD is significantly longer than live TV. live
TV users tend to surf more. However, if such channel
surfing is discounted, the holding times of both
services are not much different. While users in VoD
tend to view HD longer, channel popularity for live TV
is much less dependent on its video quality. In
contrast to some popular assumptions on user
interactivity, the transitions among live TV, VoD, and
offline modes are far from a Markov model.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "23",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Gaeta:2014:DDI,
author = "Rossano Gaeta and Marco Grangetto and Lorenzo Bovio",
title = "{DIP}: {Distributed Identification of Polluters} in
{P2P} Live Streaming",
journal = j-TOMCCAP,
volume = "10",
number = "3",
pages = "24:1--24:??",
month = apr,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2568223",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Apr 15 12:20:53 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Peer-to-peer live streaming applications are
vulnerable to malicious actions of peers that
deliberately modify data to decrease or prevent the
fruition of the media (pollution attack). In this
article we propose DIP, a fully distributed, accurate,
and robust algorithm for the identification of
polluters. DIP relies on checks that are computed by
peers upon completing reception of all blocks composing
a data chunk. A check is a special message that
contains the set of peer identifiers that provided
blocks of the chunk as well as a bit to signal if the
chunk has been corrupted. Checks are periodically
transmitted by peers to their neighbors in the overlay
network; peers receiving checks use them to maintain a
factor graph. This graph is bipartite and an
incremental belief propagation algorithm is run on it
to compute the probability of a peer being a polluter.
Using a prototype deployed over PlanetLab we show by
extensive experimentation that DIP allows honest peers
to identify polluters with very high accuracy and
completeness, even when polluters collude to deceive
them. Furthermore, we show that DIP is efficient,
requiring low computational, communication, and storage
overhead at each peer.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "24",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Hoque:2014:SEM,
author = "Mohammad Asharful Hoque and Matti Siekkinen and Jukka
K. Nurminen and Sasu Tarkoma and Mika Aalto",
title = "Saving Energy in Mobile Devices for On-Demand
Multimedia Streaming --- A Cross-Layer Approach",
journal = j-TOMCCAP,
volume = "10",
number = "3",
pages = "25:1--25:??",
month = apr,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2556942",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Apr 15 12:20:53 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article proposes a novel energy-efficient
multimedia delivery system called EStreamer. First, we
study the relationship between buffer size at the
client, burst-shaped TCP-based multimedia traffic, and
energy consumption of wireless network interfaces in
smartphones. Based on the study, we design and
implement EStreamer for constant bit rate and
rate-adaptive streaming. EStreamer can improve battery
lifetime by 3x, 1.5x, and 2x while streaming over
Wi-Fi, 3G, and 4G, respectively.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "25",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wang:2014:HEK,
author = "Feng Wang and Wan-Lei Zhao and Chong-Wah Ngo and
Bernard Merialdo",
title = "A {Hamming} Embedding Kernel with Informative
Bag-of-Visual Words for Video Semantic Indexing",
journal = j-TOMCCAP,
volume = "10",
number = "3",
pages = "26:1--26:??",
month = apr,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2535938",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Apr 15 12:20:53 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this article, we propose a novel Hamming embedding
kernel with informative bag-of-visual words to address
two main problems existing in traditional BoW
approaches for video semantic indexing. First, Hamming
embedding is employed to alleviate the information loss
caused by SIFT quantization. The Hamming distances
between keypoints in the same cell are calculated and
integrated into the SVM kernel to better discriminate
different image samples. Second, to highlight the
concept-specific visual information, we propose to
weight the visual words according to their
informativeness for detecting specific concepts. We
show that our proposed kernels can significantly
improve the performance of concept detection.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "26",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yang:2014:MDF,
author = "Ying Yang and Ioannis Ivrissimtzis",
title = "Mesh Discriminative Features for {$3$D} Steganalysis",
journal = j-TOMCCAP,
volume = "10",
number = "3",
pages = "27:1--27:??",
month = apr,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2535555",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Apr 15 12:20:53 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "We propose a steganalytic algorithm for triangle
meshes, based on the supervised training of a
classifier by discriminative feature vectors. After a
normalization step, the triangle mesh is calibrated by
one step of Laplacian smoothing and then a feature
vector is computed, encoding geometric information
corresponding to vertices, edges and faces. For a given
steganographic or watermarking algorithm, we create a
training set containing unmarked meshes and meshes
marked by that algorithm, and train a classifier using
Quadratic Discriminant Analysis. The performance of the
proposed method was evaluated on six well-known
watermarking/steganographic schemes with satisfactory
accuracy rates.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "27",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Hamam:2014:QEM,
author = "Abdelwahab Hamam and Abdulmotaleb {El Saddik} and
Jihad Alja'am",
title = "A Quality of Experience Model for Haptic Virtual
Environments",
journal = j-TOMCCAP,
volume = "10",
number = "3",
pages = "28:1--28:??",
month = apr,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2540991",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Apr 15 12:20:53 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Haptic-based Virtual Reality (VR) applications have
many merits. What is still obscure, from the designer's
perspective of these applications, is the experience
the users will undergo when they use the VR system.
Quality of Experience (QoE) is an evaluation metric
from the user's perspective that unfortunately has
received limited attention from the research community.
Assessing the QoE of VR applications reflects the
amount of overall satisfaction and benefits gained from
the application in addition to laying the foundation
for ideal user-centric design in the future. In this
article, we propose a taxonomy for the evaluation of
QoE for multimedia applications and in particular VR
applications. We model this taxonomy using a Fuzzy
logic Inference System (FIS) to quantitatively measure
the QoE of haptic virtual environments. We build and
test our FIS by conducting a users' study analysis to
evaluate the QoE of a haptic game application. Our
results demonstrate that the proposed FIS model
reflects the user's estimation of the application's
quality significantly with low error and hence is
suited for QoE evaluation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "28",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Botta:2014:PCI,
author = "Marco Botta and Davide Cavagnino and Victor Pomponiu",
title = "Protecting the Content Integrity of Digital Imagery
with Fidelity Preservation: An Improved Version",
journal = j-TOMCCAP,
volume = "10",
number = "3",
pages = "29:1--29:??",
month = apr,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2568224",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Apr 15 12:20:53 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Fragile watermarking has attracted a lot of attention
in the last decade. An interesting approach, presented
in 2011 by Lin et al., results in very high quality of
the watermarked images. However, after a thorough
examination of the paper, a few improvements are
proposed in our revised version of the algorithm in
order to overcome some shortcomings. In particular,
changes to the pseudocode and modifications to deal
with pixel saturation are suggested, along with a way
to improve the scheme security. Finally, a deeper
analysis of the security is presented.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "29",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Luo:2014:ICH,
author = "Da Luo and Weiqi Luo and Rui Yang and Jiwu Huang",
title = "Identifying Compression History of Wave Audio and Its
Applications",
journal = j-TOMCCAP,
volume = "10",
number = "3",
pages = "30:1--30:??",
month = apr,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2575978",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Apr 15 12:20:53 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Audio signal is sometimes stored and/or processed in
WAV (waveform) format without any knowledge of its
previous compression operations. To perform some
subsequent processing, such as digital audio forensics,
audio enhancement and blind audio quality assessment,
it is necessary to identify its compression history. In
this article, we will investigate how to identify a
decompressed wave audio that went through one of three
popular compression schemes, including MP3, WMA
(windows media audio) and AAC (advanced audio coding).
By analyzing the corresponding frequency coefficients,
including modified discrete cosine transform (MDCT) and
Mel-frequency cepstral coefficients (MFCCs), of those
original audio clips and their decompressed versions
with different compression schemes and bit rates, we
propose several statistics to identify the compression
scheme as well as the corresponding bit rate previously
used for a given WAV signal. The experimental results
evaluated on 8,800 audio clips with various contents
have shown the effectiveness of the proposed method. In
addition, some potential applications of the proposed
method are discussed.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "30",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhang:2014:CDM,
author = "Tianzhu Zhang and Changsheng Xu",
title = "Cross-Domain Multi-Event Tracking via {CO-PMHT}",
journal = j-TOMM,
volume = "10",
number = "4",
pages = "31:1--31:??",
month = jun,
year = "2014",
DOI = "https://doi.org/10.1145/2602633",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Aug 8 11:32:58 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "With the massive growth of events on the Internet,
efficient organization and monitoring of events becomes
a practical challenge. To deal with this problem, we
propose a novel CO-PMHT (CO-Probabilistic
Multi-Hypothesis Tracking) algorithm for cross-domain
multi-event tracking to obtain their informative
summary details and evolutionary trends over time. We
collect a large-scale dataset by searching keywords on
two domains (Gooogle News and Flickr) and downloading
both images and textual content for an event. Given the
input data, our algorithm can track multiple events in
the two domains collaboratively and boost the tracking
performance. Specifically, the bridge between two
domains is a semantic posterior probability, that
avoids the domain gap. After tracking, we can visualize
the whole evolutionary process of the event over time
and mine the semantic topics of each event for deep
understanding and event prediction. The extensive
experimental evaluations on the collected dataset well
demonstrate the effectiveness of the proposed algorithm
for cross-domain multi-event tracking.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "31",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Huang:2014:PVR,
author = "Qinghua Huang and Bisheng Chen and Jingdong Wang and
Tao Mei",
title = "Personalized Video Recommendation through Graph
Propagation",
journal = j-TOMM,
volume = "10",
number = "4",
pages = "32:1--32:??",
month = jun,
year = "2014",
DOI = "https://doi.org/10.1145/2598779",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Aug 8 11:32:58 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The rapid growth of the number of videos on the
Internet provides enormous potential for users to find
content of interest. However, the vast quantity of
videos also turns the finding process into a difficult
task. In this article, we address the problem of
providing personalized video recommendation for users.
Rather than only exploring the user-video bipartite
graph that is formulated using click information, we
first combine the clicks and queries information to
build a tripartite graph. In the tripartite graph, the
query nodes act as bridges to connect user nodes and
video nodes. Then, to further enrich the connections
between users and videos, three subgraphs between the
same kinds of nodes are added to the tripartite graph
by exploring content-based information (video tags and
textual queries). We propose an iterative propagation
algorithm over the enhanced graph to compute the
preference information of each user. Experiments
conducted on a dataset with 1,369 users, 8,765 queries,
and 17,712 videos collected from a commercial video
search engine demonstrate the effectiveness of the
proposed method.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "32",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Li:2014:UVS,
author = "Haitao Li and Xu Cheng and Jiangchuan Liu",
title = "Understanding Video Sharing Propagation in Social
Networks: Measurement and Analysis",
journal = j-TOMM,
volume = "10",
number = "4",
pages = "33:1--33:??",
month = jun,
year = "2014",
DOI = "https://doi.org/10.1145/2594440",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Aug 8 11:32:58 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Modern online social networking has drastically
changed the information distribution landscape.
Recently, video has become one of the most important
types of objects spreading among social networking
service users. The sheer and ever-increasing data
volume, the broader coverage, and the longer access
durations of video objects, however, present
significantly more challenges than other types of
objects. This article takes an initial step toward
understanding the unique characteristics of video
sharing propagation in social networks. Based on
realworld data traces from a large-scale online social
network, we examine the user behavior from diverse
aspects and identify different types of users involved
in video propagation. We closely investigate the
temporal distribution during propagation as well as the
typical propagation structures, revealing more details
beyond stationary coverage. We further extend the
conventional epidemic models to accommodate diverse
types of users and their probabilistic viewing and
sharing behaviors. The model, effectively capturing the
essentials of the propagation process, serves as a
valuable basis for such applications as workload
synthesis, traffic prediction, and resource provision
of video servers.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "33",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wang:2014:BCM,
author = "Zhiyu Wang and Peng Cui and Lexing Xie and Wenwu Zhu
and Yong Rui and Shiqiang Yang",
title = "Bilateral Correspondence Model for Words-and-Pictures
Association in Multimedia-Rich Microblogs",
journal = j-TOMM,
volume = "10",
number = "4",
pages = "34:1--34:??",
month = jun,
year = "2014",
DOI = "https://doi.org/10.1145/2611388",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Aug 8 11:32:58 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Nowadays, the amount of multimedia contents in
microblogs is growing significantly. More than 20\% of
microblogs link to a picture or video in certain large
systems. The rich semantics in microblogs provides an
opportunity to endow images with higher-level semantics
beyond object labels. However, this raises new
challenges for understanding the association between
multimodal multimedia contents in multimedia-rich
microblogs. Disobeying the fundamental assumptions of
traditional annotation, tagging, and retrieval systems,
pictures and words in multimedia-rich microblogs are
loosely associated and a correspondence between
pictures and words cannot be established. To address
the aforementioned challenges, we present the first
study analyzing and modeling the associations between
multimodal contents in microblog streams, aiming to
discover multimodal topics from microblogs by
establishing correspondences between pictures and words
in microblogs. We first use a data-driven approach to
analyze the new characteristics of the words, pictures,
and their association types in microblogs. We then
propose a novel generative model called the Bilateral
Correspondence Latent Dirichlet Allocation (BC-LDA)
model. Our BC-LDA model can assign flexible
associations between pictures and words and is able to
not only allow picture-word co-occurrence with
bilateral directions, but also single modal
association. This flexible association can best fit the
data distribution, so that the model can discover
various types of joint topics and generate pictures and
words with the topics accordingly. We evaluate this
model extensively on a large-scale real multimedia-rich
microblogs dataset. We demonstrate the advantages of
the proposed model in several application scenarios,
including image tagging, text illustration, and topic
discovery. The experimental results demonstrate that
our proposed model can significantly and consistently
outperform traditional approaches.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "34",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Lei:2014:FND,
author = "Yanqiang Lei and Guoping Qiu and Ligang Zheng and Jiwu
Huang",
title = "Fast Near-Duplicate Image Detection Using Uniform
Randomized Trees",
journal = j-TOMM,
volume = "10",
number = "4",
pages = "35:1--35:??",
month = jun,
year = "2014",
DOI = "https://doi.org/10.1145/2602186",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Aug 8 11:32:58 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Indexing structure plays an important role in the
application of fast near-duplicate image detection,
since it can narrow down the search space. In this
article, we develop a cluster of uniform randomized
trees (URTs) as an efficient indexing structure to
perform fast near-duplicate image detection. The main
contribution in this article is that we introduce
``uniformity'' and ``randomness'' into the indexing
construction. The uniformity requires classifying the
object images into the same scale subsets. Such a
decision makes good use of the two facts in
near-duplicate image detection, namely: (1) the number
of categories is huge; (2) a single category usually
contains only a small number of images. Therefore, the
uniform distribution is very beneficial to narrow down
the search space and does not significantly degrade the
detection accuracy. The randomness is embedded into the
generation of feature subspace and projection
direction, improving the flexibility of indexing
construction. The experimental results show that the
proposed method is more efficient than the popular
locality-sensitive hashing and more stable and flexible
than the traditional KD-tree.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "35",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yeh:2014:PPR,
author = "Che-Hua Yeh and Brian A. Barsky and Ming Ouhyoung",
title = "Personalized Photograph Ranking and Selection System
Considering Positive and Negative User Feedback",
journal = j-TOMM,
volume = "10",
number = "4",
pages = "36:1--36:??",
month = jun,
year = "2014",
DOI = "https://doi.org/10.1145/2584105",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Aug 8 11:32:58 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this article, we propose a novel personalized
ranking system for amateur photographs. The proposed
framework treats the photograph assessment as a ranking
problem and we introduce the idea of personalized
ranking, which ranks photographs considering both their
aesthetic qualities and personal preferences.
Photographs are described using three types of
features: photo composition, color and intensity
distribution, and personalized features. An aesthetic
prediction model is learned from labeled photographs by
using the proposed image features and RBF-ListNet
learning algorithm. The experimental results show that
the proposed framework outperforms in the ranking
performance: a Kendall's tau value of 0.432 is
significantly higher than those obtained by the
features proposed in one of the state-of-the-art
approaches (0.365) and by learning based on support
vector regression (0.384). To realize personalization
in ranking, three approaches are proposed: the
feature-based approach allows users to select
photographs with specific rules, the example-based
approach takes the positive feedback from users to
rerank the photograph, and the list-based approach
takes both positive and negative feedback from users
into consideration. User studies indicate that all
three approaches are effective in both aesthetic and
personalized ranking.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "36",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Tan:2014:PVS,
author = "Song Tan and Yu-Gang Jiang and Chong-Wah Ngo",
title = "Placing Videos on a Semantic Hierarchy for Search
Result Navigation",
journal = j-TOMM,
volume = "10",
number = "4",
pages = "37:1--37:??",
month = jun,
year = "2014",
DOI = "https://doi.org/10.1145/2578394",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Aug 8 11:32:58 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Organizing video search results in a list view is
widely adopted by current commercial search engines,
which cannot support efficient browsing for complex
search topics that have multiple semantic facets. In
this article, we propose to organize video search
results in a highly structured way. Specifically,
videos are placed on a semantic hierarchy that
accurately organizes various facets of a given search
topic. To pick the most suitable videos for each node
of the hierarchy, we define and utilize three important
criteria: relevance, uniqueness, and diversity.
Extensive evaluations on a large YouTube video dataset
demonstrate the effectiveness of our approach.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "37",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Steinmetz:2014:EN,
author = "Ralf Steinmetz",
title = "Editorial Note",
journal = j-TOMM,
volume = "11",
number = "1",
pages = "1:1--1:??",
month = aug,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2634234",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Sep 1 12:38:22 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "1",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Liu:2014:SBA,
author = "Yong-Jin Liu and Cui-Xia Ma and Qiufang Fu and Xiaolan
Fu and Sheng-Feng Qin and Lexing Xie",
title = "A Sketch-Based Approach for Interactive Organization
of Video Clips",
journal = j-TOMM,
volume = "11",
number = "1",
pages = "2:1--2:??",
month = aug,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2645643",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Sep 1 12:38:22 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "With the rapid growth of video resources, techniques
for efficient organization of video clips are becoming
appealing in the multimedia domain. In this article, a
sketch-based approach is proposed to intuitively
organize video clips by: (1) enhancing their narrations
using sketch annotations and (2) structurizing the
organization process by gesture-based free-form
sketching on touch devices. There are two main
contributions of this work. The first is a sketch
graph, a novel representation for the narrative
structure of video clips to facilitate content
organization. The second is a method to perform
context-aware sketch recommendation scalable to large
video collections, enabling common users to easily
organize sketch annotations. A prototype system
integrating the proposed approach was evaluated on the
basis of five different aspects concerning its
performance and usability. Two sketch searching
experiments showed that the proposed context-aware
sketch recommendation outperforms, in terms of accuracy
and scalability, two state-of-the-art sketch searching
methods. Moreover, a user study showed that the sketch
graph is consistently preferred over traditional
representations such as keywords and keyframes. The
second user study showed that the proposed approach is
applicable in those scenarios where the video annotator
and organizer were the same person. The third user
study showed that, for video content organization,
using sketch graph users took on average 1/3 less time
than using a mass-market tool Movie Maker and took on
average 1/4 less time than using a state-of-the-art
sketch alternative. These results demonstrated that the
proposed sketch graph approach is a promising video
organization tool.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "2",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Huang:2014:CSA,
author = "Junshi Huang and Si Liu and Junliang Xing and Tao Mei
and Shuicheng Yan",
title = "Circle \& Search: Attribute-Aware Shoe Retrieval",
journal = j-TOMM,
volume = "11",
number = "1",
pages = "3:1--3:??",
month = aug,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2632165",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Sep 1 12:38:22 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Taking the shoe as a concrete example, we present an
innovative product retrieval system that leverages
object detection and retrieval techniques to support a
brand-new online shopping experience in this article.
The system, called Circle \& Search, enables users to
naturally indicate any preferred product by simply
circling the product in images as the visual query, and
then returns visually and semantically similar products
to the users. The system is characterized by
introducing attributes in both the detection and
retrieval of the shoe. Specifically, we first develop
an attribute-aware part-based shoe detection model. By
maintaining the consistency between shoe parts and
attributes, this shoe detector has the ability to model
high-order relations between parts and thus the
detection performance can be enhanced. Meanwhile, the
attributes of this detected shoe can also be predicted
as the semantic relations between parts. Based on the
result of shoe detection, the system ranks all the
shoes in the repository using an attribute refinement
retrieval model that takes advantage of query-specific
information and attribute correlation to provide an
accurate and robust shoe retrieval. To evaluate this
retrieval system, we build a large dataset with 17,151
shoe images, in which each shoe is annotated with 10
shoe attributes e.g., heel height, heel shape, sole
shape, etc.. According to the experimental result and
the user study, our Circle \& Search system achieves
promising shoe retrieval performance and thus
significantly improves the users' online shopping
experience.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "3",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Guan:2014:TAV,
author = "Genliang Guan and Zhiyong Wang and Shaohui Mei and Max
Ott and Mingyi He and David Dagan Feng",
title = "A Top-Down Approach for Video Summarization",
journal = j-TOMM,
volume = "11",
number = "1",
pages = "4:1--4:??",
month = aug,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2632267",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Sep 1 12:38:22 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "While most existing video summarization approaches aim
to identify important frames of a video from either a
global or local perspective, we propose a top-down
approach consisting of scene identification and scene
summarization. For scene identification, we represent
each frame with global features and utilize a scalable
clustering method. We then formulate scene
summarization as choosing those frames that best cover
a set of local descriptors with minimal redundancy. In
addition, we develop a visual word-based approach to
make our approach more computationally scalable.
Experimental results on two benchmark datasets
demonstrate that our proposed approach clearly
outperforms the state-of-the-art.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "4",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Pazzi:2014:PPP,
author = "Richard W. Pazzi and Azzedine Boukerche",
title = "{PROPANE}: a Progressive Panorama Streaming Protocol
to Support Interactive {$3$D} Virtual Environment
Exploration on Graphics-Constrained Devices",
journal = j-TOMM,
volume = "11",
number = "1",
pages = "5:1--5:??",
month = aug,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2602222",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Sep 1 12:38:22 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Image-Based Rendering (IBR) has become widely known by
its relatively low requirements for generating new
scenes based on a sequence of reference images. This
characteristic of IBR shows a remarkable potential
impact in rendering complex 3D virtual environments on
graphics-constrained devices, such as head-mounted
displays, set-top boxes, media streaming devices, and
so on. If well exploited, IBR coupled with remote
rendering would enable the exploration of complex
virtual environments on these devices. However, remote
rendering requires the transmission of a large volume
of images. In addition, existing solutions consider
limited and/or deterministic navigation schemes as a
means of decreasing the volume of streamed data. This
article proposes the PROgressive PANorama StrEaming
protocol (PROPANE) to offer users a smoother virtual
navigation experience by prestreaming the imagery data
required to generate new views as the user wanders
within a 3D environment. PROPANE is based on a very
simple yet effective trigonometry model and uses a
strafe (lateral movement) technique to minimize the
delay between image updates at the client end. This
article introduces the concept of key partial
panoramas, namely panorama segments that cover
movements in any direction by simply strafing from an
appropriate key partial panorama and streaming the
amount of lost pixels. Therefore, PROPANE can provide a
constrained device with sufficient imagery data to
cover a future user's viewpoints, thereby minimizing
the impact of transmission delay and jitter. PROPANE
has been implemented and compared to two baseline
remote rendering schemes. The evaluation results show
that the proposed technique outperforms the selected
and closely related existing schemes by minimizing the
response time while not limiting the user to predefined
paths as opposed to previous protocols.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "5",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wang:2014:FEM,
author = "Xiangyu Wang and Yong Rui and Mohan Kankanhalli",
title = "{Up-Fusion}: an Evolving Multimedia Fusion Method",
journal = j-TOMM,
volume = "11",
number = "1",
pages = "6:1--6:??",
month = aug,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2611777",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Sep 1 12:38:22 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The amount of multimedia data on the Internet has
increased exponentially in the past few decades and
this trend is likely to continue. Multimedia content
inherently has multiple information sources, therefore
effective fusion methods are critical for data analysis
and understanding. So far, most of the existing fusion
methods are static with respect to time, making it
difficult for them to handle the evolving multimedia
content. To address this issue, in recent years,
several evolving fusion methods were proposed, however,
their requirements are difficult to meet, making them
useful only in limited applications. In this article,
we propose a novel evolving fusion method based on the
online portfolio selection theory. The proposed method
takes into account the correlation among different
information sources and evolves the fusion model when
new multimedia data is added. It performs effectively
on both crisp and soft decisions without requiring
additional context information. Extensive experiments
on concept detection and human detection tasks over the
TRECVID dataset and surveillance data have been
conducted and significantly better performance has been
obtained.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "6",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wang:2014:EIP,
author = "Xinxi Wang and Yi Wang and David Hsu and Ye Wang",
title = "Exploration in Interactive Personalized Music
Recommendation: a Reinforcement Learning Approach",
journal = j-TOMM,
volume = "11",
number = "1",
pages = "7:1--7:??",
month = aug,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2623372",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Sep 1 12:38:22 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Current music recommender systems typically act in a
greedy manner by recommending songs with the highest
user ratings. Greedy recommendation, however, is
suboptimal over the long term: it does not actively
gather information on user preferences and fails to
recommend novel songs that are potentially interesting.
A successful recommender system must balance the needs
to explore user preferences and to exploit this
information for recommendation. This article presents a
new approach to music recommendation by formulating
this exploration-exploitation trade-off as a
reinforcement learning task. To learn user preferences,
it uses a Bayesian model that accounts for both audio
content and the novelty of recommendations. A
piecewise-linear approximation to the model and a
variational inference algorithm help to speed up
Bayesian inference. One additional benefit of our
approach is a single unified model for both music
recommendation and playlist generation. We demonstrate
the strong potential of the proposed approach with
simulation results and a user study.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "7",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Katti:2014:OEE,
author = "Harish Katti and Anoop Kolar Rajagopal and Mohan
Kankanhalli and Ramakrishnan Kalpathi",
title = "Online Estimation of Evolving Human Visual Interest",
journal = j-TOMM,
volume = "11",
number = "1",
pages = "8:1--8:??",
month = aug,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2632284",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Sep 1 12:38:22 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Regions in video streams attracting human interest
contribute significantly to human understanding of the
video. Being able to predict salient and informative
Regions of Interest (ROIs) through a sequence of eye
movements is a challenging problem. Applications such
as content-aware retargeting of videos to different
aspect ratios while preserving informative regions and
smart insertion of dialog (closed-caption text)$^1$
into the video stream can significantly be improved
using the predicted ROIs. We propose an interactive
human-in-the-loop framework to model eye movements and
predict visual saliency into yet-unseen frames. Eye
tracking and video content are used to model visual
attention in a manner that accounts for important
eye-gaze characteristics such as temporal
discontinuities due to sudden eye movements, noise, and
behavioral artifacts. A novel statistical- and
algorithm-based method gaze buffering is proposed for
eye-gaze analysis and its fusion with content-based
features. Our robust saliency prediction is
instantiated for two challenging and exciting
applications. The first application alters video aspect
ratios on-the-fly using content-aware video
retargeting, thus making them suitable for a variety of
display sizes. The second application dynamically
localizes active speakers and places dialog captions
on-the-fly in the video stream. Our method ensures that
dialogs are faithful to active speaker locations and do
not interfere with salient content in the video stream.
Our framework naturally accommodates personalisation of
the application to suit biases and preferences of
individual users.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "8",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Ghinea:2014:ISI,
author = "Gheorghita Ghinea and Christian Timmerer and Weisi Lin
and Stephen Gulliver",
title = "Introduction to Special Issue on Multiple Sensorial
{(MulSeMedia)} Multimodal Media: Advances and
Applications",
journal = j-TOMM,
volume = "11",
number = "1s",
pages = "9:1--9:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2661333",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Oct 3 12:44:25 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "9",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Lv:2014:MHF,
author = "Zhihan Lv and Alaa Halawani and Shengzhong Feng and
Haibo Li and Shafiq Ur R{\'e}hman",
title = "Multimodal Hand and Foot Gesture Interaction for
Handheld Devices",
journal = j-TOMM,
volume = "11",
number = "1s",
pages = "10:1--10:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2645860",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Oct 3 12:44:25 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "We present a hand-and-foot-based multimodal
interaction approach for handheld devices. Our method
combines input modalities (i.e., hand and foot) and
provides a coordinated output to both modalities along
with audio and video. Human foot gesture is detected
and tracked using contour-based template detection
(CTD) and Tracking-Learning-Detection (TLD) algorithm.
3D foot pose is estimated from passive homography
matrix of the camera. 3D stereoscopic and vibrotactile
are used to enhance the immersive feeling. We developed
a multimodal football game based on the multimodal
approach as a proof-of-concept. We confirm our systems
user satisfaction through a user study.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "10",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Prasad:2014:DVC,
author = "Manoj Prasad and Murat Russell and Tracy A. Hammond",
title = "Designing Vibrotactile Codes to Communicate Verb
Phrases",
journal = j-TOMM,
volume = "11",
number = "1s",
pages = "11:1--11:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2637289",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Oct 3 12:44:25 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Soldiers, to guard themselves from enemy assault, have
to maintain visual and auditory awareness of their
environment. Their visual and auditory senses are thus
saturated. This makes these channels less usable for
communication. The tactile medium of communication with
users is appropriate for displaying information in such
situations. Research in interpersonal communication
among soldiers shows that the most common form of
communication between soldiers involves the use of verb
phrases. In this article, we have developed a
three-by-three tactile display and proposed a method
for mapping the components of a verb phrase to two
dimensions of tactile codes-shape and waveform.
Perception of tactile codes by users depends on the
ability of users to distinguish shape and waveform of
the code. We have proposed a measure to rate the
distinguish-ability of any two shapes and created a
graph-based user-centric model using this measure to
select distinguishable shapes from a set of all
presentable shapes. We conducted two user studies to
evaluate the ability of users to perceive tactile
information. The results from our first study showed
users' ability to perceive tactile shapes, tactile
waveforms, and form verb phrases from tactile codes.
The recognition accuracy and time taken to distinguish
were better when the shapes were selected from the
graph model than when shapes were chosen based on
intuition. The second user study was conducted to test
the performance of users while performing a primary
visual task simultaneously with a secondary audio or
haptic task. Users were more familiar with perceiving
information from an auditory medium than from a haptic
medium, which was reflected in their performance. Thus
the performance of users in the primary visual task was
better while using an audio medium of communication
than while using a haptic medium of communication.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "11",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Murray:2014:MSE,
author = "Niall Murray and Brian Lee and Yuansong Qiao and
Gabriel-Miro Muntean",
title = "Multiple-Scent Enhanced Multimedia Synchronization",
journal = j-TOMM,
volume = "11",
number = "1s",
pages = "12:1--12:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2637293",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Oct 3 12:44:25 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This study looked at users' perception of interstream
synchronization between audiovisual media and two
olfactory streams. The ability to detect skews and the
perception and impact of skews on user Quality of
Experience (QoE) is analyzed. The olfactory streams are
presented with the same skews (i.e., delay) and with
variable skews (i.e., jitter and mix of scents). This
article reports the limits beyond which
desynchronization reduces user-perceived quality
levels. Also, a minimum gap between the presentations
of consecutive scents is identified, necessary to
ensuring enhanced user-perceived quality. There is no
evidence (not considering scent type) that overlapping
or mixing of scents increases user QoE levels for
olfaction-enhanced multimedia.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "12",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Kroupi:2014:ECP,
author = "Eleni Kroupi and Ashkan Yazdani and Jean-Marc Vesin
and Touradj Ebrahimi",
title = "{EEG} Correlates of Pleasant and Unpleasant Odor
Perception",
journal = j-TOMM,
volume = "11",
number = "1s",
pages = "13:1--13:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2637287",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Oct 3 12:44:25 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Olfaction-enhanced multimedia experience is becoming
vital for strengthening the sensation of reality and
the quality of user experience. One approach to
investigate olfactory perception is to analyze the
alterations in brain activity during stimulation with
different odors. In this article, the changes in the
electroencephalogram (EEG) when perceiving
hedonically-different odors are studied. Results of
within and across-subject analysis are presented. We
show that EEG-based odor classification using brain
activity is possible and can be used to automatically
recognize odor pleasantness when a subject-specific
classifier is trained. However, it is a challenging
problem to design a generic classifier.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "13",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Rainer:2014:GUM,
author = "Benjamin Rainer and Christian Timmerer",
title = "A Generic Utility Model Representing the Quality of
Sensory Experience",
journal = j-TOMM,
volume = "11",
number = "1s",
pages = "14:1--14:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2648429",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Oct 3 12:44:25 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Current QoE research is mainly focusing on single
modalities (audio, visual) or combinations thereof. In
our research, we propose annotating traditional
multimedia content with additional sensory effects,
such as ambient light, vibration, wind, and olfaction,
which could potentially stimulate all human senses.
Investigating the influence of individual sensory
effects and combinations thereof is important in order
to understand how these individual sensory effects
influence the Quality of Experience (QoE) as a whole.
In this article, we describe the results of such a
subjective quality assessment of audio-visual sequences
which are annotated with additional sensory effects
such as ambient light, wind, and vibration using the
MPEG-V standard. The results of this assessment allow
us to derive a utility model representing the Quality
of Sensory Experience (QuaSE) complementary to existing
QoE models described in terms of Quality of Service
(QoS) parameters. For validating our proposed utility
model, we provide an example instantiation and validate
it against results of subjective quality assessments.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "14",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yuan:2014:UQE,
author = "Zhenhui Yuan and Shengyang Chen and Gheorghita Ghinea
and Gabriel-Miro Muntean",
title = "User Quality of Experience of Mulsemedia
Applications",
journal = j-TOMM,
volume = "11",
number = "1s",
pages = "15:1--15:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2661329",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Oct 3 12:44:25 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "User Quality of Experience (QoE) is of fundamental
importance in multimedia applications and has been
extensively studied for decades. However, user QoE in
the context of the emerging multiple-sensorial media
(mulsemedia) services, which involve different media
components than the traditional multimedia
applications, have not been comprehensively studied.
This article presents the results of subjective tests
which have investigated user perception of mulsemedia
content. In particular, the impact of intensity of
certain mulsemedia components including haptic and
airflow on user-perceived experience are studied.
Results demonstrate that by making use of mulsemedia
the overall user enjoyment levels increased by up to
77\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "15",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Luque:2014:IMS,
author = "Francisco Pedro Luque and Iris Galloso and Claudio
Feijoo and Carlos Alberto Mart{\'\i}n and Guillermo
Cisneros",
title = "Integration of Multisensorial Stimuli and Multimodal
Interaction in a Hybrid {$3$DTV} System",
journal = j-TOMM,
volume = "11",
number = "1s",
pages = "16:1--16:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2617992",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Oct 3 12:44:25 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article proposes the integration of
multisensorial stimuli and multimodal interaction
components into a sports multimedia asset under two
dimensions: immersion and interaction. The first
dimension comprises a binaural audio system and a set
of sensory effects synchronized with the audiovisual
content, whereas the second explores interaction
through the insertion of interactive 3D objects into
the main screen and on-demand presentation of
additional information in a second touchscreen. We
present an end-to-end solution integrating these
components into a hybrid (internet-broadcast)
television system using current 3DTV standards. Results
from an experimental study analyzing the perceived
quality of these stimuli and their influence on the
Quality of Experience are presented.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "16",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Ghinea:2014:MSA,
author = "Gheorghita Ghinea and Christian Timmerer and Weisi Lin
and Stephen R. Gulliver",
title = "Mulsemedia: State of the Art, Perspectives, and
Challenges",
journal = j-TOMM,
volume = "11",
number = "1s",
pages = "17:1--17:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2617994",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Oct 3 12:44:25 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Mulsemedia-multiple sensorial media-captures a wide
variety of research efforts and applications. This
article presents a historic perspective on mulsemedia
work and reviews current developments in the area.
These take place across the traditional multimedia
spectrum-from virtual reality applications to computer
games-as well as efforts in the arts, gastronomy, and
therapy, to mention a few. We also describe
standardization efforts, via the MPEG-V standard, and
identify future developments and exciting challenges
the community needs to overcome.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "17",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zha:2014:ISI,
author = "Zheng-Jun Zha and Lei Zhang and Max M{\"u}hlh{\"a}user
and Alan F. Smeaton",
title = "Introduction to the Special Issue Best Papers of {ACM
Multimedia 2013}",
journal = j-TOMM,
volume = "11",
number = "1s",
pages = "18:1--18:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2661331",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Oct 3 12:44:25 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "18",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Fang:2014:DGI,
author = "Quan Fang and Jitao Sang and Changsheng Xu",
title = "Discovering Geo-Informative Attributes for Location
Recognition and Exploration",
journal = j-TOMM,
volume = "11",
number = "1s",
pages = "19:1--19:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2648581",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Oct 3 12:44:25 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article considers the problem of automatically
discovering geo-informative attributes for location
recognition and exploration. The attributes are
expected to be both discriminative and representative,
which correspond to certain distinctive visual patterns
and associate with semantic interpretations. For our
solution, we analyze the attribute at the region level.
Each segmented region in the training set is assigned a
binary latent variable indicating its discriminative
capability. A latent learning framework is proposed for
discriminative region detection and geo-informative
attribute discovery. Moreover, we use user-generated
content to obtain the semantic interpretation for the
discovered visual attributes. Discriminative and
search-based attribute annotation methods are developed
for geo-informative attribute interpretation. The
proposed approach is evaluated on one challenging
dataset including GoogleStreetView and Flickr photos.
Experimental results show that (1) geo-informative
attributes are discriminative and useful for location
recognition; (2) the discovered semantic interpretation
is meaningful and can be exploited for further location
exploration.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "19",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Liu:2014:WYB,
author = "Luoqi Liu and Junliang Xing and Si Liu and Hui Xu and
Xi Zhou and Shuicheng Yan",
title = "{``Wow! You Are So Beautiful Today!''}",
journal = j-TOMM,
volume = "11",
number = "1s",
pages = "20:1--20:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2659234",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Oct 3 12:44:25 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Beauty e-Experts, a fully automatic system for
makeover recommendation and synthesis, is developed in
this work. The makeover recommendation and synthesis
system simultaneously considers many kinds of makeover
items on hairstyle and makeup. Given a user-provided
frontal face image with short/bound hair and no/light
makeup, the Beauty e-Experts system not only recommends
the most suitable hairdo and makeup, but also
synthesizes the virtual hairdo and makeup effects. To
acquire enough knowledge for beauty modeling, we built
the Beauty e-Experts Database, which contains 1,505
female photos with a variety of attributes annotated
with different discrete values. We organize these
attributes into two different categories, beauty
attributes and beauty-related attributes. Beauty
attributes refer to those values that are changeable
during the makeover process and thus need to be
recommended by the system. Beauty-related attributes
are those values that cannot be changed during the
makeup process but can help the system to perform
recommendation. Based on this Beauty e-Experts Dataset,
two problems are addressed for the Beauty e-Experts
system: what to recommend and how to wear it, which
describes a similar process of selecting hairstyle and
cosmetics in daily life. For the what-to-recommend
problem, we propose a multiple tree-structured
supergraph model to explore the complex relationships
among high-level beauty attributes, mid-level
beauty-related attributes, and low-level image
features. Based on this model, the most compatible
beauty attributes for a given facial image can be
efficiently inferred. For the how-to-wear-it problem,
an effective and efficient facial image synthesis
module is designed to seamlessly synthesize the
recommended makeovers into the user facial image. We
have conducted extensive experiments on testing images
of various conditions to evaluate and analyze the
proposed system. The experimental results well
demonstrate the effectiveness and efficiency of the
proposed system.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "20",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhang:2014:AAS,
author = "Hanwang Zhang and Zheng-Jun Zha and Yang Yang and
Shuicheng Yan and Yue Gao and Tat-Seng Chua",
title = "Attribute-Augmented Semantic Hierarchy: Towards a
Unified Framework for Content-Based Image Retrieval",
journal = j-TOMM,
volume = "11",
number = "1s",
pages = "21:1--21:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2637291",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Oct 3 12:44:25 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article presents a novel attribute-augmented
semantic hierarchy (A$^2$ SH) and demonstrates its
effectiveness in bridging both the semantic and
intention gaps in content-based image retrieval (CBIR).
A$^2$ SH organizes semantic concepts into multiple
semantic levels and augments each concept with a set of
related attributes. The attributes are used to describe
the multiple facets of the concept and act as the
intermediate bridge connecting the concept and
low-level visual content. An hierarchical semantic
similarity function is learned to characterize the
semantic similarities among images for retrieval. To
better capture user search intent, a hybrid feedback
mechanism is developed, which collects hybrid feedback
on attributes and images. This feedback is then used to
refine the search results based on A$^2$ SH. We use
A$^2$ SH as a basis to develop a unified content-based
image retrieval system. We conduct extensive
experiments on a large-scale dataset of over one
million Web images. Experimental results show that the
proposed A$^2$ SH can characterize the semantic
affinities among images accurately and can shape user
search intent quickly, leading to more accurate search
results as compared to state-of-the-art CBIR
solutions.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "21",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhao:2014:SSS,
author = "Xin Zhao and Xue Li and Chaoyi Pang and Quan Z. Sheng
and Sen Wang and Mao Ye",
title = "Structured Streaming Skeleton --- A New Feature for
Online Human Gesture Recognition",
journal = j-TOMM,
volume = "11",
number = "1s",
pages = "22:1--22:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2648583",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Oct 3 12:44:25 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Online human gesture recognition has a wide range of
applications in computer vision, especially in
human-computer interaction applications. The recent
introduction of cost-effective depth cameras brings a
new trend of research on body-movement gesture
recognition. However, there are two major challenges:
(i) how to continuously detect gestures from
unsegmented streams, and (ii) how to differentiate
different styles of the same gesture from other types
of gestures. In this article, we solve these two
problems with a new effective and efficient feature
extraction method-Structured Streaming Skeleton
(SSS)-which uses a dynamic matching approach to
construct a feature vector for each frame. Our
comprehensive experiments on MSRC-12 Kinect Gesture,
Huawei/3DLife-2013, and MSR-Action3D datasets have
demonstrated superior performances than the
state-of-the-art approaches. We also demonstrate model
selection based on the proposed SSS feature, where the
classifier of squared loss regression with l$_{2, 1}$
norm regularization is a recommended classifier for
best performance.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "22",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Carbunar:2014:EFN,
author = "Bogdan Carbunar and Rahul Potharaju and Michael Pearce
and Venugopal Vasudevan and Michael Needham",
title = "Errata for: {A Framework for Network Aware Caching for
Video on Demand Systems}",
journal = j-TOMM,
volume = "11",
number = "1s",
pages = "23:1--23:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2661298",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Oct 3 12:44:25 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
note = "See \cite{Carbunar:2013:FNA}.",
abstract = "Some errors were introduced into this article in the
preparation of the final source files. The errors are
summarized in the following text and revised pages with
the corrected elements indicated in red are provided.
The full corrected article can be accessed in the ACM
DL, DOI https://doi.org/10.1145/2501643.2501652 -Page
8: New Figure 6(a) -Page 16: New Figures 8(a), 8(b),
and 9(a) -Page 17: New Figure 10(b) -Page 18: New
Figures 11 and 12; corrected text reference -Page 19:
Final sentence deleted",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "23",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhang:2014:AGS,
author = "Ying Zhang and Luming Zhang and Roger Zimmermann",
title = "Aesthetics-Guided Summarization from Multiple User
Generated Videos",
journal = j-TOMM,
volume = "11",
number = "2",
pages = "24:1--24:??",
month = dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2659520",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Jan 7 17:48:10 MST 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In recent years, with the rapid development of camera
technology and portable devices, we have witnessed a
flourish of user generated videos, which are gradually
reshaping the traditional professional video oriented
media market. The volume of user generated videos in
repositories is increasing at a rapid rate. In today's
video retrieval systems, a simple query will return
many videos which seriously increase the viewing
burden. To manage these video retrievals and provide
viewers with an efficient way to browse, we introduce a
system to automatically generate a summarization from
multiple user generated videos and present their
salience to viewers in an enjoyable manner. Among
multiple consumer videos, we find their qualities to be
highly diverse due to various factors such as a
photographer's experience or environmental conditions
at the time of capture. Such quality inspires us to
include a video quality evaluation component into the
video summarization since videos with poor qualities
can seriously degrade the viewing experience. We first
propose a probabilistic model to evaluate the aesthetic
quality of each user generated video. This model
compares the rich aesthetics information from several
well-known photo databases with generic unlabeled
consumer videos, under a human perception component
indicating the correlation between a video and its
constituting frames. Subjective studies were carried
out with the results indicating that our method is
reliable. Then a novel graph-based formulation is
proposed for the multi-video summarization task.
Desirable summarization criteria is incorporated as the
graph attributes and the problem is solved through a
dynamic programming framework. Comparisons with several
state-of-the-art methods demonstrate that our algorithm
performs better than other methods in generating a
skimming video in preserving the essential scenes from
the original multiple input videos, with smooth
transitions among consecutive segments and appealing
aesthetics overall.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "24",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Calagari:2014:AAL,
author = "Kiana Calagari and Mohammad Reza Pakravan and Shervin
Shirmohammadi and Mohamed Hefeeda",
title = "{ALP}: Adaptive Loss Protection Scheme with Constant
Overhead for Interactive Video Applications",
journal = j-TOMM,
volume = "11",
number = "2",
pages = "25:1--25:??",
month = dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2656203",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Jan 7 17:48:10 MST 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "There has been an increasing demand for interactive
video transmission over the Internet for applications
such as video conferencing, video calls, and
telepresence applications. These applications are
increasingly moving towards providing High Definition
(HD) video quality to users. A key challenge in these
applications is to preserve the quality of video when
it is transported over best-effort networks that do not
guarantee lossless transport of video packets. In such
conditions, it is important to protect the transmitted
video by using intelligent and adaptive protection
schemes. Applications such as HD video conferencing
require live interaction among participants, which
limits the overall delay the system can tolerate.
Therefore, the protection scheme should add little or
no extra delay to video transport. We propose a novel
Adaptive Loss Protection (ALP) scheme for interactive
HD video applications such as video conferencing and
video chats. This scheme adds negligible delay to the
transmission process and is shown to achieve better
quality than other schemes in lossy networks. The
proposed ALP scheme adaptively applies four different
protection modes to cope with the dynamic network
conditions, which results in high video quality in all
network conditions. Our ALP scheme consists of four
protection modes; each of these modes utilizes a
protection method. Two of the modes rely on the
state-of-the-art protection methods, and we propose a
new Integrated Loss Protection (ILP) method for the
other two modes. In the ILP method we integrate three
factors for distributing the protection among packets.
These three factors are error propagation, region of
interest and header information. In order to decide
when to switch between the protection modes, a new
metric is proposed based on the effectiveness of each
mode in performing protection, rather than just
considering network statistics such as packet loss
rate. Results show that by using this metric not only
the overall quality will be improved but also the
variance of quality will decrease. One of the main
advantages of the proposed ALP scheme is that it does
not increase the bit rate overhead in poor network
conditions. Our results show a significant gain in
video quality, up to 3dB PSNR improvement is achieved
using our scheme, compared to protecting all packets
equally with the same amount of overhead.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "25",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Ren:2014:BGO,
author = "Dongni Ren and Yisheng Xu and S.-H. Gary Chan",
title = "Beyond {1Mbps} Global Overlay Live Streaming: The Case
of Proxy Helpers",
journal = j-TOMM,
volume = "11",
number = "2",
pages = "26:1--26:??",
month = dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2652485",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Jan 7 17:48:10 MST 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In order to provide live streaming over the global
Internet, a content provider often deploys an overlay
network consisting of distributed proxies placed close
to user pools. Streaming of multi-Mbps video over such
an overlay is challenging because of bandwidth
bottlenecks in paths. To effectively overcome these
bottlenecks, we consider employing proxy helpers in the
overlay to provide rich path diversity. The helpers do
not have any attached users, and hence may forward
partial video streams (or not at all) if necessary. In
this way, the helpers serve as stepping stones to
supply full streams to the servers. The issue is how to
involve the helpers in the overlay to achieve low
streaming delay meeting a certain high streaming
bitrate requirement. To address the issue, we first
formulate the problem which captures various delay and
bandwidth components, and show that it is NP-hard. We
then propose an efficient algorithm called
Stepping-Stones (SS) which can be efficiently
implemented in a controller. Given the encouraging
simulation results, we develop a novel streaming
testbed for SS and explore, through sets of Internet
experiments, the effectiveness of helpers to achieve
high bitrate (multi-Mbps) global live streaming. In our
experiments, proxies are deployed with a reasonably
wide global footprint. We collect more than a hundred
hours of streaming traces with bitrate ranging from
500kbps to a few Mbps. Our experimental data validates
that helpers indeed play an important role in achieving
high bitrate in today's Internet. Global multi-Mbps
streaming is possible due to their multihop and
multipath advantages. Our experimental trials and data
also provide valuable insights on the design of a
global push-based streaming network. There are strong
benefits of using proxy helpers to achieve high bitrate
and low delay.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "26",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Qian:2014:SEC,
author = "Shengsheng Qian and Tianzhu Zhang and Changsheng Xu
and M. Shamim Hossain",
title = "Social Event Classification via Boosted Multimodal
Supervised Latent {Dirichlet} Allocation",
journal = j-TOMM,
volume = "11",
number = "2",
pages = "27:1--27:??",
month = dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2659521",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Jan 7 17:48:10 MST 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "With the rapidly increasing popularity of social media
sites (e.g., Flickr, YouTube, and Facebook), it is
convenient for users to share their own comments on
many social events, which successfully facilitates
social event generation, sharing and propagation and
results in a large amount of user-contributed media
data (e.g., images, videos, and text) for a wide
variety of real-world events of different types and
scales. As a consequence, it has become more and more
difficult to exactly find the interesting events from
massive social media data, which is useful to browse,
search and monitor social events by users or
governments. To deal with these issues, we propose a
novel boosted multimodal supervised Latent Dirichlet
Allocation (BMM-SLDA) for social event classification
by integrating a supervised topic model, denoted as
multi-modal supervised Latent Dirichlet Allocation
(mm-SLDA), in the boosting framework. Our proposed
BMM-SLDA has a number of advantages. (1) Our mm-SLDA
can effectively exploit the multimodality and the
multiclass property of social events jointly, and make
use of the supervised category label information to
classify multiclass social event directly. (2) It is
suitable for large-scale data analysis by utilizing
boosting weighted sampling strategy to iteratively
select a small subset of data to efficiently train the
corresponding topic models. (3) It effectively exploits
social event structure by the document weight
distribution with classification error and can
iteratively learn new topic model to correct the
previously misclassified event documents. We evaluate
our BMM-SLDA on a real world dataset and show extensive
experimental results, which demonstrate that our model
outperforms state-of-the-art methods.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "27",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Ye:2014:OBL,
author = "Jun Ye and Kien A. Hua",
title = "Octree-Based {$3$D} Logic and Computation of Spatial
Relationships in Live Video Query Processing",
journal = j-TOMM,
volume = "11",
number = "2",
pages = "28:1--28:??",
month = dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2645864",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Jan 7 17:48:10 MST 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Live video computing (LVC) on distributed smart
cameras has many important applications; and a database
approach based on a Live Video DataBase Management
System (LVDBMS) has shown to be effective for general
LVC application development. The performance of such a
database system relies on accurate interpretation of
spatial relationships among objects in the live video.
With the popularity of affordable depth cameras, 3D
spatial computation techniques have been applied.
However, the 3D object models currently used are
expensive to compute, and offer limited scalability. We
address this drawback in this article by proposing an
octree-based 3D spatial logic and presenting algorithms
for computing 3D spatial relationships using depth
cameras. To support continuous query processing on live
video streams, we also develop a GPU-based
implementation of the proposed technique to further
enhance scalability for real-time applications.
Extensive performance studies based on a public RGB-D
dataset as well as the LVDBMS prototype demonstrates
the correctness and efficiency of our techniques.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "28",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yin:2014:STT,
author = "Yifang Yin and Zhijie Shen and Luming Zhang and Roger
Zimmermann",
title = "Spatial-Temporal Tag Mining for Automatic Geospatial
Video Annotation",
journal = j-TOMM,
volume = "11",
number = "2",
pages = "29:1--29:??",
month = dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2658981",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Jan 7 17:48:10 MST 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Videos are increasingly geotagged and used in
practical and powerful GIS applications. However, video
search and management operations are typically
supported by manual textual annotations, which are
subjective and laborious. Therefore, research has been
conducted to automate or semi-automate this process.
Since a diverse vocabulary for video annotations is of
paramount importance towards good search results, this
article proposes to leverage crowdsourced data from
social multimedia applications that host tags of
diverse semantics to build a spatio-temporal tag
repository, consequently acting as input to our
auto-annotation approach. In particular, to build the
tag store, we retrieve the necessary data from several
social multimedia applications, mine both the spatial
and temporal features of the tags, and then refine and
index them accordingly. To better integrate the tag
repository, we extend our previous approach by
leveraging the temporal characteristics of videos as
well. Moreover, we set up additional ranking criteria
on the basis of tag similarity, popularity and location
bias. Experimental results demonstrate that, by making
use of such a tag repository, the generated tags have a
wide range of semantics, and the resulting rankings are
more consistent with human perception.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "29",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Lin:2014:LAM,
author = "Chih-Wei Lin and Kuan-Wen Chen and Shen-Chi Chen and
Cheng-Wu Chen and Yi-Ping Hung",
title = "Large-Area, Multilayered, and High-Resolution Visual
Monitoring Using a Dual-Camera System",
journal = j-TOMM,
volume = "11",
number = "2",
pages = "30:1--30:??",
month = dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2645862",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Jan 7 17:48:10 MST 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Large-area, high-resolution visual monitoring systems
are indispensable in surveillance applications. To
construct such systems, high-quality image capture and
display devices are required. Whereas high-quality
displays have rapidly developed, as exemplified by the
announcement of the 85-inch 4K ultrahigh-definition TV
by Samsung at the 2013 Consumer Electronics Show (CES),
high-resolution surveillance cameras have progressed
slowly and remain not widely used compared with
displays. In this study, we designed an innovative
framework, using a dual-camera system comprising a
wide-angle fixed camera and a high-resolution
pan-tilt-zoom (PTZ) camera to construct a large-area,
multilayered, and high-resolution visual monitoring
system that features multiresolution monitoring of
moving objects. First, we developed a novel calibration
approach to estimate the relationship between the two
cameras and calibrate the PTZ camera. The PTZ camera
was calibrated based on the consistent property of
distinct pan-tilt angle at various zooming factors,
accelerating the calibration process without affecting
accuracy; this calibration process has not been
reported previously. After calibrating the dual-camera
system, we used the PTZ camera and synthesized a
large-area and high-resolution background image. When
foreground targets were detected in the images captured
by the wide-angle camera, the PTZ camera was controlled
to continuously track the user-selected target. Last,
we integrated preconstructed high-resolution background
and low-resolution foreground images captured using the
wide-angle camera and the high-resolution foreground
image captured using the PTZ camera to generate a
large-area, multilayered, and high-resolution view of
the scene.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "30",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Deng:2014:TFP,
author = "Zhengyu Deng and Ming Yan and Jitao Sang and
Changsheng Xu",
title = "{Twitter} is Faster: Personalized Time-Aware Video
Recommendation from {Twitter} to {YouTube}",
journal = j-TOMM,
volume = "11",
number = "2",
pages = "31:1--31:??",
month = dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2637285",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Jan 7 17:48:10 MST 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Traditional personalized video recommendation methods
focus on utilizing user profile or user history
behaviors to model user interests, which follows a
static strategy and fails to capture the swift shift of
the short-term interests of users. According to our
cross-platform data analysis, the information emergence
and propagation is faster in social textual
stream-based platforms than that in multimedia sharing
platforms at micro user level. Inspired by this, we
propose a dynamic user modeling strategy to tackle
personalized video recommendation issues in the
multimedia sharing platform YouTube, by transferring
knowledge from the social textual stream-based platform
Twitter. In particular, the cross-platform video
recommendation strategy is divided into two steps. (1)
Real-time hot topic detection: the hot topics that
users are currently following are extracted from users'
tweets, which are utilized to obtain the related videos
in YouTube. (2) Time-aware video recommendation: for
the target user in YouTube, the obtained videos are
ranked by considering the user profile in YouTube, time
factor, and quality factor to generate the final
recommendation list. In this way, the short-term (hot
topics) and long-term (user profile) interests of users
are jointly considered. Carefully designed experiments
have demonstrated the advantages of the proposed
method.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "31",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Hu:2014:SFV,
author = "Yongtao Hu and Jan Kautz and Yizhou Yu and Wenping
Wang",
title = "Speaker-Following Video Subtitles",
journal = j-TOMM,
volume = "11",
number = "2",
pages = "32:1--32:??",
month = dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2632111",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Jan 7 17:48:10 MST 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "We propose a new method for improving the presentation
of subtitles in video (e.g., TV and movies). With
conventional subtitles, the viewer has to constantly
look away from the main viewing area to read the
subtitles at the bottom of the screen, which disrupts
the viewing experience and causes unnecessary
eyestrain. Our method places on-screen subtitles next
to the respective speakers to allow the viewer to
follow the visual content while simultaneously reading
the subtitles. We use novel identification algorithms
to detect the speakers based on audio and visual
information. Then the placement of the subtitles is
determined using global optimization. A comprehensive
usability study indicated that our subtitle placement
method outperformed both conventional fixed-position
subtitling and another previous dynamic subtitling
method in terms of enhancing the overall viewing
experience and reducing eyestrain.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "32",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Chen:2015:ISI,
author = "Kuan-Ta Chen and Songqing Chen and Wei Tsang Ooi",
title = "Introduction to the Special Issue on {MMSys 2014} and
{NOSSDAV 2014}",
journal = j-TOMM,
volume = "11",
number = "2s",
pages = "41:1--41:??",
month = feb,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2717509",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Feb 25 17:56:15 MST 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "41",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Schaber:2015:CAM,
author = "Philipp Schaber and Stephan Kopf and Sina Wetzel and
Tyler Ballast and Christoph Wesch and Wolfgang
Effelsberg",
title = "{CamMark}: Analyzing, Modeling, and Simulating
Artifacts in Camcorder Copies",
journal = j-TOMM,
volume = "11",
number = "2s",
pages = "42:1--42:??",
month = feb,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700295",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Feb 25 17:56:15 MST 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "To support the development of any system that includes
the generation and evaluation of camcorder copies, as
well as to provide a common benchmark for robustness
against camcorder copies, we present a tool to simulate
digital video re-acquisition using a digital video
camera. By resampling each video frame, we simulate the
typical artifacts occurring in a camcorder copy:
geometric modifications (aspect ratio changes,
cropping, perspective and lens distortion), temporal
sampling artifacts (due to different frame rates,
shutter speeds, rolling shutters, or playback), spatial
and color subsampling (rescaling, filtering, Bayer
color filter array), and processing steps (automatic
gain control, automatic white balance). We also support
the simulation of camera movement (e.g., a hand-held
camera) and background insertion. Furthermore, we allow
for an easy setup and calibration of all the simulated
artifacts, using sample/reference pairs of images and
videos. Specifically temporal subsampling effects are
analyzed in detail to create realistic frame blending
artifacts in the simulated copies. We carefully
evaluated our entire camcorder simulation system and
found that the models we developed describe and match
the real artifacts quite well.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "42",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Toni:2015:OSA,
author = "Laura Toni and Ramon Aparicio-Pardo and Karine Pires
and Gwendal Simon and Alberto Blanc and Pascal
Frossard",
title = "Optimal Selection of Adaptive Streaming
Representations",
journal = j-TOMM,
volume = "11",
number = "2s",
pages = "43:1--43:??",
month = feb,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700294",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Feb 25 17:56:15 MST 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Adaptive streaming addresses the increasing and
heterogeneous demand of multimedia content over the
Internet by offering several encoded versions for each
video sequence. Each version (or representation) is
characterized by a resolution and a bit rate, and it is
aimed at a specific set of users, like TV or mobile
phone clients. While most existing works on adaptive
streaming deal with effective playout-buffer control
strategies on the client side, in this article we take
a providers' perspective and propose solutions to
improve user satisfaction by optimizing the set of
available representations. We formulate an integer
linear program that maximizes users' average
satisfaction, taking into account network dynamics,
type of video content, and user population
characteristics. The solution of the optimization is a
set of encoding parameters corresponding to the
representations set that maximizes user satisfaction.
We evaluate this solution by simulating multiple
adaptive streaming sessions characterized by realistic
network statistics, showing that the proposed solution
outperforms commonly used vendor recommendations, in
terms of user satisfaction but also in terms of
fairness and outage probability. The simulation results
show that video content information as well as network
constraints and users' statistics play a crucial role
in selecting proper encoding parameters to provide
fairness among users and to reduce network resource
usage. We finally propose a few theoretical guidelines
that can be used, in realistic settings, to choose the
encoding parameters based on the user characteristics,
the network capacity and the type of video content.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "43",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Chen:2015:ADF,
author = "Liang Chen and Yipeng Zhou and Dah Ming Chiu",
title = "Analysis and Detection of Fake Views in Online Video
Services",
journal = j-TOMM,
volume = "11",
number = "2s",
pages = "44:1--44:??",
month = feb,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700290",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Feb 25 17:56:15 MST 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Online video-on-demand(VoD) services invariably
maintain a view count for each video they serve, and it
has become an important currency for various
stakeholders, from viewers, to content owners,
advertizers, and the online service providers
themselves. There is often significant financial
incentive to use a robot (or a botnet) to artificially
create fake views. How can we detect fake views? Can we
detect them (and stop them) efficiently? What is the
extent of fake views with current VoD service
providers? These are the questions we study in this
article. We develop some algorithms and show that they
are quite effective for this problem.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "44",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Song:2015:SVT,
author = "Minseok Song and Yeongju Lee and Jinhan Park",
title = "Scheduling a Video Transcoding Server to Save Energy",
journal = j-TOMM,
volume = "11",
number = "2s",
pages = "45:1--45:??",
month = feb,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700282",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Feb 25 17:56:15 MST 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Recent popular streaming services such as TV
Everywhere, N-Screen, and dynamic adaptive streaming
over HTTP (DASH) need to deliver content to the wide
range of devices, requiring video content to be
transcoded into different versions. Transcoding tasks
require a lot of computation, and each task typically
has its own real-time constraint. These make it
difficult to manage transcoding, but the more efficient
use of energy in servers is an imperative. We
characterize transcoding workloads in terms of
deadlines and computation times, and propose a new
dynamic voltage and frequency scaling (DVFS) scheme
that allocates a frequency and a workload to each CPU
with the aim of minimizing power consumption while
meeting all transcoding deadlines. This scheme has been
simulated, and also implemented in a Linux transcoding
server, in which a frontend node distributes
transcoding requests to heterogeneous backend nodes.
This required a new protocol for communication between
nodes, a DVFS management scheme to reduce power
consumption and thread management and scheduling
schemes which ensure that transcoding deadlines are
met. Power measurements show that this approach can
reduce system-wide energy consumption by 17\% to 31\%,
compared with the Linux Ondemand governor.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "45",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Langroodi:2015:DCA,
author = "Mohsen Jamali Langroodi and Joseph Peters and Shervin
Shirmohammadi",
title = "Decoder-Complexity-Aware Encoding of Motion
Compensation for Multiple Heterogeneous Receivers",
journal = j-TOMM,
volume = "11",
number = "2s",
pages = "46:1--46:??",
month = feb,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700300",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Feb 25 17:56:15 MST 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "For mobile multimedia systems, advances in battery
technology have been much slower than those in memory,
graphics, and processing power, making power
consumption a major concern in mobile systems. The
computational complexity of video codecs, which
consists of CPU operations and memory accesses, is one
of the main factors affecting power consumption. In
this article, we propose a method that achieves
near-optimal video quality while respecting
user-defined bounds on the complexity needed to decode
a video. We specifically focus on the motion
compensation process, including motion vector
prediction and interpolation, because it is the single
largest component of computation-based power
consumption. We start by formulating a scenario with a
single receiver as a rate-distortion optimization
problem and we develop an efficient
decoder-complexity-aware video encoding method to solve
it. Then we extend our approach to handle multiple
heterogeneous receivers, each with a different
complexity requirement. We test our method
experimentally using the H.264 standard for the single
receiver scenario and the H.264 SVC extension for the
multiple receiver scenario. Our experimental results
show that our method can achieve up to 97\% of the
optimal solution value in the single receiver scenario,
and an average of 97\% of the optimal solution value in
the multiple receiver scenario. Furthermore, our tests
with actual power measurements show a power saving of
up to 23\% at the decoder when the complexity threshold
is halved in the encoder.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "46",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Chen:2015:TAT,
author = "Shannon Chen and Zhenhuan Gao and Klara Nahrstedt and
Indranil Gupta",
title = "{$3$DTI} Amphitheater: Towards {$3$DTI} Broadcasting",
journal = j-TOMM,
volume = "11",
number = "2s",
pages = "47:1--47:??",
month = feb,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700297",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Feb 25 17:56:15 MST 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "3DTI Amphitheater is a live broadcasting system for
dissemination of 3DTI (3D Tele-immersive) content. The
virtual environment constructed by the system mimics an
amphitheater in the real world, where performers
interact with each other in the central circular stage,
and the audience is placed in virtual seats that
surround the stage. Users of the Amphitheater can be
geographically dispersed and the streams created by the
performer sites are disseminated in a P2P network among
the participants. To deal with the high bandwidth
demand and strict latency bound of the service, we
identify the hierarchical priority of streams in
construction of the content dissemination forest.
Result shows that the Amphitheater outperforms prior
3DTI systems by boosting the application QoS by a
factor of 2.8 while sustaining the same hundred-scale
audience group.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "47",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Chen:2015:PMV,
author = "Ke Chen and Zhong Zhou and Wei Wu",
title = "Progressive Motion Vector Clustering for Motion
Estimation and Auxiliary Tracking",
journal = j-TOMM,
volume = "11",
number = "3",
pages = "33:1--33:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700296",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Feb 5 17:03:39 MST 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The motion vector similarity between neighboring
blocks is widely used in motion estimation algorithms.
However, for nonneighboring blocks, they may also have
similar motions due to close depths or belonging to the
same object inside the scene. Therefore, the motion
vectors usually have several kinds of patterns, which
reveal a clustering structure. In this article, we
propose a progressive clustering algorithm, which
periodically counts the motion vectors of the past
blocks to make incremental clustering statistics. These
statistics are used as the motion vector predictors for
the following blocks. It is proved to be much more
efficient for one block to find the best-matching
candidate with the predictors. We also design the
clustering based search with CUDA for GPU acceleration.
Another interesting application of the clustering
statistics is persistent static object tracking. Based
on the statistics, several auxiliary tracking areas are
created to guide the object tracking. Even when the
target object has significant changes in appearance or
it disappears occasionally, its position still can be
predicted. The experiments on Xiph.org Video Test Media
dataset illustrate that our clustering based search
algorithm outperforms the mainstream and some
state-of-the-art motion estimation algorithms. It is 33
times faster on average than the full search algorithm
with only slightly higher mean-square error values in
the experiments. The tracking results show that the
auxiliary tracking areas help to locate the target
object effectively.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "33",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Shen:2015:HFM,
author = "Liquan Shen and Ping An and Zhaoyang Zhang and
Qianqian Hu and Zhengchuan Chen",
title = "A {$3$D--HEVC} Fast Mode Decision Algorithm for
Real-Time Applications",
journal = j-TOMM,
volume = "11",
number = "3",
pages = "34:1--34:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700298",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Feb 5 17:03:39 MST 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "3D High Efficiency Video Coding (3D-HEVC) is an
extension of the HEVC standard for coding of multiview
videos and depth maps. It inherits the same quadtree
coding structure as HEVC for both components, which
allows recursively splitting into four equal-sized
coding units (CU). One of 11 different prediction modes
is chosen to code a CU in inter-frames. Similar to the
joint model of H.264/AVC, the mode decision process in
HM (reference software of HEVC) is performed using all
the possible depth levels and prediction modes to find
the one with the least rate distortion cost using a
Lagrange multiplier. Furthermore, both motion
estimation and disparity estimation need to be
performed in the encoding process of 3D-HEVC. Those
tools achieve high coding efficiency, but lead to a
significant computational complexity. In this article,
we propose a fast mode decision algorithm for 3D-HEVC.
Since multiview videos and their associated depth maps
represent the same scene, at the same time instant,
their prediction modes are closely linked. Furthermore,
the prediction information of a CU at the depth level X
is strongly related to that of its parent CU at the
depth level X-1 in the quadtree coding structure of
HEVC since two corresponding CUs from two neighboring
depth levels share similar video characteristics. The
proposed algorithm jointly exploits the inter-view
coding mode correlation, the inter-component
(texture-depth) correlation and the inter-level
correlation in the quadtree structure of 3D-HEVC.
Experimental results show that our algorithm saves 66\%
encoder runtime on average with only a 0.2\% BD-Rate
increase on coded views and 1.3\% BD-Rate increase on
synthesized views.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "34",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yang:2015:BML,
author = "Xiaoshan Yang and Tianzhu Zhang and Changsheng Xu and
Ming-Hsuan Yang",
title = "Boosted Multifeature Learning for Cross-Domain
Transfer",
journal = j-TOMM,
volume = "11",
number = "3",
pages = "35:1--35:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700286",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Feb 5 17:03:39 MST 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Conventional learning algorithm assumes that the
training data and test data share a common
distribution. However, this assumption will greatly
hinder the practical application of the learned model
for cross-domain data analysis in multimedia. To deal
with this issue, transfer learning based technology
should be adopted. As a typical version of transfer
learning, domain adaption has been extensively studied
recently due to its theoretical value and practical
interest. In this article, we propose a boosted
multifeature learning (BMFL) approach to iteratively
learn multiple representations within a boosting
procedure for unsupervised domain adaption. The
proposed BMFL method has a number of properties. (1) It
reuses all instances with different weights assigned by
the previous boosting iteration and avoids discarding
labeled instances as in conventional methods. (2) It
models the instance weight distribution effectively by
considering the classification error and the domain
similarity, which facilitates learning new feature
representation to correct the previously misclassified
instances. (3) It learns multiple different feature
representations to effectively bridge the source and
target domains. We evaluate the BMFL by comparing its
performance on three applications: image
classification, sentiment classification and spam
filtering. Extensive experimental results demonstrate
that the proposed BMFL algorithm performs favorably
against state-of-the-art domain adaption methods.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "35",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Lin:2015:DVS,
author = "Pei-Yu Lin",
title = "Double Verification Secret Sharing Mechanism Based on
Adaptive Pixel Pair Matching",
journal = j-TOMM,
volume = "11",
number = "3",
pages = "36:1--36:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700291",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Feb 5 17:03:39 MST 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Verifiability is essential for the secret sharing
approach, which allows the involved participants to
detect cheaters during the secret retrieval process. In
this article, we propose a double verification secret
sharing (DVSS) mechanism that can not only prevent
fraudulent participants but also satisfy the
requirements of secret payload, camouflage, image
fidelity and lossless revealed secret. DVSS offers
double verification process to enhance the cheater
detectability; experimental results reveal that the
designed scheme can share larger secret capacity and
retain superior image quality than the related secret
sharing methods.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "36",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wang:2015:INB,
author = "Shuang Wang and Shuqiang Jiang",
title = "{INSTRE}: a New Benchmark for Instance-Level Object
Retrieval and Recognition",
journal = j-TOMM,
volume = "11",
number = "3",
pages = "37:1--37:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700292",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Feb 5 17:03:39 MST 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Over the last several decades, researches on visual
object retrieval and recognition have achieved fast and
remarkable success. However, while the category-level
tasks prevail in the community, the instance-level
tasks (especially recognition) have not yet received
adequate focuses. Applications such as content-based
search engine and robot vision systems have alerted the
awareness to bring instance-level tasks into a more
realistic and challenging scenario. Motivated by the
limited scope of existing instance-level datasets, in
this article we propose a new benchmark for
INSTance-level visual object REtrieval and REcognition
(INSTRE). Compared with existing datasets, INSTRE has
the following major properties: (1) balanced data
scale, (2) more diverse intraclass instance variations,
(3) cluttered and less contextual backgrounds, (4)
object localization annotation for each image, (5)
well-manipulated double-labelled images for measuring
multiple object (within one image) case. We will
quantify and visualize the merits of INSTRE data, and
extensively compare them against existing datasets.
Then on INSTRE, we comprehensively evaluate several
popular algorithms to large-scale object retrieval
problem with multiple evaluation metrics. Experimental
results show that all the methods suffer a performance
drop on INSTRE, proving that this field still remains a
challenging problem. Finally we integrate these
algorithms into a simple yet efficient scheme for
recognition and compare it with classification-based
methods. Importantly, we introduce the realistic
multiobjects recognition problem. All experiments are
conducted in both single object case and multiple
objects case.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "37",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Lathey:2015:IEE,
author = "Ankita Lathey and Pradeep K. Atrey",
title = "Image Enhancement in Encrypted Domain over Cloud",
journal = j-TOMM,
volume = "11",
number = "3",
pages = "38:1--38:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2656205",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Feb 5 17:03:39 MST 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Cloud-based multimedia systems are becoming
increasingly common. These systems offer not only
storage facility, but also high-end computing
infrastructure which can be used to process data for
various analysis tasks ranging from low-level data
quality enhancement to high-level activity and behavior
identification operations. However, cloud data centers,
being third party servers, are often prone to
information leakage, raising security and privacy
concerns. In this article, we present a Shamir's secret
sharing based method to enhance the quality of
encrypted image data over cloud. Using the proposed
method we show that several image enhancement
operations such as noise removal, antialiasing, edge
and contrast enhancement, and dehazing can be performed
in encrypted domain with near-zero loss in accuracy and
minimal computation and data overhead. Moreover, the
proposed method is proven to be information
theoretically secure.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "38",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yin:2015:CVC,
author = "Yifang Yin and Beomjoo Seo and Roger Zimmermann",
title = "Content vs. Context: Visual and Geographic Information
Use in Video Landmark Retrieval",
journal = j-TOMM,
volume = "11",
number = "3",
pages = "39:1--39:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700287",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Feb 5 17:03:39 MST 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Due to the ubiquity of sensor-equipped smartphones, it
has become increasingly feasible for users to capture
videos together with associated geographic metadata,
for example the location and the orientation of the
camera. Such contextual information creates new
opportunities for the organization and retrieval of
geo-referenced videos. In this study we explore the
task of landmark retrieval through the analysis of two
types of state-of-the-art techniques, namely
media-content-based and geocontext-based retrievals.
For the content-based method, we choose the Spatial
Pyramid Matching (SPM) approach combined with two
advanced coding methods: Sparse Coding (SC) and
Locality-Constrained Linear Coding (LLC). For the
geo-based method, we present the Geo Landmark
Visibility Determination (GeoLVD) approach which
computes the visibility of a landmark based on
intersections of a camera's field-of-view (FOV) and the
landmark's geometric information available from
Geographic Information Systems (GIS) and services. We
first compare the retrieval results of the two methods,
and discuss the strengths and weaknesses of each
approach in terms of precision, recall and execution
time. Next we analyze the factors that affect the
effectiveness for the content-based and the geo-based
methods, respectively. Finally we propose a hybrid
retrieval method based on the integration of the visual
(content) and geographic (context) information, which
is shown to achieve significant improvements in our
experiments. We believe that the results and
observations in this work will enlighten the design of
future geo-referenced video retrieval systems, improve
our understanding of selecting the most appropriate
visual features for indexing and searching, and help in
selecting between the most suitable methods for
retrieval based on different conditions.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "39",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yang:2015:RCI,
author = "Hong-Ying Yang and Xiang-Yang Wang and Pan-Pan Niu and
Ai-Long Wang",
title = "Robust Color Image Watermarking Using Geometric
Invariant Quaternion Polar Harmonic Transform",
journal = j-TOMM,
volume = "11",
number = "3",
pages = "40:1--40:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700299",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Feb 5 17:03:39 MST 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "It is a challenging work to design a robust color
image watermarking scheme against geometric
distortions. Moments and moment invariants have become
a powerful tool in robust image watermarking owing to
their image description capability and geometric
invariance property. However, the existing moment-based
watermarking schemes were mainly designed for gray
images but not for color images, and detection quality
and robustness will be lowered when watermark is
directly embedded into the luminance component or three
color channels of color images. Furthermore, the
imperceptibility of the embedded watermark is not well
guaranteed. Based on algebra of quaternions and polar
harmonic transform (PHT), we introduced the quaternion
polar harmonic transform (QPHT) for invariant color
image watermarking in this article, which can be seen
as the generalization of PHT for gray-level images. It
is shown that the QPHT can be obtained from the PHT of
each color channel. We derived and analyzed the
rotation, scaling, and translation (RST) invariant
property of QPHT. We also discussed the problem of
color image watermarking using QPHT. Experimental
results are provided to illustrate the efficiency of
the proposed color image watermarking against geometric
distortions and common image processing operations
(including color attacks).",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "40",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Krishnappa:2015:CCV,
author = "Dilip Kumar Krishnappa and Michael Zink and Carsten
Griwodz and P{\aa}l Halvorsen",
title = "Cache-Centric Video Recommendation: an Approach to
Improve the Efficiency of {YouTube} Caches",
journal = j-TOMM,
volume = "11",
number = "4",
pages = "48:1--48:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2716310",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Aug 7 08:29:56 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this article, we take advantage of the user
behavior of requesting videos from the top of the
related list provided by YouTube to improve the
performance of YouTube caches. We recommend that local
caches reorder the related lists associated with
YouTube videos, presenting the cached content above
noncached content. We argue that the likelihood that
viewers select content from the top of the related list
is higher than selection from the bottom, and pushing
contents already in the cache to the top of the related
list would increase the likelihood of choosing cached
content. To verify that the position on the list really
is the selection criterion more dominant than the
content itself, we conduct a user study with 40
YouTube-using volunteers who were presented with random
related lists in their everyday YouTube use. After
confirming our assumption, we analyze the benefits of
our approach by an investigation that is based on two
traces collected from a university campus. Our analysis
shows that the proposed reordering approach for related
lists would lead to a 2 to 5 times increase in cache
hit rate compared to an approach without reordering the
related list. This increase in hit rate would lead to
reduction in server load and backend bandwidth usage,
which in turn reduces the latency in streaming the
video requested by the viewer and has the potential to
improve the overall performance of YouTube's content
distribution system. An analysis of YouTube's
recommendation system reveals that related lists are
created from a small pool of videos, which increases
the potential for caching content from related lists
and reordering based on the content in the cache.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "48",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhang:2015:PMC,
author = "Yu Zhang and James Z. Wang and Jia Li",
title = "Parallel Massive Clustering of Discrete
Distributions",
journal = j-TOMM,
volume = "11",
number = "4",
pages = "49:1--49:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700293",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Aug 7 08:29:56 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The trend of analyzing big data in artificial
intelligence demands highly-scalable machine learning
algorithms, among which clustering is a fundamental and
arguably the most widely applied method. To extend the
applications of regular vector-based clustering
algorithms, the Discrete Distribution (D2) clustering
algorithm has been developed, aiming at clustering data
represented by bags of weighted vectors which are well
adopted data signatures in many emerging information
retrieval and multimedia learning applications.
However, the high computational complexity of
D2-clustering limits its impact in solving massive
learning problems. Here we present the parallel
D2-clustering (PD2-clustering) algorithm with
substantially improved scalability. We developed a
hierarchical multipass algorithm structure for parallel
computing in order to achieve a balance between the
individual-node computation and the integration process
of the algorithm. Experiments and extensive comparisons
between PD2-clustering and other clustering algorithms
are conducted on synthetic datasets. The results show
that the proposed parallel algorithm achieves
significant speed-up with minor accuracy loss. We apply
PD2-clustering to image concept learning. In addition,
by extending D2-clustering to symbolic data, we apply
PD2-clustering to protein sequence clustering. For both
applications, we demonstrate the high competitiveness
of our new algorithm in comparison with other
state-of-the-art methods.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "49",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Baik:2015:EMR,
author = "Eilwoo Baik and Amit Pande and Prasant Mohapatra",
title = "Efficient {MAC} for Real-Time Video Streaming over
Wireless {LAN}",
journal = j-TOMM,
volume = "11",
number = "4",
pages = "50:1--50:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2744412",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Aug 7 08:29:56 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Wireless communication systems are highly prone to
channel errors. With video being a major player in
Internet traffic and undergoing exponential growth in
wireless domain, we argue for the need of a Video-aware
MAC (VMAC) to significantly improve the throughput and
delay performance of real-time video streaming service.
VMAC makes two changes to optimize wireless LAN for
video traffic: (a) It incorporates a
Perceptual-Error-Tolerance (PET) to the MAC frames by
reducing MAC retransmissions while minimizing any
impact on perceptual video quality; and (b) It uses a
group NACK-based Adaptive Window (NAW) of MAC frames to
improve both throughput and delay performance in
varying channel conditions. Through simulations and
experiments, we observe 56--89\% improvement in
throughput and 34--48\% improvement in delay
performance over legacy DCF and 802.11e schemes. VMAC
also shows 15--78\% improvement over legacy schemes
with multiple clients.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "50",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Antaris:2015:SSC,
author = "Stefanos Antaris and Dimitrios Rafailidis",
title = "Similarity Search over the Cloud Based on Image
Descriptors' Dimensions Value Cardinalities",
journal = j-TOMM,
volume = "11",
number = "4",
pages = "51:1--51:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2716315",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Aug 7 08:29:56 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In recognition that in modern applications billions of
images are stored into distributed databases in
different logical or physical locations, we propose a
similarity search strategy over the cloud based on the
dimensions value cardinalities of image descriptors.
Our strategy has low preprocessing requirements by
dividing the computational cost of the preprocessing
steps into several nodes over the cloud and locating
the descriptors with similar dimensions value
cardinalities logically close. New images are inserted
into the distributed databases over the cloud
efficiently, by supporting dynamical update in
real-time. The proposed insertion algorithm has low
computational complexity, depending exclusively on the
dimensionality of descriptors and a small subset of
descriptors with similar dimensions value
cardinalities. Finally, an efficient query processing
algorithm is proposed, where the dimensions of image
descriptors are prioritized in the searching strategy,
assuming that dimensions of high value cardinalities
have more discriminative power than the dimensions of
low ones. The computation effort of the query
processing algorithm is divided into several nodes over
the cloud infrastructure. In our experiments with seven
publicly available datasets of image descriptors, we
show that the proposed similarity search strategy
outperforms competitive methods of single node,
parallel and cloud-based architectures, in terms of
preprocessing cost, search time and accuracy.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "51",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Lin:2015:AMD,
author = "Yin-Tzu Lin and I-Ting Liu and Jyh-Shing Roger Jang
and Ja-Ling Wu",
title = "Audio Musical Dice Game: a User-Preference-Aware
Medley Generating System",
journal = j-TOMM,
volume = "11",
number = "4",
pages = "52:1--52:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2710015",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Aug 7 08:29:56 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article proposes a framework for creating
user-preference-aware music medleys from users' music
collections. We treat the medley generation process as
an audio version of a musical dice game. Once the
user's collection has been analyzed, the system is able
to generate various pleasing medleys. This flexibility
allows users to create medleys according to the
specified conditions, such as the medley structure or
the must-use clips. Even users without musical
knowledge can compose medley songs from their favorite
tracks. The effectiveness of the system has been
evaluated through both objective and subjective
experiments on individual components in the system.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "52",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Chen:2015:AVR,
author = "Bo-Hao Chen and Shih-Chia Huang",
title = "An Advanced Visibility Restoration Algorithm for
Single Hazy Images",
journal = j-TOMM,
volume = "11",
number = "4",
pages = "53:1--53:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2726947",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Aug 7 08:29:56 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Haze removal is the process by which horizontal
obscuration is eliminated from hazy images captured
during inclement weather. Images captured in natural
environments with varied weather conditions frequently
exhibit localized light sources or color-shift effects.
The occurrence of these effects presents a difficult
challenge for hazy image restoration, with which many
traditional restoration methods cannot adequately
contend. In this article, we present a new image haze
removal approach based on Fisher's linear
discriminant-based dual dark channel prior scheme in
order to solve the problems associated with the
presence of localized light sources and color shifts,
and thereby achieve effective restoration. Experimental
restoration results via qualitative and quantitative
evaluations show that our proposed approach can provide
higher haze-removal efficacy for images captured in
varied weather conditions than can the other
state-of-the-art approaches.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "53",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Bao:2015:CPE,
author = "Bing-Kun Bao and Changsheng Xu and Weiqing Min and
Mohammod Shamim Hossain",
title = "Cross-Platform Emerging Topic Detection and
Elaboration from Multimedia Streams",
journal = j-TOMM,
volume = "11",
number = "4",
pages = "54:1--54:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2730889",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Aug 7 08:29:56 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "With the explosive growth of online media platforms in
recent years, it becomes more and more attractive to
provide users a solution of emerging topic detection
and elaboration. And this posts a real challenge to
both industrial and academic researchers because of the
overwhelming information available in multiple
modalities and with large outlier noises. This article
provides a method on emerging topic detection and
elaboration using multimedia streams cross different
online platforms. Specifically, Twitter, New York Times
and Flickr are selected for the work to represent the
microblog, news portal and imaging sharing platforms.
The emerging keywords of Twitter are firstly extracted
using aging theory. Then, to overcome the nature of
short length message in microblog, Robust
Cross-Platform Multimedia Co-Clustering (RCPMM-CC) is
proposed to detect emerging topics with three
novelties: (1) The data from different media platforms
are in multimodalities; (2) The coclustering is
processed based on a pairwise correlated structure, in
which the involved three media platforms are pairwise
dependent; (3) The noninformative samples are
automatically pruned away at the same time of
coclustering. In the last step of cross-platform
elaboration, we enrich each emerging topic with the
samples from New York Times and Flickr by computing the
implicit links between social topics and samples from
selected news and Flickr image clusters, which are
obtained by RCPMM-CC. Qualitative and quantitative
evaluation results demonstrate the effectiveness of our
method.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "54",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Li:2015:QQG,
author = "Yang Li and Azzedine Boukerche",
title = "{QuGu}: a Quality Guaranteed Video Dissemination
Protocol Over Urban Vehicular Ad Hoc Networks",
journal = j-TOMM,
volume = "11",
number = "4",
pages = "55:1--55:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2725469",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Aug 7 08:29:56 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Video dissemination over Vehicular Ad Hoc Networks is
an attractive technology that supports many novel
applications. The merit of this work lies in the design
of an efficient video dissemination protocol that
provides high video quality at different data rates for
urban scenarios. Our objective is to improve received
video quality while meeting delay and packet loss. In
this work, we first employ a reliable scheme known as
connected dominating set, which is an efficient
receiver-based routing scheme for broadcasting video
content. To avoid repeated computing of the connected
dominating set, we add three statuses to each node. In
nonscalable video coding, the distribution of lost
frames can cause a major impact on video quality at the
receiver's end. Therefore, for the second step, we
employ Interleaving to spread out the burst losses and
to reduce the influence of loss distributions. Although
Interleaving can reduce the influence of cluster frame
loss, single packet loss is also a concern due to
collisions, and to intermittent disconnection in the
topology. In order to fix these single packet losses,
we propose a store-carry-forward scheme for the nodes
in order to retransmit the local buffer stored packets.
The results, when compared to the selected base
protocols, show that our proposed protocol is an
efficient solution for video dissemination over urban
Vehicular Ad Hoc Networks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "55",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Gaddam:2015:COM,
author = "Vamsidhar Reddy Gaddam and Ragnhild Eg and Ragnar
Langseth and Carsten Griwodz and P{\aa}l Halvorsen",
title = "The Cameraman Operating My Virtual Camera is
Artificial: Can the Machine Be as Good as a Human?",
journal = j-TOMM,
volume = "11",
number = "4",
pages = "56:1--56:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2744411",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Aug 7 08:29:56 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this article, we argue that the energy spent in
designing autonomous camera control systems is not
spent in vain. We present a real-time virtual camera
system that can create smooth camera motion. Similar
systems are frequently benchmarked with the human
operator as the best possible reference; however, we
avoid a priori assumptions in our evaluations. Our main
question is simply whether we can design algorithms to
steer a virtual camera that can compete with the user
experience for recordings from an expert operator with
several years of experience? In this respect, we
present two low-complexity servoing methods that are
explored in two user studies. The results from the user
studies give a promising answer to the question
pursued. Furthermore, all components of the system meet
the real-time requirements on commodity hardware. The
growing capabilities of both hardware and network in
mobile devices give us hope that this system can be
deployed to mobile users in the near future. Moreover,
the design of the presented system takes into account
that services to concurrent users must be supported.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "56",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Natarajan:2015:MCC,
author = "Prabhu Natarajan and Pradeep K. Atrey and Mohan
Kankanhalli",
title = "Multi-Camera Coordination and Control in Surveillance
Systems: a Survey",
journal = j-TOMM,
volume = "11",
number = "4",
pages = "57:1--57:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2710128",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Aug 7 08:29:56 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The use of multiple heterogeneous cameras is becoming
more common in today's surveillance systems. In order
to perform surveillance tasks, effective coordination
and control in multi-camera systems is very important,
and is catching significant research attention these
days. This survey aims to provide researchers with a
state-of-the-art overview of various techniques for
multi-camera coordination and control (MC$^3$) that
have been adopted in surveillance systems. The existing
literature on MC$^3$ is presented through several
classifications based on the applicable architectures,
frameworks and the associated surveillance tasks.
Finally, a discussion on the open problems in
surveillance area that can be solved effectively using
MC$^3$ and the future directions in MC$^3$ research is
presented",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "57",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{You:2015:UPD,
author = "Shingchern D. You and Yi-Han Pu",
title = "Using Paired Distances of Signal Peaks in Stereo
Channels as Fingerprints for Copy Identification",
journal = j-TOMM,
volume = "12",
number = "1",
pages = "1:1--1:??",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2742059",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Aug 28 06:14:31 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article proposes to use the relative distances
between adjacent envelope peaks detected in stereo
audio as fingerprints for copy identification. The
matching algorithm used is the rough longest common
subsequence (RLCS) algorithm. The experimental results
show that the proposed approach has better
identification accuracy than an MPEG-7 based scheme for
distorted and noisy audio. When compared with other
schemes, the proposed scheme uses fewer bits with
comparable performance. The proposed fingerprints can
also be used in conjunction with the MPEG-7 based
scheme for lower computational burden.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "1",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{ElEssaili:2015:QBC,
author = "Ali {El Essaili} and Zibin Wang and Eckehard Steinbach
and Liang Zhou",
title = "{QoE}-Based Cross-Layer Optimization for Uplink Video
Transmission",
journal = j-TOMM,
volume = "12",
number = "1",
pages = "2:1--2:??",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2801124",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Aug 28 06:14:31 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "We study the problem of resource-efficient uplink
distribution of user-generated video content over
fourth-generation mobile networks. This is challenged
by (1) the capacity-limited and time-variant uplink
channel, (2) the resource-hungry upstreamed videos and
their dynamically changing complexity, and (3) the
different playout times of the video consumers. To
address these issues, we propose a systematic approach
for quality-of-experience (QoE)-based resource
optimization and uplink transmission of multiuser
generated video content. More specifically, we present
an analytical model for distributed scalable video
transmission at the mobile producers which considers
these constraints. This is complemented by a multiuser
cross-layer optimizer in the mobile network which
determines the transmission capacity for each mobile
terminal under current cell load and radio conditions.
Both optimal and low-complexity solutions are
presented. Simulation results for LTE uplink
transmission show that significant gains in perceived
video quality can be achieved by our cross-layer
resource optimization scheme. In addition, the
distributed optimization at the mobile producers can
further improve the user experience across the
different types of video consumers.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "2",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Li:2015:CSN,
author = "Li-Jia Li and David A. Shamma and Xiangnan Kong and
Sina Jafarpour and Roelof {Van Zwol} and Xuanhui Wang",
title = "{CelebrityNet}: a Social Network Constructed from
Large-Scale Online Celebrity Images",
journal = j-TOMM,
volume = "12",
number = "1",
pages = "3:1--3:??",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2801125",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Aug 28 06:14:31 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Photos are an important information carrier for
implicit relationships. In this article, we introduce
an image based social network, called CelebrityNet,
built from implicit relationships encoded in a
collection of celebrity images. We analyze the social
properties reflected in this image-based social network
and automatically infer communities among the
celebrities. We demonstrate the interesting discoveries
of the CelebrityNet. We particularly compare the
inferred communities with human manually labeled ones
and show quantitatively that the automatically detected
communities are highly aligned with that of human
interpretation. Inspired by the uniqueness of visual
content and tag concepts within each community of the
CelebrityNet, we further demonstrate that the
constructed social network can serve as a knowledge
base for high-level visual recognition tasks. In
particular, this social network is capable of
significantly improving the performance of automatic
image annotation and classification of unknown
images.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "3",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhang:2015:SDP,
author = "Bo Zhang and Nicola Conci and Francesco G. B. {De
Natale}",
title = "Segmentation of Discriminative Patches in Human
Activity Video",
journal = j-TOMM,
volume = "12",
number = "1",
pages = "4:1--4:??",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2750780",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Aug 28 06:14:31 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this article, we present a novel approach to
segment discriminative patches in human activity
videos. First, we adopt the spatio-temporal interest
points (STIPs) to represent significant motion patterns
in the video sequence. Then, nonnegative sparse coding
is exploited to generate a sparse representation of
each STIP descriptor. We construct the feature vector
for each video by applying a two-stage sum-pooling and
l$_2$ -normalization operation. After training a
multi-class classifier through the error-correcting
code SVM, the discriminative portion of each video is
determined as the patch that has the highest confidence
while also being correctly classified according to the
video category. Experimental results show that the
video patches extracted by our method are more
separable, while preserving the perceptually relevant
portion of each activity.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "4",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wang:2015:WMZ,
author = "Hui Wang and Mun Choon Chan and Wei Tsang Ooi",
title = "Wireless Multicast for Zoomable Video Streaming",
journal = j-TOMM,
volume = "12",
number = "1",
pages = "5:1--5:??",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2801123",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Aug 28 06:14:31 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Zoomable video streaming refers to a new class of
interactive video applications, where users can zoom
into a video stream to view a selected region of
interest in higher resolutions and pan around to move
the region of interest. The zoom and pan effects are
typically achieved by breaking the source video into a
grid of independently decodable tiles. Streaming the
tiles to a set of heterogeneous users using broadcast
is challenging, as users have different link rates and
different regions of interest at different resolution
levels. In this article, we consider the following
problem: Given the subset of tiles that each user
requested, the link rate of each user, and the
available time slots, at which resolution should each
tile be sent, to maximize the overall video quality
received by all users. We design an efficient algorithm
to solve this problem and evaluate the solution on a
testbed using 10 mobile devices. Our method is able to
achieve up to 12dB improvements over other heuristic
methods.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "5",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Bianco:2015:UPM,
author = "Simone Bianco and Gianluigi Ciocca",
title = "User Preferences Modeling and Learning for Pleasing
Photo Collage Generation",
journal = j-TOMM,
volume = "12",
number = "1",
pages = "6:1--6:??",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2801126",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Aug 28 06:14:31 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this article, we consider how to automatically
create pleasing photo collages created by placing a set
of images on a limited canvas area. The task is
formulated as an optimization problem. Differently from
existing state-of-the-art approaches, we here exploit
subjective experiments to model and learn pleasantness
from user preferences. To this end, we design an
experimental framework for the identification of the
criteria that need to be taken into account to generate
a pleasing photo collage. Five different thematic photo
datasets are used to create collages using
state-of-the-art criteria. A first subjective
experiment where several subjects evaluated the
collages, emphasizes that different criteria are
involved in the subjective definition of pleasantness.
We then identify new global and local criteria and
design algorithms to quantify them. The relative
importance of these criteria are automatically learned
by exploiting the user preferences, and new collages
are generated. To validate our framework, we performed
several psycho-visual experiments involving different
users. The results shows that the proposed framework
allows to learn a novel computational model which
effectively encodes an inter-user definition of
pleasantness. The learned definition of pleasantness
generalizes well to new photo datasets of different
themes and sizes not used in the learning. Moreover,
compared with two state-of-the-art approaches, the
collages created using our framework are preferred by
the majority of the users.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "6",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Fu:2015:QBS,
author = "Bo Fu and Dirk Staehle and Gerald Kunzmann and
Eckehard Steinbach and Wolfgang Kellerer",
title = "{QoE}-Based {SVC} Layer Dropping in {LTE} Networks
Using Content-Aware Layer Priorities",
journal = j-TOMM,
volume = "12",
number = "1",
pages = "7:1--7:??",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2754167",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Aug 28 06:14:31 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The increasing popularity of mobile video streaming
applications has led to a high volume of video traffic
in mobile networks. As the base station, for instance,
the eNB in LTE networks, has limited physical
resources, it can be overloaded by this traffic. This
problem can be addressed by using Scalable Video Coding
(SVC), which allows the eNB to drop layers of the video
streams to dynamically adapt the bitrate. The impact of
bitrate adaptation on the Quality of Experience (QoE)
for the users depends on the content characteristics of
videos. As the current mobile network architectures do
not support the eNB in obtaining video content
information, QoE optimization schemes with explicit
signaling of content information have been proposed.
These schemes, however, require the eNB or a specific
optimization module to process the video content on the
fly in order to extract the required information. This
increases the computation and signaling overhead
significantly, raising the OPEX for mobile operators.
To address this issue, in this article, a content-aware
(CA) priority marking and layer dropping scheme is
proposed. The CA priority indicates a transmission
order for the layers of all transmitted videos across
all users, resulting from a comparison of their utility
versus rate characteristics. The CA priority values can
be determined at the P-GW on the fly, allowing mobile
operators to control the priority marking process.
Alternatively, they can be determined offline at the
video servers, avoiding real-time computation in the
core network. The eNB can perform content-aware SVC
layer dropping using only the priority values. No
additional content processing is required. The proposed
scheme is lightweight both in terms of architecture and
computation. The improvement in QoE is substantial and
very close to the performance obtained with the
computation and signaling-intensive QoE optimization
schemes.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "7",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Shen:2015:ASM,
author = "Siqi Shen and Shun-Yun Hu and Alexandru Iosup and Dick
Epema",
title = "Area of Simulation: Mechanism and Architecture for
Multi-Avatar Virtual Environments",
journal = j-TOMM,
volume = "12",
number = "1",
pages = "8:1--8:??",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2764463",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Aug 28 06:14:31 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Although Multi-Avatar Distributed Virtual Environments
(MAVEs) such as Real-Time Strategy (RTS) games
entertain daily hundreds of millions of online players,
their current designs do not scale. For example, even
popular RTS games such as the StarCraft series support
in a single game instance only up to 16 players and
only a few hundreds of avatars loosely controlled by
these players, which is a consequence of the
Event-Based Lockstep Simulation (EBLS) scalability
mechanism they employ. Through empirical analysis, we
show that a single Area of Interest (AoI), which is a
scalability mechanism that is sufficient for
single-avatar virtual environments (such as
Role-Playing Games), also cannot meet the scalability
demands of MAVEs. To enable scalable MAVEs, in this
work we propose Area of Simulation (AoS), a new
scalability mechanism, which combines and extends the
mechanisms of AoI and EBLS. Unlike traditional AoI
approaches, which employ only update-based operational
models, our AoS mechanism uses both event-based and
update-based operational models to manage not single,
but multiple areas of interest. Unlike EBLS, which is
traditionally used to synchronize the entire virtual
world, our AoS mechanism synchronizes only selected
areas of the virtual world. We further design an
AoS-based architecture, which is able to use both our
AoS and traditional AoI mechanisms simultaneously,
dynamically trading-off consistency guarantees for
scalability. We implement and deploy this architecture
and we demonstrate that it can operate with an order of
magnitude more avatars and a larger virtual world
without exceeding the resource capacity of players'
computers.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "8",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Lee:2015:LAR,
author = "Suk Kyu Lee and Seungho Yoo and Jongtack Jung and
Hwangnam Kim and Jihoon Ryoo",
title = "Link-Aware Reconfigurable Point-to-Point Video
Streaming for Mobile Devices",
journal = j-TOMM,
volume = "12",
number = "1",
pages = "9:1--9:??",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2771438",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Aug 28 06:14:31 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Even though people of all social standings use current
mobile devices in the wide spectrum of purpose from
entertainment tools to communication means, some issues
with real-time video streaming in hostile wireless
environment still exist. In this article, we introduce
CoSA, a link-aware real-time video streaming system for
mobile devices. The proposed system utilizes a 3D
camera to distinguish the region of importance (ROI)
and non-ROI region within the video frame. Based on the
link-state feedback from the receiver, the proposed
system allocates a higher bandwidth for the region that
is classified as ROI and a lower bandwidth for non-ROI
in the video stream by reducing the video's bit rate.
We implemented CoSA in a real test-bed where the IEEE
802.11 is employed as a medium for wireless networking.
Furthermore, we verified the effectiveness of the
proposed system by conducting a thorough empirical
study. The results indicate that the proposed system
enables real-time video streaming while maintaining a
consistent visual quality by dynamically reconfiguring
video coding parameters according to the link
quality.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "9",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wu:2015:CAM,
author = "Ming-Ju Wu and Jyh-Shing R. Jang",
title = "Combining Acoustic and Multilevel Visual Features for
Music Genre Classification",
journal = j-TOMM,
volume = "12",
number = "1",
pages = "10:1--10:??",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2801127",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Aug 28 06:14:31 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Most music genre classification approaches extract
acoustic features from frames to capture timbre
information, leading to the common framework of
bag-of-frames analysis. However, time-frequency
analysis is also vital for modeling music genres. This
article proposes multilevel visual features for
extracting spectrogram textures and their temporal
variations. A confidence-based late fusion is proposed
for combining the acoustic and visual features. The
experimental results indicated that the proposed method
achieved an accuracy improvement of approximately 14\%
and 2\% in the world's largest benchmark dataset (MASD)
and Unique dataset, respectively. In particular, the
proposed approach won the Music Information Retrieval
Evaluation eXchange (MIREX) music genre classification
contests from 2011 to 2013, demonstrating the
feasibility and necessity of combining acoustic and
visual features for classifying music genres.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "10",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{She:2015:ISI,
author = "James She and Alvin Chin and Feng Xia and Jon
Crowcroft",
title = "Introduction to: Special Issue on {Smartphone}-Based
Interactive Technologies, Systems, and Applications",
journal = j-TOMM,
volume = "12",
number = "1s",
pages = "11:1--11:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2820398",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 21 16:37:02 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "11",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhu:2015:SSB,
author = "Biao Zhu and Hongxin Zhang and Wei Chen and Feng Xia
and Ross Maciejewski",
title = "{ShotVis}: {Smartphone}-Based Visualization of {OCR}
Information from Images",
journal = j-TOMM,
volume = "12",
number = "1s",
pages = "12:1--12:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2808210",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 21 16:37:02 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "While visualization has been widely used as a data
presentation tool in both desktop and mobile devices,
the rapid visualization of information from images is
still underexplored. In this work, we present a
smartphone image acquisition and visualization approach
for text-based data. Our prototype, ShotVis, takes
images of text captured from mobile devices and
extracts information for visualization. First,
scattered characters in the text are processed and
interactively reformulated to be stored as structured
data (i.e., tables of numbers, lists of words,
sentences). From there, ShotVis allows users to
interactively bind visual forms to the underlying data
and produce visualizations of the selected forms
through touch-based interactions. In this manner,
ShotVis can quickly summarize text from images into
word clouds, scatterplots, and various other
visualizations all through a simple click of the
camera. In this way, ShotVis facilitates the
interactive exploration of text data captured via
cameras in smartphone devices. To demonstrate our
prototype, several case studies are presented along
with one user study to demonstrate the effectiveness of
our approach.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "12",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Venkatagiri:2015:ALG,
author = "Seshadri Padmanabha Venkatagiri and Mun Choon Chan and
Wei Tsang Ooi",
title = "Automated Link Generation for Sensor-Enriched
{Smartphone} Images",
journal = j-TOMM,
volume = "12",
number = "1s",
pages = "13:1--13:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2808209",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 21 16:37:02 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The ubiquity of the smartphones makes them ideal
platforms for generating in-situ content. In
well-attended events, photos captured by attendees have
diverse views that could be subjected to occlusion and
abnormal lighting effects that could obscure the view.
Such unstructured photo collections also have
significant redundancy. Thus, a scene that is partially
occluded or has bad contrast in one photo may be
captured in another photo, possibly with higher
details. We propose an application called Autolink that
automatically establishes content-based links between
sensor-annotated photos in unstructured photo
collections captured using smartphones, such that users
could navigate between high-context and high-detail
images. This hierarchically structured image collection
facilitates the design of applications for navigation
and discovery, analytics about user photography
patterns, user taste, and content/event popularity.
Autolink includes a framework that constructs this
hierarchy efficiently and with little content-specific
training data by combining photo content processing
with associated sensor logs obtained from multiple
participants. We evaluated the performance of Autolink
on two real-world sensor tagged photo datasets. The
result shows that Autolink is able to efficiently
cluster photos at 20 times faster than candidate
algorithms, into the appropriate hierarchy with at
least 70\% precision and 37\% better recall than
candidate algorithms.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "13",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Chu:2015:VCS,
author = "Chung-Hua Chu",
title = "Visual Comfort for Stereoscopic {$3$D} by Using Motion
Sensors on {$3$D} Mobile Devices",
journal = j-TOMM,
volume = "12",
number = "1s",
pages = "14:1--14:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2808211",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 21 16:37:02 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Advanced 3D mobile devices attract a lot of attentions
for 3D visualization nowadays. Stereoscopic images and
video taken from the 3D mobile devices are
uncomfortable for 3D viewing experiences due to the
limited hardware for stereoscopic 3D stabilization. The
existing stereoscopic 3D stabilization methods are
computationally inefficient for the 3D mobile devices.
In this article, we point out that this critical issue
deteriorates the 3D viewing experiences on the 3D
mobile devices. To improve visual comfort, we propose
an efficient and effective algorithm to stabilize the
stereoscopic images and video for the 3D mobile
devices. To rectify the video jitter, we use the
gyroscope and accelerometer embedded on the mobile
devices to obtain the geometry information of the
cameras. Using a different method than
video-content-based motion estimation, our algorithm
based on the gyroscope and acceleration data can
achieve higher accuracy to effectively stabilize the
video. Therefore, our approach is robust in video
stabilization even under poor lighting and substantial
foreground motion. Our algorithm outperforms previous
approaches in not only smaller running time but also
the better comfort of the stereoscopic 3D visualization
for the 3D mobile devices.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "14",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Liu:2015:ECA,
author = "Kaikai Liu and Xiaolin Li",
title = "Enabling Context-Aware Indoor Augmented Reality via
{Smartphone} Sensing and Vision Tracking",
journal = j-TOMM,
volume = "12",
number = "1s",
pages = "15:1--15:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2808208",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 21 16:37:02 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Augmented reality (AR) aims to render the world that
users see and overlay information that reflects the
real physical dynamics. The digital view could be
potentially projected near the Point-of-Interest (POI)
in a way that makes the virtual view attached to the
POI even when the camera moves. Achieving smooth
support for movements is a subject of extensive
studies. One of the key problems is where the augmented
information should be added to the field of vision in
real time. Existing solutions either leverage GPS
location for rendering outdoor AR views (hundreds of
kilometers away) or rely on image markers for
small-scale presentation (only for the marker region).
To realize AR applications under various scales and
dynamics, we propose a suite of algorithms for
fine-grained AR view tracking to improve the accuracy
of attitude and displacement estimation, reduce the
drift, eliminate the marker, and lower the computation
cost. Instead of requiring extremely high, accurate,
absolute locations, we propose multimodal solutions
according to mobility levels without additional
hardware requirement. Experimental results demonstrate
significantly less error in projecting and tracking the
AR view. These results are expected to make users
excited to explore their surroundings with enriched
content.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "15",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Ahn:2015:SHG,
author = "Junho Ahn and James Williamson and Mike Gartrell and
Richard Han and Qin Lv and Shivakant Mishra",
title = "Supporting Healthy Grocery Shopping via Mobile
Augmented Reality",
journal = j-TOMM,
volume = "12",
number = "1s",
pages = "16:1--16:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2808207",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 21 16:37:02 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Augmented reality (AR) applications have recently
become popular on modern smartphones. We explore the
effectiveness of this mobile AR technology in the
context of grocery shopping, in particular as a means
to assist shoppers in making healthier decisions as
they decide which grocery products to buy. We construct
an AR-assisted mobile grocery-shopping application that
makes real-time, customized recommendations of healthy
products to users and also highlights products to avoid
for various types of health concerns, such as allergies
to milk or nut products, low-sodium or low-fat diets,
and general caloric intake. We have implemented a
prototype of this AR-assisted mobile grocery shopping
application and evaluated its effectiveness in grocery
store aisles. Our application's evaluation with typical
grocery shoppers demonstrates that AR overlay tagging
of products reduces the search time to find healthy
food items, and that coloring the tags helps to improve
the user's ability to quickly and easily identify
recommended products, as well as products to avoid. We
have evaluated our application's functionality by
analyzing the data we collected from 15 in-person
actual grocery-shopping subjects and 104 online
application survey participants.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "16",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Ma:2015:PUC,
author = "Sixuan Ma and Zheng Yan",
title = "{PSNController}: an Unwanted Content Control System in
Pervasive Social Networking Based on Trust Management",
journal = j-TOMM,
volume = "12",
number = "1s",
pages = "17:1--17:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2808206",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 21 16:37:02 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Pervasive social networking (PSN) supports online and
instant social activities and communications in a
universal and pervasive manner on the basis of
heterogeneous networks. However, at the same time, when
mobile users expect useful and valuable contents via
PSN, they may also receive unwanted, unexpected, or
even malicious contents. These contents may intrude
user devices, occupy device memories, and irritate
mobile users. Unwanted content control in PSN has
become a crucial issue that impacts the success of PSN
usage. Nowadays, the literature still lacks a robust
and generic unwanted content control system that can be
practically applied. In this article, we present the
design and implementation of PSNController, an unwanted
content control system in PSN based on trust
management. We evaluate the system performance under a
variety of intrusions and attacks. The result shows the
system is effective with regard to accuracy,
efficiency, and robustness. It can control unwanted
contents in PSN according to trust evaluation. We
further study user acceptance on PSNController
prototype system based on a small-scale user study. We
receive sound user feedback on PSNController with
regard to perceived ease of use, perceived usefulness,
interface design, playfulness, and acceptance
attitude.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "17",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Hao:2015:LEP,
author = "Fei Hao and Mingjie Jiao and Geyong Min and Laurence
T. Yang",
title = "Launching an Efficient Participatory Sensing Campaign:
a Smart Mobile Device-Based Approach",
journal = j-TOMM,
volume = "12",
number = "1s",
pages = "18:1--18:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2808198",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 21 16:37:02 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Participatory sensing is a promising sensing paradigm
that enables collection, processing, dissemination and
analysis of the phenomena of interest by ordinary
citizens through their handheld sensing devices.
Participatory sensing has huge potential in many
applications, such as smart transportation and air
quality monitoring. However, participants may submit
low-quality, misleading, inaccurate, or even malicious
data if a participatory sensing campaign is not
launched effectively. Therefore, it has become a
significant issue to establish an efficient
participatory sensing campaign for improving the data
quality. This article proposes a novel five-tier
framework of participatory sensing and addresses
several technical challenges in this proposed framework
including: (1) optimized deployment of data collection
points (DC-points); and (2) efficient recruitment
strategy of participants. Toward this end, the
deployment of DC-points is formulated as an
optimization problem with maximum utilization of sensor
and then a Wise-Dynamic DC-points Deployment (WD3)
algorithm is designed for high-quality sensing.
Furthermore, to guarantee the reliable sensing data
collection and communication, a trajectory-based
strategy for participant recruitment is proposed to
enable campaign organizers to identify well-suited
participants for data sensing based on a joint
consideration of temporal availability, trust, and
energy. Extensive experiments and performance analysis
of the proposed framework and associated algorithms are
conducted. The results demonstrate that the proposed
algorithm can achieve a good sensing coverage with a
smaller number of DC-points, and the participants that
are termed as social sensors are easily selected, to
evaluate the feasibility and extensibility of the
proposed recruitment strategies.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "18",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Rawat:2015:CAP,
author = "Yogesh Singh Rawat and Mohan S. Kankanhalli",
title = "Context-Aware Photography Learning for Smart Mobile
Devices",
journal = j-TOMM,
volume = "12",
number = "1s",
pages = "19:1--19:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2808199",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 21 16:37:02 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this work we have developed a photography model
based on machine learning which can assist a user in
capturing high quality photographs. As scene
composition and camera parameters play a vital role in
aesthetics of a captured image, the proposed method
addresses the problem of learning photographic
composition and camera parameters. Further, we observe
that context is an important factor from a photography
perspective, we therefore augment the learning with
associated contextual information. The proposed method
utilizes publicly available photographs along with
social media cues and associated metainformation in
photography learning. We define context features based
on factors such as time, geolocation, environmental
conditions and type of image, which have an impact on
photography. We also propose the idea of computing the
photographic composition basis, eigenrules and
baserules, to support our composition learning. The
proposed system can be used to provide feedback to the
user regarding scene composition and camera parameters
while the scene is being captured. It can also
recommend position in the frame where people should
stand for better composition. Moreover, it also
provides camera motion guidance for pan, tilt and zoom
to the user for improving scene composition.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "19",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Canazza:2015:ATM,
author = "Sergio Canazza and Carlo Fantozzi and Niccol`o
Pretto",
title = "Accessing Tape Music Documents on Mobile Devices",
journal = j-TOMM,
volume = "12",
number = "1s",
pages = "20:1--20:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2808200",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 21 16:37:02 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The aim of this article is to present and discuss an
innovative methodology aimed at accessing digitized
copies of historical tape music audio documents; the
methodology leverages on the multimedia and
multisensory capabilities of mobile devices to provide
an unprecedented level of fruition. In addition to the
methodology, and stemming from it, we present an actual
software application for Android tablet devices. This
novel piece of software was designed and developed in a
multidisciplinary team involving engineers as well as
musicians, composers, and archivists. The strongest
element in our work is the fact that it follows a
rigorous process and it is based on the principles of
philological awareness; thus, it also takes into
consideration the critical points in the musicologist's
domain such as (i) the definition of preservation
(i.e., master) copy, (ii) the importance of secondary
information, (iii) the history of production and
transmission of audio documents.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "20",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Hu:2015:SCC,
author = "Xiping Hu and Junqi Deng and Jidi Zhao and Wenyan Hu
and Edith C.-H. Ngai and Renfei Wang and Johnny Shen
and Min Liang and Xitong Li and Victor C. M. Leung and
Yu-Kwong Kwok",
title = "{SAfeDJ}: a Crowd-Cloud Codesign Approach to
Situation-Aware Music Delivery for Drivers",
journal = j-TOMM,
volume = "12",
number = "1s",
pages = "21:1--21:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2808201",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 21 16:37:02 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Driving is an integral part of our everyday lives, but
it is also a time when people are uniquely vulnerable.
Previous research has demonstrated that not only does
listening to suitable music while driving not impair
driving performance, but it could lead to an improved
mood and a more relaxed body state, which could improve
driving performance and promote safe driving
significantly. In this article, we propose SAfeDJ, a
smartphone-based situation-aware music recommendation
system, which is designed to turn driving into a safe
and enjoyable experience. SAfeDJ aims at helping
drivers to diminish fatigue and negative emotion. Its
design is based on novel interactive methods, which
enable in-car smartphones to orchestrate multiple
sources of sensing data and the drivers' social
context, in collaboration with cloud computing to form
a seamless crowdsensing solution. This solution enables
different smartphones to collaboratively recommend
preferable music to drivers according to each driver's
specific situations in an automated and intelligent
manner. Practical experiments of SAfeDJ have proved its
effectiveness in music-mood analysis, and mood-fatigue
detections of drivers with reasonable computation and
communication overheads on smartphones. Also, our user
studies have demonstrated that SAfeDJ helps to decrease
fatigue degree and negative mood degree of drivers by
49.09\% and 36.35\%, respectively, compared to
traditional smartphone-based music player under similar
driving situations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "21",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Baldauf:2015:ISG,
author = "Matthias Baldauf and Peter Fr{\"o}hlich and Florence
Adegeye and Stefan Suette",
title = "Investigating On-Screen Gamepad Designs for
{Smartphone}-Controlled Video Games",
journal = j-TOMM,
volume = "12",
number = "1s",
pages = "22:1--22:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2808202",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 21 16:37:02 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "On-screen gamepads are increasingly used as
controllers for video games on distant screens, yet
lack the typical tactile feedback known from hardware
controllers. We conducted a comparative lab study to
investigate four smartphone gamepads inspired by
traditional game controllers and mobile game controls
(directional buttons, directional pad, floating
joystick, tilt control). The study consisted of both
completing a formal control test as well as controlling
two popular video games of different genres (Pac-Man
and Super Mario Bros.). The results indicate that the
directional buttons require the most attention of the
user, however, work precisely for direction-restricted
navigational tasks. Directional pad and joystick showed
a similar performance, yet they encourage drifting and
unintended operations when the user is focused on the
remote screen. While currently unfamiliar to many
users, the floating joystick can reduce the glances at
the device. Tilt turned out to be not sufficiently
precise and quick for the investigated tasks. The
article concludes with derived design guidelines with
easily realizable measures for typical contexts such as
casual gaming at home or spontaneous gaming on public
displays.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "22",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Bental:2015:SSL,
author = "Diana S. Bental and Eliza Papadopoulou and Nicholas K.
Taylor and M. Howard Williams and Fraser R. Blackmun
and Idris S. Ibrahim and Mei Yii Lim and Ioannis
Mimtsoudis and Stuart W. Whyte and Edel Jennings",
title = "Smartening Up the Student Learning Experience with
Ubiquitous Media",
journal = j-TOMM,
volume = "12",
number = "1s",
pages = "23:1--23:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2808203",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 21 16:37:02 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article describes how an experimental platform
for social, mobile and ubiquitous computing has been
used in a wide-ranging longitudinal ``in the wild''
case study of the platform with a set of third-party
services. The article outlines some of the relevant
aspects of the platform, including built-in support for
community formation, for context sensitivity, automated
learning and adaptation to the user, and for management
of privacy and trust relationships. The platform
architecture is based on the notion of Cooperating
Smart Spaces (CSSs), where a CSS is a partition of the
platform corresponding to a single user and distributed
over the devices belonging to that user. Three of the
case study services were intended for use in a physical
environment specifically created to support ubiquitous
intelligence; they were highly interactive and used
shared screens, voice input and gestural interaction.
Another three ubiquitous services were available
throughout the university environment as mobile and
desktop services. The case study exploited this
architecture's ability to integrate multiple novel
applications and interface devices and to deliver them
flexibly in these different environments. The platform
proved to be stable and reliable and the study shows
that treating a provider of services and resources (the
University) as a CSS is instrumental in enabling the
platform to provide this range of services across
differing environments.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "23",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Hung:2015:ISI,
author = "Hayley Hung and George Toderici",
title = "Introduction to: Special Issue on Extended Best Papers
from {ACM Multimedia 2014}",
journal = j-TOMM,
volume = "12",
number = "1s",
pages = "24:1--24:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2820400",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 21 16:37:02 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "24",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Kim:2015:ERD,
author = "Yelin Kim and Emily Mower Provost",
title = "Emotion Recognition During Speech Using Dynamics of
Multiple Regions of the Face",
journal = j-TOMM,
volume = "12",
number = "1s",
pages = "25:1--25:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2808204",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 21 16:37:02 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The need for human-centered, affective multimedia
interfaces has motivated research in automatic emotion
recognition. In this article, we focus on facial
emotion recognition. Specifically, we target a domain
in which speakers produce emotional facial expressions
while speaking. The main challenge of this domain is
the presence of modulations due to both emotion and
speech. For example, an individual's mouth movement may
be similar when he smiles and when he pronounces the
phoneme /IY/, as in ``cheese''. The result of this
confusion is a decrease in performance of facial
emotion recognition systems. In our previous work, we
investigated the joint effects of emotion and speech on
facial movement. We found that it is critical to employ
proper temporal segmentation and to leverage knowledge
of spoken content to improve classification
performance. In the current work, we investigate the
temporal characteristics of specific regions of the
face, such as the forehead, eyebrow, cheek, and mouth.
We present methodology that uses the temporal patterns
of specific regions of the face in the context of a
facial emotion recognition system. We test our proposed
approaches on two emotion datasets, the IEMOCAP and
SAVEE datasets. Our results demonstrate that the
combination of emotion recognition systems based on
different facial regions improves overall accuracy
compared to systems that do not leverage different
characteristics of individual regions.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "25",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Feng:2015:CAC,
author = "Fangxiang Feng and Xiaojie Wang and Ruifan Li and
Ibrar Ahmad",
title = "Correspondence Autoencoders for Cross-Modal
Retrieval",
journal = j-TOMM,
volume = "12",
number = "1s",
pages = "26:1--26:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2808205",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 21 16:37:02 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article considers the problem of cross-modal
retrieval, such as using a text query to search for
images and vice-versa. Based on different autoencoders,
several novel models are proposed here for solving this
problem. These models are constructed by correlating
hidden representations of a pair of autoencoders. A
novel optimal objective, which minimizes a linear
combination of the representation learning errors for
each modality and the correlation learning error
between hidden representations of two modalities, is
used to train the model as a whole. Minimizing the
correlation learning error forces the model to learn
hidden representations with only common information in
different modalities, while minimizing the
representation learning error makes hidden
representations good enough to reconstruct inputs of
each modality. To balance the two kind of errors
induced by representation learning and correlation
learning, we set a specific parameter in our models.
Furthermore, according to the modalities the models
attempt to reconstruct they are divided into two
groups. One group including three models is named
multimodal reconstruction correspondence autoencoder
since it reconstructs both modalities. The other group
including two models is named unimodal reconstruction
correspondence autoencoder since it reconstructs a
single modality. The proposed models are evaluated on
three publicly available datasets. And our experiments
demonstrate that our proposed correspondence
autoencoders perform significantly better than three
canonical correlation analysis based models and two
popular multimodal deep models on cross-modal retrieval
tasks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "26",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhang:2016:SPS,
author = "Longyu Zhang and Haiwei Dong and Abdulmotaleb {El
Saddik}",
title = "From {$3$D} Sensing to Printing: a Survey",
journal = j-TOMM,
volume = "12",
number = "2",
pages = "27:1--27:??",
month = mar,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2818710",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 3 17:36:33 MST 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Three-dimensional (3D) sensing and printing
technologies have reshaped our world in recent years.
In this article, a comprehensive overview of techniques
related to the pipeline from 3D sensing to printing is
provided. We compare the latest 3D sensors and 3D
printers and introduce several sensing, postprocessing,
and printing techniques available from both commercial
deployments and published research. In addition, we
demonstrate several devices, software, and experimental
results of our related projects to further elaborate
details of this process. A case study is conducted to
further illustrate the possible tradeoffs during the
process of this pipeline. Current progress, future
research trends, and potential risks of 3D technologies
are also discussed.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "27",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Petrangeli:2016:QDR,
author = "Stefano Petrangeli and Jeroen Famaey and Maxim Claeys
and Steven Latr{\'e} and Filip {De Turck}",
title = "{QoE}-Driven Rate Adaptation Heuristic for Fair
Adaptive Video Streaming",
journal = j-TOMM,
volume = "12",
number = "2",
pages = "28:1--28:??",
month = mar,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2818361",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 3 17:36:33 MST 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "HTTP Adaptive Streaming (HAS) is quickly becoming the
de facto standard for video streaming services. In HAS,
each video is temporally segmented and stored in
different quality levels. Rate adaptation heuristics,
deployed at the video player, allow the most
appropriate level to be dynamically requested, based on
the current network conditions. It has been shown that
today's heuristics underperform when multiple clients
consume video at the same time, due to fairness issues
among clients. Concretely, this means that different
clients negatively influence each other as they compete
for shared network resources. In this article, we
propose a novel rate adaptation algorithm called FINEAS
(Fair In-Network Enhanced Adaptive Streaming), capable
of increasing clients' Quality of Experience (QoE) and
achieving fairness in a multiclient setting. A key
element of this approach is an in-network system of
coordination proxies in charge of facilitating fair
resource sharing among clients. The strength of this
approach is threefold. First, fairness is achieved
without explicit communication among clients and thus
no significant overhead is introduced into the network.
Second, the system of coordination proxies is
transparent to the clients, that is, the clients do not
need to be aware of its presence. Third, the HAS
principle is maintained, as the in-network components
only provide the clients with new information and
suggestions, while the rate adaptation decision remains
the sole responsibility of the clients themselves. We
evaluate this novel approach through simulations, under
highly variable bandwidth conditions and in several
multiclient scenarios. We show how the proposed
approach can improve fairness up to 80\% compared to
state-of-the-art HAS heuristics in a scenario with
three networks, each containing 30 clients streaming
video at the same time.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "28",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Sun:2016:SOR,
author = "Shaoyan Sun and Wengang Zhou and Qi Tian and Houqiang
Li",
title = "Scalable Object Retrieval with Compact Image
Representation from Generic Object Regions",
journal = j-TOMM,
volume = "12",
number = "2",
pages = "29:1--29:??",
month = mar,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2818708",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 3 17:36:33 MST 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In content-based visual object retrieval, image
representation is one of the fundamental issues in
improving retrieval performance. Existing works adopt
either local SIFT-like features or holistic features,
and may suffer sensitivity to noise or poor
discrimination power. In this article, we propose a
compact representation for scalable object retrieval
from few generic object regions. The regions are
identified with a general object detector and are
described with a fusion of learning-based features and
aggregated SIFT features. Further, we compress feature
representation in large-scale image retrieval
scenarios. We evaluate the performance of the proposed
method on two public ground-truth datasets, with
promising results. Experimental results on a
million-scale image database demonstrate superior
retrieval accuracy with efficiency gain in both
computation and memory usage.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "29",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Ebrahim:2016:MIB,
author = "Mansoor Ebrahim and Wai Chong Chia",
title = "Multiview Image Block Compressive Sensing with Joint
Multiphase Decoding for Visual Sensor Network",
journal = j-TOMM,
volume = "12",
number = "2",
pages = "30:1--30:??",
month = mar,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2818712",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 3 17:36:33 MST 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this article, a multiview image compression
framework, which involves the use of Block-based
Compressive Sensing (BCS) and Joint Multiphase Decoding
(JMD), is proposed for a Visual Sensor Network (VSN).
In the proposed framework, one of the sensor nodes is
configured to serve as the reference node, the others
as nonreference nodes. The images are encoded
independently using the BCS to produce two observed
measurements that are transmitted to the host
workstation. In this case, the nonreference nodes
always encoded the images (I$_{NR}$ ) at a lower
subrate when compared with the images from the
reference nodes (I$_R$ ). The idea is to improve the
reconstruction of I$_{NR}$ using I$_R$. After the two
observed measurements are received by the host
workstation, they are first decoded independently, then
image registration is applied to align I$_R$ onto the
same plane of I$_{NR}$. The aligned I$_R$ is then fused
with I$_{NR}$, using wavelets to produce the projected
image I$_P$. Subsequently, the difference between the
measurements of the I$_P$ and I$_{NR}$ is calculated.
The difference is then decoded and added to I$_P$ to
produce the final reconstructed I$_{NR}$. The
simulation results show that the proposed framework is
able to improve the quality of I$_{NR}$ on average by
2dB to 3dB at lower subrates when compared with other
Compressive Sensing (CS)--based multiview image
compression frameworks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "30",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Pang:2016:OQA,
author = "Lei Pang and Chong-Wah Ngo",
title = "Opinion Question Answering by Sentiment Clip
Localization",
journal = j-TOMM,
volume = "12",
number = "2",
pages = "31:1--31:??",
month = mar,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2818711",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 3 17:36:33 MST 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article considers multimedia question answering
beyond factoid and how-to questions. We are interested
in searching videos for answering opinion-oriented
questions that are controversial and hotly debated.
Examples of questions include ``Should Edward Snowden
be pardoned?'' and ``Obamacare-unconstitutional or
not?''. These questions often invoke emotional
response, either positively or negatively, hence are
likely to be better answered by videos than texts, due
to the vivid display of emotional signals visible
through facial expression and speaking tone.
Nevertheless, a potential answer of duration 60s may be
embedded in a video of 10min, resulting in degraded
user experience compared to reading the answer in text
only. Furthermore, a text-based opinion question may be
short and vague, while the video answers could be
verbal, less structured grammatically, and noisy
because of errors in speech transcription. Direct
matching of words or syntactic analysis of sentence
structure, such as adopted by factoid and how-to
question-answering, is unlikely to find video answers.
The first problem, the answer localization, is
addressed by audiovisual analysis of the emotional
signals in videos for locating video segments likely
expressing opinions. The second problem, questions and
answers matching, is tackled by a deep architecture
that nonlinearly matches text words in questions and
speeches in videos. Experiments are conducted on eight
controversial topics based on questions crawled from
Yahoo! Answers and Internet videos from YouTube.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "31",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Papapanagiotou:2016:ICB,
author = "Vasileios Papapanagiotou and Christos Diou and
Anastasios Delopoulos",
title = "Improving Concept-Based Image Retrieval with Training
Weights Computed from Tags",
journal = j-TOMM,
volume = "12",
number = "2",
pages = "32:1--32:??",
month = mar,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2790230",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 3 17:36:33 MST 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article presents a novel approach to training
classifiers for concept detection using tags and a
variant of Support Vector Machine that enables the
usage of training weights per sample. Combined with an
appropriate tag weighting mechanism, more relevant
samples play a more important role in the calibration
of the final concept-detector model. We propose a
complete, automated framework that (i) calculates
relevance scores for each image-concept pair based on
image tags, (ii) transforms the scores into relevance
probabilities and automatically annotates each image
according to this probability, (iii) transforms either
the relevance scores or the probabilities into
appropriate training weights and finally, (iv)
incorporates the training weights and the visual
features into a Fuzzy Support Vector Machine classifier
to build the concept-detector model. The framework can
be applied to online public collections, by gathering a
large pool of diverse images, and using the calculated
probability to select a training set and the associated
training weights. To evaluate our argument, we
experiment on two large annotated datasets. Experiments
highlight the retrieval effectiveness of the proposed
approach. Furthermore, experiments with various levels
of annotation error show that using weights derived
from tags significantly increases the robustness of the
resulting concept detectors.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "32",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yang:2016:AGV,
author = "Xuyong Yang and Tao Mei and Ying-Qing Xu and Yong Rui
and Shipeng Li",
title = "Automatic Generation of Visual-Textual Presentation
Layout",
journal = j-TOMM,
volume = "12",
number = "2",
pages = "33:1--33:??",
month = mar,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2818709",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 3 17:36:33 MST 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Visual-textual presentation layout (e.g., digital
magazine cover, poster, Power Point slides, and any
other rich media), which combines beautiful image and
overlaid readable texts, can result in an eye candy
touch to attract users' attention. The designing of
visual-textual presentation layout is therefore
becoming ubiquitous in both commercially printed
publications and online digital magazines. However,
handcrafting aesthetically compelling layouts still
remains challenging for many small businesses and
amateur users. This article presents a system to
automatically generate visual-textual presentation
layouts by investigating a set of aesthetic design
principles, through which an average user can easily
create visually appealing layouts. The system is
attributed with a set of topic-dependent layout
templates and a computational framework integrating
high-level aesthetic principles (in a top-down manner)
and low-level image features (in a bottom-up manner).
The layout templates, designed with prior knowledge
from domain experts, define spatial layouts, semantic
colors, harmonic color models, and font emotion and
size constraints. We formulate the typography as an
energy optimization problem by minimizing the cost of
text intrusion, the utility of visual space, and the
mismatch of information importance in perception and
semantics, constrained by the automatically selected
template and further preserving color harmonization. We
demonstrate that our designs achieve the best reading
experience compared with the reimplementation of parts
of existing state-of-the-art designs through a series
of user studies.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "33",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Li:2016:MCR,
author = "Xuelong Li and Mulin Chen and Qi Wang",
title = "Measuring Collectiveness via Refined Topological
Similarity",
journal = j-TOMM,
volume = "12",
number = "2",
pages = "34:1--34:??",
month = mar,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2854000",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 3 17:36:33 MST 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Crowd system has motivated a surge of interests in
many areas of multimedia, as it contains plenty of
information about crowd scenes. In crowd systems,
individuals tend to exhibit collective behaviors, and
the motion of all those individuals is called
collective motion. As a comprehensive descriptor of
collective motion, collectiveness has been proposed to
reflect the degree of individuals moving as an
entirety. Nevertheless, existing works mostly have
limitations to correctly find the individuals of a
crowd system and precisely capture the various
relationships between individuals, both of which are
essential to measure collectiveness. In this article,
we propose a collectiveness-measuring method that is
capable of quantifying collectiveness accurately. Our
main contributions are threefold: (1) we compute
relatively accurate collectiveness by making the
tracked feature points represent the individuals more
precisely with a point selection strategy; (2) we
jointly investigate the spatial-temporal information of
individuals and utilize it to characterize the
topological relationship between individuals by
manifold learning; (3) we propose a stability
descriptor to deal with the irregular individuals,
which influence the calculation of collectiveness.
Intensive experiments on the simulated and real world
datasets demonstrate that the proposed method is able
to compute relatively accurate collectiveness and keep
high consistency with human perception.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "34",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Tyson:2016:MAM,
author = "Gareth Tyson and Yehia Elkhatib and Nishanth Sastry
and Steve Uhlig",
title = "Measurements and Analysis of a Major Adult Video
Portal",
journal = j-TOMM,
volume = "12",
number = "2",
pages = "35:1--35:??",
month = mar,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2854003",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 3 17:36:33 MST 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Today, the Internet is a large multimedia delivery
infrastructure, with websites such as YouTube appearing
at the top of most measurement studies. However, most
traffic studies have ignored an important domain: adult
multimedia distribution. Whereas, traditionally, such
services were provided primarily via bespoke websites,
recently these have converged towards what is known as
``Porn 2.0''. These services allow users to upload,
view, rate, and comment on videos for free (much like
YouTube). Despite their scale, we still lack even a
basic understanding of their operation. This article
addresses this gap by performing a large-scale study of
one of the most popular Porn 2.0 websites: YouPorn. Our
measurements reveal a global delivery infrastructure
that we have repeatedly crawled to collect statistics
(on 183k videos). We use this data to characterise the
corpus, as well as to inspect popularity trends and how
they relate to other features, for example, categories
and ratings. To explore our discoveries further, we use
a small-scale user study, highlighting key system
implications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "35",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Thomee:2016:FSP,
author = "Bart Thomee and Ioannis Arapakis and David A. Shamma",
title = "Finding Social Points of Interest from Georeferenced
and Oriented Online Photographs",
journal = j-TOMM,
volume = "12",
number = "2",
pages = "36:1--36:??",
month = mar,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2854004",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 3 17:36:33 MST 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Points of interest are an important requirement for
location-based services, yet they are editorially
curated and maintained, either professionally or
through community. Beyond the laborious manual
annotation task, further complications arise as points
of interest may appear, relocate, or disappear over
time, and may be relevant only to specific communities.
To assist, complement, or even replace manual
annotation, we propose a novel method for the automatic
localization of points of interest depicted in photos
taken by people across the world. Our technique
exploits the geographic coordinates and the compass
direction supplied by modern cameras, while accounting
for possible measurement errors due to the variability
in accuracy of the sensors that produced them. We
statistically demonstrate that our method significantly
outperforms techniques from the research literature on
the task of estimating the geographic coordinates and
geographic footprints of points of interest in various
cities, even when photos are involved in the estimation
process that do not show the point of interest at
all.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "36",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{delBimbo:2016:PEC,
author = "Alberto del Bimbo",
title = "From the Past {Editor-In-Chief}",
journal = j-TOMM,
volume = "12",
number = "3",
pages = "37:1--37:??",
month = jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2903774",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 16 09:38:16 MDT 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "37e",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhang:2016:SPR,
author = "Luming Zhang and Xuelong Li and Liqiang Nie and Yan
Yan and Roger Zimmermann",
title = "Semantic Photo Retargeting Under Noisy Image Labels",
journal = j-TOMM,
volume = "12",
number = "3",
pages = "37:1--37:??",
month = jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2886775",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 16 09:38:16 MDT 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "With the popularity of mobile devices, photo
retargeting has become a useful technique that adapts a
high-resolution photo onto a low-resolution screen.
Conventional approaches are limited in two aspects. The
first factor is the de-emphasized role of semantic
content that is many times more important than
low-level features in photo aesthetics. Second is the
importance of image spatial modeling: toward a
semantically reasonable retargeted photo, the spatial
distribution of objects within an image should be
accurately learned. To solve these two problems, we
propose a new semantically aware photo retargeting that
shrinks a photo according to region semantics. The key
technique is a mechanism transferring semantics of
noisy image labels (inaccurate labels predicted by a
learner like an SVM) into different image regions. In
particular, we first project the local aesthetic
features (graphlets in this work) onto a semantic
space, wherein image labels are selectively encoded
according to their noise level. Then, a
category-sharing model is proposed to robustly discover
the semantics of each image region. The model is
motivated by the observation that the semantic
distribution of graphlets from images tagged by a
common label remains stable in the presence of noisy
labels. Thereafter, a spatial pyramid is constructed to
hierarchically encode the spatial layout of graphlet
semantics. Based on this, a probabilistic model is
proposed to enforce the spatial layout of a retargeted
photo to be maximally similar to those from the
training photos. Experimental results show that (1)
noisy image labels predicted by different learners can
improve the retargeting performance, according to both
qualitative and quantitative analysis, and (2) the
category-sharing model stays stable even when 32.36\%
of image labels are incorrectly predicted.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "37",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhou:2016:MDD,
author = "Liang Zhou",
title = "Mobile Device-to-Device Video Distribution: Theory and
Application",
journal = j-TOMM,
volume = "12",
number = "3",
pages = "38:1--38:??",
month = jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2886776",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 16 09:38:16 MDT 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "As video traffic has dominated the data flow of
smartphones, traditional cellular communications face
substantial transmission challenges. In this work, we
study mobile device-to-device (D2D) video distribution
that leverages the storage and communication capacities
of smartphones. In such a mobile distributed framework,
D2D communication represents an opportunistic process
to selectively store and transmit local videos to meet
the future demand of others. The performance is
measured by the service time, which denotes the elapsed
period for fulfilling the demand, and the corresponding
implementation of each device depends on the video's
demand, availability, and size. The main contributions
of this work lie in (1) considering the impact of video
size in a practical mobile D2D video distribution
scenario and proposing a general global estimation of
the video distribution based on limited and local
observations; (2) designing a purely distributed D2D
video distribution scheme without the monitoring of any
central controller; and (3) providing a practical
implementation of the scheme, which does not need to
know the video availability, user demand, and device
mobility. Numerical results have demonstrated the
efficiency and robustness of the proposed scheme.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "38",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Ravi:2016:FAL,
author = "Hareesh Ravi and A. V. Subramanyam and Sabu Emmanuel",
title = "Forensic Analysis of Linear and Nonlinear Image
Filtering Using Quantization Noise",
journal = j-TOMM,
volume = "12",
number = "3",
pages = "39:1--39:??",
month = jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2857069",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 16 09:38:16 MDT 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The availability of intelligent image editing
techniques and antiforensic algorithms, make it
convenient to manipulate an image and to hide the
artifacts that it might have produced in the process.
Real world forgeries are generally followed by the
application of enhancement techniques such as filtering
and/or conversion of the image format to suppress the
forgery artifacts. Though several techniques evolved in
the direction of detecting some of these manipulations,
additional operations like recompression, nonlinear
filtering, and other antiforensic methods during
forgery are not deeply investigated. Toward this, we
propose a robust method to detect whether a given image
has undergone filtering (linear or nonlinear) based
enhancement, possibly followed by format conversion
after forgery. In the proposed method, JPEG
quantization noise is obtained using natural image
prior and quantization noise models. Transition
probability features extracted from the quantization
noise are used for machine learning based detection and
classification. We test the effectiveness of the
algorithm in classifying the class of the filter
applied and the efficacy in detecting filtering in low
resolution images. Experiments are performed to compare
the performance of the proposed technique with
state-of-the-art forensic filtering detection
algorithms. It is found that the proposed technique is
superior in most of the cases. Also, experiments
against popular antiforensic algorithms show the
counter antiforensic robustness of the proposed
technique.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "39",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Hu:2016:SND,
author = "Xianjun Hu and Weiming Zhang and Ke Li and Honggang Hu
and Nenghai Yu",
title = "Secure Nonlocal Denoising in Outsourced Images",
journal = j-TOMM,
volume = "12",
number = "3",
pages = "40:1--40:??",
month = jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2886777",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 16 09:38:16 MDT 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Signal processing in the encrypted domain becomes a
desired technique to protect privacy of outsourced data
in cloud. In this article, we propose a double-cipher
scheme to implement nonlocal means (NLM) denoising in
encrypted images. In this scheme, one ciphertext is
generated by the Paillier scheme, which enables the
mean filter, and the other is obtained by a
privacy-preserving transform, which enables the
nonlocal search. By the privacy-preserving transform,
the cloud server can search the similar pixel blocks in
the ciphertexts with the same speed as in the
plaintexts; thus, the proposed method can be executed
fast. To enhance the security, we randomly permutate
both ciphertexts. To reduce the denoising complexity
caused by random permutation, a random NLM method is
exploited in the encrypted domain. The experimental
results show that the quality of denoised images in the
encrypted domain is comparable to that obtained in the
plain domain.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "40",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Calagari:2016:DPS,
author = "Kiana Calagari and Tarek Elgamal and Khaled Diab and
Krzysztof Templin and Piotr Didyk and Wojciech Matusik
and Mohamed Hefeeda",
title = "Depth Personalization and Streaming of Stereoscopic
Sports Videos",
journal = j-TOMM,
volume = "12",
number = "3",
pages = "41:1--41:??",
month = jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2890103",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 16 09:38:16 MDT 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Current three-dimensional displays cannot fully
reproduce all depth cues used by a human observer in
the real world. Instead, they create only an illusion
of looking at a three-dimensional scene. This leads to
a number of challenges during the content creation
process. To assure correct depth reproduction and
visual comfort, either the acquisition setup has to be
carefully controlled or additional postprocessing
techniques have to be applied. Furthermore, these
manipulations need to account for a particular setup
that is used to present the content, for example,
viewing distance or screen size. This creates
additional challenges in the context of personal use
when stereoscopic content is shown on TV sets, desktop
monitors, or mobile devices. We address this problem by
presenting a new system for streaming stereoscopic
content. Its key feature is a computationally efficient
depth adjustment technique which can automatically
optimize viewing experience for videos of field sports
such as soccer, football, and tennis. Additionally, the
method enables depth personalization to allow users to
adjust the amount of depth according to their
preferences. Our stereoscopic video streaming system
was implemented, deployed, and tested with real
users.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "41",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wu:2016:ERM,
author = "Qiong Wu and Pierre Boulanger",
title = "Enhanced Reweighted {MRFs} for Efficient Fashion Image
Parsing",
journal = j-TOMM,
volume = "12",
number = "3",
pages = "42:1--42:??",
month = jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2890104",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 16 09:38:16 MDT 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Previous image parsing methods usually model the
problem in a conditional random field which describes a
statistical model learned from a training dataset and
then processes a query image using the conditional
probability. However, for clothing images, fashion
items have a large variety of layering and
configuration, and it is hard to learn a certain
statistical model of features that apply to general
cases. In this article, we take fashion images as an
example to show how Markov Random Fields (MRFs) can
outperform Conditional Random Fields when the
application does not follow a certain statistical model
learned from the training data set. We propose a new
method for automatically parsing fashion images in high
processing efficiency with significantly less training
time by applying a modification of MRFs, named
reweighted MRF (RW-MRF), which resolves the problem of
over smoothing infrequent labels. We further enhance
RW-MRF with occlusion prior and background prior to
resolve two other common problems in clothing parsing,
occlusion, and background spill. Our experimental
results indicate that our proposed clothing parsing
method significantly improves processing time and
training time over state-of-the-art methods, while
ensuring comparable parsing accuracy and improving
label recall rate.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "42",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Hu:2016:ADA,
author = "Yao Hu and Chen Zhao and Deng Cai and Xiaofei He and
Xuelong Li",
title = "Atom Decomposition with Adaptive Basis Selection
Strategy for Matrix Completion",
journal = j-TOMM,
volume = "12",
number = "3",
pages = "43:1--43:??",
month = jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2903716",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 16 09:38:16 MDT 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Estimating missing entries in matrices has attracted
much attention due to its wide range of applications
like image inpainting and video denoising, which are
usually considered as low-rank matrix completion
problems theoretically. It is common to consider
nuclear norm as a surrogate of the rank operator since
it is the tightest convex lower bound of the rank
operator under certain conditions. However, most
approaches based on nuclear norm minimization involve a
number of singular value decomposition (SVD)
operations. Given a matrix $ X \in R^{m \times n} $,
the time complexity of the SVD operation is $ O(m n^2)
$, which brings prohibitive computational burden on
large-scale matrices, limiting the further usage of
these methods in real applications. Motivated by this
observation, a series of atom-decomposition-based
matrix completion methods have been studied. The key to
these methods is to reconstruct the target matrix by
pursuit methods in a greedy way, which only involves
the computation of the top SVD and has great advantages
in efficiency compared with the SVD-based matrix
completion methods. However, due to gradually serious
accumulation errors, atom-decomposition-based methods
usually result in unsatisfactory reconstruction
accuracy. In this article, we propose a new efficient
and scalable atom decomposition algorithm for matrix
completion called Adaptive Basis Selection Strategy
(ABSS). Different from traditional greedy atom
decomposition methods, a two-phase strategy is
conducted to generate the basis separately via
different strategies according to their different
nature. At first, we globally prune the basis space to
eliminate the unimportant basis as much as possible and
locate the probable subspace containing the most
informative basis. Then, another group of basis spaces
are learned to improve the recovery accuracy based on
local information. In this way, our proposed algorithm
breaks through the accuracy bottleneck of traditional
atom-decomposition-based matrix completion methods;
meanwhile, it reserves the innate efficiency advantages
over SVD-based matrix completion methods. We
empirically evaluate the proposed algorithm ABSS on
real visual image data and large-scale recommendation
datasets. Results have shown that ABSS has much better
reconstruction accuracy with comparable cost to
atom-decomposition-based methods. At the same time, it
outperforms the state-of-the-art SVD-based matrix
completion algorithms by similar or better
reconstruction accuracy with enormous advantages on
efficiency.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "43",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Miao:2016:HFL,
author = "Dan Miao and Jingjing Fu and Yan Lu and Shipeng Li and
Chang Wen Chen",
title = "A High-Fidelity and Low-Interaction-Delay Screen
Sharing System",
journal = j-TOMM,
volume = "12",
number = "3",
pages = "44:1--44:??",
month = jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2897395",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 16 09:38:16 MDT 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The pervasive computing environment and wide network
bandwidth provide users more opportunities to share
screen content among multiple devices. In this article,
we introduce a remote display system to enable screen
sharing among multiple devices with high fidelity and
responsive interaction. In the developed system, the
frame-level screen content is compressed and
transmitted to the client side for screen sharing, and
the instant control inputs are simultaneously
transmitted to the server side for interaction. Even if
the screen responds immediately to the control messages
and updates at a high frame rate on the server side, it
is difficult to update the screen content with low
delay and high frame rate in the client side due to
non-negligible time consumption on the whole screen
frame compression, transmission, and display buffer
updating. To address this critical problem, we propose
a layered structure for screen coding and rendering to
deliver diverse screen content to the client side with
an adaptive frame rate. More specifically, the
interaction content with small region screen update is
compressed by a blockwise screen codec and rendered at
a high frame rate to achieve smooth interaction, while
the natural video screen content is compressed by
standard video codec and rendered at a regular frame
rate for a smooth video display. Experimental results
with real applications demonstrate that the proposed
system can successfully reduce transmission bandwidth
cost and interaction delay during screen sharing.
Especially for user interaction in small regions, the
proposed system can achieve a higher frame rate than
most previous counterparts.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "44",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wilk:2016:CAV,
author = "Stefan Wilk and Stephan Kopf and Wolfgang Effelsberg",
title = "Collaborative Annotation of Videos Relying on Weak
Consistency",
journal = j-TOMM,
volume = "12",
number = "3",
pages = "45:1--45:??",
month = jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2907983",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 16 09:38:16 MDT 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This work discusses a distributed interactive video
system that supports video annotation using
simultaneous hyperlinking by multiple users. The users
mark and annotate objects within the video with links
to other media such as text, images, websites, or other
videos. Annotations are visualized on the client user
interface as an overlay close to the objects. Our
system is intuitive to use; for example, it contains
automatic object-tracking functionality that correctly
positions the annotations, even when the form or
location of an object changes. Thus, our first
contribution discusses the adaptive object-tracking
algorithm used for this repositioning. It shows
improved precision and reliability in comparison to
nonadaptive algorithms. A second key issue is to keep
the system responsive when the number of concurrent
annotators increases. Thus, we rely on the concept of
eventual consistency between different network
entities. While this weak form of consistency allows
temporary inconsistencies, it ensures that a consistent
state can be reached. Thus, the second contribution is
the design and evaluation of our distributed
interactive video system, which relies on the weak
consistency paradigm.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "45",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Merani:2016:ASP,
author = "Maria Luisa Merani and Laura Natali",
title = "Adaptive Streaming in {P2P} Live Video Systems: a
Distributed Rate Control Approach",
journal = j-TOMM,
volume = "12",
number = "3",
pages = "46:1--46:??",
month = jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2912123",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 16 09:38:16 MDT 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Dynamic Adaptive Streaming over HTTP (DASH) is a
recently proposed standard that offers different
versions of the same media content to adapt the
delivery process over the Internet to dynamic bandwidth
fluctuations and different user device capabilities.
The peer-to-peer (P2P) paradigm for video streaming
allows us to leverage the cooperation among peers,
guaranteeing the service of video requests with
increased scalability and reduced cost. We propose to
combine these two approaches in a P2P-DASH
architecture, exploiting the potentiality of both. The
new platform is made of several swarms and a different
DASH representation is streamed within each of them;
unlike client-server DASH architectures, where each
client autonomously selects which version to download
according to current network conditions and to its
device resources, we put forth a new rate control
strategy implemented at peer site to maintain a good
viewing quality to the local user and to simultaneously
guarantee the successful operation of the P2P swarms.
The effectiveness of the solution is demonstrated
through simulation and it indicates that the P2P-DASH
platform is able to provide its users with very good
performance, much more satisfying than in a
conventional P2P environment where DASH is not
employed. Through a comparison with a reference DASH
system modeled via the Integer Linear Programming (ILP)
approach, the new system is shown to outperform such
reference architecture. To further validate the
proposal, in terms of both robustness and scalability,
system behavior is investigated in the critical
condition of a flash crowd, showing that the strong
upsurge of new users can be successfully revealed and
gradually accommodated.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "46",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Jia:2016:WGB,
author = "Adele Lu Jia and Siqi Shen and Dick H. J. Epema and
Alexandru Iosup",
title = "When Game Becomes Life: The Creators and Spectators of
Online Game Replays and Live Streaming",
journal = j-TOMM,
volume = "12",
number = "4",
pages = "47:1--47:??",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2957750",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 25 07:28:05 MDT 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Online gaming franchises such as World of Tanks,
Defense of the Ancients, and StarCraft have attracted
hundreds of millions of users who, apart from playing
the game, also socialize with each other through gaming
and viewing gamecasts. As a form of User Generated
Content (UGC), gamecasts play an important role in user
entertainment and gamer education. They deserve the
attention of both industrial partners and the academic
communities, corresponding to the large amount of
revenue involved and the interesting research problems
associated with UGC sites and social networks. Although
previous work has put much effort into analyzing
general UGC sites such as YouTube, relatively little is
known about the gamecast sharing sites. In this work,
we provide the first comprehensive study of gamecast
sharing sites, including commercial streaming-based
sites such as Amazon's Twitch.tv and
community-maintained replay-based sites such as
WoTreplays. We collect and share a novel dataset on
WoTreplays that includes more than 380,000 game
replays, shared by more than 60,000 creators with more
than 1.9 million gamers. Together with an earlier
published dataset on Twitch.tv, we investigate basic
characteristics of gamecast sharing sites, and we
analyze the activities of their creators and
spectators. Among our results, we find that (i)
WoTreplays and Twitch.tv are both fast-consumed
repositories, with millions of gamecasts being
uploaded, viewed, and soon forgotten; (ii) both the
gamecasts and the creators exhibit highly skewed
popularity, with a significant heavy tail phenomenon;
and (iii) the upload and download preferences of
creators and spectators are different: while the
creators emphasize their individual skills, the
spectators appreciate team-wise tactics. Our findings
provide important knowledge for infrastructure and
service improvement, for example, in the design of
proper resource allocation mechanisms that consider
future gamecasting and in the tuning of incentive
policies that further help player retention.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "47",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Rana:2016:DBV,
author = "Shuvendu Rana and Arijit Sur",
title = "Depth-Based View-Invariant Blind {$3$D} Image
Watermarking",
journal = j-TOMM,
volume = "12",
number = "4",
pages = "48:1--48:??",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2957751",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 25 07:28:05 MDT 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "With the huge advance in Internet technology as well
as the availability of low-cost 3D display devices, 3D
image transmission has become popular in recent times.
Since watermarking has become regarded as a potential
Digital Rights Management (DRM) tools in the past
decade, 3D image watermarking is an emerging research
topic. With the introduction of the Depth Image-Based
Rendering (DIBR) technique, 3D image watermarking is a
more challenging task, especially for synthetic view
generation. In this article, synthetic view generation
is regarded as a potential attack, and a blind
watermarking scheme is proposed that can resist it. In
the proposed scheme, the watermark is embedded into the
low-pass filtered dependent view region of 3D images.
Block Discrete Cosine Transformation (DCT) is used for
spatial-filtration of the dependent view region to find
the DC coefficient with horizontally shifted coherent
regions from the left and right view to make the scheme
robust against synthesis view attack. A comprehensive
set of experiments have been carried out to justify the
robustness of the proposed scheme over related existing
schemes with respect to Stereo JPEG compression and
different noise addition attacks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "48",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Silva:2016:MIB,
author = "Bruno M. C. Silva and Joel J. P. C. Rodrigues and
Neeraj Kumar Mario L. {Proen{\c{c}}a, Jr.} and Guangjie
Han",
title = "{MobiCoop}: an Incentive-Based Cooperation Solution
for Mobile Applications",
journal = j-TOMM,
volume = "12",
number = "4",
pages = "49:1--49:??",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2957752",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 25 07:28:05 MDT 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Network architectures based on mobile devices and
wireless communications present several constraints
(e.g., processor, energy storage, bandwidth, etc.) that
affect the overall network performance. Cooperation
strategies have been considered as a solution to
address these network limitations. In the presence of
unstable network infrastructures, mobile nodes
cooperate with each other, forwarding data and
performing other specific network functionalities. This
article proposes a generalized incentive-based
cooperation solution for mobile services and
applications called MobiCoop. This reputation-based
scheme includes an application framework for mobile
applications that uses a Web service to handle all the
nodes reputation and network permissions. The main goal
of MobiCoop is to provide Internet services to mobile
devices without network connectivity through
cooperation with neighbor devices. The article includes
a performance evaluation study of MobiCoop considering
both a real scenario (using a prototype) and a
simulation-based study. Results show that the proposed
approach provides network connectivity independency to
users with mobile apps when Internet connectivity is
unavailable. Then, it is concluded that MobiCoop
improved significantly the overall system performance
and the service provided for a given mobile
application.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "49",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Shivani:2016:PVC,
author = "Shivendra Shivani and Suneeta Agarwal",
title = "Progressive Visual Cryptography with Unexpanded
Meaningful Shares",
journal = j-TOMM,
volume = "12",
number = "4",
pages = "50:1--50:??",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2935618",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 25 07:28:05 MDT 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The traditional $k$-out-of-$n$ Visual Cryptography
(VC) scheme is the conception of ``all or nothing'' for
$n$ participants to share a secret image. The original
secret image can be visually revealed only when a
subset of $k$ or more shares are superimposed together,
but if the number of stacked shares are less than $k$,
nothing will be revealed. On the other hand, a
Progressive Visual Cryptography (PVC) scheme differs
from the traditional VC with respect to decoding. In
PVC, clarity and contrast of the decoded secret image
will be increased progressively with the number of
stacked shares. Much of the existing state-of-the-art
research on PVC has problems with pixel expansion and
random pattern of the shares. In this article, a novel
scheme of progressive visual cryptography with four or
more number of unexpanded as well as meaningful shares
has been proposed. For this, a novel and efficient
Candidate Block Replacement preprocessing approach and
a basis matrix creation algorithm have also been
introduced. The proposed method also eliminates many
unnecessary encryption constraints like a predefined
codebook for encoding and decoding the secret image,
restriction on the number of participants, and so on.
From the experiments, it is observed that the
reconstruction probability of black pixels in the
decoded image corresponding to the black pixel in the
secret image is always 1, whereas that of white pixels
is 0.5 irrespective of the meaningful contents visible
in the shares, thus ensuring the value of contrast to
always be 50\%. Therefore, a reconstructed image can be
easily identified by a human visual system without any
computation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "50",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Ademoye:2016:AME,
author = "Oluwakemi A. Ademoye and Niall Murray and Gabriel-Miro
Muntean and Gheorghita Ghinea",
title = "Audio Masking Effect on Inter-Component Skews in
Olfaction-Enhanced Multimedia Presentations",
journal = j-TOMM,
volume = "12",
number = "4",
pages = "51:1--51:??",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2957753",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 25 07:28:05 MDT 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Media-rich content plays a vital role in consumer
applications today, as these applications try to find
new and interesting ways to engage their users. Video,
audio, and the more traditional forms of media content
continue to dominate with respect to the use of media
content to enhance the user experience. Tactile
interactivity has also now become widely popular in
modern computing applications, while our olfactory and
gustatory senses continue to have a limited role.
However, in recent times, there have been significant
advancements regarding the use of olfactory media
content (i.e., smell), and there are a variety of
devices now available to enable its computer-controlled
emission. This paper explores the impact of the audio
stream on user perception of olfactory-enhanced video
content in the presence of skews between the olfactory
and video media. This research uses the results from
two experimental studies of user-perceived quality of
olfactory-enhanced multimedia, where audio was present
and absent, respectively. Specifically, the paper shows
that the user Quality of Experience (QoE) is generally
higher in the absence of audio for nearly perfect
synchronized olfactory-enhanced multimedia
presentations (i.e., an olfactory media skew of between
{-10,+10s}); however, for greater olfactory media skews
(ranging between {-30s;-10s} and {+10s, +30s}) user QoE
is higher when the audio stream is present. It can be
concluded that the presence of the audio has the
ability to mask larger synchronization skews between
the other media components in olfaction-enhanced
multimedia presentations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "51",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhong:2016:FED,
author = "Sheng-Hua Zhong and Yan Liu and Kien A. Hua",
title = "Field Effect Deep Networks for Image Recognition with
Incomplete Data",
journal = j-TOMM,
volume = "12",
number = "4",
pages = "52:1--52:??",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2957754",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 25 07:28:05 MDT 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Image recognition with incomplete data is a well-known
hard problem in computer vision and machine learning.
This article proposes a novel deep learning technique
called Field Effect Bilinear Deep Networks (FEBDN) for
this problem. To address the difficulties of
recognizing incomplete data, we design a novel
second-order deep architecture with the Field Effect
Restricted Boltzmann Machine, which models the
reliability of the delivered information according to
the availability of the features. Based on this new
architecture, we propose a new three-stage learning
procedure with field effect bilinear initialization,
field effect abstraction and estimation, and global
fine-tuning with missing features adjustment. By
integrating the reliability of features into the new
learning procedure, the proposed FEBDN can jointly
determine the classification boundary and estimate the
missing features. FEBDN has demonstrated impressive
performance on recognition and estimation tasks in
various standard datasets.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "52",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yan:2016:UVR,
author = "Ming Yan and Jitao Sang and Changsheng Xu and M.
Shamim Hossain",
title = "A Unified Video Recommendation by Cross-Network User
Modeling",
journal = j-TOMM,
volume = "12",
number = "4",
pages = "53:1--53:??",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2957755",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 25 07:28:05 MDT 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Online video sharing sites are increasingly
encouraging their users to connect to the social
network venues such as Facebook and Twitter, with goals
to boost user interaction and better disseminate the
high-quality video content. This in turn provides huge
possibilities to conduct cross-network collaboration
for personalized video recommendation. However, very
few efforts have been devoted to leveraging users'
social media profiles in the auxiliary network to
capture and personalize their video preferences, so as
to recommend videos of interest. In this article, we
propose a unified YouTube video recommendation solution
by transferring and integrating users' rich social and
content information in Twitter network. While general
recommender systems often suffer from typical problems
like cold-start and data sparsity, our proposed
recommendation solution is able to effectively learn
from users' abundant auxiliary information on Twitter
for enhanced user modeling and well address the typical
problems in a unified framework. In this framework, two
stages are mainly involved: (1) auxiliary-network data
transfer, where user preferences are transferred from
an auxiliary network by learning cross-network
knowledge associations; and (2) cross-network data
integration, where transferred user preferences are
integrated with the observed behaviors on a target
network in an adaptive fashion. Experimental results
show that the proposed cross-network collaborative
solution achieves superior performance not only in
terms of accuracy, but also in improving the diversity
and novelty of the recommended videos.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "53",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Jiang:2016:CVI,
author = "Yijing Jiang and Shanyu Tang and Liping Zhang and
Muzhou Xiong and Yau Jim Yip",
title = "Covert Voice over {Internet} Protocol Communications
with Packet Loss Based on Fractal Interpolation",
journal = j-TOMM,
volume = "12",
number = "4",
pages = "54:1--54:??",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2961053",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 25 07:28:05 MDT 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The last few years have witnessed an explosive growth
in the research of information hiding in multimedia
objects, but few studies have taken into account packet
loss in multimedia networks. As one of the most popular
real-time services in the Internet, Voice over Internet
Protocol (VoIP) contributes to a large part of network
traffic for its advantages of real time, high flow, and
low cost. So packet loss is inevitable in multimedia
networks and affects the performance of VoIP
communications. In this study, a fractal-based VoIP
steganographic approach was proposed to realize covert
VoIP communications in the presence of packet loss. In
the proposed scheme, secret data to be hidden were
divided into blocks after being encrypted with the
block cipher, and each block of the secret data was
then embedded into VoIP streaming packets. The VoIP
packets went through a packet-loss system based on
Gilbert model which simulates a real network situation.
And a prediction model based on fractal interpolation
was built to decide whether a VoIP packet was suitable
for data hiding. The experimental results indicated
that the speech quality degradation increased with the
escalating packet-loss level. The average variance of
speech quality metrics (PESQ score) between the
``no-embedding'' speech samples and the
``with-embedding'' stego-speech samples was about
0.717, and the variances narrowed with the increasing
packet-loss level. Both the average PESQ scores and the
SNR values of stego-speech samples and the
data-retrieving rates had almost the same varying
trends when the packet-loss level increased, indicating
that the success rate of the fractal prediction model
played an important role in the performance of covert
VoIP communications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "54",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yang:2016:SFM,
author = "Xiaoshan Yang and Tianzhu Zhang and Changsheng Xu",
title = "Semantic Feature Mining for Video Event
Understanding",
journal = j-TOMM,
volume = "12",
number = "4",
pages = "55:1--55:??",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2962719",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 25 07:28:05 MDT 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Content-based video understanding is extremely
difficult due to the semantic gap between low-level
vision signals and the various semantic concepts
(object, action, and scene) in videos. Though feature
extraction from videos has achieved significant
progress, most of the previous methods rely only on
low-level features, such as the appearance and motion
features. Recently, visual-feature extraction has been
improved significantly with machine-learning
algorithms, especially deep learning. However, there is
still not enough work focusing on extracting semantic
features from videos directly. The goal of this article
is to adopt unlabeled videos with the help of text
descriptions to learn an embedding function, which can
be used to extract more effective semantic features
from videos when only a few labeled samples are
available for video recognition. To achieve this goal,
we propose a novel embedding convolutional neural
network (ECNN). We evaluate our algorithm by comparing
its performance on three challenging benchmarks with
several popular state-of-the-art methods. Extensive
experimental results show that the proposed ECNN
consistently and significantly outperforms the existing
methods.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "55",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Nilsson:2016:ASD,
author = "Tommy Nilsson and Carl Hogsden and Charith Perera and
Saeed Aghaee and David J. Scruton and Andreas Lund and
Alan F. Blackwell",
title = "Applying Seamful Design in Location-Based Mobile
Museum Applications",
journal = j-TOMM,
volume = "12",
number = "4",
pages = "56:1--56:??",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2962720",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 25 07:28:05 MDT 2016",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The application of mobile computing is currently
altering patterns of our behavior to a greater degree
than perhaps any other invention. In combination with
the introduction of power-efficient wireless
communication technologies, such as Bluetooth Low
Energy (BLE), designers are today increasingly
empowered to shape the way we interact with our
physical surroundings and thus build entirely new
experiences. However, our evaluations of BLE and its
abilities to facilitate mobile location-based
experiences in public environments revealed a number of
potential problems. Most notably, the position and
orientation of the user in combination with various
environmental factors, such as crowds of people
traversing the space, were found to cause major
fluctuations of the received BLE signal strength. These
issues are rendering a seamless functioning of any
location-based application practically impossible.
Instead of achieving seamlessness by eliminating these
technical issues, we thus choose to advocate the use of
a seamful approach, that is, to reveal and exploit
these problems and turn them into a part of the actual
experience. In order to demonstrate the viability of
this approach, we designed, implemented, and evaluated
the Ghost Detector -an educational location-based
museum game for children. By presenting a qualitative
evaluation of this game and by motivating our design
decisions, this article provides insight into some of
the challenges and possible solutions connected to the
process of developing location-based BLE-enabled
experiences for public cultural spaces.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "56",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yan:2017:LCI,
author = "Zheng Yan",
title = "Learning from Collective Intelligence: Feature
Learning Using Social Images and Tags",
journal = j-TOMM,
volume = "13",
number = "1",
pages = "1:1--1:??",
month = jan,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/2978656",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Jan 18 17:18:28 MST 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Feature representation for visual content is the key
to the progress of many fundamental applications such
as annotation and cross-modal retrieval. Although
recent advances in deep feature learning offer a
promising route towards these tasks, they are limited
in application domains where high-quality and
large-scale training data are expensive to obtain. In
this article, we propose a novel deep feature learning
paradigm based on social collective intelligence, which
can be acquired from the inexhaustible social
multimedia content on the Web, in particular, largely
social images and tags. Differing from existing feature
learning approaches that rely on high-quality
image-label supervision, our weak supervision is
acquired by mining the visual-semantic embeddings from
noisy, sparse, and diverse social image collections.
The resultant image-word embedding space can be used to
(1) fine-tune deep visual models for low-level feature
extractions and (2) seek sparse representations as
high-level cross-modal features for both image and
text. We offer an easy-to-use implementation for the
proposed paradigm, which is fast and compatible with
any state-of-the-art deep architectures. Extensive
experiments on several benchmarks demonstrate that the
cross-modal features learned by our paradigm
significantly outperforms others in various
applications such as content-based retrieval,
classification, and image captioning.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "1",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Cheung:2017:PVT,
author = "Ming Cheung and James She and Alvin Junus and Lei
Cao",
title = "Prediction of Virality Timing Using Cascades in Social
Media",
journal = j-TOMM,
volume = "13",
number = "1",
pages = "2:1--2:??",
month = jan,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/2978771",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Jan 18 17:18:28 MST 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Predicting content going viral in social networks is
attractive for viral marketing, advertisement,
entertainment, and other applications, but it remains a
challenge in the big data era today. Previous works
mainly focus on predicting the possible popularity of
content rather than the timing of reaching such
popularity. This work proposes a novel yet practical
iterative algorithm to predict virality timing, in
which the correlation between the timing and growth of
content popularity is captured by using its own big
data naturally generated from users' sharing. Such data
is not only able to correlate the dynamics and
associated timings in social cascades of viral content
but also can be useful to self-correct the predicted
timing against the actual timing of the virality in
each iterative prediction. The proposed prediction
algorithm is verified by datasets from two popular
social networks-Twitter and Digg-as well as two
synthesized datasets with extreme network densities and
infection rates. With about 50\% of the required
content virality data available (i.e., halfway before
reaching its actual virality timing), the error of the
predicted timing is proven to be bounded within a 40\%
deviation from the actual timing. To the best of our
knowledge, this is the first work that predicts content
virality timing iteratively by capturing social
cascades dynamics.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "2",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Chiu:2017:AAS,
author = "Chih-Yi Chiu and Yu-Cyuan Liou and Amorntip
Prayoonwong",
title = "Approximate Asymmetric Search for Binary Embedding
Codes",
journal = j-TOMM,
volume = "13",
number = "1",
pages = "3:1--3:??",
month = jan,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/2990504",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Jan 18 17:18:28 MST 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this article, we propose a method of approximate
asymmetric nearest-neighbor search for binary embedding
codes. The asymmetric distance takes advantage of less
information loss at the query side. However,
calculating asymmetric distances through exhaustive
search is prohibitive in a large-scale dataset. We
present a novel method, called multi-index voting, that
integrates the multi-index hashing technique with a
voting mechanism to select appropriate candidates and
calculate their asymmetric distances. We show that the
candidate selection scheme can be formulated as the
tail of the binomial distribution function. In
addition, a binary feature selection method based on
minimal quantization error is proposed to address the
memory insufficiency issue and improve the search
accuracy. Substantial experimental evaluations were
made to demonstrate that the proposed method can yield
an approximate accuracy to the exhaustive search method
while significantly accelerating the runtime. For
example, one result shows that in a dataset of one
billion 256-bit binary codes, examining only 0.5\% of
the dataset, can reach 95--99\% close accuracy to the
exhaustive search method and accelerate the search by
73--128 times. It also demonstrates an excellent
tradeoff between the search accuracy and time
efficiency compared to the state-of-the-art
nearest-neighbor search methods. Moreover, the proposed
feature selection method shows its effectiveness and
improves the accuracy up to 8.35\% compared with other
feature selection methods.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "3",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Miller:2017:QBL,
author = "Konstantin Miller and Abdel-Karim Al-Tamimi and Adam
Wolisz",
title = "{QoE}-Based Low-Delay Live Streaming Using Throughput
Predictions",
journal = j-TOMM,
volume = "13",
number = "1",
pages = "4:1--4:??",
month = jan,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/2990505",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Jan 18 17:18:28 MST 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Recently, Hypertext Transfer Protocol (HTTP)-based
adaptive streaming has become the de facto standard for
video streaming over the Internet. It allows clients to
dynamically adapt media characteristics to the varying
network conditions to ensure a high quality of
experience (QoE)-that is, minimize playback
interruptions while maximizing video quality at a
reasonable level of quality changes. In the case of
live streaming, this task becomes particularly
challenging due to the latency constraints. The
challenge further increases if a client uses a wireless
access network, where the throughput is subject to
considerable fluctuations. Consequently, live streams
often exhibit latencies of up to 20 to 30 seconds. In
the present work, we introduce an adaptation algorithm
for HTTP-based live streaming called LOLYPOP (short for
low-latency prediction-based adaptation), which is
designed to operate with a transport latency of a few
seconds. To reach this goal, LOLYPOP leverages
Transmission Control Protocol throughput predictions on
multiple time scales, from 1 to 10 seconds, along with
estimations of the relative prediction error
distributions. In addition to satisfying the latency
constraint, the algorithm heuristically maximizes the
QoE by maximizing the average video quality as a
function of the number of skipped segments and quality
transitions. To select an efficient prediction method,
we studied the performance of several time series
prediction methods in IEEE 802.11 wireless access
networks. We evaluated LOLYPOP under a large set of
experimental conditions, limiting the transport latency
to 3 seconds, against a state-of-the-art adaptation
algorithm called FESTIVE. We observed that the average
selected video representation index is by up to a
factor of 3 higher than with the baseline approach. We
also observed that LOLYPOP is able to reach points from
a broader region in the QoE space, and thus it is
better adjustable to the user profile or service
provider requirements.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "4",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Ranasinghe:2017:DLS,
author = "Nimesha Ranasinghe and Ellen Yi-Luen Do",
title = "Digital Lollipop: Studying Electrical Stimulation on
the Human Tongue to Simulate Taste Sensations",
journal = j-TOMM,
volume = "13",
number = "1",
pages = "5:1--5:??",
month = jan,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/2996462",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Jan 18 17:18:28 MST 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Among the five primary senses, the sense of taste is
the least explored as a form of digital media applied
in Human--Computer Interface. This article presents an
experimental instrument, the Digital Lollipop, for
digitally simulating the sensation of taste (gustation)
by utilizing electrical stimulation on the human
tongue. The system is capable of manipulating the
properties of electric currents (magnitude, frequency,
and polarity) to formulate different stimuli. To
evaluate the effectiveness of this method, the system
was experimentally tested in two studies. The first
experiment was conducted using separate regions of the
human tongue to record occurrences of basic taste
sensations and their respective intensity levels. The
results indicate occurrences of sour, salty, bitter,
and sweet sensations from different regions of the
tongue. One of the major discoveries of this experiment
was that the sweet taste emerges via an inverse-current
mechanism, which deserves further research in the
future. The second study was conducted to compare
natural and artificial (virtual) sour taste sensations
and examine the possibility of effectively controlling
the artificial sour taste at three intensity levels
(mild, medium, and strong). The proposed method is
attractive since it does not require any chemical
solutions and facilitates further research
opportunities in several directions including
human--computer interaction, virtual reality, food and
beverage, as well as medicine.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "5",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Min:2017:FPT,
author = "Xiongkuo Min and Guangtao Zhai and Ke Gu and Xiaokang
Yang",
title = "Fixation Prediction through Multimodal Analysis",
journal = j-TOMM,
volume = "13",
number = "1",
pages = "6:1--6:??",
month = jan,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/2996463",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Jan 18 17:18:28 MST 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this article, we propose to predict human eye
fixation through incorporating both audio and visual
cues. Traditional visual attention models generally
make the utmost of stimuli's visual features, yet they
bypass all audio information. In the real world,
however, we not only direct our gaze according to
visual saliency, but also are attracted by salient
audio cues. Psychological experiments show that audio
has an influence on visual attention, and subjects tend
to be attracted by the sound sources. Therefore, we
propose fusing both audio and visual information to
predict eye fixation. In our proposed framework, we
first localize the moving--sound-generating objects
through multimodal analysis and generate an audio
attention map. Then, we calculate the spatial and
temporal attention maps using the visual modality.
Finally, the audio, spatial, and temporal attention
maps are fused to generate the final audiovisual
saliency map. The proposed method is applicable to
scenes containing moving--sound-generating objects. We
gather a set of video sequences and collect
eye-tracking data under an audiovisual test condition.
Experiment results show that we can achieve better eye
fixation prediction performance when taking both audio
and visual cues into consideration, especially in some
typical scenes in which object motion and audio are
highly correlated.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "6",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Chu:2017:POI,
author = "Wei-Ta Chu and Chih-Hao Chiu",
title = "Predicting Occupation from Images by Combining Face
and Body Context Information",
journal = j-TOMM,
volume = "13",
number = "1",
pages = "7:1--7:??",
month = jan,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3009911",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Jan 18 17:18:28 MST 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Facial images embed age, gender, and other rich
information that is implicitly related to occupation.
In this work, we advocate that occupation prediction
from a single facial image is a doable computer vision
problem. We extract multilevel hand-crafted features
associated with locality-constrained linear coding and
convolutional neural network features as image
occupation descriptors. To avoid the curse of
dimensionality and overfitting, a boost strategy called
multichannel SVM is used to integrate features from
face and body. Intra- and interclass visual variations
are jointly considered in the boosting framework to
further improve performance. In the evaluation, we
verify the effectiveness of predicting occupation from
face and demonstrate promising performance obtained by
combining face and body information. More importantly,
our work further integrates deep features into the
multichannel SVM framework and shows significantly
better performance over the state of the art.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "7",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Xu:2017:CSA,
author = "Jingxi Xu and Benjamin W. Wah",
title = "Consistent Synchronization of Action Order with Least
Noticeable Delays in Fast-Paced Multiplayer Online
Games",
journal = j-TOMM,
volume = "13",
number = "1",
pages = "8:1--8:??",
month = jan,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3003727",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Jan 18 17:18:28 MST 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "When running multiplayer online games on IP networks
with losses and delays, the order of actions may be
changed when compared to the order run on an ideal
network with no delays and losses. To maintain a proper
ordering of events, traditional approaches either use
rollbacks to undo certain actions or local lags to
introduce additional delays. Both may be perceived by
players because their changes are beyond the
just-noticeable-difference (JND) threshold. In this
article, we propose a novel method for ensuring a
strongly consistent completion order of actions, where
strong consistency refers to the same completion order
as well as the same interval between any completion
time and the corresponding ideal reference completion
time under no network delay. We find that small
adjustments within the JND on the duration of an action
would not be perceivable, as long as the duration is
comparable to the network round-trip time. We utilize
this property to control the vector of durations of
actions and formulate the search of the vector as a
multidimensional optimization problem. By using the
property that players are generally more sensitive to
the most prominent delay effect (with the highest
probability of noticeability P$_{notice}$ or the
probability of correctly noticing a change when
compared to the reference), we prove that the optimal
solution occurs when P$_{notice}$ of the individual
adjustments are equal. As this search can be done
efficiently in polynomial time ($\approx$ 5ms) with a
small amount of space ($\approx$ 160KB), the search can
be done at runtime to determine the optimal
control. Last, we evaluate our approach on the popular
open-source online shooting game BZFlag.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "8",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Schramm:2017:ATS,
author = "Rodrigo Schramm and Helena {De Souza Nunes} and
Cl{\'a}udio Rosito Jung",
title = "Audiovisual Tool for {Solf{\`e}ge} Assessment",
journal = j-TOMM,
volume = "13",
number = "1",
pages = "9:1--9:??",
month = jan,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3007194",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Jan 18 17:18:28 MST 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Solf{\`e}ge is a general technique used in the music
learning process that involves the vocal performance of
melodies, regarding the time and duration of musical
sounds as specified in the music score, properly
associated with the meter-mimicking performed by hand
movement. This article presents an audiovisual approach
for automatic assessment of this relevant musical study
practice. The proposed system combines the gesture of
meter-mimicking (video information) with the melodic
transcription (audio information), where hand movement
works as a metronome, controlling the time flow (tempo)
of the musical piece. Thus, meter-mimicking is used to
align the music score (ground truth) with the sung
melody, allowing assessment even in time-dynamic
scenarios. Audio analysis is applied to achieve the
melodic transcription of the sung notes and the
solf{\`e}ge performances are evaluated by a set of
Bayesian classifiers that were generated from real
evaluations done by experts listeners.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "9",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wu:2017:IRS,
author = "Haojun Wu and Yong Wang and Jiwu Huang",
title = "Identification of Reconstructed Speech",
journal = j-TOMM,
volume = "13",
number = "1",
pages = "10:1--10:??",
month = jan,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3004055",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Jan 18 17:18:28 MST 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Both voice conversion and hidden Markov model-- (HMM)
based speech synthesis can be used to produce
artificial voices of a target speaker. They have shown
great negative impacts on speaker verification (SV)
systems. In order to enhance the security of SV
systems, the techniques to detect converted/synthesized
speech should be taken into consideration. During voice
conversion and HMM-based synthesis, speech
reconstruction is applied to transform a set of
acoustic parameters to reconstructed speech. Hence, the
identification of reconstructed speech can be used to
distinguish converted/synthesized speech from human
speech. Several related works on such identification
have been reported. The equal error rates (EERs) lower
than 5\% of detecting reconstructed speech have been
achieved. However, through the cross-database
evaluations on different speech databases, we find that
the EERs of several testing cases are higher than 10\%.
The robustness of detection algorithms to different
speech databases needs to be improved. In this article,
we propose an algorithm to identify the reconstructed
speech. Three different speech databases and two
different reconstruction methods are considered in our
work, which has not been addressed in the reported
works. The high-dimensional data visualization approach
is used to analyze the effect of speech reconstruction
on Mel-frequency cepstral coefficients (MFCC) of speech
signals. The Gaussian mixture model supervectors of
MFCC are used as acoustic features. Furthermore, a set
of commonly used classification algorithms are applied
to identify reconstructed speech. According to the
comparison among different classification methods,
linear discriminant analysis-ensemble classifiers are
chosen in our algorithm. Extensive experimental results
show that the EERs lower than 1\% can be achieved by
the proposed algorithm in most cases, outperforming the
reported state-of-the-art identification techniques.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "10",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Gaj:2017:DCR,
author = "Sibaji Gaj and Aditya Kanetkar and Arijit Sur and
Prabin Kumar Bora",
title = "Drift-Compensated Robust Watermarking Algorithm for
{H.265\slash HEVC} Video Stream",
journal = j-TOMM,
volume = "13",
number = "1",
pages = "11:1--11:??",
month = jan,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3009910",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Jan 18 17:18:28 MST 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "It has been observed in the recent literature that the
drift error due to watermarking degrades the visual
quality of the embedded video. The existing drift error
handling strategies for recent video standards such as
H.264 may not be directly applicable for upcoming
high-definition video standards (such as High
Efficiency Video Coding (HEVC)) due to different
compression architecture. In this article, a compressed
domain watermarking scheme is proposed for H.265/HEVC
bit stream that can handle drift error propagation both
for intra- and interprediction process. Additionally,
the proposed scheme shows adequate robustness against
recompression attack as well as common image processing
attacks while maintaining decent visual quality. A
comprehensive set of experiments has been carried out
to justify the efficacy of the proposed scheme over the
existing literature.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "11",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Dutta:2017:EFC,
author = "Tanima Dutta and Hari Prabhat Gupta",
title = "An Efficient Framework for Compressed Domain
Watermarking in {$P$} Frames of High-Efficiency Video
Coding ({HEVC})-Encoded Video",
journal = j-TOMM,
volume = "13",
number = "1",
pages = "12:1--12:??",
month = jan,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3002178",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Jan 18 17:18:28 MST 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Digital watermarking has received much attention in
recent years as a promising solution to copyright
protection. Video watermarking in compressed domain has
gained importance since videos are stored and
transmitted in a compressed format. This decreases the
overhead to fully decode and re-encode the video for
embedding and extraction of the watermark. High
Efficiency Video Coding (HEVC/H.265) is the latest and
most efficient video compression standard and a
successor to H.264 Advanced Video Coding. In this
article, we propose a robust watermarking framework for
HEVC-encoded video using informed detector. A readable
watermark is embedded invisibly in P frames for better
perceptual quality. Our framework imposes security and
robustness by selecting appropriate blocks using a
random key and the spatio-temporal characteristics of
the compressed video. A detail analysis of the
strengths of different compressed domain features is
performed for implementing the watermarking framework.
We experimentally demonstrate the utility of the
proposed work. The results show that the proposed work
effectively limits the increase in video bitrate and
degradation in perceptual quality. The proposed
framework is robust against re-encoding and image
processing attacks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "12",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Lisanti:2017:MKC,
author = "Giuseppe Lisanti and Svebor Karaman and Iacopo Masi",
title = "Multichannel-Kernel Canonical Correlation Analysis for
Cross-View Person Reidentification",
journal = j-TOMM,
volume = "13",
number = "2",
pages = "13:1--13:??",
month = may,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3038916",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jun 16 14:48:38 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this article, we introduce a method to overcome one
of the main challenges of person reidentification in
multicamera networks, namely cross-view appearance
changes. The proposed solution addresses the extreme
variability of person appearance in different camera
views by exploiting multiple feature representations.
For each feature, kernel canonical correlation analysis
with different kernels is employed to learn several
projection spaces in which the appearance correlation
between samples of the same person observed from
different cameras is maximized. An iterative logistic
regression is finally used to select and weight the
contributions of each projection and perform the
matching between the two views. Experimental evaluation
shows that the proposed solution obtains comparable
performance on the VIPeR and PRID 450s datasets and
improves on the PRID and CUHK01 datasets with respect
to the state of the art.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "13",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Ye:2017:TOM,
author = "Jun Ye and Hao Hu and Guo-Jun Qi and Kien A. Hua",
title = "A Temporal Order Modeling Approach to Human Action
Recognition from Multimodal Sensor Data",
journal = j-TOMM,
volume = "13",
number = "2",
pages = "14:1--14:??",
month = may,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3038917",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jun 16 14:48:38 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "From wearable devices to depth cameras, researchers
have exploited various multimodal data to recognize
human actions for applications, such as video gaming,
education, and healthcare. Although there many
successful techniques have been presented in the
literature, most current approaches have focused on
statistical or local spatiotemporal features and do not
explicitly explore the temporal dynamics of the sensor
data. However, human action data contain rich temporal
structure information that can characterize the unique
underlying patterns of different action categories.
From this perspective, we propose a novel temporal
order modeling approach to human action recognition.
Specifically, we explore subspace projections to
extract the latent temporal patterns from different
human action sequences. The temporal order between
these patterns are compared, and the index of the
pattern that appears first is used to encode the entire
sequence. This process is repeated multiple times and
produces a compact feature vector representing the
temporal dynamics of the sequence. Human action
recognition can then be efficiently solved by the
nearest neighbor search based on the Hamming distance
between these compact feature vectors. We further
introduce a sequential optimization algorithm to learn
the optimized projections that preserve the pairwise
label similarity of the action sequences. Experimental
results on two public human action datasets demonstrate
the superior performance of the proposed technique in
both accuracy and efficiency.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "14",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wang:2017:MCL,
author = "Shuai Wang and Yang Cong and Huijie Fan and Baojie Fan
and Lianqing Liu and Yunsheng Yang and Yandong Tang and
Huaici Zhao and Haibin Yu",
title = "Multi-Class Latent Concept Pooling for Computer-Aided
Endoscopy Diagnosis",
journal = j-TOMM,
volume = "13",
number = "2",
pages = "15:1--15:??",
month = may,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3051481",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jun 16 14:48:38 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Successful computer-aided diagnosis systems typically
rely on training datasets containing sufficient and
richly annotated images. However, detailed image
annotation is often time consuming and subjective,
especially for medical images, which becomes the
bottleneck for the collection of large datasets and
then building computer-aided diagnosis systems. In this
article, we design a novel computer-aided endoscopy
diagnosis system to deal with the multi-classification
problem of electronic endoscopy medical records (EEMRs)
containing sets of frames, while labels of EEMRs can be
mined from the corresponding text records using an
automatic text-matching strategy without human special
labeling. With unambiguous EEMR labels and ambiguous
frame labels, we propose a simple but effective pooling
scheme called Multi-class Latent Concept Pooling, which
learns a codebook from EEMRs with different classes
step by step and encodes EEMRs based on a soft
weighting strategy. In our method, a computer-aided
diagnosis system can be extended to new unseen classes
with ease and applied to the standard single-instance
classification problem even though detailed annotated
images are unavailable. In order to validate our
system, we collect 1,889 EEMRs with more than 59K
frames and successfully mine labels for 348 of them.
The experimental results show that our proposed system
significantly outperforms the state-of-the-art methods.
Moreover, we apply the learned latent concept codebook
to detect the abnormalities in endoscopy images and
compare it with a supervised learning classifier, and
the evaluation shows that our codebook learning method
can effectively extract the true prototypes related to
different classes from the ambiguous data.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "15",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Demirbilek:2017:MLB,
author = "Edip Demirbilek and Jean-Charles Gr{\'e}goire",
title = "Machine Learning-Based Parametric Audiovisual Quality
Prediction Models for Real-Time Communications",
journal = j-TOMM,
volume = "13",
number = "2",
pages = "16:1--16:??",
month = may,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3051482",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jun 16 14:48:38 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In order to mechanically predict audiovisual quality
in interactive multimedia services, we have developed
machine learning--based no-reference parametric models.
We have compared Decision Trees--based ensemble
methods, Genetic Programming and Deep Learning models
that have one and more hidden layers. We have used the
Institut national de la recherche scientifique (INRS)
audiovisual quality dataset specifically designed to
include ranges of parameters and degradations typically
seen in real-time communications. Decision Trees--based
ensemble methods have outperformed both Deep Learning--
and Genetic Programming--based models in terms of
Root-Mean-Square Error (RMSE) and Pearson correlation
values. We have also trained and developed models on
various publicly available datasets and have compared
our results with those of these original models. Our
studies show that Random Forests--based prediction
models achieve high accuracy for both the INRS
audiovisual quality dataset and other publicly
available comparable datasets.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "16",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Gokhale:2017:CCN,
author = "Vineet Gokhale and Jayakrishnan Nair and Subhasis
Chaudhuri",
title = "Congestion Control for Network-Aware Telehaptic
Communication",
journal = j-TOMM,
volume = "13",
number = "2",
pages = "17:1--17:??",
month = may,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3052821",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jun 16 14:48:38 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Telehaptic applications involve delay-sensitive
multimedia communication between remote locations with
distinct Quality of Service (QoS) requirements for
different media components. These QoS constraints pose
a variety of challenges, especially when the
communication occurs over a shared network, with
unknown and time-varying cross-traffic. In this work,
we propose a transport layer congestion control
protocol for telehaptic applications operating over
shared networks, termed as Dynamic Packetization Module
(DPM). DPM is a lossless, network-aware protocol that
tunes the telehaptic packetization rate based on the
level of congestion in the network. To monitor the
network congestion, we devise a novel network feedback
module, which communicates the end-to-end delays
encountered by the telehaptic packets to the respective
transmitters with negligible overhead. Via extensive
simulations, we show that DPM meets the QoS
requirements of telehaptic applications over a wide
range of network cross-traffic conditions. We also
report qualitative results of a real-time telepottery
experiment with several human subjects, which reveal
that DPM preserves the quality of telehaptic activity
even under heavily congested network scenarios.
Finally, we compare the performance of DPM with several
previously proposed telehaptic communication protocols
and demonstrate that DPM outperforms these protocols.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "17",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Sobhani:2017:VBA,
author = "Ashkan Sobhani and Abdulsalam Yassine and Shervin
Shirmohammadi",
title = "A Video Bitrate Adaptation and Prediction Mechanism
for {HTTP} Adaptive Streaming",
journal = j-TOMM,
volume = "13",
number = "2",
pages = "18:1--18:??",
month = may,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3052822",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jun 16 14:48:38 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The Hypertext Transfer Protocol (HTTP) Adaptive
Streaming (HAS) has now become ubiquitous and accounts
for a large amount of video delivery over the Internet.
But since the Internet is prone to bandwidth
variations, HAS's up and down switching between
different video bitrates to keep up with bandwidth
variations leads to a reduction in Quality of
Experience (QoE). In this article, we propose a video
bitrate adaptation and prediction mechanism based on
Fuzzy logic for HAS players, which takes into
consideration the estimate of available network
bandwidth as well as the predicted buffer occupancy
level in order to proactively and intelligently respond
to current conditions. This leads to two contributions:
First, it allows HAS players to take appropriate
actions, sooner than existing methods, to prevent
playback interruptions caused by buffer underrun,
reducing the ON-OFF traffic phenomena associated with
current approaches and increasing the QoE. Second, it
facilitates fair sharing of bandwidth among competing
players at the bottleneck link. We present the
implementation of our proposed mechanism and provide
both empirical/QoE analysis and performance comparison
with existing work. Our results show that, compared to
existing systems, our system has (1) better fairness
among multiple competing players by almost 50\% on
average and as much as 80\% as indicated by Jain's
fairness index and (2) better perceived quality of
video by almost 8\% on average and as much as 17\%,
according to the estimate the Mean Opinion Score (eMOS)
model.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "18",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Grant:2017:CSU,
author = "Jason M. Grant and Patrick J. Flynn",
title = "Crowd Scene Understanding from Video: a Survey",
journal = j-TOMM,
volume = "13",
number = "2",
pages = "19:1--19:??",
month = may,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3052930",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jun 16 14:48:38 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Crowd video analysis has applications in crowd
management, public space design, and visual
surveillance. Example tasks potentially aided by
automated analysis include anomaly detection (such as a
person walking against the grain of traffic or rapid
assembly/dispersion of groups of people), population
and density measurements, and interactions between
groups of people. This survey explores crowd analysis
as it relates to two primary research areas: crowd
statistics and behavior understanding. First, we survey
methods for counting individuals and approximating the
density of the crowd. Second, we showcase research
efforts on behavior understanding as related to crowds.
These works focus on identifying groups, interactions
within small groups, and abnormal activity detection
such as riots and bottlenecks in large crowds. Works
presented in this section also focus on tracking groups
of individuals, either as a single entity or a subset
of individuals within the frame of reference. Finally,
a summary of datasets available for crowd activity
video research is provided.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "19",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Hussein:2017:VJF,
author = "Fairouz Hussein and Massimo Piccardi",
title = "{V-JAUNE}: a Framework for Joint Action Recognition
and Video Summarization",
journal = j-TOMM,
volume = "13",
number = "2",
pages = "20:1--20:??",
month = may,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3063532",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jun 16 14:48:38 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Video summarization and action recognition are two
important areas of multimedia video analysis. While
these two areas have been tackled separately to date,
in this article, we present a latent structural SVM
framework to recognize the action and derive the
summary of a video in a joint, simultaneous fashion.
Efficient inference is provided by a submodular score
function that accounts for the action and summary
jointly. In this article, we also define a novel
measure to evaluate the quality of a predicted video
summary against the annotations of multiple annotators.
Quantitative and qualitative results over two
challenging action datasets-the ACE and MSR
DailyActivity3D datasets-show that the proposed joint
approach leads to higher action recognition accuracy
and equivalent or better summary quality than
comparable approaches that perform these tasks
separately.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "20",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Cizmeci:2017:MSM,
author = "Burak Cizmeci and Xiao Xu and Rahul Chaudhari and
Christoph Bachhuber and Nicolas Alt and Eckehard
Steinbach",
title = "A Multiplexing Scheme for Multimodal Teleoperation",
journal = j-TOMM,
volume = "13",
number = "2",
pages = "21:1--21:??",
month = may,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3063594",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jun 16 14:48:38 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article proposes an application-layer
multiplexing scheme for teleoperation systems with
multimodal feedback (video, audio, and haptics). The
available transmission resources are carefully
allocated to avoid delay-jitter for the haptic signal
potentially caused by the size and arrival time of the
video and audio data. The multiplexing scheme gives
high priority to the haptic signal and applies a
preemptive-resume scheduling strategy to stream the
audio and video data. The proposed approach estimates
the available transmission rate in real time and adapts
the video bitrate, data throughput, and force buffer
size accordingly. Furthermore, the proposed scheme
detects sudden transmission rate drops and applies
congestion control to avoid abrupt delay increases and
converge promptly to the altered transmission rate. The
performance of the proposed scheme is measured
objectively in terms of end-to-end signal latencies,
packet rates, and peak signal-to-noise ratio (PSNR) for
visual quality. Moreover, peak-delay and convergence
time measurements are carried out to investigate the
performance of the congestion control mode of the
system.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "21",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Su:2017:DDP,
author = "Zhuo Su and Kun Zeng and Hanhui Li and Xiaonan Luo",
title = "A Dual-Domain Perceptual Framework for Generating
Visual Inconspicuous Counterparts",
journal = j-TOMM,
volume = "13",
number = "2",
pages = "22:1--22:??",
month = may,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3068427",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jun 16 14:48:38 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/tomccap/;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "For a given image, it is a challenging task to
generate its corresponding counterpart with visual
inconspicuous modification. The complexity of this
problem reasons from the high correlativity between the
editing operations and vision perception. Essentially,
a significant requirement that should be emphasized is
how to make the object modifications hard to be found
visually in the generative counterparts. In this
article, we propose a novel dual-domain perceptual
framework to generate visual inconspicuous
counterparts, which applies the perceptual
bidirectional similarity metric (PBSM) and appearance
similarity metric (ASM) to create the dual-domain
perception error minimization model. The candidate
targets are yielded by the well-known PatchMatch model
with the strokes-based interactions and selective
object library. By the dual-perceptual evaluation
index, all candidate targets are sorted to select out
the best result. For demonstration, a series of
objective and subjective measurements are used to
evaluate the performance of our framework.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "22",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Singh:2017:SCB,
author = "Priyanka Singh and Balasubramanian Raman and Nishant
Agarwal and Pradeep K. Atrey",
title = "Secure Cloud-Based Image Tampering Detection and
Localization Using {POB} Number System",
journal = j-TOMM,
volume = "13",
number = "3",
pages = "23:1--23:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3077140",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:22 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The benefits of high-end computation infrastructure
facilities provided by cloud-based multimedia systems
are attracting people all around the globe. However,
such cloud-based systems possess security issues as
third party servers become involved in them. Rendering
data in an unreadable form so that no information is
revealed to the cloud data centers will serve as the
best solution to these security issues. One such image
encryption scheme based on a Permutation Ordered Binary
Number System has been proposed in this work. It
distributes the image information in totally random
shares, which can be stored at the cloud data centers.
Further, the proposed scheme authenticates the shares
at the pixel level. If any tampering is done at the
cloud servers, the scheme can accurately identify the
altered pixels via authentication bits and localizes
the tampered area. The tampered portion is also
reflected back in the reconstructed image that is
obtained at the authentic user end. The experimental
results validate the efficacy of the proposed scheme
against various kinds of possible attacks, tested with
a variety of images. The tamper detection accuracy has
been computed on a pixel basis and found to be
satisfactorily high for most of the tampering
scenarios.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "23",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Thirunarayanan:2017:CSE,
author = "Ishwarya Thirunarayanan and Khimya Khetarpal and
Sanjeev Koppal and Olivier {Le Meur} and John Shea and
Eakta Jain",
title = "Creating Segments and Effects on Comics by Clustering
Gaze Data",
journal = j-TOMM,
volume = "13",
number = "3",
pages = "24:1--24:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3078836",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:22 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Traditional comics are increasingly being augmented
with digital effects, such as recoloring, stereoscopy,
and animation. An open question in this endeavor is
identifying where in a comic panel the effects should
be placed. We propose a fast, semi-automatic technique
to identify effects-worthy segments in a comic panel by
utilizing gaze locations as a proxy for the importance
of a region. We take advantage of the fact that comic
artists influence viewer gaze towards narrative
important regions. By capturing gaze locations from
multiple viewers, we can identify important regions and
direct a computer vision segmentation algorithm to
extract these segments. The challenge is that these
gaze data are noisy and difficult to process. Our key
contribution is to leverage a theoretical breakthrough
in the computer networks community towards robust and
meaningful clustering of gaze locations into semantic
regions, without needing the user to specify the number
of clusters. We present a method based on the concept
of relative eigen quality that takes a scanned comic
image and a set of gaze points and produces an image
segmentation. We demonstrate a variety of effects such
as defocus, recoloring, stereoscopy, and animations. We
also investigate the use of artificially generated gaze
locations from saliency models in place of actual gaze
locations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "24",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Houle:2017:QEC,
author = "Michael E. Houle and Xiguo Ma and Vincent Oria and
Jichao Sun",
title = "Query Expansion for Content-Based Similarity Search
Using Local and Global Features",
journal = j-TOMM,
volume = "13",
number = "3",
pages = "25:1--25:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3063595",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:22 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article presents an efficient and totally
unsupervised content-based similarity search method for
multimedia data objects represented by high-dimensional
feature vectors. The assumption is that the similarity
measure is applicable to feature vectors of arbitrary
length. During the offline process, different sets of
features are selected by a generalized version of the
Laplacian Score in an unsupervised way for individual
data objects in the database. Online retrieval is
performed by ranking the query object in the feature
spaces of candidate objects. Those candidates for which
the query object is ranked highly are selected as the
query results. The ranking scheme is incorporated into
an automated query expansion framework to further
improve the semantic quality of the search result.
Extensive experiments were conducted on several
datasets to show the capability of the proposed method
in boosting effectiveness without losing efficiency.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "25",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Riegler:2017:ACA,
author = "Michael Riegler and Konstantin Pogorelov and Sigrun
Losada Eskeland and Peter Thelin Schmidt and Zeno
Albisser and Dag Johansen and Carsten Griwodz and
P{\aa}l Halvorsen and Thomas {De Lange}",
title = "From Annotation to Computer-Aided Diagnosis: Detailed
Evaluation of a Medical Multimedia System",
journal = j-TOMM,
volume = "13",
number = "3",
pages = "26:1--26:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3079765",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:22 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Holistic medical multimedia systems covering
end-to-end functionality from data collection to aided
diagnosis are highly needed, but rare. In many
hospitals, the potential value of multimedia data
collected through routine examinations is not
recognized. Moreover, the availability of the data is
limited, as the health care personnel may not have
direct access to stored data. However, medical
specialists interact with multimedia content daily
through their everyday work and have an increasing
interest in finding ways to use it to facilitate their
work processes. In this article, we present a novel,
holistic multimedia system aiming to tackle automatic
analysis of video from gastrointestinal (GI) endoscopy.
The proposed system comprises the whole pipeline,
including data collection, processing, analysis, and
visualization. It combines filters using machine
learning, image recognition, and extraction of global
and local image features. The novelty is primarily in
this holistic approach and its real-time performance,
where we automate a complete algorithmic GI screening
process. We built the system in a modular way to make
it easily extendable to analyze various abnormalities,
and we made it efficient in order to run in real time.
The conducted experimental evaluation proves that the
detection and localization accuracy are comparable or
even better than existing systems, but it is by far
leading in terms of real-time performance and efficient
resource consumption.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "26",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yang:2017:EPR,
author = "Xun Yang and Meng Wang and Richang Hong and Qi Tian
and Yong Rui",
title = "Enhancing Person Re-identification in a Self-Trained
Subspace",
journal = j-TOMM,
volume = "13",
number = "3",
pages = "27:1--27:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3089249",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:22 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Despite the promising progress made in recent years,
person re-identification (re-ID) remains a challenging
task due to the complex variations in human appearances
from different camera views. For this challenging
problem, a large variety of algorithms have been
developed in the fully supervised setting, requiring
access to a large amount of labeled training data.
However, the main bottleneck for fully supervised re-ID
is the limited availability of labeled training
samples. To address this problem, we propose a
self-trained subspace learning paradigm for person
re-ID that effectively utilizes both labeled and
unlabeled data to learn a discriminative subspace where
person images across disjoint camera views can be
easily matched. The proposed approach first constructs
pseudo-pairwise relationships among unlabeled persons
using the k-nearest neighbors algorithm. Then, with the
pseudo-pairwise relationships, the unlabeled samples
can be easily combined with the labeled samples to
learn a discriminative projection by solving an
eigenvalue problem. In addition, we refine the
pseudo-pairwise relationships iteratively, which
further improves learning performance. A multi-kernel
embedding strategy is also incorporated into the
proposed approach to cope with the non-linearity in a
person's appearance and explore the complementation of
multiple kernels. In this way, the performance of
person re-ID can be greatly enhanced when training data
are insufficient. Experimental results on six widely
used datasets demonstrate the effectiveness of our
approach, and its performance can be comparable to the
reported results of most state-of-the-art fully
supervised methods while using much fewer labeled
data.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "27",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Lin:2017:RHA,
author = "Shih-Yao Lin and Yen-Yu Lin and Chu-Song Chen and
Yi-Ping Hung",
title = "Recognizing Human Actions with Outlier Frames by
Observation Filtering and Completion",
journal = j-TOMM,
volume = "13",
number = "3",
pages = "28:1--28:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3089250",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:22 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article addresses the problem of recognizing
partially observed human actions. Videos of actions
acquired in the real world often contain corrupt frames
caused by various factors. These frames may appear
irregularly, and make the actions only partially
observed. They change the appearance of actions and
degrade the performance of pretrained recognition
systems. In this article, we propose an approach to
address the corrupt-frame problem without knowing their
locations and durations in advance. The proposed
approach includes two key components: outlier filtering
and observation completion. The former identifies and
filters out unobserved frames, and the latter fills up
the filtered parts by retrieving coherent alternatives
from training data. Hidden Conditional Random Fields
(HCRFs) are then used to recognize the filtered and
completed actions. Our approach has been evaluated on
three datasets, which contain both fully observed
actions and partially observed actions with either real
or synthetic corrupt frames. The experimental results
show that our approach performs favorably against the
other state-of-the-art methods, especially when corrupt
frames are present.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "28",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Karafotias:2017:IER,
author = "Georgios Karafotias and Akiko Teranishi and Georgios
Korres and Friederike Eyssel and Scandar Copti and
Mohamad Eid",
title = "Intensifying Emotional Reactions via Tactile Gestures
in Immersive Films",
journal = j-TOMM,
volume = "13",
number = "3",
pages = "29:1--29:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3092840",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:22 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The film industry continuously strives to make
visitors' movie experience more immersive and thus,
more captivating. This is realized through larger
screens, sophisticated speaker systems, and high
quality 2D and 3D content. Moreover, a recent trend in
the film industry is to incorporate multiple
interaction modalities, such as 4D film, to simulate
rain, wind, vibration, and heat, in order to intensify
viewers' emotional reactions. In this context, humans'
sense of touch possesses significant potential for
intensifying emotional reactions for the film
experience beyond audio-visual sensory modalities. This
article presents a framework for authoring tactile cues
(tactile gestures as used in this article) and enabling
automatic rendering of said gestures to intensify
emotional reactions in an immersive film experience. To
validate the proposed framework, we conducted an
experimental study where tactile gestures are designed
and evaluated for the ability to intensify four
emotional reactions: high valence-high arousal, high
valence-low arousal, low valence-high arousal, and low
valence-low arousal. Using a haptic jacket,
participants felt tactile gestures that are
synchronized with the audio-visual contents of a film.
Results demonstrated that (1) any tactile feedback
generated a positive user experience; (2) the tactile
feedback intensifies emotional reactions when the
audio-visual stimuli elicit clear emotional responses,
except for low arousal emotional response since tactile
gestures seem to always generate excitement; (3)
purposed tactile gestures do not seem to significantly
outperform randomized tactile gesture for intensifying
specific emotional reactions; and (4) using a haptic
jacket is not distracting for the users.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "29",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Cheung:2017:ASU,
author = "Ming Cheung and James She",
title = "An Analytic System for User Gender Identification
through User Shared Images",
journal = j-TOMM,
volume = "13",
number = "3",
pages = "30:1--30:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3095077",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:22 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Many social media applications, such as
recommendation, virality prediction, and marketing,
make use of user gender, which may not be explicitly
specified or kept privately. Meanwhile, advanced mobile
devices have become part of our lives and a huge amount
of content is being generated by users every day,
especially user shared images shared by individuals in
social networks. This particular form of user generated
content is widely accessible to others due to the
sharing nature. When user gender is only accessible to
exclusive parties, these user shared images are proved
to be an easier way to identify user gender. This work
investigated 3,152,344 images by 7,450 users from
Fotolog and Flickr, two image-oriented social networks.
It is observed that users who share visually similar
images are more likely to have the same gender. A
multimedia big data system that utilizes this
phenomenon is proposed for user gender identification
with 79\% accuracy. These findings are useful for
information or services in any social network with
intensive image sharing.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "30",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Engelbrecht:2017:PDS,
author = "Herman A. Engelbrecht and John S. Gilmore",
title = "{Pithos}: Distributed Storage for Massive Multi-User
Virtual Environments",
journal = j-TOMM,
volume = "13",
number = "3",
pages = "31:1--31:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3105577",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:22 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "There has been significant research effort into
peer-to-peer (P2P) massively multi-user virtual
environments (MMVEs). A number of architectures have
been proposed to implement the P2P approach; however,
the development of fully distributed MMVEs has met with
a number of challenges. In this work, we address one of
the key remaining challenges of state consistency and
persistency in P2P MMVEs. Having reviewed state
management and persistency architectures currently
receiving research attention, we have identified
deficiencies such as lack of load balancing,
responsiveness, and scalability. To address these
deficiencies, we present Pithos-a reliable, responsive,
secure, load-balanced, and scalable distributed storage
system, suited to P2P MMVEs. Pithos is designed
specifically for P2P MMVEs, and we show that it
improves the reliability and responsiveness of storage
architectures as compared to existing P2P state
persistency architectures. Pithos is implemented as an
OverSim simulation running on the OMNeT++ network
simulation framework. It is evaluated using up to
10,400 peers, with realistic latency profiles, with up
to 15.8 million storage and retrieval requests that are
generated to store a total of 2.4 million objects. Each
peer in Pithos uses a maximum of 1,950Bps bandwidth to
achieve 99.98\% storage reliability, while the most
reliable overlay storage configuration tested only
achieved 93.65\% reliability, using 2,182Bps bandwidth.
Pithos is also more responsive than overlay storage,
with an average responsiveness of 0.192s, compared with
the average overlay responsiveness of 1.4s when
retrieving objects from storage.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "31",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhang:2017:SDL,
author = "Jun Zhang and Meng Wang and Liang Lin and Xun Yang and
Jun Gao and Yong Rui",
title = "Saliency Detection on Light Field: a Multi-Cue
Approach",
journal = j-TOMM,
volume = "13",
number = "3",
pages = "32:1--32:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3107956",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:22 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Saliency detection has recently received increasing
research interest on using high-dimensional datasets
beyond two-dimensional images. Despite the many
available capturing devices and algorithms, there still
exists a wide spectrum of challenges that need to be
addressed to achieve accurate saliency detection.
Inspired by the success of the light-field technique,
in this article, we propose a new computational scheme
to detect salient regions by integrating multiple
visual cues from light-field images. First, saliency
prior maps are generated from several light-field
features based on superpixel-level intra-cue
distinctiveness, such as color, depth, and flow
inherited from different focal planes and multiple
viewpoints. Then, we introduce the location prior to
enhance the saliency maps. These maps will finally be
merged into a single map using a random-search-based
weighting strategy. Besides, we refine the object
details by employing a two-stage saliency refinement to
obtain the final saliency map. In addition, we present
a more challenging benchmark dataset for light-field
saliency analysis, named HFUT-Lytro, which consists of
255 light fields with a range from 53 to 64 images
generated from each light-field image, therein spanning
multiple occurrences of saliency detection challenges
such as occlusions, cluttered background, and
appearance changes. Experimental results show that our
approach can achieve 0.6--6.7\% relative improvements
over state-of-the-art methods in terms of the F-measure
and Precision metrics, which demonstrates the
effectiveness of the proposed approach.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "32",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Ota:2017:ISI,
author = "Kaoru Ota and Minh Son Dao and Vasileios Mezaris and
Francesco G. B. {De Natale}",
title = "Introduction to Special Issue on Deep Learning for
Mobile Multimedia",
journal = j-TOMM,
volume = "13",
number = "3s",
pages = "33:1--33:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3088340",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:22 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "33",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Ota:2017:DLM,
author = "Kaoru Ota and Minh Son Dao and Vasileios Mezaris and
Francesco G. B. {De Natale}",
title = "Deep Learning for Mobile Multimedia: a Survey",
journal = j-TOMM,
volume = "13",
number = "3s",
pages = "34:1--34:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3092831",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:22 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Deep Learning (DL) has become a crucial technology for
multimedia computing. It offers a powerful instrument
to automatically produce high-level abstractions of
complex multimedia data, which can be exploited in a
number of applications, including object detection and
recognition, speech-to- text, media retrieval,
multimodal data analysis, and so on. The availability
of affordable large-scale parallel processing
architectures, and the sharing of effective open-source
codes implementing the basic learning algorithms,
caused a rapid diffusion of DL methodologies, bringing
a number of new technologies and applications that
outperform, in most cases, traditional machine learning
technologies. In recent years, the possibility of
implementing DL technologies on mobile devices has
attracted significant attention. Thanks to this
technology, portable devices may become smart objects
capable of learning and acting. The path toward these
exciting future scenarios, however, entangles a number
of important research challenges. DL architectures and
algorithms are hardly adapted to the storage and
computation resources of a mobile device. Therefore,
there is a need for new generations of mobile
processors and chipsets, small footprint learning and
inference algorithms, new models of collaborative and
distributed processing, and a number of other
fundamental building blocks. This survey reports the
state of the art in this exciting research area,
looking back to the evolution of neural networks, and
arriving to the most recent results in terms of
methodologies, technologies, and applications for
mobile environments.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "34",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Seidenari:2017:DAD,
author = "Lorenzo Seidenari and Claudio Baecchi and Tiberio
Uricchio and Andrea Ferracani and Marco Bertini and
Alberto {Del Bimbo}",
title = "Deep Artwork Detection and Retrieval for Automatic
Context-Aware Audio Guides",
journal = j-TOMM,
volume = "13",
number = "3s",
pages = "35:1--35:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3092832",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:22 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this article, we address the problem of creating a
smart audio guide that adapts to the actions and
interests of museum visitors. As an autonomous agent,
our guide perceives the context and is able to interact
with users in an appropriate fashion. To do so, it
understands what the visitor is looking at, if the
visitor is moving inside the museum hall, or if he or
she is talking with a friend. The guide performs
automatic recognition of artworks, and it provides
configurable interface features to improve the user
experience and the fruition of multimedia materials
through semi-automatic interaction. Our smart audio
guide is backed by a computer vision system capable of
working in real time on a mobile device, coupled with
audio and motion sensors. We propose the use of a
compact Convolutional Neural Network (CNN) that
performs object classification and localization. Using
the same CNN features computed for these tasks, we
perform also robust artwork recognition. To improve the
recognition accuracy, we perform additional video
processing using shape-based filtering, artwork
tracking, and temporal filtering. The system has been
deployed on an NVIDIA Jetson TK1 and a NVIDIA Shield
Tablet K1 and tested in a real-world environment
(Bargello Museum of Florence).",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "35",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Pouladzadeh:2017:MMF,
author = "Parisa Pouladzadeh and Shervin Shirmohammadi",
title = "Mobile Multi-Food Recognition Using Deep Learning",
journal = j-TOMM,
volume = "13",
number = "3s",
pages = "36:1--36:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3063592",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:22 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this article, we propose a mobile food recognition
system that uses the picture of the food, taken by the
user's mobile device, to recognize multiple food items
in the same meal, such as steak and potatoes on the
same plate, to estimate the calorie and nutrition of
the meal. To speed up and make the process more
accurate, the user is asked to quickly identify the
general area of the food by drawing a bounding circle
on the food picture by touching the screen. The system
then uses image processing and computational
intelligence for food item recognition. The advantage
of recognizing items, instead of the whole meal, is
that the system can be trained with only single item
food images. At the training stage, we first use region
proposal algorithms to generate candidate regions and
extract the convolutional neural network (CNN) features
of all regions. Second, we perform region mining to
select positive regions for each food category using
maximum cover by our proposed submodular optimization
method. At the testing stage, we first generate a set
of candidate regions. For each region, a classification
score is computed based on its extracted CNN features
and predicted food names of the selected regions. Since
fast response is one of the important parameters for
the user who wants to eat the meal, certain heavy
computational parts of the application are offloaded to
the cloud. Hence, the processes of food recognition and
calorie estimation are performed in cloud server. Our
experiments, conducted with the FooDD dataset, show an
average recall rate of 90.98\%, precision rate of
93.05\%, and accuracy of 94.11\% compared to 50.8\% to
88\% accuracy of other existing food recognition
systems.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "36",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Bharati:2017:ETC,
author = "Sailesh Bharati and Hassan Aboubakr Omar and Weihua
Zhuang",
title = "Enhancing Transmission Collision Detection for
Distributed {TDMA} in Vehicular Networks",
journal = j-TOMM,
volume = "13",
number = "3s",
pages = "37:1--37:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3092833",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:22 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The increasing number of road accidents has led to the
evolution of vehicular ad hoc networks (VANETs), which
allow vehicles and roadside infrastructure to
continuously broadcast safety messages, including
necessary information to avoid undesired events on the
road. To support reliable broadcast of safety messages,
distributed time division multiple access (D-TDMA)
protocols are proposed for medium access control in
VANETs. Existing D-TDMA protocols react to a
transmission failure without distinguishing whether the
failure comes from a transmission collision or from a
poor radio channel condition, resulting in degraded
performance. In this article, we present the importance
of transmission failure differentiation due to a poor
channel or due to a transmission collision for D-TDMA
protocols in vehicular networks. We study the effects
of such a transmission failure differentiation on the
performance of a node when reserving a time slot to
access the transmission channel. Furthermore, we
propose a method for transmission failure
differentiation, employing the concept of deep-learning
techniques, for a node to decide whether to release or
continue using its acquired time slot. The proposed
method is based on the application of a Markov chain
model to estimate the channel state when a transmission
failure occurs. The Markov model parameters are
dynamically updated by each node (i.e., vehicle or
roadside unit) based on information included in the
safety messages that are periodically received from
neighboring nodes. In addition, from the D-TDMA
protocol headers of received messages, a node
approximately determines the error in estimating the
channel state based on the proposed Markov model and
then uses this channel estimation error to further
improve subsequent channel state estimations. Through
mathematical analysis, we show that transmission
failure differentiation, or transmission collision
detection, helps a node to efficiently reserve a time
slot even with a large number of nodes contending for
time slots. Furthermore, through extensive simulations
in a highway scenario, we demonstrate that the proposed
solution significantly improves the performance of
D-TDMA protocols by reducing unnecessary contention on
the available time slots, thus increasing the number of
nodes having unique time slots for successful broadcast
of safety messages.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "37",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Vandecasteele:2017:SSC,
author = "Florian Vandecasteele and Karel Vandenbroucke and
Dimitri Schuurman and Steven Verstockt",
title = "{Spott}: On-the-Spot e-Commerce for Television Using
Deep Learning-Based Video Analysis Techniques",
journal = j-TOMM,
volume = "13",
number = "3s",
pages = "38:1--38:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3092834",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:22 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Spott is an innovative second screen mobile multimedia
application which offers viewers relevant information
on objects (e.g., clothing, furniture, food) they see
and like on their television screens. The application
enables interaction between TV audiences and brands, so
producers and advertisers can offer potential consumers
tailored promotions, e-shop items, and/or free samples.
In line with the current views on innovation
management, the technological excellence of the Spott
application is coupled with iterative user involvement
throughout the entire development process. This article
discusses both of these aspects and how they impact
each other. First, we focus on the technological
building blocks that facilitate the (semi-) automatic
interactive tagging process of objects in the video
streams. The majority of these building blocks
extensively make use of novel and state-of-the-art deep
learning concepts and methodologies. We show how these
deep learning based video analysis techniques
facilitate video summarization, semantic keyframe
clustering, and (similar) object retrieval. Secondly,
we provide insights in user tests that have been
performed to evaluate and optimize the application's
user experience. The lessons learned from these open
field tests have already been an essential input in the
technology development and will further shape the
future modifications to the Spott application.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "38",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhang:2017:TDC,
author = "Qingchen Zhang and Laurence T. Yang and Xingang Liu
and Zhikui Chen and Peng Li",
title = "A {Tucker} Deep Computation Model for Mobile
Multimedia Feature Learning",
journal = j-TOMM,
volume = "13",
number = "3s",
pages = "39:1--39:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3063593",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:22 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Recently, the deep computation model, as a tensor deep
learning model, has achieved super performance for
multimedia feature learning. However, the conventional
deep computation model involves a large number of
parameters. Typically, training a deep computation
model with millions of parameters needs
high-performance servers with large-scale memory and
powerful computing units, limiting the growth of the
model size for multimedia feature learning on common
devices such as portable CPUs and conventional
desktops. To tackle this problem, this article proposes
a Tucker deep computation model by using the Tucker
decomposition to compress the weight tensors in the
full-connected layers for multimedia feature learning.
Furthermore, a learning algorithm based on the
back-propagation strategy is devised to train the
parameters of the Tucker deep computation model.
Finally, the performance of the Tucker deep computation
model is evaluated by comparing with the conventional
deep computation model on two representative multimedia
datasets, that is, CUAVE and SNAE2, in terms of
accuracy drop, parameter reduction, and speedup in the
experiments. Results imply that the Tucker deep
computation model can achieve a large-parameter
reduction and speedup with a small accuracy drop for
multimedia feature learning.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "39",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Timmerer:2017:BPA,
author = "Christian Timmerer and Ali C. Begen",
title = "Best Papers of the {2016 ACM Multimedia Systems
(MMSys) Conference and Workshop on Network and
Operating System Support for Digital Audio and Video
(NOSSDAV) 2016}",
journal = j-TOMM,
volume = "13",
number = "3s",
pages = "40:1--40:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3084539",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:22 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "40",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Daronco:2017:DRA,
author = "Stefano D'aronco and Sergio Mena and Pascal Frossard",
title = "Distributed Rate Allocation in Switch-Based Multiparty
Videoconferencing System",
journal = j-TOMM,
volume = "13",
number = "3s",
pages = "41:1--41:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3092835",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:22 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Multiparty videoconferences, or more generally
multiparty video calls, are gaining a lot of popularity
as they offer a rich communication experience. These
applications have, however, large requirements in terms
of both network and computational resources and have to
deal with sets of heterogeneous clients. The multiparty
videoconferencing systems are usually either based on
expensive central nodes, called Multipoint Control
Units (MCU), with transcoding capabilities, or on a
peer-to-peer architecture where users cooperate to
distribute more efficiently the different video
streams. Whereas the first class of systems requires an
expensive central hardware, the second one depends
completely on the redistribution capacity of the users,
which sometimes might neither provide sufficient
bandwidth nor be reliable enough. In this work, we
propose an alternative solution where we use a central
node to distribute the video streams, but at the same
time we maintain the hardware complexity and the
computational requirements of this node as low as
possible, for example, it has no video decoding
capabilities. We formulate the rate allocation problem
as an optimization problem that aims at maximizing the
Quality of Service (QoS) of the videoconference. We
propose two different distributed algorithms for
solving the optimization problem: the first algorithm
is able to find an approximate solution of the problem
in a one-shot execution, whereas the second algorithm,
based on Lagrangian relaxation, performs iterative
updates of the optimization variables in order to
gradually increase the value of the objective function.
The two algorithms, though being disjointed, nicely
complement each other. If executed in sequence, they
allow us to achieve both a quick approximate rate
reallocation, in case of a sudden change of the system
conditions, and a precise refinement of the variables,
which avoids problems caused by possible faulty
approximate solutions. We have further implemented our
solution in a network simulator where we show that our
rate allocation algorithm is able to properly optimize
users' QoS. We also illustrate the benefits of our
solution in terms of network usage and overall utility
when compared to a baseline heuristic method operating
on the same system architecture.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "41",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Cofano:2017:DPE,
author = "Giuseppe Cofano and Luca {De Cicco} and Thomas Zinner
and Anh Nguyen-Ngoc and Phuoc Tran-Gia and Saverio
Mascolo",
title = "Design and Performance Evaluation of Network-assisted
Control Strategies for {HTTP} Adaptive Streaming",
journal = j-TOMM,
volume = "13",
number = "3s",
pages = "42:1--42:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3092836",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:22 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article investigates several network-assisted
streaming approaches that rely on active cooperation
between video streaming applications and the network.
We build a Video Control Plane that enforces Video
Quality Fairness among concurrent video flows generated
by heterogeneous client devices. For this purpose, a
max-min fairness optimization problem is solved at
runtime. We compare two approaches to actuate the
optimal solution in an Software Defined Networking
network: The first one allocates network bandwidth
slices to video flows, and the second one guides video
players in the video bitrate selection. We assess
performance through several QoE-related metrics, such
as Video Quality Fairness, video quality, and switching
frequency. The impact of client-side adaptation
algorithms is also investigated.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "42",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wisniewski:2017:OAA,
author = "Piotr Wisniewski and Jordi Mongay Batalla and Andrzej
Beben and Piotr Krawiec and Andrzej Chydzinski",
title = "On Optimizing Adaptive Algorithms Based on Rebuffering
Probability",
journal = j-TOMM,
volume = "13",
number = "3s",
pages = "43:1--43:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3092837",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:22 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Traditionally, video adaptive algorithms aim to select
the representation that better fits to the current
download rate. In recent years, a number of new
approaches appeared that take into account the buffer
occupancy and the probability of video rebuffering as
important indicators of the representation to be
selected. We propose an optimization of the existing
algorithm based on rebuffering probability and argue
that the algorithm should avoid the situations when the
client buffer is full and the download is stopped,
since these situations decrease the efficiency of the
algorithm. Reducing full buffer states does not
increase the rebuffering probability thanks to a clever
management of the client buffer, which analyses the
buffer occupancy and downloads higher bitrate
representations only in the case of high buffer
occupancy.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "43",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Kleinrouweler:2017:SAP,
author = "Jan Willem Kleinrouweler and Sergio Cabrero and Pablo
Cesar",
title = "An {SDN} Architecture for Privacy-Friendly
Network-Assisted {DASH}",
journal = j-TOMM,
volume = "13",
number = "3s",
pages = "44:1--44:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3092838",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:22 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Dynamic Adaptive Streaming over HTTP (DASH) is the
premier technology for Internet video streaming. DASH
efficiently uses existing HTTP-based delivery
infrastructures implementing adaptive streaming.
However, DASH traffic is bursty in nature. This causes
performance problems when DASH players share a network
connection or in networks with heavy background
traffic. The result is unstable and lower quality
video. In this article, we present the design and
implementation of a so-called DASH Assisting Network
Element (DANE). Our system provides target bitrate
signaling and dynamic traffic control. These two
mechanisms realize proper bandwidth sharing among
clients. Our system is privacy friendly and fully
supports encrypted video streams. Trying to improve the
streaming experience for users who share a network
connection, our system increases the video bitrate and
reduces the number of quality switches. We show this
through evaluations in our Wi-Fi testbed.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "44",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wang:2017:DAQ,
author = "Cong Wang and Divyashri Bhat and Amr Rizk and Michael
Zink",
title = "Design and Analysis of {QoE}-Aware Quality Adaptation
for {DASH}: a Spectrum-Based Approach",
journal = j-TOMM,
volume = "13",
number = "3s",
pages = "45:1--45:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3092839",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:22 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The dynamics of the application-layer-based control
loop of dynamic adaptive streaming over HTTP (DASH)
make video bitrate selection for DASH a difficult
problem. In this work, we provide a DASH quality
adaptation algorithm, named SQUAD, that is specifically
tailored to provide a high quality of experience (QoE).
We review and provide new insights into the challenges
for DASH rate estimation. We found that in addition to
the ON-OFF behavior of DASH clients, there exists a
discrepancy in the timescales that form the basis of
the rate estimates across (i) different video segments
and (ii) the rate control loops of DASH and
Transmission Control Protocol (TCP). With these
observations in mind, we design SQUAD aiming to
maximize the average quality bitrate while minimizing
the quality variations. We test our implementation of
SQUAD together with a number of different quality
adaptation algorithms under various conditions in the
Global Environment for Networking Innovation testbed,
as well as, in a series of measurements over the public
Internet. Through a measurement study, we show that by
sacrificing little to nothing in average quality
bitrate, SQUAD can provide significantly better QoE in
terms of quality switching and magnitude. In addition,
we show that retransmission of higher-quality segments
that were originally received in low-quality is
feasible and improves the QoE.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "45",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhang:2017:CAC,
author = "Cong Zhang and Jiangchuan Liu and Haiyang Wang",
title = "Cloud-Assisted Crowdsourced Livecast",
journal = j-TOMM,
volume = "13",
number = "3s",
pages = "46:1--46:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3095755",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:22 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The past two years have witnessed an explosion of a
new generation of livecast services, represented by
Twitch.tv, GamingLive, and Dailymotion, to name but a
few. With such a livecast service, geo-distributed
Internet users can broadcast any event in real-time,
for example, game, cooking, drawing, and so on, to
viewers of interest. Its crowdsourced nature enables
rich interactions among broadcasters and viewers but
also introduces great challenges to accommodate their
great scales and dynamics. To fulfill the demands from
a large number of heterogeneous broadcasters and
geo-distributed viewers, expensive server clusters have
been deployed to ingest and transcode live streams. Yet
our Twitch-based measurement shows that a significant
portion of the unpopular and dynamic broadcasters are
consuming considerable system resources; in particular,
25\% of bandwidth resources and 30\% of computational
capacity are used by the broadcasters who do not have
any viewers at all. In this article, through the
real-world measurement and data analysis, we show that
the public cloud has great potentials to address these
scalability challenges. We accordingly present the
design of Cloud-assisted Crowdsourced Livecast (CACL)
and propose a comprehensive set of solutions for
broadcaster partitioning. Our trace-driven evaluations
show that our CACL design can smartly assign ingesting
and transcoding tasks to the elastic cloud virtual
machines, providing flexible and cost-effective system
deployment.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "46",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Dao:2017:TCM,
author = "Minh Son Dao",
title = "This is the Table of Contents for the most recent
online-only supplemental issue {TOMM} 13(3s). {Please}
find this supplemental issue in the {ACM Digital
Library} and enjoy reading them!",
journal = j-TOMM,
volume = "13",
number = "4",
pages = "47:1--47:??",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3143786",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:23 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "47e",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhang:2017:SRB,
author = "Hong-Bo Zhang and Bineng Zhong and Qing Lei and
Ji-Xiang Du and Jialin Peng and Duansheng Chen and Xiao
Ke",
title = "Sparse Representation-Based Semi-Supervised Regression
for People Counting",
journal = j-TOMM,
volume = "13",
number = "4",
pages = "47:1--47:??",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3106156",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:23 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Label imbalance and the insufficiency of labeled
training samples are major obstacles in most methods
for counting people in images or videos. In this work,
a sparse representation-based semi-supervised
regression method is proposed to count people in images
with limited data. The basic idea is to predict the
unlabeled training data, select reliable samples to
expand the labeled training set, and retrain the
regression model. In the algorithm, the initial
regression model, which is learned from the labeled
training data, is used to predict the number of people
in the unlabeled training dataset. Then, the unlabeled
training samples are regarded as an over-complete
dictionary. Each feature of the labeled training data
can be expressed as a sparse linear approximation of
the unlabeled data. In turn, the labels of the labeled
training data can be estimated based on a sparse
reconstruction in feature space. The label confidence
in labeling an unlabeled sample is estimated by
calculating the reconstruction error. The training set
is updated by selecting unlabeled samples with minimal
reconstruction errors, and the regression model is
retrained on the new training set. A co-training style
method is applied during the training process. The
experimental results demonstrate that the proposed
method has a low mean square error and mean absolute
error compared with those of state-of-the-art
people-counting benchmarks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "47",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Akhtar:2017:COV,
author = "Shahid Akhtar and Andre Beck and Ivica Rimac",
title = "Caching Online Video: Analysis and Proposed
Algorithm",
journal = j-TOMM,
volume = "13",
number = "4",
pages = "48:1--48:??",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3106157",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:23 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Online video presents new challenges to traditional
caching with over a thousand-fold increase in number of
assets, rapidly changing popularity of assets and much
higher throughput requirements. We propose a new
hierarchical filtering algorithm for caching online
video HiFi. Our algorithm is designed to optimize hit
rate, replacement rate and cache throughput. It has an
associated implementation complexity comparable to that
of LRU. Our results show that, under typical operator
conditions, HiFi can increase edge cache byte hit rate
by 5\%--24\% over an LRU policy, but more importantly
can increase the RAM or memory byte hit rate by 80\% to
200\% and reduce the replacement rate by more than 100
times! These two factors combined can dramatically
increase throughput for most caches. If SSDs are used
for storage, the much lower replacement rate may also
allow substitution of lower-cost MLC-based SSDs instead
of SLC-based SSDs. We extend previous multi-tier
analytical models for LRU caches to caches with
filtering. We analytically show how HiFi can approach
the performance of an optimal caching policy and how to
tune HiFi to reach as close to optimal performance as
the traffic conditions allow. We develop a realistic
simulation environment for online video using
statistics from operator traces. We show that HiFi
performs within a few percentage points from the
optimal solution which was simulated by Belady's MIN
algorithm under typical operator conditions",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "48",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Dang-Nguyen:2017:MRD,
author = "Duc-Tien Dang-Nguyen and Luca Piras and Giorgio
Giacinto and Giulia Boato and Francesco G. B. {De
Natale}",
title = "Multimodal Retrieval with Diversification and
Relevance Feedback for Tourist Attraction Images",
journal = j-TOMM,
volume = "13",
number = "4",
pages = "49:1--49:??",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3103613",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:23 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this article, we present a novel framework that can
produce a visual description of a tourist attraction by
choosing the most diverse pictures from
community-contributed datasets, which describe
different details of the queried location. The main
strength of the proposed approach is its flexibility
that permits us to filter out non-relevant images and
to obtain a reliable set of diverse and relevant images
by first clustering similar images according to their
textual descriptions and their visual content and then
extracting images from different clusters according to
a measure of the user's credibility. Clustering is
based on a two-step process, where textual descriptions
are used first and the clusters are then refined
according to the visual features. The degree of
diversification can be further increased by exploiting
users' judgments on the results produced by the
proposed algorithm through a novel approach, where
users not only provide a relevance feedback but also a
diversity feedback. Experimental results performed on
the MediaEval 2015 ``Retrieving Diverse Social Images''
dataset show that the proposed framework can achieve
very good performance both in the case of automatic
retrieval of diverse images and in the case of the
exploitation of the users' feedback. The effectiveness
of the proposed approach has been also confirmed by a
small case study involving a number of real users.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "49",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{FujiiPontello:2017:MUR,
author = "Luciana {Fujii Pontello} and Pedro H. F. Holanda and
Bruno Guilherme and Jo{\~a}o Paulo V. Cardoso and Olga
Goussevskaia and Ana Paula {Couto Da Silva}",
title = "Mixtape: Using Real-Time User Feedback to Navigate
Large Media Collections",
journal = j-TOMM,
volume = "13",
number = "4",
pages = "50:1--50:??",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3105969",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:23 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this work, we explore the increasing demand for
novel user interfaces to navigate large media
collections. We implement a geometric data structure to
store and retrieve item-to-item similarity information
and propose a novel navigation framework that uses
vector operations and real-time user feedback to direct
the outcome. The framework is scalable to large media
collections and is suitable for computationally
constrained devices. In particular, we implement this
framework in the domain of music. To evaluate the
effectiveness of the navigation process, we propose an
automatic evaluation framework, based on synthetic user
profiles, which allows us to quickly simulate and
compare navigation paths using different algorithms and
datasets. Moreover, we perform a real user study. To do
that, we developed and launched Mixtape, a simple web
application that allows users to create playlists by
providing real-time feedback through liking and
skipping patterns.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "50",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yakubu:2017:SSN,
author = "Abukari M. Yakubu and Namunu C. Maddage and Pradeep K.
Atrey",
title = "Securing Speech Noise Reduction in Outsourced
Environment",
journal = j-TOMM,
volume = "13",
number = "4",
pages = "51:1--51:??",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3105970",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:23 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Cloud data centers (CDCs) are becoming a
cost-effective method for processing and storage of
multimedia data including images, video, and audio.
Since CDCs are physically located in different
jurisdictions, and are managed by external parties,
data security is a growing concern. Data encryption at
CDCs is commonly practiced to improve data security.
However, to process the data at CDCs, data must often
be decrypted, which raises issues in security. Thus,
there is a growing demand for data processing
techniques in encrypted domain in such an outsourced
environment. In this article, we analyze encrypted
domain speech content processing techniques for noise
reduction. Noise contaminates speech during
transmission or during the acquisition process by
recording. As a result, the quality of the speech
content is degraded. We apply Shamir's secret sharing
as the cryptosystem to encrypt speech data before
uploading it to a CDC. We then propose finite impulse
response digital filters to reduce white and wind noise
in the speech in the encrypted domain. We prove that
our proposed schemes meet the security requirements of
efficiency, accuracy, and checkability for both
semi-honest and malicious adversarial models.
Experimental results show that our proposed filtering
techniques for speech noise reduction in the encrypted
domain produce similar results when compared to
plaintext domain processing.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "51",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Guerrini:2017:IFR,
author = "Fabrizio Guerrini and Nicola Adami and Sergio Benini
and Alberto Piacenza and Julie Porteous and Marc
Cavazza and Riccardo Leonardi",
title = "Interactive Film Recombination",
journal = j-TOMM,
volume = "13",
number = "4",
pages = "52:1--52:??",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3103241",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:23 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this article, we discuss an innovative media
entertainment application called Interactive
Movietelling. As an offspring of Interactive
Storytelling applied to movies, we propose to integrate
narrative generation through artificial intelligence
(AI) planning with video processing and modeling to
construct filmic variants starting from the baseline
content. The integration is possible thanks to content
description using semantic attributes pertaining to
intermediate-level concepts shared between video
processing and planning levels. The output is a
recombination of segments taken from the input movie
performed so as to convey an alternative plot. User
tests on the prototype proved how promising Interactive
Movietelling might be, even if it was designed at a
proof of concept level. Possible improvements that are
suggested here lead to many challenging research
issues.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "52",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhou:2017:CCB,
author = "Mingliang Zhou and Yongfei Zhang and Bo Li and Xupeng
Lin",
title = "Complexity Correlation-Based {CTU}-Level Rate Control
with Direction Selection for {HEVC}",
journal = j-TOMM,
volume = "13",
number = "4",
pages = "53:1--53:??",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3107616",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:23 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Rate control is a crucial consideration in
high-efficiency video coding (HEVC). The estimation of
model parameters is very important for coding tree unit
(CTU)-level rate control, as it will significantly
affect bit allocation and thus coding performance.
However, the model parameters in the CTU-level rate
control sometimes fails because of inadequate
consideration of the correlation between model
parameters and complexity characteristic. In this
study, we establish a novel complexity
correlation-based CTU-level rate control for HEVC.
First, we formulate the model parameter estimation
scheme as a multivariable estimation problem; second,
based on the complexity correlation of the neighbouring
CTU, an optimal direction is selected in five
directions for reference CTU set selection during model
parameter estimation to further improve the prediction
accuracy of the complexity of the current CTU. Third,
to improve their precision, the relationship between
the model parameters and the complexity of the
reference CTU set in the optimal direction is
established by using least square method (LS), and the
model parameters are solved via the estimated
complexity of the current CTU. Experimental results
show that the proposed algorithm can significantly
improve the accuracy of the CTU-level rate control and
thus the coding performance; the proposed scheme
consistently outperforms HM 16.0 and other
state-of-the-art algorithms in a variety of testing
configurations. More specifically, up to 8.4\% and on
average 6.4\% BD-Rate reduction is achieved compared to
HM 16.0 and up to 4.7\% and an average of 3.4\% BD-Rate
reduction is achieved compared to other algorithms,
with only a slight complexity overhead.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "53",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Sharrab:2017:MAP,
author = "Yousef O. Sharrab and Nabil J. Sarhan",
title = "Modeling and Analysis of Power Consumption in Live
Video Streaming Systems",
journal = j-TOMM,
volume = "13",
number = "4",
pages = "54:1--54:??",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3115505",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:23 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article develops an aggregate power consumption
model for live video streaming systems, including
many-to-many systems. In many-to-one streaming systems,
multiple video sources (i.e., cameras and/or sensors)
stream videos to a monitoring station. We model the
power consumed by the video sources in the capturing,
encoding, and transmission phases and then provide an
overall model in terms of the main capturing and
encoding parameters, including resolution, frame rate,
number of reference frames, motion estimation range,
and quantization. We also analyze the power consumed by
the monitoring station due to receiving, decoding, and
upscaling the received video streams. In addition to
modeling the power consumption, we model the achieved
bitrate of video encoding. We validate the developed
models through extensive experiments using two types of
systems and different video contents. Furthermore, we
analyze many-to-one systems in terms of bitrate, video
quality, and the power consumed by the sources, as well
as that by the monitoring station, considering the
impacts of multiple parameters simultaneously.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "54",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Ng:2017:WSD,
author = "Pai Chet Ng and James She and Kang Eun Jeon and
Matthias Baldauf",
title = "When Smart Devices Interact With Pervasive Screens: a
Survey",
journal = j-TOMM,
volume = "13",
number = "4",
pages = "55:1--55:??",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3115933",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:23 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The meeting of pervasive screens and smart devices has
witnessed the birth of screen-smart device interaction
(SSI), a key enabler to many novel interactive use
cases. Most current surveys focus on direct
human-screen interaction, and to the best of our
knowledge, none have studied state-of-the-art SSI. This
survey identifies three core elements of SSI and
delivers a timely discussion on SSI oriented around the
screen, the smart device, and the interaction modality.
Two evaluation metrics (i.e., interaction latency and
accuracy) have been adopted and refined to match the
evaluation criterion of SSI. The bottlenecks that
hinder the further advancement of the current SSI in
connection with this metrics are studied. Last, future
research challenges and opportunities are highlighted
in the hope of inspiring continuous research efforts to
realize the next generation of SSI.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "55",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Franti:2017:MMO,
author = "Pasi Fr{\"a}nti and Radu Mariescu-Istodor and Lahari
Sengupta",
title = "{O-Mopsi}: Mobile Orienteering Game for Sightseeing,
Exercising, and Education",
journal = j-TOMM,
volume = "13",
number = "4",
pages = "56:1--56:??",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3115935",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:23 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Location-based games have been around already since
2000 but only recently when PokemonGo came to markets
it became clear that they can reach wide popularity. In
this article, we perform a literature-based analytical
study of what kind of issues location-based game design
faces, and how they can be solved. We study how to use
and verify the location, the role of the games as
exergames, use in education, and study technical and
safety issues. As a case study, we present O-Mopsi game
that combines physical activity with problem solving.
It includes three challenges: (1) navigating to the
next target, (2) deciding the order of targets, (3)
physical movement. All of them are unavoidable and
relevant. For guiding the players, we use three types
of multimedia: images (targets and maps), sound (user
guidance), and GPS (for positioning). We discuss
motivational aspects, analysis of the playing, and
content creation. The quality of experiences is
reported based on playing in SciFest Science festivals
during 2011--2016.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "56",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Messaoudi:2017:PAG,
author = "Farouk Messaoudi and Adlen Ksentini and Gwendal Simon
and Philippe Bertin",
title = "Performance Analysis of Game Engines on Mobile and
Fixed Devices",
journal = j-TOMM,
volume = "13",
number = "4",
pages = "57:1--57:??",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3115934",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:23 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Mobile gaming is an emerging concept wherein gamers
are using mobile devices, like smartphones and tablets,
to play best-seller games. Compared to dedicated gaming
boxes or PCs, these devices still fall short of
executing newly complex 3D video games with a rich
immersion. Three novel solutions, relying on cloud
computing infrastructure, namely, computation
offloading, cloud gaming, and client-server
architecture, will represent the next generation of
game engine architecture aiming at improving the gaming
experience. The basis of these aforementioned solutions
is the distribution of the game code over different
devices (including set-top boxes, PCs, and servers). In
order to know how the game code should be distributed,
advanced knowledge of game engines is required. By
consequence, dissecting and analyzing game engine
performances will surely help to better understand how
to move in these new directions (i.e., distribute game
code), which is so far missing in the literature.
Aiming at filling this gap, we propose in this article
to analyze and evaluate one of the famous engines in
the market, that is, ``Unity 3D.'' We begin by
detailing the architecture and the game logic of game
engines. Then, we propose a test-bed to evaluate the
CPU and GPU consumption per frame and per module for
nine representative games on three platforms, namely, a
stand-alone computer, embedded systems, and web
players. Based on the obtained results and
observations, we build a valued graph of each module,
composing the Unity 3D architecture, which reflects the
internal flow and CPU consumption. Finally, we made a
comparison in terms of CPU consumption between these
architectures.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "57",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Cheung:2017:ECF,
author = "Ming Cheung and Xiaopeng Li and James She",
title = "An Efficient Computation Framework for Connection
Discovery using Shared Images",
journal = j-TOMM,
volume = "13",
number = "4",
pages = "58:1--58:??",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3115951",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:23 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "With the advent and popularity of the social network,
social graphs become essential to improve services and
information relevance to users for many social media
applications to predict follower/followee relationship,
community membership, and so on. However, the social
graphs could be hidden by users due to privacy concerns
or kept by social media. Recently, connections
discovered from user-shared images using
machine-generated labels are proved to be more
accessible alternatives to social graphs. But real-time
discovery is difficult due to high complexity, and many
applications are not possible. This article proposes an
efficient computation framework for connection
discovery using user-shared images, which is suitable
for any image processing and computer vision techniques
for connection discovery on the fly. The framework
includes the architecture of online computation to
facilitate real-time processing, offline computation
for a complete processing, and online/offline
communication. The proposed framework is implemented to
demonstrate its effectiveness by speeding up connection
discovery through user-shared images. By studying 300K+
user-shared images from two popular social networks, it
is proven that the proposed computation framework
reduces 90\% of runtime with a comparable accurate with
existing frameworks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "58",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Li:2017:DSF,
author = "Xiaopeng Li and Ming Cheung and James She",
title = "A Distributed Streaming Framework for Connection
Discovery Using Shared Videos",
journal = j-TOMM,
volume = "13",
number = "4",
pages = "59:1--59:??",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3120996",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:23 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "With the advances in mobile devices and the popularity
of social networks, users can share multimedia content
anytime, anywhere. One of the most important types of
emerging content is video, which is commonly shared on
platforms such as Instagram and Facebook. User
connections, which indicate whether two users are
follower/followee or have the same interests, are
essential to improve services and information relevant
to users for many social media applications. But they
are normally hidden due to users' privacy concerns or
are kept confidential by social media sites. Using
user-shared content is an alternative way to discover
user connections. This article proposes to use
user-shared videos for connection discovery with the
Bag of Feature Tagging method and proposes a
distributed streaming computation framework to
facilitate the analytics. Exploiting the uniqueness of
shared videos, the proposed framework is divided into
Streaming processing and Online and Offline
Computation. With experiments using a dataset from
Twitter, it has been proved that the proposed method
using user-shared videos for connection discovery is
feasible. And the proposed computation framework
significantly accelerates the analytics, reducing the
processing time to only 32\% for follower/followee
recommendation. It has also been proved that comparable
performance can be achieved with only partial data for
each video and leads to more efficient computation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "59",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{DeBoer:2017:SRZ,
author = "Maaike H. T. {De Boer} and Yi-Jie Lu and Hao Zhang and
Klamer Schutte and Chong-Wah Ngo and Wessel Kraaij",
title = "Semantic Reasoning in Zero Example Video Event
Retrieval",
journal = j-TOMM,
volume = "13",
number = "4",
pages = "60:1--60:??",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3131288",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:23 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Searching in digital video data for high-level events,
such as a parade or a car accident, is challenging when
the query is textual and lacks visual example images or
videos. Current research in deep neural networks is
highly beneficial for the retrieval of high-level
events using visual examples, but without examples it
is still hard to (1) determine which concepts are
useful to pre-train ( Vocabulary challenge ) and (2)
which pre-trained concept detectors are relevant for a
certain unseen high-level event ( Concept Selection
challenge ). In our article, we present our Semantic
Event Retrieval System which (1) shows the importance
of high-level concepts in a vocabulary for the
retrieval of complex and generic high-level events and
(2) uses a novel concept selection method ( i-w2v )
based on semantic embeddings. Our experiments on the
international TRECVID Multimedia Event Detection
benchmark show that a diverse vocabulary including
high-level concepts improves performance on the
retrieval of high-level events in videos and that our
novel method outperforms a knowledge-based concept
selection method.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "60",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Guo:2017:EMD,
author = "Jianting Guo and Peijia Zheng and Jiwu Huang",
title = "An Efficient Motion Detection and Tracking Scheme for
Encrypted Surveillance Videos",
journal = j-TOMM,
volume = "13",
number = "4",
pages = "61:1--61:??",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3131342",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:23 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Performing detection on surveillance videos
contributes significantly to the goals of safety and
security. However, performing detection on unprotected
surveillance video may reveal the privacy of innocent
people in the video. Therefore, striking a proper
balance between maintaining personal privacy while
enhancing the feasibility of detection is an important
issue. One promising solution to this problem is to
encrypt the surveillance videos and perform detection
on the encrypted videos. Most existing encrypted signal
processing methods focus on still images or small data
volumes; however, because videos are typically much
larger, investigating how to process encrypted videos
is a significant challenge. In this article, we propose
an efficient motion detection and tracking scheme for
encrypted H.264/AVC video bitstreams, which does not
require the previous decryption on the encrypted video.
The main idea is to first estimate motion information
from the bitstream structure and codeword length and,
then, propose a region update (RU) algorithm to deal
with the loss and error drifting of motion caused by
the video encryption. The RU algorithm is designed
based on the prior knowledge that the object motion in
the video is continuous in space and time. Compared to
the existing scheme, which is based on video encryption
that occurs at the pixel level, the proposed scheme has
the advantages of requiring only a small storage of the
encrypted video and has a low computational cost for
both encryption and detection. Experimental results
show that our scheme performs better regarding
detection accuracy and execution speed. Moreover, the
proposed scheme can work with more than one
format-compliant video encryption method, provided that
the positions of the macroblocks can be extracted from
the encrypted video bitstream. Due to the coupling of
video stream encryption and detection algorithms, our
scheme can be directly connected to the video stream
output (e.g., surveillance cameras) without requiring
any camera modifications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "61",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Motamedi:2017:PPF,
author = "Mohammad Motamedi and Philipp Gysel and Soheil
Ghiasi",
title = "{PLACID}: a Platform for {FPGA}-Based Accelerator
Creation for {DCNNs}",
journal = j-TOMM,
volume = "13",
number = "4",
pages = "62:1--62:??",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3131289",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Dec 23 10:49:23 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Deep Convolutional Neural Networks (DCNNs) exhibit
remarkable performance in a number of pattern
recognition and classification tasks. Modern DCNNs
involve many millions of parameters and billions of
operations. Inference using such DCNNs, if implemented
as software running on an embedded processor, results
in considerable execution time and energy consumption,
which is prohibitive in many mobile applications.
Field-programmable gate array (FPGA)-based acceleration
of DCNN inference is a promising approach to improve
both energy consumption and classification throughput.
However, the engineering effort required for
development and verification of an optimized FPGA-based
architecture is significant. In this article, we
present PLACID, an automated PLatform for Accelerator
CreatIon for DCNNs. PLACID uses an analytical approach
to characterization and exploration of the
implementation space. PLACID enables generation of an
accelerator with the highest throughput for a given
DCNN on a specific target FPGA platform. Subsequently,
it generates an RTL level architecture in Verilog,
which can be passed onto commercial tools for FPGA
implementation. PLACID is fully automated, and reduces
the accelerator design time from a few months down to a
few hours. Experimental results show that architectures
synthesized by PLACID yield 2$ \times $ higher
throughput density than the best competing approach.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "62",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Akputu:2018:ERU,
author = "Oryina Kingsley Akputu and Kah Phooi Seng and Yunli
Lee and Li-Minn Ang",
title = "Emotion Recognition Using Multiple Kernel Learning
toward E-learning Applications",
journal = j-TOMM,
volume = "14",
number = "1",
pages = "1:1--1:??",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3131287",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jan 16 18:18:12 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Adaptive Educational Hypermedia (AEH) e-learning
models aim to personalize educational content and
learning resources based on the needs of an individual
learner. The Adaptive Hypermedia Architecture (AHA) is
a specific implementation of the AEH model that
exploits the cognitive characteristics of learner
feedback to adapt resources accordingly. However,
beside cognitive feedback, the learning realm generally
includes both the affective and emotional feedback of
the learner, which is often neglected in the design of
e-learning models. This article aims to explore the
potential of utilizing affect or emotion recognition
research in AEH models. The framework is referred to as
Multiple Kernel Learning Decision Tree Weighted Kernel
Alignment (MKLDT-WFA). The MKLDT-WFA has two merits
over classical MKL. First, the WFA component only
preserves the relevant kernel weights to reduce
redundancy and improve the discrimination for emotion
classes. Second, training via the decision tree reduces
the misclassification issues associated with the
SimpleMKL. The proposed work has been evaluated on
different emotion datasets and the results confirm the
good performances. Finally, the conceptual
Emotion-based E-learning Model (EEM) with the proposed
emotion recognition framework is proposed for future
work.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "1",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Li:2018:LLP,
author = "Kai Li and Guo-Jun Qi and Kien A. Hua",
title = "Learning Label Preserving Binary Codes for Multimedia
Retrieval: a General Approach",
journal = j-TOMM,
volume = "14",
number = "1",
pages = "2:1--2:??",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3152126",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jan 16 18:18:12 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Learning-based hashing has been researched extensively
in the past few years due to its great potential in
fast and accurate similarity search among huge volumes
of multimedia data. In this article, we present a novel
multimedia hashing framework, called Label Preserving
Multimedia Hashing (LPMH) for multimedia similarity
search. In LPMH, a general optimization method is used
to learn the joint binary codes of multiple media types
by explicitly preserving semantic label information.
Compared with existing hashing methods which are
typically developed under and thus restricted to some
specific objective functions, the proposed optimization
strategy is not tied to any specific loss function and
can easily incorporate bit balance constraints to
produce well-balanced binary codes. Specifically, our
formulation leads to a set of Binary Integer
Programming (BIP) problems that have exact solutions
both with and without bit balance constraints. These
problems can be solved extremely fast and the solution
can easily scale up to large-scale datasets. In the
hash function learning stage, the boosted decision
trees algorithm is utilized to learn multiple
media-specific hash functions that can map
heterogeneous data sources into a homogeneous Hamming
space for cross-media retrieval. We have
comprehensively evaluated the proposed method using a
range of large-scale datasets in both single-media and
cross-media retrieval tasks. The experimental results
demonstrate that LPMH is competitive with
state-of-the-art methods in both speed and accuracy.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "2",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Ceballos:2018:IEC,
author = "Rodrigo Ceballos and Beatrice Ionascu and Wanjoo Park
and Mohamad Eid",
title = "Implicit Emotion Communication: {EEG} Classification
and Haptic Feedback",
journal = j-TOMM,
volume = "14",
number = "1",
pages = "3:1--3:??",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3152128",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jan 16 18:18:12 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Today, ubiquitous digital communication systems do not
have an intuitive, natural way of communicating
emotion, which, in turn, affects the degree to which
humans can emotionally connect and interact with one
another. To address this problem, a more natural,
intuitive, and implicit emotion communication system
was designed and created that employs asymmetry-based
EEG emotion classification for detecting the emotional
state of the sender and haptic feedback (in the form of
tactile gestures) for displaying emotions for a
receiver. Emotions are modeled in terms of valence
(positive/negative emotions) and arousal (intensity of
the emotion). Performance analysis shows that the
proposed EEG subject-dependent emotion classification
model with Free Asymmetry features allows for more
flexible feature-generation schemes than other existing
algorithms and attains an average accuracy of 92.5\%
for valence and 96.5\% for arousal, outperforming
previous-generation schemes in high feature space. As
for the haptic feedback, a tactile gesture authoring
tool and a haptic jacket were developed to design
tactile gestures that can intensify emotional reactions
in terms of valence and arousal. Experimental study
demonstrated that subject-independent emotion
transmission through tactile gestures is effective for
the arousal dimension of an emotion but is less
effective for valence. Consistency in subject-dependent
responses for both valence and arousal suggests that
personalized tactile gestures would be more
effective.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "3",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wu:2018:DAQ,
author = "Jiyan Wu and Bo Cheng and Yuan Yang and Ming Wang and
Junliang Chen",
title = "Delay-Aware Quality Optimization in Cloud-Assisted
Video Streaming System",
journal = j-TOMM,
volume = "14",
number = "1",
pages = "4:1--4:??",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3152116",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jan 16 18:18:12 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Cloud-assisted video streaming has emerged as a new
paradigm to optimize multimedia content distribution
over the Internet. This article investigates the
problem of streaming cloud-assisted real-time video to
multiple destinations (e.g., cloud video conferencing,
multi-player cloud gaming, etc.) over lossy
communication networks. The user diversity and network
dynamics result in the delay differences among multiple
destinations. This research proposes Differentiated
cloud-Assisted VIdeo Streaming (DAVIS) framework, which
proactively leverages such delay differences in video
coding and transmission optimization. First, we
analytically formulate the optimization problem of
joint coding and transmission to maximize received
video quality. Second, we develop a quality
optimization framework that integrates the video
representation selection and FEC (Forward Error
Correction) packet interleaving. The proposed DAVIS is
able to effectively perform differentiated quality
optimization for multiple destinations by taking
advantage of the delay differences in cloud-assisted
video streaming system. We conduct the performance
evaluation through extensive experiments with the
Amazon EC2 instances and Exata emulation platform.
Evaluation results show that DAVIS outperforms the
reference cloud-assisted streaming solutions in video
quality and delay performance.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "4",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Jiang:2018:DBC,
author = "Shuhui Jiang and Yue Wu and Yun Fu",
title = "Deep Bidirectional Cross-Triplet Embedding for Online
Clothing Shopping",
journal = j-TOMM,
volume = "14",
number = "1",
pages = "5:1--5:??",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3152114",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jan 16 18:18:12 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this article, we address the cross-domain (i.e.,
street and shop) clothing retrieval problem and
investigate its real-world applications for online
clothing shopping. It is a challenging problem due to
the large discrepancy between street and shop domain
images. We focus on learning an effective
feature-embedding model to generate robust and
discriminative feature representation across domains.
Existing triplet embedding models achieve promising
results by finding an embedding metric in which the
distance between negative pairs is larger than the
distance between positive pairs plus a margin. However,
existing methods do not address the challenges in the
cross-domain clothing retrieval scenario sufficiently.
First, the intradomain and cross-domain data
relationships need to be considered simultaneously.
Second, the number of matched and nonmatched
cross-domain pairs are unbalanced. To address these
challenges, we propose a deep cross-triplet embedding
algorithm together with a cross-triplet sampling
strategy. The extensive experimental evaluations
demonstrate the effectiveness of the proposed
algorithms well. Furthermore, we investigate two novel
online shopping applications, clothing trying on and
accessories recommendation, based on a unified
cross-domain clothing retrieval framework.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "5",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wang:2018:DFI,
author = "Peisong Wang and Qinghao Hu and Zhiwei Fang and
Chaoyang Zhao and Jian Cheng",
title = "{DeepSearch}: a Fast Image Search Framework for Mobile
Devices",
journal = j-TOMM,
volume = "14",
number = "1",
pages = "6:1--6:??",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3152127",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jan 16 18:18:12 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Content-based image retrieval (CBIR) is one of the
most important applications of computer vision. In
recent years, there have been many important advances
in the development of CBIR systems, especially
Convolutional Neural Networks (CNNs) and other
deep-learning techniques. On the other hand, current
CNN-based CBIR systems suffer from high computational
complexity of CNNs. This problem becomes more severe as
mobile applications become more and more popular. The
current practice is to deploy the entire CBIR systems
on the server side while the client side only serves as
an image provider. This architecture can increase the
computational burden on the server side, which needs to
process thousands of requests per second. Moreover,
sending images have the potential of personal
information leakage. As the need of mobile search
expands, concerns about privacy are growing. In this
article, we propose a fast image search framework,
named DeepSearch, which makes complex image search
based on CNNs feasible on mobile phones. To implement
the huge computation of CNN models, we present a tensor
Block Term Decomposition (BTD) approach as well as a
nonlinear response reconstruction method to accelerate
the CNNs involving in object detection and feature
extraction. The extensive experiments on the ImageNet
dataset and Alibaba Large-scale Image Search Challenge
dataset show that the proposed accelerating approach
BTD can significantly speed up the CNN models and
further makes CNN-based image search practical on
common smart phones.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "6",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Liu:2018:RMV,
author = "Sicong Liu and Silvestro Roberto Poccia and K.
Sel{\c{c}}uk Candan and Maria Luisa Sapino and Xiaolan
Wang",
title = "Robust Multi-Variate Temporal Features of
Multi-Variate Time Series",
journal = j-TOMM,
volume = "14",
number = "1",
pages = "7:1--7:??",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3152123",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jan 16 18:18:12 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Many applications generate and/or consume
multi-variate temporal data, and experts often lack the
means to adequately and systematically search for and
interpret multi-variate observations. In this article,
we first observe that multi-variate time series often
carry localized multi-variate temporal features that
are robust against noise. We then argue that these
multi-variate temporal features can be extracted by
simultaneously considering, at multiple scales,
temporal characteristics of the time series along with
external knowledge, including variate relationships
that are known a priori. Relying on these observations,
we develop data models and algorithms to detect robust
multi-variate temporal (RMT) features that can be
indexed for efficient and accurate retrieval and can be
used for supporting data exploration and analysis
tasks. Experiments confirm that the proposed RMT
algorithm is highly effective and efficient in
identifying robust multi-scale temporal features of
multi-variate time series.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "7",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Guo:2018:OEL,
author = "Dan Guo and Wengang Zhou and Houqiang Li and Meng
Wang",
title = "Online Early-Late Fusion Based on Adaptive {HMM} for
Sign Language Recognition",
journal = j-TOMM,
volume = "14",
number = "1",
pages = "8:1--8:??",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3152121",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jan 16 18:18:12 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In sign language recognition (SLR) with multimodal
data, a sign word can be represented by multiply
features, for which there exist an intrinsic property
and a mutually complementary relationship among them.
To fully explore those relationships, we propose an
online early-late fusion method based on the adaptive
Hidden Markov Model (HMM). In terms of the intrinsic
property, we discover that inherent latent change
states of each sign are related not only to the number
of key gestures and body poses but also to their
translation relationships. We propose an adaptive HMM
method to obtain the hidden state number of each sign
by affinity propagation clustering. For the
complementary relationship, we propose an online
early-late fusion scheme. The early fusion (feature
fusion) is dedicated to preserving useful information
to achieve a better complementary score, while the late
fusion (score fusion) uncovers the significance of
those features and aggregates them in a weighting
manner. Different from classical fusion methods, the
fusion is query adaptive. For different queries, after
feature selection (including the combined feature), the
fusion weight is inversely proportional to the area
under the curve of the normalized query score list for
each selected feature. The whole fusion process is
effective and efficient. Experiments verify the
effectiveness on the signer-independent SLR with large
vocabulary. Compared either on different dataset sizes
or to different SLR models, our method demonstrates
consistent and promising performance.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "8",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yang:2018:JEA,
author = "Huei-Fang Yang and Bo-Yao Lin and Kuang-Yu Chang and
Chu-Song Chen",
title = "Joint Estimation of Age and Expression by Combining
Scattering and Convolutional Networks",
journal = j-TOMM,
volume = "14",
number = "1",
pages = "9:1--9:??",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3152118",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jan 16 18:18:12 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article tackles the problem of joint estimation
of human age and facial expression. This is an
important yet challenging problem because expressions
can alter face appearances in a similar manner to human
aging. Different from previous approaches that deal
with the two tasks independently, our approach trains a
convolutional neural network (CNN) model that unifies
ordinal regression and multi-class classification in a
single framework. We demonstrate experimentally that
our method performs more favorably against
state-of-the-art approaches.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "9",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Huang:2018:EHD,
author = "Shao Huang and Weiqiang Wang and Shengfeng He and
Rynson W. H. Lau",
title = "Egocentric Hand Detection Via Dynamic Region Growing",
journal = j-TOMM,
volume = "14",
number = "1",
pages = "10:1--10:??",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3152129",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jan 16 18:18:12 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Egocentric videos, which mainly record the activities
carried out by the users of wearable cameras, have
drawn much research attention in recent years. Due to
its lengthy content, a large number of ego-related
applications have been developed to abstract the
captured videos. As the users are accustomed to
interacting with the target objects using their own
hands, while their hands usually appear within their
visual fields during the interaction, an egocentric
hand detection step is involved in tasks like gesture
recognition, action recognition, and social interaction
understanding. In this work, we propose a dynamic
region-growing approach for hand region detection in
egocentric videos, by jointly considering hand-related
motion and egocentric cues. We first determine seed
regions that most likely belong to the hand, by
analyzing the motion patterns across successive frames.
The hand regions can then be located by extending from
the seed regions, according to the scores computed for
the adjacent superpixels. These scores are derived from
four egocentric cues: contrast, location, position
consistency, and appearance continuity. We discuss how
to apply the proposed method in real-life scenarios,
where multiple hands irregularly appear and disappear
from the videos. Experimental results on public
datasets show that the proposed method achieves
superior performance compared with the state-of-the-art
methods, especially in complicated scenarios.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "10",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wen:2018:VBR,
author = "Jiqing Wen and James She and Xiaopeng Li and Hui Mao",
title = "Visual Background Recommendation for Dance
Performances Using Deep Matrix Factorization",
journal = j-TOMM,
volume = "14",
number = "1",
pages = "11:1--11:??",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3152463",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jan 16 18:18:12 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The stage background is one of the most important
features for a dance performance, as it helps to create
the scene and atmosphere. In conventional dance
performances, the background images are usually
selected or designed by professional stage designers
according to the theme and the style of the dance. In
new media dance performances, the stage effects are
usually generated by media editing software. Selecting
or producing a dance background is quite challenging
and is generally carried out by skilled technicians.
The goal of the research reported in this article is to
ease this process. Instead of searching for background
images from the sea of available resources, dancers are
recommended images that they are more likely to use.
This work proposes the idea of a novel system to
recommend images based on content-based social
computing. The core part of the system is a
probabilistic prediction model to predict a dancer's
interests in candidate images through social platforms.
Different from traditional collaborative filtering or
content-based models, the model proposed here
effectively combines a dancer's social behaviors
(rating action, click action, etc.) with the visual
content of images shared by the dancer using deep
matrix factorization (DMF). With the help of such a
system, dancers can select from the recommended images
and set them as the backgrounds of their dance
performances through a media editor. According to the
experiment results, the proposed DMF model outperforms
the previous methods, and when the dataset is very
sparse, the proposed DMF model shows more significant
results.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "11",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Pan:2018:AFP,
author = "Zhaoqing Pan and Jianjun Lei and Yajuan Zhang and Fu
Lee Wang",
title = "Adaptive Fractional-Pixel Motion Estimation Skipped
Algorithm for Efficient {HEVC} Motion Estimation",
journal = j-TOMM,
volume = "14",
number = "1",
pages = "12:1--12:??",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3159170",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jan 16 18:18:12 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "High-Efficiency Video Coding (HEVC) efficiently
addresses the storage and transmit problems of
high-definition videos, especially for 4K videos. The
variable-size Prediction Units (PUs)--based Motion
Estimation (ME) contributes a significant compression
rate to the HEVC encoder and also generates a huge
computation load. Meanwhile, high-level encoding
complexity prevents widespread adoption of the HEVC
encoder in multimedia systems. In this article, an
adaptive fractional-pixel ME skipped scheme is proposed
for low-complexity HEVC ME. First, based on the
property of the variable-size PUs--based ME process and
the video content partition relationship among
variable-size PUs, all inter-PU modes during a coding
unit encoding process are classified into root-type PU
mode and children-type PU modes. Then, according to the
ME result of the root-type PU mode, the
fractional-pixel ME of its children-type PU modes is
adaptively skipped. Simulation results show that,
compared to the original ME in HEVC reference software,
the proposed algorithm reduces ME encoding time by an
average of 63.22\% while encoding efficiency
performance is maintained.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "12",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zheng:2018:DLC,
author = "Zhedong Zheng and Liang Zheng and Yi Yang",
title = "A Discriminatively Learned {CNN} Embedding for Person
Reidentification",
journal = j-TOMM,
volume = "14",
number = "1",
pages = "13:1--13:??",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3159171",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jan 16 18:18:12 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this article, we revisit two popular convolutional
neural networks in person re-identification (re-ID):
verification and identification models. The two models
have their respective advantages and limitations due to
different loss functions. Here, we shed light on how to
combine the two models to learn more discriminative
pedestrian descriptors. Specifically, we propose a
Siamese network that simultaneously computes the
identification loss and verification loss. Given a pair
of training images, the network predicts the identities
of the two input images and whether they belong to the
same identity. Our network learns a discriminative
embedding and a similarity measurement at the same
time, thus taking full usage of the re-ID annotations.
Our method can be easily applied on different
pretrained networks. Albeit simple, the learned
embedding improves the state-of-the-art performance on
two public person re-ID benchmarks. Further, we show
that our architecture can also be applied to image
retrieval. The code is available at
\url{https://github.com/layumi/2016_person_re-ID}.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "13",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Sun:2018:RPP,
author = "Weiwei Sun and Jiantao Zhou and Shuyuan Zhu and Yuan
Yan Tang",
title = "Robust Privacy-Preserving Image Sharing over Online
Social Networks {(OSNs)}",
journal = j-TOMM,
volume = "14",
number = "1",
pages = "14:1--14:??",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3165265",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jan 16 18:18:12 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Sharing images online has become extremely easy and
popular due to the ever-increasing adoption of mobile
devices and online social networks (OSNs). The privacy
issues arising from image sharing over OSNs have
received significant attention in recent years. In this
article, we consider the problem of designing a secure,
robust, high-fidelity, storage-efficient image-sharing
scheme over Facebook, a representative OSN that is
widely accessed. To accomplish this goal, we first
conduct an in-depth investigation on the manipulations
that Facebook performs to the uploaded images. Assisted
by such knowledge, we propose a DCT-domain image
encryption/decryption framework that is robust against
these lossy operations. As verified theoretically and
experimentally, superior performance in terms of data
privacy, quality of the reconstructed images, and
storage cost can be achieved.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "14",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Berretti:2018:IAS,
author = "Stefano Berretti",
title = "Improved Audio Steganalytic Feature and Its
Applications in Audio Forensics",
journal = j-TOMM,
volume = "14",
number = "2",
pages = "43:1--43:??",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3190575",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue May 29 08:39:06 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Digital multimedia steganalysis has attracted wide
attention over the past decade. Currently, there are
many algorithms for detecting image steganography.
However, little research has been devoted to audio
steganalysis. Since the statistical properties of image
and audio files are quite different, features that are
effective in image steganalysis may not be effective
for audio. In this article, we design an improved audio
steganalytic feature set derived from both the time and
Mel-frequency domains for detecting some typical
steganography in the time domain, including LSB
matching, Hide4PGP, and Steghide. The experiment
results, evaluated on different audio sources,
including various music and speech clips of different
complexity, have shown that the proposed features
significantly outperform the existing ones. Moreover,
we use the proposed features to detect and further
identify some typical audio operations that would
probably be used in audio tampering. The extensive
experiment results have shown that the proposed
features also outperform the related forensic methods,
especially when the length of the audio clip is small,
such as audio clips with 800 samples. This is very
important in real forensic situations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "43",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Gupta:2018:AGM,
author = "Abhinav Gupta and Divya Singhal",
title = "Analytical Global Median Filtering Forensics Based on
Moment Histograms",
journal = j-TOMM,
volume = "14",
number = "2",
pages = "44:1--44:??",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3176650",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue May 29 08:39:06 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Median filtering forensics in images has gained wide
attention from researchers in recent years because of
its inherent nature of preserving visual traces.
Although many forensic methods are developed for median
filtering detection, probability of detection reduces
under JPEG compression at low-quality factors and for
low-resolution images. The feature set reduction is
also a challenging issue among existing detectors. In
this article, a 19-dimensional feature set is
analytically derived from image skewness and kurtosis
histograms. This new feature set is exploited for the
purpose of global median filtering forensics and
verified with exhaustive experimental results. The
efficacy of the method is tested on six popular
databases (UCID, BOWS2, BOSSBase, NRCS, RAISE, and DID)
and found that the new feature set uncovers filtering
traces for moderate, low JPEG post-compression and
low-resolution operation. Our proposed method yields
lowest probability of error and largest area under the
ROC curve for most of the test cases in comparison with
previous approaches. Some novel test cases are
introduced to thoroughly assess the benefits and
limitations of the proposed method. The obtained
results indicate that the proposed method would provide
an important tool to the field of passive image
forensics.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "44",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Huang:2018:MSH,
author = "Min Huang and Song-Zhi Su and Hong-Bo Zhang and
Guo-Rong Cai and Dongying Gong and Donglin Cao and
Shao-Zi Li",
title = "Multifeature Selection for {$3$D} Human Action
Recognition",
journal = j-TOMM,
volume = "14",
number = "2",
pages = "45:1--45:??",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3177757",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue May 29 08:39:06 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In mainstream approaches for 3D human action
recognition, depth and skeleton features are combined
to improve recognition accuracy. However, this strategy
results in high feature dimensions and low
discrimination due to redundant feature vectors. To
solve this drawback, a multi-feature selection approach
for 3D human action recognition is proposed in this
paper. First, three novel single-modal features are
proposed to describe depth appearance, depth motion,
and skeleton motion. Second, a classification entropy
of random forest is used to evaluate the discrimination
of the depth appearance based features. Finally, one of
the three features is selected to recognize the sample
according to the discrimination evaluation.
Experimental results show that the proposed
multi-feature selection approach significantly
outperforms other approaches based on single-modal
feature and feature fusion.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "45",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Mazaheri:2018:LMC,
author = "Amir Mazaheri and Boqing Gong and Mubarak Shah",
title = "Learning a Multi-Concept Video Retrieval Model with
Multiple Latent Variables",
journal = j-TOMM,
volume = "14",
number = "2",
pages = "46:1--46:??",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3176647",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue May 29 08:39:06 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Effective and efficient video retrieval has become a
pressing need in the ``big video'' era. The objective
of this work is to provide a principled model for
computing the ranking scores of a video in response to
one or more concepts, where the concepts could be
directly supplied by users or inferred by the system
from the user queries. Indeed, how to deal with
multi-concept queries has become a central component in
modern video retrieval systems that accept text
queries. However, it has been long overlooked and
simply implemented by weighted averaging of the
corresponding concept detectors' scores. Our approach,
which can be considered as a latent ranking SVM,
integrates the advantages of various recent works in
text and image retrieval, such as choosing ranking over
structured prediction, modeling inter-dependencies
between querying concepts, and so on. Videos consist of
shots, and we use latent variables to account for the
mutually complementary cues within and across shots.
Concept labels of shots are scarce and noisy. We
introduce a simple and effective technique to make our
model robust to outliers. Our approach gives superior
performance when it is tested on not only the queries
seen at training but also novel queries, some of which
consist of more concepts than the queries used for
training.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "46",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Tulilaulu:2018:DM,
author = "Aurora Tulilaulu and Matti Nelimarkka and Joonas
Paalasmaa and Daniel Johnson and Dan Ventura and Petri
Myllys and Hannu Toivonen",
title = "Data Musicalization",
journal = j-TOMM,
volume = "14",
number = "2",
pages = "47:1--47:??",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3184742",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue May 29 08:39:06 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Data musicalization is the process of automatically
composing music based on given data as an approach to
perceptualizing information artistically. The aim of
data musicalization is to evoke subjective experiences
in relation to the information rather than merely to
convey unemotional information objectively. This
article is written as a tutorial for readers interested
in data musicalization. We start by providing a
systematic characterization of musicalization
approaches, based on their inputs, methods, and
outputs. We then illustrate data musicalization
techniques with examples from several applications: one
that perceptualizes physical sleep data as music,
several that artistically compose music inspired by the
sleep data, one that musicalizes on-line chat
conversations to provide a perceptualization of
liveliness of a discussion, and one that uses
musicalization in a gamelike mobile application that
allows its users to produce music. We additionally
provide a number of electronic samples of music
produced by the different musicalization
applications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "47",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Cornia:2018:PMA,
author = "Marcella Cornia and Lorenzo Baraldi and Giuseppe Serra
and Rita Cucchiara",
title = "Paying More Attention to Saliency: Image Captioning
with Saliency and Context Attention",
journal = j-TOMM,
volume = "14",
number = "2",
pages = "48:1--48:??",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3177745",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue May 29 08:39:06 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Image captioning has been recently gaining a lot of
attention thanks to the impressive achievements shown
by deep captioning architectures, which combine
Convolutional Neural Networks to extract image
representations and Recurrent Neural Networks to
generate the corresponding captions. At the same time,
a significant research effort has been dedicated to the
development of saliency prediction models, which can
predict human eye fixations. Even though saliency
information could be useful to condition an image
captioning architecture, by providing an indication of
what is salient and what is not, research is still
struggling to incorporate these two techniques. In this
work, we propose an image captioning approach in which
a generative recurrent neural network can focus on
different parts of the input image during the
generation of the caption, by exploiting the
conditioning given by a saliency prediction model on
which parts of the image are salient and which are
contextual. We show, through extensive quantitative and
qualitative experiments on large-scale datasets, that
our model achieves superior performance with respect to
captioning baselines with and without saliency and to
different state-of-the-art approaches combining
saliency and captioning.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "48",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wen:2018:CEE,
author = "Longyin Wen and Honggang Qi and Siwei Lyu",
title = "Contrast Enhancement Estimation for Digital Image
Forensics",
journal = j-TOMM,
volume = "14",
number = "2",
pages = "49:1--49:??",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3183518",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue May 29 08:39:06 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Inconsistency in contrast enhancement can be used to
expose image forgeries. In this work, we describe a new
method to estimate contrast enhancement operations from
a single image. Our method takes advantage of the
nature of contrast enhancement as a mapping between
pixel values and the distinct characteristics it
introduces to the image pixel histogram. Our method
recovers the original pixel histogram and the contrast
enhancement simultaneously from a single image with an
iterative algorithm. Unlike previous works, our method
is robust in the presence of additive noise
perturbations that are used to hide the traces of
contrast enhancement. Furthermore, we also develop an
effective method to detect image regions undergone
contrast enhancement transformations that are different
from the rest of the image, and we use this method to
detect composite images. We perform extensive
experimental evaluations to demonstrate the efficacy
and efficiency of our method.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "49",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Jiang:2018:DMP,
author = "Yu-Gang Jiang and Minjun Li and Xi Wang and Wei Liu
and Xian-Sheng Hua",
title = "{DeepProduct}: Mobile Product Search With Portable
Deep Features",
journal = j-TOMM,
volume = "14",
number = "2",
pages = "50:1--50:??",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3184745",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue May 29 08:39:06 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Features extracted by deep networks have been popular
in many visual search tasks. This article studies deep
network structures and training schemes for mobile
visual search. The goal is to learn an effective yet
portable feature representation that is suitable for
bridging the domain gap between mobile user photos and
(mostly) professionally taken product images while
keeping the computational cost acceptable for
mobile-based applications. The technical contributions
are twofold. First, we propose an alternative of the
contrastive loss popularly used for training deep
Siamese networks, namely robust contrastive loss, where
we relax the penalty on some positive and negative
pairs to alleviate overfitting. Second, a simple
multitask fine-tuning scheme is leveraged to train the
network, which not only utilizes knowledge from the
provided training photo pairs but also harnesses
additional information from the large ImageNet dataset
to regularize the fine-tuning process. Extensive
experiments on challenging real-world datasets
demonstrate that both the robust contrastive loss and
the multitask fine-tuning scheme are effective, leading
to very promising results with a time cost suitable for
mobile product search scenarios.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "50",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Ahmad:2018:EDM,
author = "Kashif Ahmad and Mohamed Lamine Mekhalfi and Nicola
Conci and Farid Melgani and Francesco {De Natale}",
title = "Ensemble of Deep Models for Event Recognition",
journal = j-TOMM,
volume = "14",
number = "2",
pages = "51:1--51:??",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3199668",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue May 29 08:39:06 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this article, we address the problem of recognizing
an event from a single related picture. Given the large
number of event classes and the limited information
contained in a single shot, the problem is known to be
particularly hard. To achieve a reliable detection, we
propose a combination of multiple classifiers, and we
compare three alternative strategies to fuse the
results of each classifier, namely: (i) induced order
weighted averaging operators, (ii) genetic algorithms,
and (iii) particle swarm optimization. Each method is
aimed at determining the optimal weights to be assigned
to the decision scores yielded by different deep
models, according to the relevant optimization
strategy. Experimental tests have been performed on
three event recognition datasets, evaluating the
performance of various deep models, both alone and
selectively combined. Experimental results demonstrate
that the proposed approach outperforms traditional
multiple classifier solutions based on uniform
weighting, and outperforms recent state-of-the-art
approaches.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "51",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Hu:2018:UER,
author = "Wei Hu and Mozhdeh Seifi and Erik Reinhard",
title = "Over- and Under-Exposure Reconstruction of a Single
Plenoptic Capture",
journal = j-TOMM,
volume = "14",
number = "2",
pages = "52:1--52:??",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3199514",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue May 29 08:39:06 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Light field images, for example, taken with plenoptic
cameras, offer interesting post-processing
opportunities, including depth-of-field management,
depth estimation, viewpoint selection, and 3D image
synthesis. Like most capture devices, however,
plenoptic cameras have a limited dynamic range, so that
over- and under-exposed areas in plenoptic images are
commonplace. We therefore present a straightforward and
robust plenoptic reconstruction technique based on the
observation that vignetting causes peripheral views to
receive less light than central views. Thus,
corresponding pixels in different views can be used to
reconstruct illumination, especially in areas where
information missing in one view is present in another.
Our algorithm accurately reconstructs under- and
over-exposed regions (known as declipping),
additionally affording an increase in peak luminance by
up to two f-stops, and a comparable lowering of the
noise floor. The key advantages of this approach are
that no hardware modifications are necessary to improve
the dynamic range, that no multiple exposure techniques
are required, and therefore that no ghosting or other
artifacts are introduced.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "52",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Skorin-Kapov:2018:GES,
author = "Lea Skorin-Kapov and Mart{\'\i}n Varela and Tobias
Ho{\ss}feld and Kuan-Ta Chen",
title = "Guest Editorial: Special Issue on {``QoE Management
for Multimedia Services''}",
journal = j-TOMM,
volume = "14",
number = "2s",
pages = "28:1--28:??",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3192332",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue May 29 08:39:06 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "28",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Skorin-Kapov:2018:SEC,
author = "Lea Skorin-Kapov and Mart{\'\i}n Varela and Tobias
Ho{\ss}feld and Kuan-Ta Chen",
title = "A Survey of Emerging Concepts and Challenges for {QoE}
Management of Multimedia Services",
journal = j-TOMM,
volume = "14",
number = "2s",
pages = "29:1--29:??",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3176648",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue May 29 08:39:06 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Quality of Experience (QoE) has received much
attention over the past years and has become a
prominent issue for delivering services and
applications. A significant amount of research has been
devoted to understanding, measuring, and modelling QoE
for a variety of media services. The next logical step
is to actively exploit that accumulated knowledge to
improve and manage the quality of multimedia services,
while at the same time ensuring efficient and
cost-effective network operations. Moreover, with many
different players involved in the end-to-end service
delivery chain, identifying the root causes of QoE
impairments and finding effective solutions for meeting
the end users' requirements and expectations in terms
of service quality is a challenging and complex
problem. In this article, we survey state-of-the-art
findings and present emerging concepts and challenges
related to managing QoE for networked multimedia
services. Going beyond a number of previously published
survey articles addressing the topic of QoE management,
we address QoE management in the context of ongoing
developments, such as the move to softwarized networks,
the exploitation of big data analytics and machine
learning, and the steady rise of new and immersive
services (e.g., augmented and virtual reality). We
address the implications of such paradigm shifts in
terms of new approaches in QoE modeling and the need
for novel QoE monitoring and management
infrastructures.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "29",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhu:2018:MIV,
author = "Yi Zhu and Sharath Chandra Guntuku and Weisi Lin and
Gheorghita Ghinea and Judith A. Redi",
title = "Measuring Individual Video {QoE}: a Survey, and
Proposal for Future Directions Using Social Media",
journal = j-TOMM,
volume = "14",
number = "2s",
pages = "30:1--30:??",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3183512",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue May 29 08:39:06 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The next generation of multimedia services have to be
optimized in a personalized way, taking user factors
into account for the evaluation of individual
experience. Previous works have investigated the
influence of user factors mostly in a controlled
laboratory environment which often includes a limited
number of users and fails to reflect real-life
environment. Social media, especially Facebook, provide
an interesting alternative for Internet-based
subjective evaluation. In this article, we develop (and
open-source) a Facebook application, named YouQ$^1$, as
an experimental platform for studying individual
experience for videos. Our results show that subjective
experiments based on YouQ can produce reliable results
as compared to a controlled laboratory experiment.
Additionally, YouQ has the ability to collect user
information automatically from Facebook, which can be
used for modeling individual experience.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "30",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Petrangeli:2018:QEC,
author = "Stefano Petrangeli and Jeroen {Van Der Hooft} and Tim
Wauters and Filip {De Turck}",
title = "Quality of Experience-Centric Management of Adaptive
Video Streaming Services: Status and Challenges",
journal = j-TOMM,
volume = "14",
number = "2s",
pages = "31:1--31:??",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3165266",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue May 29 08:39:06 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Video streaming applications currently dominate
Internet traffic. Particularly, HTTP Adaptive Streaming
(HAS) has emerged as the dominant standard for
streaming videos over the best-effort Internet, thanks
to its capability of matching the video quality to the
available network resources. In HAS, the video client
is equipped with a heuristic that dynamically decides
the most suitable quality to stream the content, based
on information such as the perceived network bandwidth
or the video player buffer status. The goal of this
heuristic is to optimize the quality as perceived by
the user, the so-called Quality of Experience (QoE).
Despite the many advantages brought by the adaptive
streaming principle, optimizing users' QoE is far from
trivial. Current heuristics are still suboptimal when
sudden bandwidth drops occur, especially in wireless
environments, thus leading to freezes in the video
playout, the main factor influencing users' QoE. This
issue is aggravated in case of live events, where the
player buffer has to be kept as small as possible in
order to reduce the playout delay between the user and
the live signal. In light of the above, in recent
years, several works have been proposed with the aim of
extending the classical purely client-based structure
of adaptive video streaming, in order to fully optimize
users' QoE. In this article, a survey is presented of
research works on this topic together with a
classification based on where the optimization takes
place. This classification goes beyond client-based
heuristics to investigate the usage of server- and
network-assisted architectures and of new application
and transport layer protocols. In addition, we outline
the major challenges currently arising in the field of
multimedia delivery, which are going to be of extreme
relevance in future years.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "31",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Bhat:2018:SNA,
author = "Divyashri Bhat and Amr Rizk and Michael Zink and Ralf
Steinmetz",
title = "{SABR}: Network-Assisted Content Distribution for
{QoE}-Driven {ABR} Video Streaming",
journal = j-TOMM,
volume = "14",
number = "2s",
pages = "32:1--32:??",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3183516",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue May 29 08:39:06 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "State-of-the-art software-defined wide area networks
(SD-WANs) provide the foundation for flexible and
highly resilient networking. In this work, we design,
implement, and evaluate a novel architecture (denoted
as SABR) that leverages the benefits of
software-defined networking (SDN) to provide
network-assisted adaptive bitrate streaming. With
clients retaining full control of their streaming
algorithms, we clearly show that by this network
assistance, both the clients and the content providers
benefit significantly in terms of quality of experience
(QoE) and content origin offloading. SABR utilizes
information on available bandwidths per link and
network cache contents to guide video streaming clients
with the goal of improving the viewer's QoE. In
addition, SABR uses SDN capabilities to dynamically
program flows to optimize the utilization of content
delivery network caches. Backed by our study of
SDN-assisted streaming, we discuss the change in the
requirements for network-to-player APIs that enables
flexible video streaming. We illustrate the difficulty
of the problem and the impact of SDN-assisted streaming
on QoE metrics using various well-established player
algorithms. We evaluate SABR together with
state-of-the-art dynamic adaptive streaming over HTTP
(DASH) quality adaptation algorithms through a series
of experiments performed on a real-world, SDN-enabled
testbed network with minimal modifications to an
existing DASH client. In addition, we compare the
performance of different caching strategies in
combination with SABR. Our trace-based measurements
show the substantial improvement in cache hit rates and
QoE metrics in conjunction with SABR indicating a rich
design space for jointly optimized SDN-assisted caching
architectures for adaptive bitrate video streaming
applications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "32",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Burger:2018:GAV,
author = "Valentin Burger and Thomas Zinner and Lam Dinh-Xuan
and Florian Wamser and Phuoc Tran-Gia",
title = "A Generic Approach to Video Buffer Modeling Using
Discrete-Time Analysis",
journal = j-TOMM,
volume = "14",
number = "2s",
pages = "33:1--33:??",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3183511",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue May 29 08:39:06 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The large share of traffic in the Internet generated
by video streaming services puts high loads on access
and aggregation networks, resulting in high costs for
the content delivery infrastructure. To reduce the
bandwidth consumed while maintaining a high playback
quality, video players use policies that control and
limit the buffer level by using thresholds for pausing
and continuing the video download. This allows shaping
the bandwidth consumed by video streams and limiting
the traffic wasted in case of playback abortion.
Especially in mobile scenarios, where the throughput
can be highly variant, the buffer policy can have a
high impact on the probability of interruptions during
video playback. To find the optimal setting for the
buffer policy in each network condition, the
relationship between the parameters of the buffer
policy, the network throughput dynamics, and the
corresponding video playback behavior needs to be
understood. To this end, we model the video buffer as
GI/GI/1 queue with pq -policy using discrete-time
analysis. By studying the stochastic properties of the
buffer-level distribution, we are able to accurately
evaluate the impact of network and video bitrate
dynamics on the video playback quality based on the
buffer policy. We find a fundamental relationship
between the bandwidth variation and the expected
interarrival time of segments, meaning that
overproportionately more bandwidth is necessary to
prevent stalling events for high bandwidth variation.
The proposed model further allows to optimize the
trade-off between the traffic wasted in case of video
abortion and video streaming quality experienced by the
user.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "33",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Siekkinen:2018:CYS,
author = "Matti Siekkinen and Teemu k{\"a}m{\"a}r{\"a}inen and
Leonardo Favario and Enrico Masala",
title = "Can You See What {I} See? {Quality}-of-Experience
Measurements of Mobile Live Video Broadcasting",
journal = j-TOMM,
volume = "14",
number = "2s",
pages = "34:1--34:??",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3165279",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue May 29 08:39:06 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Broadcasting live video directly from mobile devices
is rapidly gaining popularity with applications like
Periscope and Facebook Live. The quality of experience
(QoE) provided by these services comprises many
factors, such as quality of transmitted video, video
playback stalling, end-to-end latency, and impact on
battery life, and they are not yet well understood. In
this article, we examine mainly the Periscope service
through a comprehensive measurement study and compare
it in some aspects to Facebook Live. We shed light on
the usage of Periscope through analysis of crawled data
and then investigate the aforementioned QoE factors
through statistical analyses as well as controlled
small-scale measurements using a couple of different
smartphones and both versions, Android and iOS, of the
two applications. We report a number of findings
including the discrepancy in latency between the two
most commonly used protocols, RTMP and HLS, surprising
surges in bandwidth demand caused by the Periscope
app's chat feature, substantial variations in video
quality, poor adaptation of video bitrate to available
upstream bandwidth at the video broadcaster side, and
significant power consumption caused by the
applications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "34",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Bruneau-Queyreix:2018:PNS,
author = "Joachim Bruneau-Queyreix and Jordi Mongay Batalla and
Mathias Lacaud and Daniel Negru",
title = "{PMS}: a Novel Scale-Adaptive and Quality-Adaptive
Hybrid {P2P\slash} Multisource Solution for Live
Streaming",
journal = j-TOMM,
volume = "14",
number = "2s",
pages = "35:1--35:??",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3183515",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue May 29 08:39:06 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Single-source HTTP adaptive streaming solutions (HAS)
have become the de facto solutions to deliver live
video over the Internet. By avoiding video stalling
events that are mainly caused by the lack of throughput
at client or at server side, HAS solutions increase the
end users' quality of experience (QoE). We propose to
pragmatically extend HAS with our MS-Stream solution
that simultaneously utilizes several servers. MS-Stream
aims at offering high QoE for live content delivery by
exploiting expanded bandwidth and link diversity in
distributed heterogeneous infrastructures. By
leveraging end users' connectivity capacities, we
further extend the QoE and scalability capabilities of
our proposal by exposing a hybrid P2P/multisource
live-streaming solution (P2P/MS-Stream (PMS)),
achieving trade-offs between the system's scale and the
end users' QoE. We propose a distributed quality
adaptation algorithm run by every peer, along with a
local optimization method of the usage of the server
infrastructure made available. Large-scale evaluations
conducted with 300 peers located in France permits
validating our approach and algorithms over flash crowd
events and allow us to conclude that PMS can reach the
optimal trade-offs between QoE and system scale.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "35",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Floris:2018:QAO,
author = "Alessandro Floris and Arslan Ahmad and Luigi Atzori",
title = "{QoE}-Aware {OTT-ISP} Collaboration in Service
Management: Architecture and Approaches",
journal = j-TOMM,
volume = "14",
number = "2s",
pages = "36:1--36:??",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3183517",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue May 29 08:39:06 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "It is a matter of fact that quality of experience
(QoE) has become one of the key factors determining
whether a new multimedia service will be successfully
accepted by the final users. Accordingly, several QoE
models have been developed with the aim of capturing
the perception of the user by considering as many
influencing factors as possible. However, when it comes
to adopting these models in the management of the
services and networks, it frequently happens that no
single provider has access to all of the tools to
either measure all influencing factors parameters or
control over the delivered quality. In particular, it
often happens to the over-the-top (OTT) and Internet
service providers (ISPs), which act with complementary
roles in the service delivery over the Internet. On the
basis of this consideration, in this article we first
highlight the importance of a possible OTT-ISP
collaboration for a joint service management in terms
of technical and economic aspects. Then we propose a
general reference architecture for a possible
collaboration and information exchange among them.
Finally, we define three different approaches, namely
joint venture, customer lifetime value based, and QoE
fairness based. The first aims to maximize the revenue
by providing better QoE to customers paying more. The
second aims to maximize the profit by providing better
QoE to the most profitable customers (MPCs). The third
aims to maximize QoE fairness among all customers.
Finally, we conduct simulations to compare the three
approaches in terms of QoE provided to the users,
profit generated for the providers, and QoE fairness.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "36",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yan:2018:GES,
author = "Yan Yan and Liqiang Nie and Rita Cucchiara",
title = "Guest Editorial: Special Section on {``Multimedia
Understanding via Multimodal Analytics''}",
journal = j-TOMM,
volume = "14",
number = "2s",
pages = "37:1--37:??",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3192334",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue May 29 08:39:06 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "37",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Tiwari:2018:MMS,
author = "Akanksha Tiwari and Christian {Von Der Weth} and Mohan
S. Kankanhalli",
title = "Multimodal Multiplatform Social Media Event
Summarization",
journal = j-TOMM,
volume = "14",
number = "2s",
pages = "38:1--38:??",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3115433",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue May 29 08:39:06 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Social media platforms are turning into important news
sources since they provide real-time information from
different perspectives. However, high volume, dynamism,
noise, and redundancy exhibited by social media data
make it difficult to comprehend the entire content.
Recent works emphasize on summarizing the content of
either a single social media platform or of a single
modality (either textual or visual). However, each
platform has its own unique characteristics and user
base, which brings to light different aspects of
real-world events. This makes it critical as well as
challenging to combine textual and visual data from
different platforms. In this article, we propose
summarization of real-world events with data stemming
from different platforms and multiple modalities. We
present the use of a Markov Random Fields based
similarity measure to link content across multiple
platforms. This measure also enables the linking of
content across time, which is useful for tracking the
evolution of long-running events. For the final content
selection, summarization is modeled as a subset
selection problem. To handle the complexity of the
optimal subset selection, we propose the use of
submodular objectives. Facets such as coverage,
novelty, and significance are modeled as submodular
objectives in a multimodal social media setting. We
conduct a series of quantitative and qualitative
experiments to illustrate the effectiveness of our
approach compared to alternative methods.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "38",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wang:2018:SAM,
author = "Anran Wang and Jianfei Cai and Jiwen Lu and Tat-Jen
Cham",
title = "Structure-Aware Multimodal Feature Fusion for {RGB-D}
Scene Classification and Beyond",
journal = j-TOMM,
volume = "14",
number = "2s",
pages = "39:1--39:??",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3115932",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue May 29 08:39:06 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "While convolutional neural networks (CNNs) have been
excellent for object recognition, the greater spatial
variability in scene images typically means that the
standard full-image CNN features are suboptimal for
scene classification. In this article, we investigate a
framework allowing greater spatial flexibility, in
which the Fisher vector (FV)-encoded distribution of
local CNN features, obtained from a multitude of region
proposals per image, is considered instead. The CNN
features are computed from an augmented pixel-wise
representation consisting of multiple modalities of
RGB, HHA, and surface normals, as extracted from RGB-D
data. More significantly, we make two postulates: (1)
component sparsity-that only a small variety of region
proposals and their corresponding FV GMM components
contribute to scene discriminability, and (2) modal
nonsparsity-that features from all modalities are
encouraged to coexist. In our proposed feature fusion
framework, these are implemented through regularization
terms that apply group lasso to GMM components and
exclusive group lasso across modalities. By learning
and combining regressors for both proposal-based FV
features and global CNN features, we are able to
achieve state-of-the-art scene classification
performance on the SUNRGBD Dataset and NYU Depth
Dataset V2. Moreover, we further apply our feature
fusion framework on an action recognition task to
demonstrate that our framework can be generalized for
other multimodal well-structured features. In
particular, for action recognition, we enforce
interpart sparsity to choose more discriminative body
parts, and intermodal nonsparsity to make informative
features from both appearance and motion modalities
coexist. Experimental results on the JHMDB and MPII
Cooking Datasets show that our feature fusion is also
very effective for action recognition, achieving very
competitive performance compared with the state of the
art.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "39",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wang:2018:ICD,
author = "Cheng Wang and Haojin Yang and Christoph Meinel",
title = "Image Captioning with Deep Bidirectional {LSTMs} and
Multi-Task Learning",
journal = j-TOMM,
volume = "14",
number = "2s",
pages = "40:1--40:??",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3115432",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue May 29 08:39:06 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Generating a novel and descriptive caption of an image
is drawing increasing interests in computer vision,
natural language processing, and multimedia
communities. In this work, we propose an end-to-end
trainable deep bidirectional LSTM (Bi-LSTM (Long
Short-Term Memory)) model to address the problem. By
combining a deep convolutional neural network (CNN) and
two separate LSTM networks, our model is capable of
learning long-term visual-language interactions by
making use of history and future context information at
high-level semantic space. We also explore deep
multimodal bidirectional models, in which we increase
the depth of nonlinearity transition in different ways
to learn hierarchical visual-language embeddings. Data
augmentation techniques such as multi-crop,
multi-scale, and vertical mirror are proposed to
prevent overfitting in training deep models. To
understand how our models ``translate'' image to
sentence, we visualize and qualitatively analyze the
evolution of Bi-LSTM internal states over time. The
effectiveness and generality of proposed models are
evaluated on four benchmark datasets: Flickr8K,
Flickr30K, MSCOCO, and Pascal1K datasets. We
demonstrate that Bi-LSTM models achieve highly
competitive performance on both caption generation and
image-sentence retrieval even without integrating an
additional mechanism (e.g., object detection, attention
model). Our experiments also prove that multi-task
learning is beneficial to increase model generality and
gain performance. We also demonstrate the performance
of transfer learning of the Bi-LSTM model significantly
outperforms previous methods on the Pascal1K dataset.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "40",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Liu:2018:TPA,
author = "Zhenguang Liu and Yingjie Xia and Qi Liu and Qinming
He and Chao Zhang and Roger Zimmermann",
title = "Toward Personalized Activity Level Prediction in
Community Question Answering {Websites}",
journal = j-TOMM,
volume = "14",
number = "2s",
pages = "41:1--41:??",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3187011",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue May 29 08:39:06 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Community Question Answering (CQA) websites have
become valuable knowledge repositories. Millions of
internet users resort to CQA websites to seek answers
to their encountered questions. CQA websites provide
information far beyond a search on a site such as
Google due to (1) the plethora of high-quality answers,
and (2) the capabilities to post new questions toward
the communities of domain experts. While most research
efforts have been made to identify experts or to
preliminarily detect potential experts of CQA websites,
there has been a remarkable shift toward investigating
how to keep the engagement of experts. Experts are
usually the major contributors of high-quality answers
and questions of CQA websites. Consequently, keeping
the expert communities active is vital to improving the
lifespan of these websites. In this article, we present
an algorithm termed PALP to predict the activity level
of expert users of CQA websites. To the best of our
knowledge, PALP is the first approach to address a
personalized activity level prediction model for CQA
websites. Furthermore, it takes into consideration user
behavior change over time and focuses specifically on
expert users. Extensive experiments on the Stack
Overflow website demonstrate the competitiveness of
PALP over existing methods.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "41",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Abdallah:2018:AHD,
author = "Maha Abdallah",
title = "Aesthetic Highlight Detection in Movies Based on
Synchronization of Spectators' Reactions",
journal = j-TOMM,
volume = "14",
number = "3",
pages = "68:1--68:??",
month = aug,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3175497",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:44 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Detection of aesthetic highlights is a challenge for
understanding the affective processes taking place
during movie watching. In this article, we study
spectators' responses to movie aesthetic stimuli in a
social context. Moreover, we look for uncovering the
emotional component of aesthetic highlights in movies.
Our assumption is that synchronized spectators'
physiological and behavioral reactions occur during
these highlights because: (i) aesthetic choices of
filmmakers are made to elicit specific emotional
reactions (e.g., special effects, empathy, and
compassion toward a character) and (ii) watching a
movie together causes spectators' affective reactions
to be synchronized through emotional contagion. We
compare different approaches to estimation of
synchronization among multiple spectators' signals,
such as pairwise, group, and overall synchronization
measures to detect aesthetic highlights in movies. The
results show that the unsupervised architecture relying
on synchronization measures is able to capture
different properties of spectators' synchronization and
detect aesthetic highlights based on both spectators'
electrodermal and acceleration signals. We discover
that pairwise synchronization measures perform the most
accurately independently of the category of the
highlights and movie genres. Moreover, we observe that
electrodermal signals have more discriminative power
than acceleration signals for highlight detection.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "68",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Bai:2018:ADA,
author = "Yalong Bai and Kuiyuan Yang and Tao Mei and Wei-Ying
Ma and Tiejun Zhao",
title = "Automatic Data Augmentation from Massive {Web} Images
for Deep Visual Recognition",
journal = j-TOMM,
volume = "14",
number = "3",
pages = "69:1--69:??",
month = aug,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3204941",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:44 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Large-scale image datasets and deep convolutional
neural networks (DCNNs) are the two primary driving
forces for the rapid progress in generic object
recognition tasks in recent years. While lots of
network architectures have been continuously designed
to pursue lower error rates, few efforts are devoted to
enlarging existing datasets due to high labeling costs
and unfair comparison issues. In this article, we aim
to achieve lower error rates by augmenting existing
datasets in an automatic manner. Our method leverages
both the web and DCNN, where the web provides massive
images with rich contextual information, and DCNN
replaces humans to automatically label images under the
guidance of web contextual information. Experiments
show that our method can automatically scale up
existing datasets significantly from billions of web
pages with high accuracy. The performance on object
recognition tasks and transfer learning tasks have been
significantly improved by using the automatically
augmented datasets, which demonstrates that more
supervisory information has been automatically gathered
from the web. Both the dataset and models trained on
the dataset have been made publicly available.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "69",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Tan:2018:UCD,
author = "Min Tan and Jun Yu and Zhou Yu and Fei Gao and Yong
Rui and Dacheng Tao",
title = "User-Click-Data-Based Fine-Grained Image Recognition
via Weakly Supervised Metric Learning",
journal = j-TOMM,
volume = "14",
number = "3",
pages = "70:1--70:??",
month = aug,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3209666",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:44 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "We present a novel fine-grained image recognition
framework using user click data, which can bridge the
semantic gap in distinguishing categories that are
similar in visual. As query set in click data is
usually large-scale and redundant, we first propose a
click-feature-based query-merging approach to merge
queries with similar semantics and construct a compact
click feature. Afterward, we utilize this compact click
feature and convolutional neural network (CNN)-based
deep visual feature to jointly represent an image.
Finally, with the combined feature, we employ the
metric-learning-based template-matching scheme for
efficient recognition. Considering the heavy noise in
the training data, we introduce a reliability variable
to characterize the image reliability, and propose a
weakly-supervised metric and template leaning with
smooth assumption and click prior (WMTLSC) method to
jointly learn the distance metric, object templates,
and image reliability. Extensive experiments are
conducted on a public Clickture-Dog dataset and our
newly established Clickture-Bird dataset. It is shown
that the click-data-based query merging helps
generating a highly compact (the dimension is reduced
to 0.9\%) and dense click feature for images, which
greatly improves the computational efficiency. Also,
introducing this click feature into CNN feature further
boosts the recognition accuracy. The proposed framework
performs much better than previous state-of-the-arts in
fine-grained recognition tasks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "70",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Bentaleb:2018:OSO,
author = "Abdelhak Bentaleb and Ali C. Begen and Roger
Zimmermann",
title = "{ORL--SDN}: Online Reinforcement Learning for
{SDN}-Enabled {HTTP} Adaptive Streaming",
journal = j-TOMM,
volume = "14",
number = "3",
pages = "71:1--71:??",
month = aug,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3219752",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:44 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In designing an HTTP adaptive streaming (HAS) system,
the bitrate adaptation scheme in the player is a key
component to ensure a good quality of experience (QoE)
for viewers. We propose a new online reinforcement
learning optimization framework, called ORL-SDN,
targeting HAS players running in a software-defined
networking (SDN) environment. We leverage SDN to
facilitate the orchestration of the adaptation schemes
for a set of HAS players. To reach a good level of QoE
fairness in a large population of players, we cluster
them based on a perceptual quality index. We formulate
the adaptation process as a Partially Observable Markov
Decision Process and solve the per-cluster optimization
problem using an online Q-learning technique that
leverages model predictive control and parallelism via
aggregation to avoid a per-cluster suboptimal selection
and to accelerate the convergence to an optimum. This
framework achieves maximum long-term revenue by
selecting the optimal representation for each cluster
under time-varying network conditions. The results show
that ORL-SDN delivers substantial improvements in
viewer QoE, presentation quality stability, fairness,
and bandwidth utilization over well-known adaptation
schemes.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "71",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Kong:2018:EVE,
author = "Lingchao Kong and Rui Dai",
title = "Efficient Video Encoding for Automatic Video Analysis
in Distributed Wireless Surveillance Systems",
journal = j-TOMM,
volume = "14",
number = "3",
pages = "72:1--72:??",
month = aug,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3226036",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:44 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In many distributed wireless surveillance
applications, compressed videos are used for performing
automatic video analysis tasks. The accuracy of object
detection, which is essential for various video
analysis tasks, can be reduced due to video quality
degradation caused by lossy compression. This article
introduces a video encoding framework with the
objective of boosting the accuracy of object detection
for wireless surveillance applications. The proposed
video encoding framework is based on systematic
investigation of the effects of lossy compression on
object detection. It has been found that current
standardized video encoding schemes cause temporal
domain fluctuation for encoded blocks in stable
background areas and spatial texture degradation for
encoded blocks in dynamic foreground areas of a raw
video, both of which degrade the accuracy of object
detection. Two measures, the sum-of-absolute frame
difference (SFD) and the degradation of texture in 2D
transform domain (TXD), are introduced to depict the
temporal domain fluctuation and the spatial texture
degradation in an encoded video, respectively. The
proposed encoding framework is designed to suppress
unnecessary temporal fluctuation in stable background
areas and preserve spatial texture in dynamic
foreground areas based on the two measures, and it
introduces new mode decision strategies for both intra-
and interframes to improve the accuracy of object
detection while maintaining an acceptable rate
distortion performance. Experimental results show that,
compared with traditional encoding schemes, the
proposed scheme improves the performance of object
detection and results in lower bit rates and
significantly reduced complexity with comparable
quality in terms of PSNR and SSIM.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "72",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wang:2018:ICA,
author = "Anqi Wang and Haifeng Hu and Liang Yang",
title = "Image Captioning with Affective Guiding and Selective
Attention",
journal = j-TOMM,
volume = "14",
number = "3",
pages = "73:1--73:??",
month = aug,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3226037",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:44 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Image captioning is an increasingly important problem
associated with artificial intelligence, computer
vision, and natural language processing. Recent works
revealed that it is possible for a machine to generate
meaningful and accurate sentences for images. However,
most existing methods ignore latent emotional
information in an image. In this article, we propose a
novel image captioning model with Affective Guiding and
Selective Attention Mechanism named AG-SAM. In our
method, we aim to bridge the affective gap between
image captioning and the emotional response elicited by
the image. First, we introduce affective components
that capture higher-level concepts encoded in images
into AG-SAM. Hence, our language model can be adapted
to generate sentences that are more passionate and
emotive. In addition, a selective gate acting on the
attention mechanism controls the degree of how much
visual information AG-SAM needs. Experimental results
have shown that our model outperforms most existing
methods, clearly reflecting an association between
images and emotional components that is usually ignored
in existing works.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "73",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Sikora:2018:SAS,
author = "Marjan Sikora and Mladen Russo and Jurica Derek and
Ante Jurcevi{\'c}",
title = "Soundscape of an Archaeological Site Recreated with
Audio Augmented Reality",
journal = j-TOMM,
volume = "14",
number = "3",
pages = "74:1--74:??",
month = aug,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3230652",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:44 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article investigates the use of an audio
augmented reality (AAR) system to recreate the
soundscape of a medieval archaeological site. The aim
of our work was to explore whether it is possible to
enhance a tourist's archaeological experience, which is
often derived from only scarce remains. We developed a
smartphone-based AAR system, which uses location and
orientation sensors to synthesize the soundscape of a
site and plays it to the user via headphones. We
recreated the ancient soundscape of a medieval
archaeological site in Croatia and tested it in situ on
two groups of participants using the soundwalk method.
One test group performed the soundwalk while listening
to the recreated soundscape using the AAR system, while
the second control group did not use the AAR equipment.
We measured the experiences of the participants using
two methods: the standard soundwalk questionnaire and
affective computing equipment for detecting the
emotional state of participants. The results of both
test methods show that participants who were listening
to the ancient soundscape using our AAR system
experienced higher arousal than those visiting the site
without AAR.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "74",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Kirchhoffer:2018:PDV,
author = "Heiner Kirchhoffer and Detlev Marpe and Heiko Schwarz
and Thomas Wiegand",
title = "Properties and Design of Variable-to-Variable Length
Codes",
journal = j-TOMM,
volume = "14",
number = "3",
pages = "75:1--75:??",
month = aug,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3230653",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:44 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "For the entropy coding of independent and identically
distributed (i.i.d.) binary sources,
variable-to-variable length (V2V) codes are an
interesting alternative to arithmetic coding. Such a
V2V code translates variable length words of the source
into variable length code words by employing two
prefix-free codes. In this article, several properties
of V2V codes are studied, and new concepts are
developed. In particular, it is shown that the
redundancy of a V2V code cannot be zero for a binary
i.i.d. source {X} with 0 \< p$_X$ (1) \< 0.5.
Furthermore, the concept of prime and composite V2V
codes is proposed, and it is shown why composite V2V
codes can be disregarded in the search for particular
classes of minimum redundancy codes. Moreover, a
canonical representation for V2V codes is proposed,
which identifies V2V codes that have the same average
code length function. It is shown how these concepts
can be employed to greatly reduce the complexity of a
search for minimum redundancy (size-limited) V2V
codes.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "75",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Kiess:2018:SCA,
author = "Johannes Kiess and Stephan Kopf and Benjamin Guthier
and Wolfgang Effelsberg",
title = "A Survey on Content-Aware Image and Video
Retargeting",
journal = j-TOMM,
volume = "14",
number = "3",
pages = "76:1--76:??",
month = aug,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3231598",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:44 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This survey introduces the current state of the art in
image and video retargeting and describes important
ideas and technologies that have influenced the recent
work. Retargeting is the process of adapting an image
or video from one screen resolution to another to fit
different displays, for example, when watching a wide
screen movie on a normal television screen or a mobile
device. As there has been considerable work done in
this field already, this survey provides an overview of
the techniques. It is meant to be a starting point for
new research in the field. We include explanations of
basic terms and operators, as well as the basic
workflow of the different methods.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "76",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Cecil:2018:NBV,
author = "J. Cecil and Avinash Gupta and M. Pirela-Cruz and
Parmesh Ramanathan",
title = "A Network-Based Virtual Reality Simulation Training
Approach for Orthopedic Surgery",
journal = j-TOMM,
volume = "14",
number = "3",
pages = "77:1--77:??",
month = aug,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3232678",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:44 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The focus of this article is on the adoption of
immersive and haptic simulators for training of medical
residents in a surgical process called Less Invasive
Stabilization System (LISS) plating surgery. LISS
surgery is an orthopedic surgical procedure to treat
fractures of the femur bone. Development of such
simulators is a complex task which involves multiple
systems, technologies, and human experts. Emerging Next
Generation Internet technologies were used to develop
the standalone on-line haptic-based simulator
accessible to the students 24/7. A standalone immersive
surgical simulator was also developed using HTC Vive.
Expert surgeons played an important role in developing
the simulator system; use cases of the target surgical
processes were built using a modeling language called
the engineering Enterprise Modeling Language (eEML). A
detailed study presenting the comparison between the
haptic-based simulator and the immersive simulator has
been also presented. The outcomes of this study
underscore the potential of using such simulators in
surgical training.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "77",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Dong:2018:LMK,
author = "Husheng Dong and Ping Lu and Chunping Liu and Yi Ji
and Shengrong Gong",
title = "Learning Multiple Kernel Metrics for Iterative Person
Re-Identification",
journal = j-TOMM,
volume = "14",
number = "3",
pages = "78:1--78:??",
month = aug,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3234929",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:44 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In person re-identification most metric learning
methods learn from training data only once, and then
they are deployed for testing. Although impressive
performance has been achieved, the discriminative
information from successfully identified test samples
are ignored. In this work, we present a novel
re-identification framework termed Iterative Multiple
Kernel Metric Learning (IMKML). Specifically, there are
two main modules in IMKML. In the first module,
multiple metrics are learned via a new derived Kernel
Marginal Nullspace Learning (KMNL) algorithm. Taking
advantage of learning a discriminative nullspace from
neighborhood manifold, KMNL can well tackle the Small
Sample Size (SSS) problem in re-identification distance
metric learning. The second module is to construct a
pseudo training set by performing re-identification on
the testing set. The pseudo training set, which
consists of the test image pairs that are highly
probable correct matches, is then inserted into the
labeled training set to retrain the metrics. By
iteratively alternating between the two modules, many
more samples will be involved for training and
significant performance gains can be achieved.
Experiments on four challenging datasets, including
VIPeR, PRID450S, CUHK01, and Market-1501, show that the
proposed method performs favorably against the
state-of-the-art approaches, especially on the lower
ranks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "78",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Abdallah:2018:ISI,
author = "Maha Abdallah and Kuan-Ta Chen and Carsten Griwodz and
Cheng-Hsin Hsu",
title = "Introduction to the Special Issue on Delay-Sensitive
Video Computing in the Cloud",
journal = j-TOMM,
volume = "14",
number = "3s",
pages = "53:1--53:??",
month = aug,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3214698",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:45 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "53",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Abdallah:2018:DSV,
author = "Maha Abdallah and Carsten Griwodz and Kuan-Ta Chen and
Gwendal Simon and Pin-Chun Wang and Cheng-Hsin Hsu",
title = "Delay-Sensitive Video Computing in the Cloud: a
Survey",
journal = j-TOMM,
volume = "14",
number = "3s",
pages = "54:1--54:??",
month = aug,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3212804",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:45 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "While cloud servers provide a tremendous amount of
resources for networked video applications, most
successful stories of cloud-assisted video applications
are presentational video services, such as YouTube and
NetFlix. This article surveys the recent advances on
delay-sensitive video computations in the cloud, which
are crucial to cloud-assisted conversational video
services, such as cloud gaming, Virtual Reality (VR),
Augmented Reality (AR), and telepresence. Supporting
conversational video services with cloud resources is
challenging because most cloud servers are far away
from the end users while these services incur the
following stringent requirements: high bandwidth, short
delay, and high heterogeneity. In this article, we
cover the literature with a top-down approach: from
applications and experience, to architecture and
management, and to optimization in and outside of the
cloud. We also point out major open challenges, hoping
to stimulate more research activities in this emerging
and exciting direction.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "54",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Li:2018:CES,
author = "Yusen Li and Yunhua Deng and Xueyan Tang and Wentong
Cai and Xiaoguang Liu and Gang Wang",
title = "Cost-Efficient Server Provisioning for Cloud Gaming",
journal = j-TOMM,
volume = "14",
number = "3s",
pages = "55:1--55:??",
month = aug,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3190838",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:45 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Cloud gaming has gained significant popularity
recently due to many important benefits such as removal
of device constraints, instant-on, and cross-platform.
The properties of intensive resource demands and
dynamic workloads make cloud gaming appropriate to be
supported by an elastic cloud platform. Facing a large
user population, a fundamental problem is how to
provide satisfactory cloud gaming service at modest
cost. We observe that the software storage cost could
be substantial compared to the server running cost in
cloud gaming using elastic cloud resources. Therefore,
in this article, we address the server provisioning
problem for cloud gaming to optimize both the server
running cost and the software storage cost. We find
that the distribution of game software among servers
and the selection of server types both trigger
tradeoffs between the software storage cost and the
server running cost in cloud gaming. We formulate the
problem with a stochastic model and employ queueing
theory to conduct a solid theoretical analysis of the
system behaviors under different request dispatching
policies. We then propose several classes of algorithms
to approximate the optimal solution. The proposed
algorithms are evaluated by extensive experiments using
real-world parameters. The results show that the
proposed Ordered and Genetic algorithms are
computationally efficient, nearly cost-optimal, and
highly robust to dynamic changes.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "55",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Slivar:2018:GCD,
author = "Ivan Slivar and Mirko Suznjevic and Lea Skorin-Kapov",
title = "Game Categorization for Deriving {QoE}-Driven Video
Encoding Configuration Strategies for Cloud Gaming",
journal = j-TOMM,
volume = "14",
number = "3s",
pages = "56:1--56:??",
month = aug,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3132041",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:45 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Cloud gaming has been recognized as a promising shift
in the online game industry, with the aim of
implementing the ``on demand'' service concept that has
achieved market success in other areas of digital
entertainment such as movies and TV shows. The concepts
of cloud computing are leveraged to render the game
scene as a video stream that is then delivered to
players in real-time. The main advantage of this
approach is the capability of delivering high-quality
graphics games to any type of end user device; however,
at the cost of high bandwidth consumption and strict
latency requirements. A key challenge faced by cloud
game providers lies in configuring the video encoding
parameters so as to maximize player Quality of
Experience (QoE) while meeting bandwidth availability
constraints. In this article, we tackle one aspect of
this problem by addressing the following research
question: Is it possible to improve service adaptation
based on information about the characteristics of the
game being streamed? To answer this question, two main
challenges need to be addressed: the need for different
QoE-driven video encoding (re-)configuration strategies
for different categories of games, and how to determine
a relevant game categorization to be used for assigning
appropriate configuration strategies. We investigate
these problems by conducting two subjective laboratory
studies with a total of 80 players and three different
games. Results indicate that different strategies
should likely be applied for different types of games,
and show that existing game classifications are not
necessarily suitable for differentiating game types in
this context. We thus further analyze objective video
metrics of collected game play video traces as well as
player actions per minute and use this as input data
for clustering of games into two clusters. Subjective
results verify that different video encoding
configuration strategies may be applied to games
belonging to different clusters.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "56",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Claypool:2018:GID,
author = "Mark Claypool",
title = "Game Input with Delay-Moving Target Selection with a
Game Controller Thumbstick",
journal = j-TOMM,
volume = "14",
number = "3s",
pages = "57:1--57:??",
month = aug,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3187288",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:45 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Hosting interactive video-based services, such as
computer games, in the Cloud poses particular
challenges given user sensitivity to delay. A better
understanding of the impact of delay on player-game
interactions can help design cloud systems and games
that accommodate delays inherent in cloud systems.
Previous top-down studies of delay using full-featured
games have helped understand the impact of delay, but
often do not generalize or lend themselves to analytic
modeling. Bottom-up studies isolating user input and
delay can better generalize and be used in models, but
have yet to be applied to cloud-hosted computer games.
In order to better understand delay impact in
cloud-hosted computer games, we conduct a large
bottom-up user study centered on a fundamental game
interaction-selecting a moving target with user input
impeded by delay. Our work builds a custom game that
controls both the target speed and input delay and has
players select the target using a game controller
analog thumbstick. Analysis of data from over 50 users
shows target selection time exponentially increases
with delay and target speed and is well-fit by an
exponential model that includes a delay and target
speed interaction term. A comparison with two previous
studies, both using a mouse instead of a thumbstick,
suggests the model's relationship between selection
time, delay, and target speed holds more broadly,
providing a foundation for a potential law explaining
moving target selection with delay encountered in
cloud-hosted games.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "57",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Hou:2018:NHC,
author = "Xueshi Hou and Yao Lu and Sujit Dey",
title = "Novel Hybrid-Cast Approach to Reduce Bandwidth and
Latency for Cloud-Based Virtual Space",
journal = j-TOMM,
volume = "14",
number = "3s",
pages = "58:1--58:??",
month = aug,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3205864",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:45 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this article, we explore the possibility of
enabling cloud-based virtual space applications for
better computational scalability and easy access from
any end device, including future lightweight wireless
head-mounted displays. In particular, we investigate
virtual space applications such as virtual classroom
and virtual gallery, in which the scenes and activities
are rendered in the cloud, with multiple views captured
and streamed to each end device. A key challenge is the
high bandwidth requirement to stream all the user
views, leading to high operational cost and potential
large delay in a bandwidth-restricted wireless network.
We propose a novel hybrid-cast approach to save
bandwidth in a multi-user streaming scenario. We
identify and broadcast the common pixels shared by
multiple users, while unicasting the residual pixels
for each user. We formulate the problem of minimizing
the total bitrate needed to transmit the user views
using hybrid-casting and describe our approach. A
common view extraction approach and a smart grouping
algorithm are proposed and developed to achieve our
hybrid-cast approach. Simulation results show that the
hybrid-cast approach can significantly reduce total
bitrate by up to 55\% and avoid congestion-related
latency, compared to traditional cloud-based approach
of transmitting all the views as individual unicast
streams, hence addressing the bandwidth challenges of
the cloud, with additional benefits in cost and
delay.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "58",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Liu:2018:CBC,
author = "Chang Liu and Wei Tsang Ooi and Jinyuan Jia and Lei
Zhao",
title = "{Cloud Baking}: Collaborative Scene Illumination for
Dynamic {Web$3$D} Scenes",
journal = j-TOMM,
volume = "14",
number = "3s",
pages = "59:1--59:??",
month = aug,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3206431",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:45 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "We propose Cloud Baking, a collaborative rendering
architecture for dynamic Web3D scenes. In our
architecture, the cloud renderer renders the scene with
the global illumination (GI) information in a GI map;
the web-based client renderer renders the scene with
ambient lighting only and blends it with the GI map
received from the cloud for the final scene. This
approach allows the users to interact with the web
scene and change the scene dynamically through the web
interface end, yet move the computationally heavy tasks
of global illumination computation to the cloud. A
challenge we face is the interaction delay that causes
the frames rendered on the cloud and the client to go
out of sync. We propose to use 3D warping and a
hole-filling algorithm designed for GI map to predict
the late GI map. We show both quantitatively and
visually the quality of the GI map produced using our
method. Our prediction algorithm allows us to further
reduce the frequency at which the GI map is computed
and sent from the server, reducing both computational
needs and bandwidth usage.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "59",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Cesar:2018:BPA,
author = "Pablo Cesar and Cheng-Hsin Hsu and Chun-Ying Huang and
Pan Hui",
title = "Best Papers of the {ACM Multimedia Systems (MMSys)
Conference 2017} and the {ACM Workshop on Network and
Operating System Support for Digital Audio and Video
(NOSSDAV) 2017}",
journal = j-TOMM,
volume = "14",
number = "3s",
pages = "60:1--60:??",
month = aug,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3214700",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:45 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "60",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zahran:2018:AAS,
author = "Ahmed H. Zahran and Jason J. Quinlan and K. K.
Ramakrishnan and Cormac J. Sreenan",
title = "{ASAP}: Adaptive Stall-Aware Pacing for Improved
{DASH} Video Experience in Cellular Networks",
journal = j-TOMM,
volume = "14",
number = "3s",
pages = "61:1--61:??",
month = aug,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3219750",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:45 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The dramatic growth of video traffic represents a
practical challenge for cellular network operators in
providing a consistent streaming Quality of Experience
(QoE) to their users. Satisfying this objective has
so-far proved elusive, due to the inherent
characteristics of wireless networks and varying
channel conditions as well as variability in the video
bitrate that can degrade streaming performance. In this
article, we propose stall-aware pacing as a novel MPEG
DASH video traffic management solution that reduces
playback stalls and seeks to maintain a consistent QoE
for cellular users, even those with diverse channel
conditions. These goals are achieved by leveraging both
network and client state information to optimize the
pacing of individual video flows. We evaluate the
performance of two versions of stall-aware pacing
techniques extensively, including stall-aware pacing
(SAP) and adaptive stall-aware pacing (ASAP), using
real video content and clients, operating over a
simulated LTE network. We implement state-of-the-art
client adaptation and traffic management strategies for
direct comparisons with SAP and ASAP. Our results,
using a heavily loaded base station, show that SAP
reduces the number of stalls and the average stall
duration per session by up to 95\%. Additionally, SAP
ensures that clients with good channel conditions do
not dominate available wireless resources, evidenced by
a reduction of up to 40\% in the standard deviation of
the QoE metric across clients. We also show that ASAP
achieves additional performance gains by adaptively
pacing video streams based on the application buffer
state.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "61",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhou:2018:EOP,
author = "Chao Zhou and Zhenhua Li and Joe Osgood and Yao Liu",
title = "On the Effectiveness of Offset Projections for $
360$-Degree Video Streaming",
journal = j-TOMM,
volume = "14",
number = "3s",
pages = "62:1--62:??",
month = aug,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3209660",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:45 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "A new generation of video streaming technology,
360-degree video, promises greater immersiveness than
standard video streams. This level of immersiveness is
similar to that produced by virtual reality
devices-users can control the field of view using head
movements rather than needing to manipulate external
devices. Although 360-degree video could revolutionize
the streaming experience, its large-scale adoption is
hindered by a number of factors: 360-degree video
streams have larger bandwidth requirements and require
faster responsiveness to user inputs, and users may be
more sensitive to lower quality streams. In this
article, we review standard approaches toward
360-degree video encoding and compare these to families
of approaches that distort the spherical surface to
allow oriented concentrations of the 360-degree view.
We refer to these distorted projections as offset
projections. Our measurement studies show that most
types of offset projections produce rendered views with
better quality than their nonoffset equivalents when
view orientations are within 40 or 50 degrees of the
offset orientation. Offset projections complicate
adaptive 360-degree video streaming because they
require a combination of bitrate and view orientation
adaptations. We estimate that this combination of
streaming adaptation in two dimensions can cause over
57\% extra segments to be downloaded compared to an
ideal downloading strategy, wasting 20\% of the total
downloading bandwidth.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "62",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Bahirat:2018:DEM,
author = "Kanchan Bahirat and Chengyuan Lai and Ryan P. Mcmahan
and Balakrishnan Prabhakaran",
title = "Designing and Evaluating a Mesh Simplification
Algorithm for Virtual Reality",
journal = j-TOMM,
volume = "14",
number = "3s",
pages = "63:1--63:??",
month = aug,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3209661",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:45 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "With the increasing accessibility of the mobile
head-mounted displays (HMDs), mobile virtual reality
(VR) systems are finding applications in various areas.
However, mobile HMDs are highly constrained with
limited graphics processing units (GPUs) and low
processing power and onboard memory. Hence, VR
developers must be cognizant of the number of polygons
contained within their virtual environments to avoid
rendering at low frame rates and inducing simulator
sickness. The most robust and rapid approach to keeping
the overall number of polygons low is to use mesh
simplification algorithms to create low-poly versions
of pre-existing, high-poly models. Unfortunately, most
existing mesh simplification algorithms cannot
adequately handle meshes with lots of boundaries or
nonmanifold meshes, which are common attributes of many
3D models. In this article, we present QEM$_{4VR}$, a
high-fidelity mesh simplification algorithm
specifically designed for VR. This algorithm addresses
the deficiencies of prior quadric error metric (QEM)
approaches by leveraging the insight that the most
relevant boundary edges lie along curvatures while
linear boundary edges can be collapsed. Additionally,
our algorithm preserves key surface properties, such as
normals, texture coordinates, colors, and materials, as
it preprocesses 3D models and generates their low-poly
approximations offline. We evaluated the effectiveness
of our QEM$_{4VR}$ algorithm by comparing its
simplified-mesh results to those of prior QEM
variations in terms of geometric approximation error,
texture error, progressive approximation errors, frame
rate impact, and perceptual quality measures. We found
that QEM$_{4VR}$ consistently yielded simplified meshes
with less geometric approximation error and texture
error than the prior QEM variations. It afforded better
frame rates than QEM variations with boundary
preservation constraints that create unnecessary lower
bounds on overall polygon count reduction. Our
evaluation revealed that QEM$_{4VR}$ did not fair well
in terms of existing perceptual distance measurements,
but human-based inspections demonstrate that these
algorithmic measurements are not suitable substitutes
for actual human perception. In turn, we present a
user-based methodology for evaluating the perceptual
qualities of mesh simplification algorithms.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "63",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wang:2018:ELV,
author = "Junjue Wang and Brandon Amos and Anupam Das and
Padmanabhan Pillai and Norman Sadeh and Mahadev
Satyanarayanan",
title = "Enabling Live Video Analytics with a Scalable and
Privacy-Aware Framework",
journal = j-TOMM,
volume = "14",
number = "3s",
pages = "64:1--64:??",
month = aug,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3209659",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:45 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "We show how to build the components of a
privacy-aware, live video analytics ecosystem from the
bottom up, starting with OpenFace, our new open-source
face recognition system that approaches
state-of-the-art accuracy. Integrating OpenFace with
interframe tracking, we build RTFace, a mechanism for
denaturing video streams that selectively blurs faces
according to specified policies at full frame rates.
This enables privacy management for live video
analytics while providing a secure approach for
handling retrospective policy exceptions. Finally, we
present a scalable, privacy-aware architecture for
large camera networks using RTFace and show how it can
be an enabler for a vibrant ecosystem and marketplace
of privacy-aware video streams and analytics
services.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "64",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Gudmundsson:2018:PWS,
author = "Gylfi {\Thorn}{\'o}r Gudmundsson and Bj{\"o}rn
{\Thorn}{\'o}r J{\'o}nsson and Laurent Amsaleg and
Michael J. Franklin",
title = "Prototyping a {Web}-Scale Multimedia Retrieval Service
Using {Spark}",
journal = j-TOMM,
volume = "14",
number = "3s",
pages = "65:1--65:??",
month = aug,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3209662",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:45 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The world has experienced phenomenal growth in data
production and storage in recent years, much of which
has taken the form of media files. At the same time,
computing power has become abundant with multi-core
machines, grids, and clouds. Yet it remains a challenge
to harness the available power and move toward
gracefully searching and retrieving from web-scale
media collections. Several researchers have
experimented with using automatically distributed
computing frameworks, notably Hadoop and Spark, for
processing multimedia material, but mostly using small
collections on small computing clusters. In this
article, we describe a prototype of a (near) web-scale
throughput-oriented MM retrieval service using the
Spark framework running on the AWS cloud service. We
present retrieval results using up to 43 billion SIFT
feature vectors from the public YFCC 100M collection,
making this the largest high-dimensional feature vector
collection reported in the literature. We also present
a publicly available demonstration retrieval system,
running on our own servers, where the implementation of
the Spark pipelines can be observed in practice using
standard image benchmarks, and downloaded for research
purposes. Finally, we describe a method to evaluate
retrieval quality of the ever-growing high-dimensional
index of the prototype, without actually indexing a
web-scale media collection.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "65",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Ma:2018:CUB,
author = "Ming Ma and Lei Zhang and Jiangchuan Liu and Zhi Wang
and Haitian Pang and Lifeng Sun and Weihua Li and
Guangling Hou and Kaiyan Chu",
title = "Characterizing User Behaviors in Mobile Personal
Livecast: Towards an Edge Computing-assisted Paradigm",
journal = j-TOMM,
volume = "14",
number = "3s",
pages = "66:1--66:??",
month = aug,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3219751",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:45 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Mobile personal livecast (MPL) services are emerging
and have received great attention recently. In MPL,
numerous and geo-distributed ordinary people broadcast
their video contents to worldwide viewers. Different
from conventional social networking services like
Twitter and Facebook, which have a tolerance for
interaction delay, the interactions (e.g., chat
messages) in a personal livecast must be in real-time
with low feedback latency. These unique characteristics
inspire us to: (1) investigate how the relationships
(e.g., social links and geo-locations) between viewers
and broadcasters influence the user behaviors, which
has yet to be explored in depth; and (2) explore
insights to benefit the improvement of system
performance. In this article, we carry out extensive
measurements of a representative MPL system, with a
large-scale dataset containing 11M users. In the
current costly and limited cloud-based MPL system,
which is faced with scalability problem, we find: (1)
the long content uploading distances between
broadcasters and cloud ingesting servers result in an
impaired system QoS, including a high broadcast latency
and a frequently buffering events; and (2) most of the
broadcasters in MPL are geographically locally popular
(the majority of the views come from the same region of
the broadcaster), which consume vast computation and
bandwidth resources of the clouds and Content Delivery
Networks. Fortunately, the emergence of edge computing,
which provides cloud-computing capabilities at the edge
of the mobile network, naturally sheds new light on the
MPL system; i.e., localized ingesting, transcoding, and
delivering locally popular live content is possible.
Based on these critical observations, we propose an
edge-assisted MPL system that collaboratively utilizes
the core-cloud and abundant edge computing resources to
improve the system efficiency and scalability. In our
framework, we consider a dynamic broadcaster assignment
to minimize the broadcast latency while keeping the
resource lease cost low. We formulate the broadcaster
scheduling as a stable matching with migration problem
to solve it effectively. Compared with the current pure
cloud-based system, our edge-assisted delivery approach
reduces the broadcast latency by about 35\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "66",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Huang:2018:UBA,
author = "Lei Huang and Bowen Ding and Aining Wang and Yuedong
Xu and Yipeng Zhou and Xiang Li",
title = "User Behavior Analysis and Video Popularity Prediction
on a Large-Scale {VoD} System",
journal = j-TOMM,
volume = "14",
number = "3s",
pages = "67:1--67:??",
month = aug,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3226035",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:45 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Understanding streaming user behavior is crucial to
the design of large-scale Video-on-Demand (VoD)
systems. In this article, we begin with the measurement
of individual viewing behavior from two aspects: the
temporal characteristics and user interest. We observe
that active users spend more hours on each active day,
and their daily request time distribution is more
scattered than that of the less active users, while the
inter-view time distribution differs negligibly between
two groups. The common interest in popular videos and
the latest uploaded videos is observed in both groups.
We then investigate the predictability of video
popularity as a collective user behavior through early
views. In the light of the limitations of classical
approaches, the Autoregressive-Moving-Average (ARMA)
model is employed to forecast the popularity dynamics
of individual videos at fine-grained time scales, thus
achieving much higher prediction accuracy. When applied
to video caching, the ARMA-assisted Least Frequently
Used (LFU) algorithm can outperform the Least Recently
Used (LRU) by 11--16\%, the well-tuned LFU by 6--13\%,
and the LFU is only 2--4\% inferior to the offline LFU
in terms of hit ratio.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "67",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhang:2018:JHA,
author = "Junfeng Zhang and Haifeng Hu",
title = "Joint Head Attribute Classifier and Domain-Specific
Refinement Networks for Face Alignment",
journal = j-TOMM,
volume = "14",
number = "4",
pages = "79:1--79:??",
month = nov,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3241059",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:45 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "In this article, a two-stage refinement network is
proposed for facial landmarks detection on
unconstrained conditions. Our model can be divided into
two modules, namely the Head Attribude Classifier (HAC)
module and the Domain-Specific Refinement (DSR) module.
Given an input facial image, HAC adopts multi-task
learning mechanism to detect the head pose and obtain
an initial shape. Based on the obtained head pose, DSR
designs three different CNN-based refinement networks
trained by specific domain, respectively, and
automatically selects the most approximate network for
the landmarks refinement. Different from existing
two-stage models, HAC combines head pose prediction
with facial landmarks estimation to improve the
accuracy of head pose prediction, as well as obtaining
a robust initial shape. Moreover, an adaptive
sub-network training strategy applied in the DSR module
can effectively solve the issue of traditional
multi-view methods that an improperly selected
sub-network may result in alignment failure. The
extensive experimental results on two public datasets,
AFLW and 300W, confirm the validity of our model.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "79",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{PascottiValem:2018:USL,
author = "Lucas {Pascotti Valem} and Carlos {Renan De Oliveira}
and Daniel Carlos {Guimar{\~a}es Pedronette} and
Jurandy Almeida",
title = "Unsupervised Similarity Learning through Rank
Correlation and {kNN} Sets",
journal = j-TOMM,
volume = "14",
number = "4",
pages = "80:1--80:??",
month = nov,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3241053",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:45 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The increasing amount of multimedia data collections
available today evinces the pressing need for methods
capable of indexing and retrieving this content.
Despite the continuous advances in multimedia features
and representation models, to establish an effective
measure for comparing different multimedia objects
still remains a challenging task. While supervised and
semi-supervised techniques made relevant advances on
similarity learning tasks, scenarios where labeled data
are non-existent require different strategies. In such
situations, unsupervised learning has been established
as a promising solution, capable of considering the
contextual information and the dataset structure for
computing new similarity/dissimilarity measures. This
article extends a recent unsupervised learning
algorithm that uses an iterative re-ranking strategy to
take advantage of different k -Nearest Neighbors (kNN)
sets and rank correlation measures. Two novel
approaches are proposed for computing the kNN sets and
their corresponding top- k lists. The proposed
approaches were validated in conjunction with various
rank correlation measures, yielding superior
effectiveness results in comparison with previous
works. In addition, we also evaluate the ability of the
method in considering different multimedia objects,
conducting an extensive experimental evaluation on
various image and video datasets.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "80",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wu:2018:TLD,
author = "Hui-Yin Wu and Francesca Pal{\`u} and Roberto Ranon
and Marc Christie",
title = "Thinking Like a Director: Film Editing Patterns for
Virtual Cinematographic Storytelling",
journal = j-TOMM,
volume = "14",
number = "4",
pages = "81:1--81:??",
month = nov,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3241057",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:45 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article introduces Film Editing Patterns (FEP), a
language to formalize film editing practices and
stylistic choices found in movies. FEP constructs are
constraints, expressed over one or more shots from a
movie sequence, that characterize changes in
cinematographic visual properties, such as shot sizes,
camera angles, or layout of actors on the screen. We
present the vocabulary of the FEP language, introduce
its usage in analyzing styles from annotated film data,
and describe how it can support users in the creative
design of film sequences in 3D. More specifically, (i)
we define the FEP language, (ii) we present an
application to craft filmic sequences from 3D animated
scenes that uses FEPs as a high level mean to select
cameras and perform cuts between cameras that follow
best practices in cinema, and (iii) we evaluate the
benefits of FEPs by performing user experiments in
which professional filmmakers and amateurs had to
create cinematographic sequences. The evaluation
suggests that users generally appreciate the idea of
FEPs, and that it can effectively help novice and
medium experienced users in crafting film sequences
with little training.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "81",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yu:2018:SPI,
author = "Tuo Yu and Haiming Jin and Wai-Tian Tan and Klara
Nahrstedt",
title = "{SKEPRID}: Pose and Illumination Change-Resistant
Skeleton-Based Person Re-Identification",
journal = j-TOMM,
volume = "14",
number = "4",
pages = "82:1--82:??",
month = nov,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3243217",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:45 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Currently, the surveillance camera-based person
re-identification is still challenging because of
diverse factors such as people's changing poses and
various illumination. The various poses make it hard to
conduct feature matching across images, and the
illumination changes make color-based features
unreliable. In this article, we present SKEPRID,$^1$ a
skeleton-based person re-identification method that
handles strong pose and illumination changes jointly.
To reduce the impacts of pose changes on
re-identification, we estimate the joints' positions of
a person based on the deep learning technique and thus
make it possible to extract features on specific body
parts with high accuracy. Based on the skeleton
information, we design a set of local color
comparison-based cloth-type features, which are
resistant to various lighting conditions. Moreover, to
better evaluate SKEPRID, we build the PO8LI$^2$
dataset, which has large pose and illumination
diversity. Our experimental results show that SKEPRID
outperforms state-of-the-art approaches in the case of
strong pose and illumination variation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "82",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Fan:2018:UPR,
author = "Hehe Fan and Liang Zheng and Chenggang Yan and Yi
Yang",
title = "Unsupervised Person Re-identification: Clustering and
Fine-tuning",
journal = j-TOMM,
volume = "14",
number = "4",
pages = "83:1--83:??",
month = nov,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3243316",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:45 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "The superiority of deeply learned pedestrian
representations has been reported in very recent
literature of person re-identification (re-ID). In this
article, we consider the more pragmatic issue of
learning a deep feature with no or only a few labels.
We propose a progressive unsupervised learning (PUL)
method to transfer pretrained deep representations to
unseen domains. Our method is easy to implement and can
be viewed as an effective baseline for unsupervised
re-ID feature learning. Specifically, PUL iterates
between (1) pedestrian clustering and (2) fine-tuning
of the convolutional neural network (CNN) to improve
the initialization model trained on the irrelevant
labeled dataset. Since the clustering results can be
very noisy, we add a selection operation between the
clustering and fine-tuning. At the beginning, when the
model is weak, CNN is fine-tuned on a small amount of
reliable examples that locate near to cluster centroids
in the feature space. As the model becomes stronger, in
subsequent iterations, more images are being adaptively
selected as CNN training samples. Progressively,
pedestrian clustering and the CNN model are improved
simultaneously until algorithm convergence. This
process is naturally formulated as self-paced learning.
We then point out promising directions that may lead to
further improvement. Extensive experiments on three
large-scale re-ID datasets demonstrate that PUL outputs
discriminative features that improve the re-ID
accuracy. Our code has been released at
https://github.com/hehefan/Unsupervised-Person-Re-identification-Clustering-and-Fine-tuning.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "83",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Lin:2018:REN,
author = "Xiaodan Lin and Xiangui Kang",
title = "Robust Electric Network Frequency Estimation with Rank
Reduction and Linear Prediction",
journal = j-TOMM,
volume = "14",
number = "4",
pages = "84:1--84:??",
month = nov,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3241058",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:45 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "This article deals with the problem of Electric
Network Frequency (ENF) estimation where Signal to
Noise Ratio (SNR) is an essential challenge. By
exploiting the low-rank structure of the ENF signal
from the audio spectrogram, we propose an approach
based on robust principle component analysis to get rid
of the interference from speech contents and some of
the background noise, which in our case can be regarded
as sparse in nature. Weighted linear prediction is
enforced on the low-rank signal subspace to gain
accurate ENF estimation. The performance of the
proposed scheme is analyzed and evaluated as a function
of SNR, and the Cram{\'e}r-Rao Lower Bound (CRLB) is
approached at an SNR level above -10 dB. Experiments on
real datasets have demonstrated the advantages of the
proposed method over state-of-the-art work in terms of
estimation accuracy. Specifically, the proposed scheme
can effectively capture the ENF fluctuations along the
time axis using small numbers of signal observations
while preserving sufficient frequency precision.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "84",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Li:2018:PMB,
author = "Yue Li and Gaobo Yang and Yapei Zhu and Xiangling Ding
and Rongrong Gong",
title = "Probability Model-Based Early Merge Mode Decision for
Dependent Views Coding in {$3$D-HEVC}",
journal = j-TOMM,
volume = "14",
number = "4",
pages = "85:1--85:??",
month = nov,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3267128",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:45 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "As a 3D extension to the High Efficiency Video Coding
(HEVC) standard, 3D-HEVC was developed to improve the
coding efficiency of multiview videos. It inherits the
prediction modes from HEVC, yet both Motion Estimation
(ME) and Disparity Estimation (DE) are required for
dependent views coding. This improves coding efficiency
at the cost of huge computational costs. In this
article, an early Merge mode decision approach is
proposed for dependent texture views and dependent
depth maps coding in 3D-HEVC based on priori and
posterior probability models. First, the priori
probability model is established by exploiting the
hierarchical and interview correlations from those
previously encoded blocks. Second, the posterior
probability model is built by using the Coded Block
Flag (CBF) of the current coding block. Finally, the
joint priori and posterior probability model is adopted
to early terminate the Merge mode decision for both
dependent texture views and dependent depth maps
coding. Experimental results show that the proposed
approach saves 45.2\% and 30.6\% encoding time on
average for dependent texture views and dependent depth
maps coding while maintaining negligible loss of coding
efficiency, respectively.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "85",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Santos:2018:HAS,
author = "Joel A. F. {Dos Santos} and D{\'e}bora C.
Muchaluat-Saade and C{\'e}cile Roisin and Nabil
Laya{\"\i}da",
title = "A Hybrid Approach for Spatio-Temporal Validation of
Declarative Multimedia Documents",
journal = j-TOMM,
volume = "14",
number = "4",
pages = "86:1--86:??",
month = nov,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3267127",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:45 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Declarative multimedia documents represent the
description of multimedia applications in terms of
media items and relationships among them. Relationships
specify how media items are dynamically arranged in
time and space during runtime. Although a declarative
approach usually facilitates the authoring task,
authors can still make mistakes due to incorrect use of
language constructs or inconsistent or missing
relationships in a document. In order to properly
support multimedia application authoring, it is
important to provide tools with validation
capabilities. Document validation can indicate possible
inconsistencies in a given document to an author so
that it can be revised before deployment. Although very
useful, multimedia validation tools are not often
provided by authoring tools. This work proposes a
multimedia validation approach that relies on a formal
model called Simple Hypermedia Model (SHM). SHM is used
for representing a document for the purpose of
validation. An SHM document is validated using a hybrid
approach based on two complementary techniques. The
first one captures the document's spatio-temporal
layout in terms of its state throughout its execution
by means of a rewrite theory, and validation is
performed through model-checking. The second one
captures the document's layout in terms of intervals
and event occurrences by means of Satisfiability Modulo
Theories (SMT) formulas, and validation is performed
through SMT solving. Due to different characteristics
of both approaches, each validation technique
complements the other in terms of expressiveness of SHM
and tests to be checked. We briefly present validation
tools that use our approach. They were evaluated with
real NCL documents and by usability tests.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "86",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wu:2018:ICS,
author = "Jie Wu and Haifeng Hu and Yi Wu",
title = "Image Captioning via Semantic Guidance Attention and
Consensus Selection Strategy",
journal = j-TOMM,
volume = "14",
number = "4",
pages = "87:1--87:??",
month = nov,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3271485",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:45 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Recently, a series of attempts have incorporated
spatial attention mechanisms into the task of image
captioning, which achieves a remarkable improvement in
the quality of generative captions. However, the
traditional spatial attention mechanism adopts latent
and delayed semantic representations to decide which
area should be paid more attention to, resulting in
inaccurate semantic guidance and the introduction of
redundant information. In order to optimize the spatial
attention mechanism, we propose the Semantic Guidance
Attention (SGA) mechanism in this article.
Specifically, SGA utilizes semantic word
representations to provide an intuitive semantic
guidance that focuses accurately on semantic-related
regions. Moreover, we reduce the difficulty of
generating fluent sentences by updating the attention
information in time. At the same time, the beam search
algorithm is widely used to predict words during
sequence generation. This algorithm generates a
sentence according to the probabilities of words, so it
is easy to push out a generic sentence and discard some
distinctive captions. In order to overcome this
limitation, we design the Consensus Selection (CS)
strategy to choose the most descriptive and informative
caption, which is selected by the semantic similarity
of captions instead of the probabilities of words. The
consensus caption is determined by selecting the one
with the highest cumulative semantic similarity with
respect to the reference captions. Our proposed model
(SGA-CS) is validated on Flickr30k and MSCOCO, which
shows that SGA-CS outperforms state-of-the-art
approaches. To our best knowledge, SGA-CS is the first
attempt to jointly produce semantic attention guidance
and select descriptive captions for image captioning
tasks, achieving one of the best performance ratings
among any cross-entropy training methods.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "87",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Strezoski:2018:OLS,
author = "Gjorgji Strezoski and Marcel Worring",
title = "{OmniArt}: a Large-scale Artistic Benchmark",
journal = j-TOMM,
volume = "14",
number = "4",
pages = "88:1--88:??",
month = nov,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3273022",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:45 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "Baselines are the starting point of any quantitative
multimedia research, and benchmarks are essential for
pushing those baselines further. In this article, we
present baselines for the artistic domain with a new
benchmark dataset featuring over 2 million images with
rich structured metadata dubbed OmniArt. OmniArt
contains annotations for dozens of attribute types and
features semantic context information through concepts,
IconClass labels, color information, and (limited)
object-level bounding boxes. For our dataset we
establish and present baseline scores on multiple tasks
such as artist attribution, creation period estimation,
type, style, and school prediction. In addition to our
metadata related experiments, we explore the color
spaces of art through different types and evaluate a
transfer learning object recognition pipeline.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "88",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Koch:2018:CYU,
author = "Christian Koch and Moritz Lode and Denny Stohr and Amr
Rizk and Ralf Steinmetz",
title = "Collaborations on {YouTube}: From Unsupervised
Detection to the Impact on Video and Channel
Popularity",
journal = j-TOMM,
volume = "14",
number = "4",
pages = "89:1--89:??",
month = nov,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3241054",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:45 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
abstract = "YouTube is the most popular platform for streaming of
user-generated videos. Nowadays, professional YouTubers
are organized in so-called multichannel networks
(MCNs). These networks offer services such as brand
deals, equipment, and strategic advice in exchange for
a share of the YouTubers' revenues. A dominant strategy
to gain more subscribers and, hence, revenue is
collaborating with other YouTubers. Yet, collaborations
on YouTube have not been studied in a detailed
quantitative manner. To close this gap, first, we
collect a YouTube dataset covering video statistics
over 3 months for 7,942 channels. Second, we design a
framework for collaboration detection given a
previously unknown number of persons featured in
YouTube videos. We denote this framework, for the
detection and analysis of collaborations in YouTube
videos using a Deep Neural Network (DNN)-based
approach, as CATANA. Third, we analyze about 2.4 years
of video content and use CATANA to answer research
questions guiding YouTubers and MCNs for efficient
collaboration strategies. Thereby, we focus on (1)
collaboration frequency and partner selectivity, (2)
the influence of MCNs on channel collaborations, (3)
collaborating channel types, and (4) the impact of
collaborations on video and channel popularity. Our
results show that collaborations are in many cases
significantly beneficial regarding viewers and newly
attracted subscribers for both collaborating channels,
often showing more than 100\% popularity growth
compared with noncollaboration videos.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "89",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhang:2019:EQA,
author = "Wei Zhang",
title = "Efficient {QoE}-Aware Scheme for Video Quality
Switching Operations in Dynamic Adaptive Streaming",
journal = j-TOMM,
volume = "15",
number = "1",
pages = "17:1--17:??",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3269494",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3269494",
abstract = "Dynamic Adaptive Streaming over HTTP (DASH) is a
popular over-the-top video content distribution
technique that adapts the streaming session according
to the user's network condition typically in terms of
downlink bandwidth. This video quality adaptation can
be achieved by scaling the frame quality, spatial
resolution or frame rate. Despite the flexibility on
the video quality scaling methods, each of these
quality scaling dimensions has varying effects on the
Quality of Experience (QoE) for end users. Furthermore,
in video streaming, the changes in motion over time
along with the scaling method employed have an
influence on QoE, hence the need to carefully tailor
scaling methods to suit streaming applications and
content type. In this work, we investigate an
intelligent DASH approach for the latest video coding
standard H.265 and propose a heuristic QoE-aware
cost-efficient adaptation scheme that does not switch
unnecessarily to the highest quality level but rather
stays temporarily at an intermediate quality level in
certain streaming scenarios. Such an approach achieves
a comparable and consistent level of quality under
impaired network conditions as commonly found in
Internet and mobile networks while reducing bandwidth
requirements and quality switching overhead. The
rationale is based on our empirical experiments, which
show that an increase in bitrate does not necessarily
mean noticeable improvement in QoE. Furthermore, our
work demonstrates that the Signal-to-Noise Ratio (SNR)
and the spatial resolution scalability types are the
best fit for our proposed algorithm. Finally, we
demonstrate an innovative interaction between quality
scaling methods and the polarity of switching
operations. The proposed QoE-aware scheme is
implemented and empirical results show that it is able
to reduce bandwidth requirements by up to 41\% whilst
achieving equivalent QoE compared with a representative
DASH reference implementation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "17",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yahia:2019:HBF,
author = "Mariem {Ben Yahia} and Yannick {Le Louedec} and
Gwendal Simon and Loutfi Nuaymi and Xavier Corbillon",
title = "{HTTP/2}-based Frame Discarding for Low-Latency
Adaptive Video Streaming",
journal = j-TOMM,
volume = "15",
number = "1",
pages = "18:1--18:??",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3280854",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3280854",
abstract = "In this article, we propose video delivery schemes
insuring around 1s delivery latency with Dynamic
Adaptive Streaming over HTTP (DASH), which is a
standard version of HTTP Live Streaming (HLS), so as to
benefit from the video representation switching between
successive video segments. We also propose HTTP/2-based
algorithms to apply video frame discarding policies
inside a video segment when a selected DASH
representation does not match with the available
network resources. The current solutions with small
buffer suffer from rebuffering events. Rebuffering not
only impacts the Quality of Experience (QoE) but also
increases the delivery delay between the displayed and
the original video streams. In this work, we completely
eliminate rebuffering events by developing optimal and
practical video frame discarding algorithms to meet the
1s latency constraint. In all our algorithms, we
request the video frames individually through HTTP/2
multiple streams, and we selectively drop the least
meaningful video frames thanks to HTTP/2 stream
resetting feature. Our simulations show that the
proposed algorithms eliminate rebuffering while
insuring an acceptable video quality with at least a
Peak Signal to Noise Ratio (PSNR) of 35dB compared to
25dB of the basic First In First Out (FIFO) algorithm.
We also quantify and qualify the resulting temporal
distortion of the video segments per algorithm. An
important number of missing video frames results in a
temporal fluidity break known as video jitter. The
displayed video looks like a series of snapshots. We
show that both the optimal Integer Linear Program (ILP)
and practical algorithms decrease the frequency and
duration of the jitters. For example, practical
algorithms reduce the number of crashed displayed
videos (presenting one jitter longer than 1,350ms) with
22\% compared to the basic FIFO algorithm. We also show
that requesting video frames separately with HTTP/2
slightly increases the overhead from 4.34\% to
5.76\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "18",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Li:2019:SRC,
author = "Xianguo Li and Yemei Sun and Yanli Yang and Changyun
Miao",
title = "Symmetrical Residual Connections for Single Image
Super-Resolution",
journal = j-TOMM,
volume = "15",
number = "1",
pages = "19:1--19:??",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3282445",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3282445",
abstract = "Single-image super-resolution (SISR) methods based on
convolutional neural networks (CNN) have shown great
potential in the literature. However, most deep CNN
models don't have direct access to subsequent layers,
seriously hindering the information flow. Furthermore,
they fail to make full use of the hierarchical features
from different low-level layers, thereby resulting in
relatively low accuracy. In this article, we present a
new SISR CNN, called SymSR, which incorporates
symmetrical nested residual connections to improve both
the accuracy and the execution speed. SymSR takes a
larger image region for contextual spreading. It
symmetrically combines multiple short paths for the
forward propagation to improve the accuracy and for the
backward propagation of gradient flow to accelerate the
convergence speed. Extensive experiments based on open
challenge datasets show the effectiveness of
symmetrical residual connections. Compared with four
other state-of-the-art super-resolution CNN methods,
SymSR is superior in both accuracy and runtime.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "19",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yu:2019:DCM,
author = "Yi Yu and Suhua Tang and Francisco Raposo and Lei
Chen",
title = "Deep Cross-Modal Correlation Learning for Audio and
Lyrics in Music Retrieval",
journal = j-TOMM,
volume = "15",
number = "1",
pages = "20:1--20:??",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3281746",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3281746",
abstract = "Deep cross-modal learning has successfully
demonstrated excellent performance in cross-modal
multimedia retrieval, with the aim of learning joint
representations between different data modalities.
Unfortunately, little research focuses on cross-modal
correlation learning where temporal structures of
different data modalities, such as audio and lyrics,
should be taken into account. Stemming from the
characteristic of temporal structures of music in
nature, we are motivated to learn the deep sequential
correlation between audio and lyrics. In this work, we
propose a deep cross-modal correlation learning
architecture involving two-branch deep neural networks
for audio modality and text modality (lyrics). Data in
different modalities are converted to the same
canonical space where intermodal canonical correlation
analysis is utilized as an objective function to
calculate the similarity of temporal structures. This
is the first study that uses deep architectures for
learning the temporal correlation between audio and
lyrics. A pretrained Doc2Vec model followed by fully
connected layers is used to represent lyrics. Two
significant contributions are made in the audio branch,
as follows: (i) We propose an end-to-end network to
learn cross-modal correlation between audio and lyrics,
where feature extraction and correlation learning are
simultaneously performed and joint representation is
learned by considering temporal structures. (ii) And,
as for feature extraction, we further represent an
audio signal by a short sequence of local summaries
(VGG16 features) and apply a recurrent neural network
to compute a compact feature that better learns the
temporal structures of music audio. Experimental
results, using audio to retrieve lyrics or using lyrics
to retrieve audio, verify the effectiveness of the
proposed deep correlation learning architectures in
cross-modal music retrieval.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "20",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Sun:2019:ERF,
author = "Jia Sun and Di Huang and Yunhong Wang and Liming
Chen",
title = "Expression Robust {$3$D} Facial Landmarking via
Progressive Coarse-to-Fine Tuning",
journal = j-TOMM,
volume = "15",
number = "1",
pages = "21:1--21:??",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3282833",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3282833",
abstract = "Facial landmarking is a fundamental task in automatic
machine-based face analysis. The majority of existing
techniques for such a problem are based on 2D images;
however, they suffer from illumination and pose
variations that may largely degrade landmarking
performance. The emergence of 3D data theoretically
provides an alternative to overcome these weaknesses in
the 2D domain. This article proposes a novel approach
to 3D facial landmarking, which combines both the
advantages of feature-based methods as well as
model-based ones in a progressive three-stage
coarse-to-fine manner (initial, intermediate, and fine
stages). For the initial stage, a few fiducial
landmarks (i.e., the nose tip and two inner eye
corners) are robustly detected through curvature
analysis, and these points are further exploited to
initialize the subsequent stage. For the intermediate
stage, a statistical model is learned in the feature
space of three normal components of the facial
point-cloud rather than the smooth original
coordinates, namely Active Normal Model (ANM). For the
fine stage, cascaded regression is employed to locally
refine the landmarks according to their geometry
attributes. The proposed approach can accurately
localize dozens of fiducial points on each 3D face
scan, greatly surpassing the feature-based ones, and it
also improves the state of the art of the model-based
ones in two aspects: sensitivity to initialization and
deficiency in discrimination. The proposed method is
evaluated on the BU-3DFE, Bosphorus, and BU-4DFE
databases, and competitive results are achieved in
comparison with counterparts in the literature, clearly
demonstrating its effectiveness.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "21",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Peng:2019:CGC,
author = "Yuxin Peng and Jinwei Qi",
title = "{CM-GANs}: Cross-modal Generative Adversarial Networks
for Common Representation Learning",
journal = j-TOMM,
volume = "15",
number = "1",
pages = "22:1--22:??",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3284750",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3284750",
abstract = "It is known that the inconsistent distributions and
representations of different modalities, such as image
and text, cause the heterogeneity gap, which makes it
very challenging to correlate heterogeneous data and
measure their similarities. Recently, generative
adversarial networks (GANs) have been proposed and have
shown their strong ability to model data distribution
and learn discriminative representation. It has also
been shown that adversarial learning can be fully
exploited to learn discriminative common
representations for bridging the heterogeneity gap.
Inspired by this, we aim to effectively correlate
large-scale heterogeneous data of different modalities
with the power of GANs to model cross-modal joint
distribution. In this article, we propose Cross-modal
Generative Adversarial Networks (CM-GANs) with the
following contributions. First, a cross-modal GAN
architecture is proposed to model joint distribution
over the data of different modalities. The
inter-modality and intra-modality correlation can be
explored simultaneously in generative and
discriminative models. Both compete with each other to
promote cross-modal correlation learning. Second, the
cross-modal convolutional autoencoders with
weight-sharing constraint are proposed to form the
generative model. They not only exploit the cross-modal
correlation for learning the common representations but
also preserve reconstruction information for capturing
the semantic consistency within each modality. Third, a
cross-modal adversarial training mechanism is proposed,
which uses two kinds of discriminative models to
simultaneously conduct intra-modality and
inter-modality discrimination. They can mutually boost
to make the generated common representations more
discriminative by the adversarial training process. In
summary, our proposed CM-GAN approach can use GANs to
perform cross-modal common representation learning by
which the heterogeneous data can be effectively
correlated. Extensive experiments are conducted to
verify the performance of CM-GANs on cross-modal
retrieval compared with 13 state-of-the-art methods on
4 cross-modal datasets.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "22",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Pala:2019:RFM,
author = "Pietro Pala and Stefano Berretti",
title = "Reconstructing {$3$D} Face Models by Incremental
Aggregation and Refinement of Depth Frames",
journal = j-TOMM,
volume = "15",
number = "1",
pages = "23:1--23:??",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3287309",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3287309",
abstract = "Face recognition from two-dimensional (2D) still
images and videos is quite successful even with ``in
the wild'' conditions. Instead, less consolidated
results are available for the cases in which face data
come from non-conventional cameras, such as infrared or
depth. In this article, we investigate this latter
scenario assuming that a low-resolution depth camera is
used to perform face recognition in an uncooperative
context. To this end, we propose, first, to
automatically select a set of frames from the depth
sequence of the camera because they provide a good view
of the face in terms of pose and distance. Then, we
design a progressive refinement approach to reconstruct
a higher-resolution model from the selected
low-resolution frames. This process accounts for the
anisotropic error of the existing points in the current
3D model and the points in a newly acquired frame so
that the refinement step can progressively adjust the
point positions in the model using a Kalman-like
estimation. The quality of the reconstructed model is
evaluated by considering the error between the
reconstructed models and their corresponding
high-resolution scans used as ground truth. In
addition, we performed face recognition using the
reconstructed models as probes against a gallery of
reconstructed models and a gallery with high-resolution
scans. The obtained results confirm the possibility to
effectively use the reconstructed models for the face
recognition task.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "23",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Hu:2019:OCT,
author = "Han Hu and Yichao Jin and Yonggang Wen and Cedric
Westphal",
title = "Orchestrating Caching, Transcoding and Request Routing
for Adaptive Video Streaming Over {ICN}",
journal = j-TOMM,
volume = "15",
number = "1",
pages = "24:1--24:??",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3289184",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3289184",
abstract = "Information-centric networking (ICN) has been touted
as a revolutionary solution for the future of the
Internet, which will be dominated by video traffic.
This work investigates the challenge of distributing
video content of adaptive bitrate (ABR) over ICN. In
particular, we use the in-network caching capability of
ICN routers to serve users; in addition, with the help
of named function, we enable ICN routers to transcode
videos to lower-bitrate versions to improve the cache
hit ratio. Mathematically, we formulate this design
challenge into a constrained optimization problem,
which aims to maximize the cache hit ratio for service
providers and minimize the service delay for endusers.
We design a two-step iterative algorithm to find the
optimum. First, given a content management scheme, we
minimize the service delay via optimally configuring
the routing scheme. Second, we maximize the cache hits
for a given routing policy. Finally, we rigorously
prove its convergence. Through extensive simulations,
we verify the convergence and the performance gains
over other algorithms. We also find that more resources
should be allocated to ICN routers with a heavier
request rate, and the routing scheme favors the
shortest path to schedule more traffic.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "24",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yuan:2019:DLT,
author = "Bo Yuan and Xinbo Gao and Zhenxing Niu and Qi Tian",
title = "Discovering Latent Topics by {Gaussian} Latent
{Dirichlet} Allocation and Spectral Clustering",
journal = j-TOMM,
volume = "15",
number = "1",
pages = "25:1--25:??",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3290047",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3290047",
abstract = "Today, diversifying the retrieval results of a certain
query will improve customers' search efficiency.
Showing the multiple aspects of information provides
users an overview of the object, which helps them fast
target their demands. To discover aspects, research
focuses on generating image clusters from initially
retrieved results. As an effective approach, latent
Dirichlet allocation (LDA) has been proved to have good
performance on discovering high-level topics. However,
traditional LDA is designed to process textual words,
and it needs the input as discrete data. When we apply
this algorithm to process continuous visual images, a
common solution is to quantize the continuous features
into discrete form by a bag-of-visual-words algorithm.
During this process, quantization error will lead to
information that inevitably is lost. To construct a
topic model with complete visual information, this work
applies Gaussian latent Dirichlet allocation (GLDA) on
the diversity issue of image retrieval. In this model,
traditional multinomial distribution is substituted
with Gaussian distribution to model continuous visual
features. In addition, we propose a two-phase spectral
clustering strategy, called dual spectral clustering,
to generate clusters from region level to image level.
The experiments on the challenging landmarks of the
DIV400 database show that our proposal improves
relevance and diversity by about 10\% compared to
traditional topic models.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "25",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{He:2019:ICV,
author = "Chen He and Haifeng Hu",
title = "Image Captioning With Visual-Semantic Double
Attention",
journal = j-TOMM,
volume = "15",
number = "1",
pages = "26:1--26:??",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3292058",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3292058",
abstract = "In this article, we propose a novel Visual-Semantic
Double Attention (VSDA) model for image captioning. In
our approach, VSDA consists of two parts: a modified
visual attention model is used to extract sub-region
image features, then a new SEmantic Attention (SEA)
model is proposed to distill semantic features.
Traditional attribute-based models always neglect the
distinctive importance of each attribute word and fuse
all of them into recurrent neural networks, resulting
in abundant irrelevant semantic features. In contrast,
at each timestep, our model selects the most relevant
word that aligns with current context. In other words,
the real power of VSDA lies in the ability of not only
leveraging semantic features but also eliminating the
influence of irrelevant attribute words to make the
semantic guidance more precise. Furthermore, our
approach solves the problem that visual attention
models cannot boost generating non-visual words.
Considering that visual and semantic features are
complementary to each other, our model can leverage
both of them to strengthen the generations of visual
and non-visual words. Extensive experiments are
conducted on famous datasets: MS COCO and Flickr30k.
The results show that VSDA outperforms other methods
and achieves promising performance.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "26",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Liu:2019:MII,
author = "Ruoyu Liu and Yao Zhao and Shikui Wei and Liang Zheng
and Yi Yang",
title = "Modality-Invariant Image-Text Embedding for
Image-Sentence Matching",
journal = j-TOMM,
volume = "15",
number = "1",
pages = "27:1--27:??",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3300939",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3300939",
abstract = "Performing direct matching among different modalities
(like image and text) can benefit many tasks in
computer vision, multimedia, information retrieval, and
information fusion. Most of existing works focus on
class-level image-text matching, called cross-modal
retrieval, which attempts to propose a uniform model
for matching images with all types of texts, for
example, tags, sentences, and articles (long texts).
Although cross-model retrieval alleviates the
heterogeneous gap among visual and textual information,
it can provide only a rough correspondence between two
modalities. In this article, we propose a more precise
image-text embedding method, image-sentence matching,
which can provide heterogeneous matching in the
instance level. The key issue for image-text embedding
is how to make the distributions of the two modalities
consistent in the embedding space. To address this
problem, some previous works on the cross-model
retrieval task have attempted to pull close their
distributions by employing adversarial learning.
However, the effectiveness of adversarial learning on
image-sentence matching has not been proved and there
is still not an effective method. Inspired by previous
works, we propose to learn a modality-invariant
image-text embedding for image-sentence matching by
involving adversarial learning. On top of the triplet
loss--based baseline, we design a modality
classification network with an adversarial loss, which
classifies an embedding into either the image or text
modality. In addition, the multi-stage training
procedure is carefully designed so that the proposed
network not only imposes the image-text similarity
constraints by ground-truth labels, but also enforces
the image and text embedding distributions to be
similar by adversarial learning. Experiments on two
public datasets (Flickr30k and MSCOCO) demonstrate that
our method yields stable accuracy improvement over the
baseline model and that our results compare favorably
to the state-of-the-art methods.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "27",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Ma:2019:PFC,
author = "Ruijun Ma and Haifeng Hu and Weixuan Wang and Jia Xu
and Zhengming Li",
title = "Photorealistic Face Completion with Semantic Parsing
and Face Identity-Preserving Features",
journal = j-TOMM,
volume = "15",
number = "1",
pages = "28:1--28:??",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3300940",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3300940",
abstract = "Tremendous progress on deep learning has shown
exciting potential for a variety of face completion
tasks. However, most learning-based methods are limited
to handle general or structure specified face images
(e.g., well-aligned faces). In this article, we propose
a novel face completion algorithm, called Learning and
Preserving Face Completion Network (LP-FCN), which
simultaneously parses face images and extracts face
identity-preserving (FIP) features. By tackling these
two tasks in a mutually boosting way, the LP-FCN can
guide an identity preserving inference and ensure pixel
faithfulness of completed faces. In addition, we adopt
a global discriminator and a local discriminator to
distinguish real images from synthesized ones. By
training with a combined identity preserving, semantic
parsing and adversarial loss, the LP-FCN encourages the
completion results to be semantically valid and
visually consistent for more complicated image
completion tasks. Experiments show that our approach
obtains similar visual quality, but achieves better
performance on unaligned faces completion and fine
detailed synthesis against the state-of-the-art
methods.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "28",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Lokoc:2019:ISS,
author = "Jakub Lokoc and Gregor Kovalc{\'\i}k and Bernd
M{\"u}nzer and Klaus Sch{\"o}ffmann and Werner Bailer
and Ralph Gasser and Stefanos Vrochidis and Phuong Anh
Nguyen and Sitapa Rujikietgumjorn and Kai Uwe Barthel",
title = "Interactive Search or Sequential Browsing? {A}
Detailed Analysis of the {Video Browser Showdown
2018}",
journal = j-TOMM,
volume = "15",
number = "1",
pages = "29:1--29:??",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3295663",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3295663",
abstract = "This work summarizes the findings of the 7th iteration
of the Video Browser Showdown (VBS) competition
organized as a workshop at the 24th International
Conference on Multimedia Modeling in Bangkok. The
competition focuses on video retrieval scenarios in
which the searched scenes were either previously
observed or described by another person (i.e., an
example shot is not available). During the event, nine
teams competed with their video retrieval tools in
providing access to a shared video collection with 600
hours of video content. Evaluation objectives, rules,
scoring, tasks, and all participating tools are
described in the article. In addition, we provide some
insights into how the different teams interacted with
their video browsers, which was made possible by a
novel interaction logging mechanism introduced for this
iteration of the VBS. The results collected at the VBS
evaluation server confirm that searching for one
particular scene in the collection when given a limited
time is still a challenging task for many of the
approaches that were showcased during the event. Given
only a short textual description, finding the correct
scene is even harder. In ad hoc search with multiple
relevant scenes, the tools were mostly able to find at
least one scene, whereas recall was the issue for many
teams. The logs also reveal that even though recent
exciting advances in machine learning narrow the
classical semantic gap problem, user-centric interfaces
are still required to mediate access to specific
content. Finally, open challenges and lessons learned
are presented for future VBS events.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "29",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhang:2019:ESI,
author = "Wei Zhang and Ting Yao and Shiai Zhu and Abdulmotaleb
{El Saddik}",
title = "Editorial to Special Issue on Deep Learning for
Intelligent Multimedia Analytics",
journal = j-TOMM,
volume = "15",
number = "1s",
pages = "1:1--1:??",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3292059",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3292059",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "1",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhang:2019:DLB,
author = "Wei Zhang and Ting Yao and Shiai Zhu and Abdulmotaleb
{El Saddik}",
title = "Deep Learning-Based Multimedia Analytics: a Review",
journal = j-TOMM,
volume = "15",
number = "1s",
pages = "2:1--2:??",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3279952",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3279952",
abstract = "The multimedia community has witnessed the rise of
deep learning-based techniques in analyzing multimedia
content more effectively. In the past decade, the
convergence of deep-learning and multimedia analytics
has boosted the performance of several traditional
tasks, such as classification, detection, and
regression, and has also fundamentally changed the
landscape of several relatively new areas, such as
semantic segmentation, captioning, and content
generation. This article aims to review the development
path of major tasks in multimedia analytics and take a
look into future directions. We start by summarizing
the fundamental deep techniques related to multimedia
analytics, especially in the visual domain, and then
review representative high-level tasks powered by
recent advances. Moreover, the performance review of
popular benchmarks gives a pathway to technology
advancement and helps identify both milestone works and
future directions.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "2",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Xie:2019:CAN,
author = "Hongtao Xie and Shancheng Fang and Zheng-Jun Zha and
Yating Yang and Yan Li and Yongdong Zhang",
title = "Convolutional Attention Networks for Scene Text
Recognition",
journal = j-TOMM,
volume = "15",
number = "1s",
pages = "3:1--3:??",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3231737",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3231737",
abstract = "In this article, we present Convoluitional Attention
Networks (CAN) for unconstrained scene text
recognition. Recent dominant approaches for scene text
recognition are mainly based on Convolutional Neural
Networks (CNN) and Recurrent Neural Networks (RNN),
where the CNN encodes images and the RNN generates
character sequences. Our CAN is different from these
methods; our CAN is completely built on CNN and
includes an attention mechanism. The distinctive
characteristics of our method include (i) CAN follows
encoder-decoder architecture, in which the encoder is a
deep two-dimensional CNN and the decoder is a
one-dimensional CNN; (ii) the attention mechanism is
applied in every convolutional layer of the decoder,
and we propose a novel spatial attention method using
average pooling; and (iii) position embeddings are
equipped in both a spatial encoder and a sequence
decoder to give our networks a sense of location. We
conduct experiments on standard datasets for scene text
recognition, including Street View Text, IIIT5K, and
ICDAR datasets. The experimental results validate the
effectiveness of different components and show that our
convolutional-based method achieves state-of-the-art or
competitive performance over prior works, even without
the use of RNN.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "3",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Chen:2019:SAD,
author = "Zhineng Chen and Shanshan Ai and Caiyan Jia",
title = "Structure-Aware Deep Learning for Product Image
Classification",
journal = j-TOMM,
volume = "15",
number = "1s",
pages = "4:1--4:??",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3231742",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3231742",
abstract = "Automatic product image classification is a task of
crucial importance with respect to the management of
online retailers. Motivated by recent advancements of
deep Convolutional Neural Networks (CNN) on image
classification, in this work we revisit the problem in
the context of product images with the existence of a
predefined categorical hierarchy and attributes, aiming
to leverage the hierarchy and attributes to improve
classification accuracy. With these structure-aware
clues, we argue that more advanced deep models could be
developed beyond the flat one-versus-all classification
performed by conventional CNNs. To this end, novel
efforts of this work include a salient-sensitive CNN
that gazes into the product foreground by inserting a
dedicated spatial attention module; a multiclass
regression-based refinement that is expected to predict
more accurately by merging prediction scores from
multiple preceding CNNs, each corresponding to a
distinct classifier in the hierarchy; and a multitask
deep learning architecture that effectively explores
correlations among categories and attributes for
categorical label prediction. Experimental results on
nearly 1 million real-world product images basically
validate the effectiveness of the proposed efforts
individually and jointly, from which performance gains
are observed.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "4",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Jiang:2019:DPR,
author = "Shuqiang Jiang and Gongwei Chen and Xinhang Song and
Linhu Liu",
title = "Deep Patch Representations with Shared Codebook for
Scene Classification",
journal = j-TOMM,
volume = "15",
number = "1s",
pages = "5:1--5:??",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3231738",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3231738",
abstract = "Scene classification is a challenging problem.
Compared with object images, scene images are more
abstract, as they are composed of objects. Object and
scene images have different characteristics with
different scales and composition structures. How to
effectively integrate the local mid-level semantic
representations including both object and scene
concepts needs to be investigated, which is an
important aspect for scene classification. In this
article, the idea of a sharing codebook is introduced
by organically integrating deep learning, concept
feature, and local feature encoding techniques. More
specifically, the shared local feature codebook is
generated from the combined ImageNet1K and Places365
concepts (Mixed1365) using convolutional neural
networks. As the Mixed1365 features cover all the
semantic information including both object and scene
concepts, we can extract a shared codebook from the
Mixed1365 features, which only contain a subset of the
whole 1,365 concepts with the same codebook size. The
shared codebook can not only provide complementary
representations without additional codebook training
but also be adaptively extracted toward different scene
classification tasks. A method of fusing the encoded
features with both the original codebook and the shared
codebook is proposed for scene classification. In this
way, more comprehensive and representative image
features can be generated for classification. Extensive
experimentations conducted on two public datasets
validate the effectiveness of the proposed method.
Besides, some useful observations are also revealed to
show the advantage of shared codebook.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "5",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhao:2019:VCR,
author = "Rui-Wei Zhao and Qi Zhang and Zuxuan Wu and Jianguo Li
and Yu-Gang Jiang",
title = "Visual Content Recognition by Exploiting Semantic
Feature Map with Attention and Multi-task Learning",
journal = j-TOMM,
volume = "15",
number = "1s",
pages = "6:1--6:??",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3231739",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3231739",
abstract = "Recent studies have shown that spatial relationships
among objects are very important for visual
recognition, since they can provide rich clues on
object contexts within the images. In this article, we
introduce a novel method to learn the Semantic Feature
Map (SFM) with attention-based deep neural networks for
image and video classification in an end-to-end manner,
aiming to explicitly model the spatial object contexts
within the images. In particular, we explicitly apply
the designed gate units to the extracted object
features for important objects selection and noise
removal. These selected object features are then
organized into the proposed SFM, which is a compact and
discriminative representation with the spatial
information among objects preserved. Finally, we employ
either Fully Convolutional Networks (FCN) or Long-Short
Term Memory (LSTM) as the classifiers on top of the SFM
for content recognition. A novel multi-task learning
framework with image classification loss, object
localization loss, and grid labeling loss are also
introduced to help better learn the model parameters.
We conduct extensive evaluations and comparative
studies to verify the effectiveness of the proposed
approach on Pascal VOC 2007/2012 and MS-COCO benchmarks
for image classification. In addition, the experimental
results also show that the SFMs learned from the image
domain can be successfully transferred to CCV and FCVID
benchmarks for video classification.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "6",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Liu:2019:CMF,
author = "Xueliang Liu and Meng Wang and Zheng-Jun Zha and
Richang Hong",
title = "Cross-Modality Feature Learning via Convolutional
Autoencoder",
journal = j-TOMM,
volume = "15",
number = "1s",
pages = "7:1--7:??",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3231740",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3231740",
abstract = "Learning robust and representative features across
multiple modalities has been a fundamental problem in
machine learning and multimedia fields. In this
article, we propose a novel MUltimodal Convolutional
AutoEncoder (MUCAE) approach to learn representative
features from visual and textual modalities. For each
modality, we integrate the convolutional operation into
an autoencoder framework to learn a joint
representation from the original image and text
content. We optimize the convolutional autoencoders of
different modalities jointly by exploiting the
correlation between the hidden representations from the
convolutional autoencoders, in particular by minimizing
both the reconstructing error of each modality and the
correlation divergence between the hidden feature of
different modalities. Compared to the conventional
solutions relying on hand-crafted features, the
proposed MUCAE approach encodes features from image
pixels and text characters directly and produces more
representative and robust features. We evaluate MUCAE
on cross-media retrieval as well as unimodal
classification tasks over real-world large-scale
multimedia databases. Experimental results have shown
that MUCAE performs better than the state-of-the-arts
methods.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "7",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Liu:2019:DCN,
author = "Jiawei Liu and Zheng-Jun Zha and Xuejin Chen and Zilei
Wang and Yongdong Zhang",
title = "Dense {$3$D}-Convolutional Neural Network for Person
Re-Identification in Videos",
journal = j-TOMM,
volume = "15",
number = "1s",
pages = "8:1--8:??",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3231741",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3231741",
abstract = "Person re-identification aims at identifying a certain
pedestrian across non-overlapping multi-camera networks
in different time and places. Existing person
re-identification approaches mainly focus on matching
pedestrians on images; however, little attention has
been paid to re-identify pedestrians in videos.
Compared to images, video clips contain motion patterns
of pedestrians, which is crucial to person
re-identification. Moreover, consecutive video frames
present pedestrian appearance with different body poses
and from different viewpoints, providing valuable
information toward addressing the challenge of pose
variation, occlusion, and viewpoint change, and so on.
In this article, we propose a Dense 3D-Convolutional
Network (D3DNet) to jointly learn spatio-temporal and
appearance representation for person re-identification
in videos. The D3DNet consists of multiple
three-dimensional (3D) dense blocks and transition
layers. The 3D dense blocks enlarge the receptive
fields of visual neurons in both spatial and temporal
dimensions, leading to discriminative appearance
representation as well as short-term and long-term
motion patterns of pedestrians without the requirement
of an additional motion estimation module. Moreover, we
formulate a loss function consisting of an
identification loss and a center loss to minimize
intra-class variance and maximize inter-class variance
simultaneously, toward addressing the challenge of
large intra-class variance and small inter-class
variance. Extensive experiments on two real-world video
datasets of person identification, i.e., MARS and
iLIDS-VID, have shown the effectiveness of the proposed
approach.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "8",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhao:2019:DSM,
author = "Liang Zhao and Zhikui Chen and Laurence T. Yang and M.
Jamal Deen and Z. Jane Wang",
title = "Deep Semantic Mapping for Heterogeneous Multimedia
Transfer Learning Using Co-Occurrence Data",
journal = j-TOMM,
volume = "15",
number = "1s",
pages = "9:1--9:??",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3241055",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3241055",
abstract = "Transfer learning, which focuses on finding a
favorable representation for instances of different
domains based on auxiliary data, can mitigate the
divergence between domains through knowledge transfer.
Recently, increasing efforts on transfer learning have
employed deep neural networks (DNN) to learn more
robust and higher level feature representations to
better tackle cross-media disparities. However, only a
few articles consider the correction and semantic
matching between multi-layer heterogeneous domain
networks. In this article, we propose a deep semantic
mapping model for heterogeneous multimedia transfer
learning (DHTL) using co-occurrence data. More
specifically, we integrate the DNN with canonical
correlation analysis (CCA) to derive a deep correlation
subspace as the joint semantic representation for
associating data across different domains. In the
proposed DHTL, a multi-layer correlation matching
network across domains is constructed, in which the CCA
is combined to bridge each pair of domain-specific
hidden layers. To train the network, a joint objective
function is defined and the optimization processes are
presented. When the deep semantic representation is
achieved, the shared features of the source domain are
transferred for task learning in the target domain.
Extensive experiments for three multimedia recognition
applications demonstrate that the proposed DHTL can
effectively find deep semantic representations for
heterogeneous domains, and it is superior to the
several existing state-of-the-art methods for deep
transfer learning.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "9",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Hossain:2019:ADL,
author = "M. Shamim Hossain and Syed Umar Amin and Mansour
Alsulaiman and Ghulam Muhammad",
title = "Applying Deep Learning for Epilepsy Seizure Detection
and Brain Mapping Visualization",
journal = j-TOMM,
volume = "15",
number = "1s",
pages = "10:1--10:??",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3241056",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3241056",
abstract = "Deep Convolutional Neural Network (CNN) has achieved
remarkable results in computer vision tasks for
end-to-end learning. We evaluate here the power of a
deep CNN to learn robust features from raw
Electroencephalogram (EEG) data to detect seizures.
Seizures are hard to detect, as they vary both inter-
and intra-patient. In this article, we use a deep CNN
model for seizure detection task on an open-access EEG
epilepsy dataset collected at the Boston Children's
Hospital. Our deep learning model is able to extract
spectral, temporal features from EEG epilepsy data and
use them to learn the general structure of a seizure
that is less sensitive to variations. For cross-patient
EEG data, our method produced an overall sensitivity of
90.00\%, specificity of 91.65\%, and overall accuracy
of 98.05\% for the whole dataset of 23 patients. The
system can detect seizures with an accuracy of 99.46\%.
Thus, it can be used as an excellent cross-patient
seizure classifier. The results show that our model
performs better than the previous state-of-the-art
models for patient-specific and cross-patient seizure
detection task. The method gave an overall accuracy of
99.65\% for patient-specific data. The system can also
visualize the special orientation of band power
features. We use correlation maps to relate spectral
amplitude features to the output in the form of images.
By using the results from our deep learning model, this
visualization method can be used as an effective
multimedia tool for producing quick and relevant brain
mapping images that can be used by medical experts for
further investigation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "10",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Alameda-Pineda:2019:SSM,
author = "Xavier Alameda-Pineda and Miriam Redi and Mohammad
Soleymani and Nicu Sebe and Shih-Fu Chang and Samuel
Gosling",
title = "Special Section on Multimodal Understanding of Social,
Affective, and Subjective Attributes",
journal = j-TOMM,
volume = "15",
number = "1s",
pages = "11:1--11:??",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3292061",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3292061",
abstract = "Multimedia scientists have largely focused their
research on the recognition of tangible properties of
data such as objects and scenes. Recently, the field
has started evolving toward the modeling of more
complex properties. For example, the understanding of
social, affective, and subjective attributes of visual
data has attracted the attention of many research teams
at the crossroads of computer vision, multimedia, and
social sciences. These intangible attributes include,
for example, visual beauty, video popularity, or user
behavior. Multiple, diverse challenges arise when
modeling such properties from multimedia data. The
sections concern technical aspects such as reliable
groundtruth collection, the effective learning of
subjective properties, or the impact of context in
subjective perception; see Refs. [2] and [3].",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "11",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Hu:2019:VPI,
author = "Chuan-Shen Hu and Yi-Tsung Hsieh and Hsiao-Wei Lin and
Mei-Chen Yeh",
title = "{Virtual Portraitist}: an Intelligent Tool for Taking
Well-Posed Selfies",
journal = j-TOMM,
volume = "15",
number = "1s",
pages = "12:1--12:??",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3288760",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3288760",
abstract = "Smart photography carries the promise of quality
improvement and functionality extension in making
aesthetically appealing pictures. In this article, we
focus on self-portrait photographs and introduce new
methods that guide a user in how to best pose while
taking a selfie. While most of the current solutions
use a post processing procedure to beautify a picture,
the developed tool enables a novel function of
recommending a good look before the photo is captured.
Given an input face image, the tool automatically
estimates the pose-based aesthetic score, finds the
most attractive angle of the face, and suggests how the
pose should be adjusted. The recommendation results are
determined adaptively to the appearance and initial
pose of the input face. We apply a data mining approach
to find distinctive, frequent itemsets and association
rules from online profile pictures, upon which the
aesthetic estimation and pose recommendation methods
are developed. A simulated and a real image set are
used for experimental evaluation. The results show the
proposed aesthetic estimation method can effectively
select user-favorable photos. Moreover, the
recommendation performance for the vertical adjustment
is moderately related to the degree of conformity among
the professional photographers' recommendations. This
study echoes the trend of instant photo sharing, in
which a user takes a picture and then immediately
shares it on a social network without engaging in
tedious editing.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "12",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Okada:2019:MDG,
author = "Shogo Okada and Laurent Son Nguyen and Oya Aran and
Daniel Gatica-Perez",
title = "Modeling Dyadic and Group Impressions with Intermodal
and Interperson Features",
journal = j-TOMM,
volume = "15",
number = "1s",
pages = "13:1--13:??",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3265754",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3265754",
abstract = "This article proposes a novel feature-extraction
framework for inferring impression personality traits,
emergent leadership skills, communicative competence,
and hiring decisions. The proposed framework extracts
multimodal features, describing each participant's
nonverbal activities. It captures intermodal and
interperson relationships in interactions and captures
how the target interactor generates nonverbal behavior
when other interactors also generate nonverbal
behavior. The intermodal and interperson patterns are
identified as frequent co-occurring events based on
clustering from multimodal sequences. The proposed
framework is applied to the SONVB corpus, which is an
audiovisual dataset collected from dyadic job
interviews, and the ELEA audiovisual data corpus, which
is a dataset collected from group meetings. We evaluate
the framework on a binary classification task involving
15 impression variables from the two data corpora. The
experimental results show that the model trained with
co-occurrence features is more accurate than previous
models for 14 out of 15 traits.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "13",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhao:2019:PER,
author = "Sicheng Zhao and Amir Gholaminejad and Guiguang Ding
and Yue Gao and Jungong Han and Kurt Keutzer",
title = "Personalized Emotion Recognition by Personality-Aware
High-Order Learning of Physiological Signals",
journal = j-TOMM,
volume = "15",
number = "1s",
pages = "14:1--14:??",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3233184",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3233184",
abstract = "Due to the subjective responses of different subjects
to physical stimuli, emotion recognition methodologies
from physiological signals are increasingly becoming
personalized. Existing works mainly focused on modeling
the involved physiological corpus of each subject,
without considering the psychological factors, such as
interest and personality. The latent correlation among
different subjects has also been rarely examined. In
this article, we propose to investigate the influence
of personality on emotional behavior in a hypergraph
learning framework. Assuming that each vertex is a
compound tuple (subject, stimuli), multi-modal
hypergraphs can be constructed based on the personality
correlation among different subjects and on the
physiological correlation among corresponding stimuli.
To reveal the different importance of vertices,
hyperedges, and modalities, we learn the weights for
each of them. As the hypergraphs connect different
subjects on the compound vertices, the emotions of
multiple subjects can be simultaneously recognized. In
this way, the constructed hypergraphs are
vertex-weighted multi-modal multi-task ones. The
estimated factors, referred to as emotion relevance,
are employed for emotion recognition. We carry out
extensive experiments on the ASCERTAIN dataset and the
results demonstrate the superiority of the proposed
method, as compared to the state-of-the-art emotion
recognition approaches.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "14",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Trabelsi:2019:UDS,
author = "Rim Trabelsi and Jagannadan Varadarajan and Le Zhang
and Issam Jabri and Yong Pei and Fethi Smach and Ammar
Bouallegue and Pierre Moulin",
title = "Understanding the Dynamics of Social Interactions: a
Multi-Modal Multi-View Approach",
journal = j-TOMM,
volume = "15",
number = "1s",
pages = "15:1--15:??",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3300937",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3300937",
abstract = "In this article, we deal with the problem of
understanding human-to-human interactions as a
fundamental component of social events analysis.
Inspired by the recent success of multi-modal visual
data in many recognition tasks, we propose a novel
approach to model dyadic interaction by means of
features extracted from synchronized 3D skeleton
coordinates, depth, and Red Green Blue (RGB) sequences.
From skeleton data, we extract new view-invariant
proxemic features, named Unified Proxemic Descriptor
(UProD), which is able to incorporate intrinsic and
extrinsic distances between two interacting subjects. A
novel key frame selection method is introduced to
identify salient instants of the interaction sequence
based on the joints' energy. From Red Green Blue Depth
(RGBD) videos, more holistic CNN features are extracted
by applying an adaptive pre-trained Convolutional
Neural Networks (CNNs) on optical flow frames. For
better understanding the dynamics of interactions, we
expand the boundaries of dyadic interactions analysis
by proposing a fundamentally new modeling for
non-treated problem aiming to discern the active from
the passive interactor. Extensive experiments have been
carried out on four multi-modal and multi-view
interactions datasets. The experimental results
demonstrate the superiority of our proposed techniques
against the state-of-the-art approaches.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "15",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Gan:2019:MSF,
author = "Tian Gan and Junnan Li and Yongkang Wong and Mohan S.
Kankanhalli",
title = "A Multi-sensor Framework for Personal Presentation
Analytics",
journal = j-TOMM,
volume = "15",
number = "2",
pages = "30:1--30:??",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3300941",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3300941",
abstract = "Presentation has been an effective method for
delivering information to an audience for many years.
Over the past few decades, technological advancements
have revolutionized the way humans deliver
presentation. Conventionally, the quality of a
presentation is usually evaluated through painstaking
manual analysis with experts. Although the expert
feedback is effective in assisting users to improve
their presentation skills, manual evaluation suffers
from high cost and is often not available to most
individuals. In this work, we propose a novel
multi-sensor self-quantification system for
presentations, which is designed based on a new
proposed assessment rubric. We present our analytics
model with conventional ambient sensors (i.e., static
cameras and Kinect sensor) and the emerging wearable
egocentric sensors (i.e., Google Glass). In addition,
we performed a cross-correlation analysis of speaker's
vocal behavior and body language. The proposed
framework is evaluated on a new presentation dataset,
namely, NUS Multi-Sensor Presentation dataset, which
consists of 51 presentations covering a diverse range
of topics. To validate the efficacy of the proposed
system, we have conducted a series of user studies with
the speakers and an interview with an English
communication expert, which reveals positive and
promising feedback.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "30",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Tang:2019:RVL,
author = "Pengjie Tang and Hanli Wang and Qinyu Li",
title = "Rich Visual and Language Representation with
Complementary Semantics for Video Captioning",
journal = j-TOMM,
volume = "15",
number = "2",
pages = "31:1--31:??",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3303083",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3303083",
abstract = "It is interesting and challenging to translate a video
to natural description sentences based on the video
content. In this work, an advanced framework is built
to generate sentences with coherence and rich semantic
expressions for video captioning. A long short term
memory (LSTM) network with an improved factored way is
first developed, which takes the inspiration of LSTM
with a conventional factored way and a common practice
to feed multi-modal features into LSTM at the first
time step for visual description. Then, the
incorporation of the LSTM network with the proposed
improved factored way and un-factored way is exploited,
and a voting strategy is utilized to predict candidate
words. In addition, for robust and abstract visual and
language representation, residuals are employed to
enhance the gradient signals that are learned from the
residual network (ResNet), and a deeper LSTM network is
constructed. Furthermore, three convolutional neural
network based features extracted from GoogLeNet,
ResNet101, and ResNet152, are fused to catch more
comprehensive and complementary visual information.
Experiments are conducted on two benchmark datasets,
including MSVD and MSR-VTT2016, and competitive
performances are obtained by the proposed techniques as
compared to other state-of-the-art methods.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "31",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Shen:2019:MLS,
author = "Chen Shen and Zhongming Jin and Wenqing Chu and
Rongxin Jiang and Yaowu Chen and Guo-Jun Qi and
Xian-Sheng Hua",
title = "Multi-level Similarity Perception Network for Person
Re-identification",
journal = j-TOMM,
volume = "15",
number = "2",
pages = "32:1--32:??",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3309881",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3309881",
abstract = "In this article, we propose a novel deep Siamese
architecture based on a convolutional neural network
(CNN) and multi-level similarity perception for the
person re-identification (re-ID) problem. According to
the distinct characteristics of diverse feature maps,
we effectively apply different similarity constraints
to both low-level and high-level feature maps during
training stage. Due to the introduction of appropriate
similarity comparison mechanisms at different levels,
the proposed approach can adaptively learn
discriminative local and global feature
representations, respectively, while the former is more
sensitive in localizing part-level prominent patterns
relevant to re-identifying people across cameras.
Meanwhile, a novel strong activation pooling strategy
is utilized on the last convolutional layer for
abstract local-feature aggregation to pursue more
representative feature representations. Based on this,
we propose final feature embedding by simultaneously
encoding original global features and discriminative
local features. In addition, our framework has two
other benefits: First, classification constraints can
be easily incorporated into the framework, forming a
unified multi-task network with similarity constraints.
Second, as similarity-comparable information has been
encoded in the network's learning parameters via
back-propagation, pairwise input is not necessary at
test time. That means we can extract features of each
gallery image and build an index in an off-line manner,
which is essential for large-scale real-world
applications. Experimental results on multiple
challenging benchmarks demonstrate that our method
achieves splendid performance compared with the current
state-of-the-art approaches.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "32",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Miao:2019:DLS,
author = "Yu Miao and Haiwei Dong and Jihad Mohamad {Al Jaam}
and Abdulmotaleb {El Saddik}",
title = "A Deep Learning System for Recognizing Facial
Expression in Real-Time",
journal = j-TOMM,
volume = "15",
number = "2",
pages = "33:1--33:??",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3311747",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3311747",
abstract = "This article presents an image-based real-time facial
expression recognition system that is able to recognize
the facial expressions of several subjects on a webcam
at the same time. Our proposed methodology combines a
supervised transfer learning strategy and a joint
supervision method with center loss, which is crucial
for facial tasks. A newly proposed Convolutional Neural
Network (CNN) model, MobileNet, which has both accuracy
and speed, is deployed in both offline and in a
real-time framework that enables fast and accurate
real-time output. Evaluations towards two publicly
available datasets, JAFFE and CK+, are carried out
respectively. The JAFFE dataset reaches an accuracy of
95.24\%, while an accuracy of 96.92\% is achieved on
the 6-class CK+ dataset, which contains only the last
frames of image sequences. At last, the average
run-time cost for the recognition of the real-time
implementation is around 3.57ms/frame on a NVIDIA
Quadro K4200 GPU.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "33",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Mesfin:2019:UET,
author = "Gebremariam Mesfin and Nadia Hussain and Alexandra
Covaci and Gheorghita Ghinea",
title = "Using Eye Tracking and Heart-Rate Activity to Examine
Crossmodal Correspondences {QoE} in {Mulsemedia}",
journal = j-TOMM,
volume = "15",
number = "2",
pages = "34:1--34:??",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3303080",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3303080",
abstract = "Different senses provide us with information of
various levels of precision and enable us to construct
a more precise representation of the world. Rich
multisensory simulations are thus beneficial for
comprehension, memory reinforcement, or retention of
information. Crossmodal mappings refer to the
systematic associations often made between different
sensory modalities (e.g., high pitch is matched with
angular shapes) and govern multisensory processing. A
great deal of research effort has been put into
exploring cross-modal correspondences in the field of
cognitive science. However, the possibilities they open
in the digital world have been relatively unexplored.
Multiple sensorial media (mulsemedia) provides a highly
immersive experience to the users and enhances their
Quality of Experience (QoE) in the digital world. Thus,
we consider that studying the plasticity and the
effects of cross-modal correspondences in a mulsemedia
setup can bring interesting insights about improving
the human computer dialogue and experience. In our
experiments, we exposed users to videos with certain
visual dimensions (brightness, color, and shape), and
we investigated whether the pairing with a cross-modal
matching sound (high and low pitch) and the
corresponding auto-generated vibrotactile effects
(produced by a haptic vest) lead to an enhanced QoE.
For this, we captured the eye gaze and the heart rate
of users while experiencing mulsemedia, and we asked
them to fill in a set of questions targeting their
enjoyment and perception at the end of the experiment.
Results showed differences in eye-gaze patterns and
heart rate between the experimental and the control
group, indicating changes in participants' engagement
when videos were accompanied by matching cross-modal
sounds (this effect was the strongest for the video
displaying angular shapes and high-pitch audio) and
transitively generated cross-modal vibrotactile
effects.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "34",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Cheung:2019:DOC,
author = "Ming Cheung and James She and Weiwei Sun and Jiantao
Zhou",
title = "Detecting Online Counterfeit-goods Seller using
Connection Discovery",
journal = j-TOMM,
volume = "15",
number = "2",
pages = "35:1--35:??",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3311785",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3311785",
abstract = "With the advancement of social media and mobile
technology, any smartphone user can easily become a
seller on social media and e-commerce platforms, such
as Instagram and Carousell in Hong Kong or Taobao in
China. A seller shows images of their products and
annotates their images with suitable tags that can be
searched easily by others. Those images could be taken
by the seller, or the seller could use images shared by
other sellers. Among sellers, some sell counterfeit
goods, and these sellers may use disguising tags and
language, which make detecting them a difficult task.
This article proposes a framework to detect counterfeit
sellers by using deep learning to discover connections
among sellers from their shared images. Based on 473K
shared images from Taobao, Instagram, and Carousell, it
is proven that the proposed framework can detect
counterfeit sellers. The framework is 30\% better than
approaches using object recognition in detecting
counterfeit sellers. To the best of our knowledge, this
is the first work to detect online counterfeit sellers
from their shared images.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "35",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yarnagula:2019:QMC,
author = "Hema Kumar Yarnagula and Parikshit Juluri and Sheyda
Kiani Mehr and Venkatesh Tamarapalli and Deep Medhi",
title = "{QoE} for Mobile Clients with {Segment-aware Rate
Adaptation Algorithm (SARA)} for {DASH} Video
Streaming",
journal = j-TOMM,
volume = "15",
number = "2",
pages = "36:1--36:??",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3311749",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3311749",
abstract = "Dynamic adaptive streaming over HTTP (DASH) is widely
used for video streaming on mobile devices. Ensuring a
good quality of experience (QoE) for mobile video
streaming is essential, as it severely impacts both the
network and content providers' revenue. Thus, a good
rate adaptation algorithm at the client end that
provides high QoE is critically important. Recently, a
segment size-aware rate adaptation (SARA) algorithm was
proposed for DASH clients. However, its performance on
mobile clients has not been investigated so far. The
main contributions of this article are twofold: (1) We
discuss SARA's implementation for mobile clients to
improve the QoE in mobile video streaming, one that
accurately predicts the download time for the next
segment and makes an informed bitrate selection, and
(2) we developed a new parametric QoE model to compute
a cumulative score that helps in fair comparison of
different adaptation algorithms. Based on our
subjective and objective evaluation, we observed that
SARA for mobile clients outperforms others by 17\% on
average, in terms of the Mean Opinion Score, while
achieving, on average, a 76\% improvement in terms of
the interruption ratio. The score obtained from our new
parametric QoE model also demonstrates that the SARA
algorithm for mobile clients gives a better QoE among
all the algorithms.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "36",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Atrey:2019:WMD,
author = "Pradeep K. Atrey and Bakul Trehan and Mukesh K.
Saini",
title = "Watch Me from Distance {(WMD)}: a Privacy-Preserving
Long-Distance Video Surveillance System",
journal = j-TOMM,
volume = "15",
number = "2",
pages = "37:1--37:??",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3312574",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3312574",
abstract = "Preserving the privacy of people in video surveillance
systems is quite challenging, and a significant amount
of research has been done to solve this problem in
recent times. Majority of existing techniques are based
on detecting bodily cues such as face and/or silhouette
and obscuring them so that people in the videos cannot
be identified. We observe that merely hiding bodily
cues is not enough for protecting identities of the
individuals in the videos. An adversary, who has prior
contextual knowledge about the surveilled area, can
identify people in the video by exploiting the implicit
inference channels such as behavior, place, and time.
This article presents an anonymous surveillance system,
called Watch Me from Distance (WMD), which advocates
for outsourcing of surveillance video monitoring
(similar to call centers) to the long-distance sites
where professional security operators watch the video
and alert the local site when any suspicious or
abnormal event takes place. We find that long-distance
monitoring helps in decoupling the contextual knowledge
of security operators. Since security operators at the
remote site could turn into adversaries, a trust
computation model to determine the credibility of the
operators is presented as an integral part of the
proposed system. The feasibility study and experiments
suggest that the proposed system provides more robust
measures of privacy yet maintains surveillance
effectiveness.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "37",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Hsu:2019:LMC,
author = "Chih-Fan Hsu and Yu-Shuen Wang and Chin-Laung Lei and
Kuan-Ta Chen",
title = "Look at Me! {Correcting} Eye Gaze in Live Video
Communication",
journal = j-TOMM,
volume = "15",
number = "2",
pages = "38:1--38:??",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3311784",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3311784",
abstract = "Although live video communication is widely used, it
is generally less engaging than face-to-face
communication because of limitations on social,
emotional, and haptic feedback. Missing eye contact is
one such problem caused by the physical deviation
between the screen and camera on a device. Manipulating
video frames to correct eye gaze is a solution to this
problem. In this article, we introduce a system to
rotate the eyeball of a local participant before the
video frame is sent to the remote side. It adopts a
warping-based convolutional neural network to relocate
pixels in eye regions. To improve visual quality, we
minimize the L2 distance between the ground truths and
warped eyes. We also present several newly designed
loss functions to help network training. These new loss
functions are designed to preserve the shape of eye
structures and minimize color changes around the
periphery of eye regions. To evaluate the presented
network and loss functions, we objectively and
subjectively compared results generated by our system
and the state-of-the-art, DeepWarp, in relation to two
datasets. The experimental results demonstrated the
effectiveness of our system. In addition, we showed
that our system can perform eye-gaze correction in real
time on a consumer-level laptop. Because of the quality
and efficiency of the system, gaze correction by
postprocessing through this system is a feasible
solution to the problem of missing eye contact in video
communication.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "38",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Ahmad:2019:HDF,
author = "Kashif Ahmad and Nicola Conci",
title = "How Deep Features Have Improved Event Recognition in
Multimedia: a Survey",
journal = j-TOMM,
volume = "15",
number = "2",
pages = "39:1--39:??",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3306240",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3306240",
abstract = "Event recognition is one of the areas in multimedia
that is attracting great attention of researchers.
Being applicable in a wide range of applications, from
personal to collective events, a number of interesting
solutions for event recognition using multimedia
information sources have been proposed. On the other
hand, following their immense success in
classification, object recognition, and detection, deep
learning has been shown to perform well in event
recognition tasks also. Thus, a large portion of the
literature on event analysis relies nowadays on deep
learning architectures. In this article, we provide an
extensive overview of the existing literature in this
field, analyzing how deep features and deep learning
architectures have changed the performance of event
recognition frameworks. The literature on event-based
analysis of multimedia contents can be categorized into
four groups, namely (i) event recognition in single
images; (ii) event recognition in personal photo
collections; (iii) event recognition in videos; and
(iv) event recognition in audio recordings. In this
article, we extensively review different
deep-learning-based frameworks for event recognition in
these four domains. Furthermore, we also review some
benchmark datasets made available to the scientific
community to validate novel event recognition
pipelines. In the final part of the manuscript, we also
provide a detailed discussion on basic insights
gathered from the literature review, and identify
future trends and challenges.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "39",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Chen:2019:ACV,
author = "Yadang Chen and Chuanyan Hao and Alex X. Liu and Enhua
Wu",
title = "Appearance-consistent Video Object Segmentation Based
on a Multinomial Event Model",
journal = j-TOMM,
volume = "15",
number = "2",
pages = "40:1--40:??",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3321507",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3321507",
abstract = "In this study, we propose an effective and efficient
algorithm for unconstrained video object segmentation,
which is achieved in a Markov random field (MRF). In
the MRF graph, each node is modeled as a superpixel and
labeled as either foreground or background during the
segmentation process. The unary potential is computed
for each node by learning a transductive SVM classifier
under supervision by a few labeled frames. The pairwise
potential is used for the spatial-temporal smoothness.
In addition, a high-order potential based on the
multinomial event model is employed to enhance the
appearance consistency throughout the frames. To
minimize this intractable feature, we also introduce a
more efficient technique that simply extends the
original MRF structure. The proposed approach was
evaluated in experiments with different measures and
the results based on a benchmark demonstrated its
effectiveness compared with other state-of-the-art
algorithms.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "40",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Roberto:2019:DLS,
author = "Pierdicca Roberto and Frontoni Emanuele and Zingaretti
Primo and Mancini Adriano and Loncarski Jelena and
Paolanti Marina",
title = "Design, Large-Scale Usage Testing, and Important
Metrics for Augmented Reality Gaming Applications",
journal = j-TOMM,
volume = "15",
number = "2",
pages = "41:1--41:??",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3311748",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3311748",
abstract = "Augmented Reality (AR) offers the possibility to
enrich the real world with digital mediated content,
increasing in this way the quality of many everyday
experiences. While in some research areas such as
cultural heritage, tourism, or medicine there is a
strong technological investment, AR for game purposes
struggles to become a widespread commercial
application. In this article, a novel framework for AR
kid games is proposed, already developed by the authors
for other AR applications such as Cultural Heritage and
Arts. In particular, the framework includes different
layers such as the development of a series of AR kid
puzzle games in an intermediate structure which can be
used as a standard for different applications
development, the development of a smart configuration
tool, together with general guidelines and long-life
usage tests and metrics. The proposed application is
designed for augmenting the puzzle experience, but can
be easily extended to other AR gaming applications.
Once the user has assembled the real puzzle, AR
functionality within the mobile application can be
unlocked, bringing to life puzzle characters, creating
a seamless game that merges AR interactions with the
puzzle reality. The main goals and benefits of this
framework can be seen in the development of a novel set
of AR tests and metrics in the pre-release phase (in
order to help the commercial launch and developers),
and in the release phase by introducing the measures
for long-life app optimization, usage tests and hint on
final users together with a measure to design policy,
providing a method for automatic testing of quality and
popularity improvements. Moreover, smart configuration
tools, as part of the general framework, enabling
multi-app and eventually also multi-user development,
have been proposed, facilitating the serialization of
the applications. Results were obtained from a
large-scale user test with about 4 million users on a
set of eight gaming applications, providing the
scientific community a workflow for implicit
quantitative analysis in AR gaming. Different data
analytics developed on the data collected by the
framework prove that the proposed approach is
affordable and reliable for long-life testing and
optimization.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "41",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Siarohin:2019:IIM,
author = "Aliaksandr Siarohin and Gloria Zen and Cveta
Majtanovic and Xavier Alameda-Pineda and Elisa Ricci
and Nicu Sebe",
title = "Increasing Image Memorability with Neural Style
Transfer",
journal = j-TOMM,
volume = "15",
number = "2",
pages = "42:1--42:??",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3311781",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3311781",
abstract = "Recent works in computer vision and multimedia have
shown that image memorability can be automatically
inferred exploiting powerful deep-learning models. This
article advances the state of the art in this area by
addressing a novel and more challenging issue: `` Given
an arbitrary input image, can we make it more
memorable? '' To tackle this problem, we introduce an
approach based on an editing-by-applying-filters
paradigm: given an input image, we propose to
automatically retrieve a set of ``style seeds,'' i.e.,
a set of style images that, applied to the input image
through a neural style transfer algorithm, provide the
highest increase in memorability. We show the
effectiveness of the proposed approach with experiments
on the publicly available LaMem dataset, performing
both a quantitative evaluation and a user study. To
demonstrate the flexibility of the proposed framework,
we also analyze the impact of different implementation
choices, such as using different state-of-the-art
neural style transfer methods. Finally, we show several
qualitative results to provide additional insights on
the link between image style and memorability.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "42",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Do:2019:SDC,
author = "Thanh-Toan Do and Tuan Hoang and Dang-Khoa Le Tan and
Huu Le and Tam V. Nguyen and Ngai-Man Cheung",
title = "From Selective Deep Convolutional Features to Compact
Binary Representations for Image Retrieval",
journal = j-TOMM,
volume = "15",
number = "2",
pages = "43:1--43:??",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3314051",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3314051",
abstract = "In the large-scale image retrieval task, the two most
important requirements are the discriminability of
image representations and the efficiency in computation
and storage of representations. Regarding the former
requirement, Convolutional Neural Network is proven to
be a very powerful tool to extract highly
discriminative local descriptors for effective image
search. Additionally, to further improve the
discriminative power of the descriptors, recent works
adopt fine-tuned strategies. In this article, taking a
different approach, we propose a novel, computationally
efficient, and competitive framework. Specifically, we
first propose various strategies to compute masks,
namely, SIFT-masks, SUM-mask, and MAX-mask, to select a
representative subset of local convolutional features
and eliminate redundant features. Our in-depth analyses
demonstrate that proposed masking schemes are effective
to address the burstiness drawback and improve
retrieval accuracy. Second, we propose to employ recent
embedding and aggregating methods that can
significantly boost the feature discriminability.
Regarding the computation and storage efficiency, we
include a hashing module to produce very compact binary
image representations. Extensive experiments on six
image retrieval benchmarks demonstrate that our
proposed framework achieves the state-of-the-art
retrieval performances.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "43",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Shen:2019:LCS,
author = "Liquan Shen and Ping An and Guorui Feng",
title = "Low-Complexity Scalable Extension of the
High-Efficiency Video Coding {(SHVC)} Encoding System",
journal = j-TOMM,
volume = "15",
number = "2",
pages = "44:1--44:??",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3313185",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3313185",
abstract = "The scalable extension of the high-efficiency video
coding (SHVC) system adopts a hierarchical
quadtree-based coding unit (CU) that is suitable for
various texture and motion properties of videos.
Currently, the test model of SHVC identifies the
optimal CU size by performing an exhaustive quadtree
depth-level search, which achieves a high compression
efficiency at a heavy cost in terms of the
computational complexity. However, many interactive
multimedia applications, such as remote monitoring and
video surveillance, which are sensitive to time delays,
have insufficient computational power for coding
high-definition (HD) and ultra-high-definition (UHD)
videos. Therefore, it is important, yet challenging, to
optimize the SHVC coding procedure and accelerate video
coding. In this article, we propose a fast CU quadtree
depth-level decision algorithm for inter-frames on
enhancement layers that is based on an analysis of
inter-layer, spatial, and temporal correlations. When
motion/texture properties of coding regions can be
identified early, a fast algorithm can be designed for
adapting CU depth-level decision procedures to video
contents and avoiding unnecessary computations during
CU depth-level traversal. The proposed algorithm
determines the motion activity level at the treeblock
size of the hierarchical quadtree by utilizing motion
vectors from its corresponding blocks at the base
layer. Based on the motion activity level, neighboring
encoded CUs that have larger correlations are
preferentially selected to predict the optimal depth
level of the current treeblock. Finally, two
parameters, namely, the motion activity level and the
predicted CU depth level, are used to identify a subset
of candidate CU depth levels and adaptively optimize CU
depth-level decision processes. The experimental
results demonstrate that the proposed scheme can run
approximately three times faster than the most recent
SHVC reference software, with a negligible loss of
compression efficiency. The proposed scheme is
efficient for all types of scalable video sequences
under various coding conditions and outperforms
state-of-the-art fast SHVC and HEVC algorithms. Our
scheme is a suitable candidate for interactive HD/UHD
video applications that are expected to operate in
real-time and power-constrained scenarios.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "44",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Hu:2019:CAA,
author = "Jun Hu and Shengsheng Qian and Quan Fang and Xueliang
Liu and Changsheng Xu",
title = "{A$^2$ CMHNE}: Attention-Aware Collaborative
Multimodal Heterogeneous Network Embedding",
journal = j-TOMM,
volume = "15",
number = "2",
pages = "45:1--45:??",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3321506",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3321506",
abstract = "Network representation learning is playing an
important role in network analysis due to its
effectiveness in a variety of applications. However,
most existing network embedding models focus on
homogeneous networks and neglect the diverse properties
such as different types of network structures and
associated multimedia content information. In this
article, we learn node representations for multimodal
heterogeneous networks, which contain multiple types of
nodes and/or links as well as multimodal content such
as texts and images. We propose a novel attention-aware
collaborative multimodal heterogeneous network
embedding method (A$^2$ CMHNE), where an
attention-based collaborative representation learning
approach is proposed to promote the collaboration of
structure-based embedding and content-based embedding,
and generate the robust node representation by
introducing an attention mechanism that enables
informative embedding integration. In experiments, we
compare our model with existing network embedding
models on two real-world datasets. Our method leads to
dramatic improvements in performance by 5\%, and 9\%
compared with five state-of-the-art embedding methods
on one benchmark (M10 Dataset), and on a multi-modal
heterogeneous network dataset (WeChat dataset) for node
classification, respectively. Experimental results
demonstrate the effectiveness of our proposed method on
both node classification and link prediction tasks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "45",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Hosny:2019:RCI,
author = "Khalid M. Hosny and Mohamed M. Darwish",
title = "Resilient Color Image Watermarking Using Accurate
Quaternion Radial Substituted {Chebyshev} Moments",
journal = j-TOMM,
volume = "15",
number = "2",
pages = "46:1--46:??",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3325193",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3325193",
abstract = "In this work, a new quaternion-based method for color
image watermarking is proposed. In this method, a novel
set of quaternion radial substituted Chebyshev moments
(QRSCMs) is presented for robust geometrically
invariant image watermarking. An efficient
computational method is proposed for highly accurate,
fast, and numerically stable QRSCMs in polar
coordinates. The proposed watermarking method consists
of three stages. In the first stage, the Arnold
transform is used to improve the security of the
watermarking scheme by scrambling the binary watermark.
In the second stage, the proposed accurate and stable
QRSCMs of the host color image are computed. In the
third stage, the encrypted binary watermark is embedded
into the host image by employing the quantization
technique on selected-magnitude QRSCMs where the
watermarked color image is obtained by adding the
original host color image to the compensation image.
Then, the binary watermark can be extracted directly
without using the original image from the magnitudes of
QRSCMs. Numerical experiments are performed where the
performance of proposed method is compared with the
existing quaternion moment-based watermarking methods.
The comparison clearly shows that the proposed method
is very efficient in terms of the visual
imperceptibility capability and the robustness under
different attacks compared to the existing quaternion
moment-based watermarking algorithms.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "46",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Mou:2019:AVG,
author = "Wenxuan Mou and Hatice Gunes and Ioannis Patras",
title = "Alone versus In-a-group: a Multi-modal Framework for
Automatic Affect Recognition",
journal = j-TOMM,
volume = "15",
number = "2",
pages = "47:1--47:??",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3321509",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:46 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3321509",
abstract = "Recognition and analysis of human affect has been
researched extensively within the field of computer
science in the past two decades. However, most of the
past research in automatic analysis of human affect has
focused on the recognition of affect displayed by
people in individual settings and little attention has
been paid to the analysis of the affect expressed in
group settings. In this article, we first analyze the
affect expressed by each individual in terms of arousal
and valence dimensions in both individual and group
videos and then propose methods to recognize the
contextual information, i.e., whether a person is alone
or in-a-group by analyzing their face and body
behavioral cues. For affect analysis, we first devise
affect recognition models separately in individual and
group videos and then introduce a cross-condition
affect recognition model that is trained by combining
the two different types of data. We conduct a set of
experiments on two datasets that contain both
individual and group videos. Our experiments show that
(1) the proposed Volume Quantized Local Zernike Moments
Fisher Vector outperforms other unimodal features in
affect analysis; (2) the temporal learning model,
Long-Short Term Memory Networks, works better than the
static learning model, Support Vector Machine; (3)
decision fusion helps to improve affect recognition,
indicating that body behaviors carry emotional
information that is complementary rather than redundant
to the emotion content in facial behaviors; and (4) it
is possible to predict the context, i.e., whether a
person is alone or in-a-group, using their non-verbal
behavioral cues.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "47",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Hong:2019:ASS,
author = "Richang Hong",
title = "Advanced Stereo Seam Carving by Considering Occlusions
on Both Sides",
journal = j-TOMM,
volume = "15",
number = "3",
pages = "69:1--69:??",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3321513",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:47 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3321513",
abstract = "Stereo image retargeting plays a significant role in
the field of image processing, which aims at making
major objects as prominent as possible when the
resolution of an image is changed, including
maintaining disparity and depth information at the same
time. Some seam carving methods are proposed to
preserve the geometric consistency of the images.
However, the regions of occlusion on both sides are not
considered properly. In this article, we propose a
solution to solve this problem. A new strategy of seams
finding is designed by considering occluded and
occluding regions on both of the input images, and
leaving geometric consistency in both images intact. We
also introduced the method of line segment detection
and superpixel segmentation to further improve the
quality of the images. Imaging effects are optimized in
the process and visual comfort, which is also
influenced by other factors, can be boosted as well.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "69",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhang:2019:SET,
author = "Yun Zhang and Na Li and Sam Kwong and Gangyi Jiang and
Huanqiang Zeng",
title = "Statistical Early Termination and Early Skip Models
for Fast Mode Decision in {HEVC INTRA} Coding",
journal = j-TOMM,
volume = "15",
number = "3",
pages = "70:1--70:??",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3321510",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:47 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3321510",
abstract = "In this article, statistical Early Termination (ET)
and Early Skip (ES) models are proposed for fast Coding
Unit (CU) and prediction mode decision in HEVC INTRA
coding, in which three categories of ET and ES
sub-algorithms are included. First, the CU ranges of
the current CU are recursively predicted based on the
texture and CU depth of the spatial neighboring CUs.
Second, the statistical model based ET and ES schemes
are proposed and applied to optimize the CU and INTRA
prediction mode decision, in which the coding
complexities over different decision layers are jointly
minimized subject to acceptable rate-distortion
degradation. Third, the mode correlations among the
INTRA prediction modes are exploited to early terminate
the full rate-distortion optimization in each CU
decision layer. Extensive experiments are performed to
evaluate the coding performance of each sub-algorithm
and the overall algorithm. Experimental results reveal
that the overall proposed algorithm can achieve 45.47\%
to 74.77\%, and 58.09\% on average complexity
reduction, while the overall Bj{\o}ntegaard delta bit
rate increase and Bj{\o}ntegaard delta peak
signal-to-noise ratio degradation are 2.29\% and -0.11
dB, respectively.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "70",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Gupta:2019:SGM,
author = "Abhinav Gupta and Divya Singhal",
title = "A Simplistic Global Median Filtering Forensics Based
on Frequency Domain Analysis of Image Residuals",
journal = j-TOMM,
volume = "15",
number = "3",
pages = "71:1--71:??",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3321508",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:47 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3321508",
abstract = "Sophisticated image forgeries introduce digital image
forensics as an active area of research. In this area,
many researchers have addressed the problem of median
filtering forensics. Existing median filtering
detectors are adequate to classify median filtered
images in uncompressed mode and in compressed mode at
high-quality factors. Despite that, the field is
lacking a robust method to detect median filtering in
low-resolution images compressed with low-quality
factors. In this article, a novel feature set (four
feature dimensions), based on first-order statistics of
frequency contents of median filtered residuals (MFRs)
of original and median filtered images, has been
proposed. The proposed feature set outperforms
handcrafted features-based state-of-the-art detectors
in terms of feature set dimensions and detection
results obtained for low-resolution images at all
quality factors. Also, results reveal the efficacy of
proposed method over deep-learning-based median
filtering detector. Comprehensive results expose the
efficacy of the proposed detector to detect median
filtering against other similar manipulations.
Additionally, generalization ability test on
cross-database images support the cross-validation
results on four different databases. Thus, our proposed
detector meets the current challenges in the field, to
a great extent.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "71",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wu:2019:HVO,
author = "Kan Wu and Guanbin Li and Haofeng Li and Jianjun Zhang
and Yizhou Yu",
title = "Harvesting Visual Objects from {Internet} Images via
Deep-Learning-Based Objectness Assessment",
journal = j-TOMM,
volume = "15",
number = "3",
pages = "72:1--72:??",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3318463",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:47 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3318463",
abstract = "The collection of internet images has been growing in
an astonishing speed. It is undoubted that these images
contain rich visual information that can be useful in
many applications, such as visual media creation and
data-driven image synthesis. In this article, we focus
on the methodologies for building a visual object
database from a collection of internet images. Such
database is built to contain a large number of
high-quality visual objects that can help with various
data-driven image applications. Our method is based on
dense proposal generation and objectness-based
re-ranking. A novel deep convolutional neural network
is designed for the inference of proposal objectness,
the probability of a proposal containing optimally
located foreground object. In our work, the objectness
is quantitatively measured in regard of completeness
and fullness, reflecting two complementary features of
an optimal proposal: a complete foreground and
relatively small background. Our experiments indicate
that object proposals re-ranked according to the output
of our network generally achieve higher performance
than those produced by other state-of-the-art methods.
As a concrete example, a database of over 1.2 million
visual objects has been built using the proposed
method, and has been successfully used in various
data-driven image applications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "72",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yuan:2019:SSP,
author = "Yuan Yuan and Jie Fang and Xiaoqiang Lu and Yachuang
Feng",
title = "Spatial Structure Preserving Feature Pyramid Network
for Semantic Image Segmentation",
journal = j-TOMM,
volume = "15",
number = "3",
pages = "73:1--73:??",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3321512",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:47 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3321512",
abstract = "Recently, progress on semantic image segmentation is
substantial, benefiting from the rapid development of
Convolutional Neural Networks. Semantic image
segmentation approaches proposed lately have been
mostly based on Fully convolutional Networks (FCNs).
However, these FCN-based methods use large receptive
fields and too many pooling layers to depict the
discriminative semantic information of the images.
Specifically, on one hand, convolutional kernel with
large receptive field smooth the detailed edges, since
too much contexture information is used to depict the
``center pixel.'' However, the pooling layer increases
the receptive field through zooming out the latest
feature maps, which loses many detailed information of
the image, especially in the deeper layers of the
network. These operations often cause low spatial
resolution inside deep layers, which leads to spatially
fragmented prediction. To address this problem, we
exploit the inherent multi-scale and pyramidal
hierarchy of deep convolutional networks to extract the
feature maps with different resolutions and take full
advantages of these feature maps via a gradually
stacked fusing way. Specifically, for two adjacent
convolutional layers, we upsample the features from
deeper layer with stride of 2 and then stack them on
the features from shallower layer. Then, a
convolutional layer with kernels of 1$ \times $ 1 is
followed to fuse these stacked features. The fused
feature preserves the spatial structure information of
the image; meanwhile, it owns strong discriminative
capability for pixel classification. Additionally, to
further preserve the spatial structure information and
regional connectivity of the predicted category label
map, we propose a novel loss term for the network. In
detail, two graph model-based spatial affinity matrixes
are proposed, which are used to depict the pixel-level
relationships in the input image and predicted category
label map respectively, and then their cosine distance
is backward propagated to the network. The proposed
architecture, called spatial structure preserving
feature pyramid network, significantly improves the
spatial resolution of the predicted category label map
for semantic image segmentation. The proposed method
achieves state-of-the-art results on three public and
challenging datasets for semantic image segmentation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "73",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhang:2019:MFA,
author = "Junxuan Zhang and Haifeng Hu and Xinlong Lu",
title = "Moving Foreground-Aware Visual Attention and Key
Volume Mining for Human Action Recognition",
journal = j-TOMM,
volume = "15",
number = "3",
pages = "74:1--74:??",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3321511",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:47 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3321511",
abstract = "Recently, many deep learning approaches have shown
remarkable progress on human action recognition.
However, it remains unclear how to extract the useful
information in videos since only video-level labels are
available in the training phase. To address this
limitation, many efforts have been made to improve the
performance of action recognition by applying the
visual attention mechanism in the deep learning model.
In this article, we propose a novel deep model called
Moving Foreground Attention (MFA) that enhances the
performance of action recognition by guiding the model
to focus on the discriminative foreground targets. In
our work, MFA detects the moving foreground through a
proposed variance-based algorithm. Meanwhile, an
unsupervised proposal is utilized to mine the
action-related key volumes and generate corresponding
correlation scores. Based on these scores, a newly
proposed stochastic-out scheme is exploited to train
the MFA. Experiment results show that action
recognition performance can be significantly improved
by using our proposed techniques, and our model
achieves state-of-the-art performance on UCF101 and
HMDB51.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "74",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{More:2019:PLA,
author = "Amit More and Subhasis Chaudhuri",
title = "A Pseudo-likelihood Approach for Geo-localization of
Events from Crowd-sourced Sensor-Metadata",
journal = j-TOMM,
volume = "15",
number = "3",
pages = "75:1--75:??",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3321701",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:47 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3321701",
abstract = "Events such as live concerts, protest marches, and
exhibitions are often video recorded by many people at
the same time, typically using smartphone devices. In
this work, we address the problem of geo-localizing
such events from crowd-generated data. Traditional
approaches for solving such a problem using multiple
video sequences of the event would require highly
complex computer vision (CV) methods, which are
computation intensive and are not robust under the
environment where visual data are collected through
crowd-sourced medium. In the present work, we approach
the problem in a probabilistic framework using only the
sensor metadata obtained from smartphones. We model the
event location and camera locations and orientations
(camera parameters) as the hidden states in a Hidden
Markov Model. The sensor metadata from GPS and the
digital compass from user smartphones are used as the
observations associated with the hidden states of the
model. We have used a suitable potential function to
capture the complex interaction between the hidden
states (i.e., event location and camera parameters).
The non-Gaussian densities involved in the model, such
as the potential function involving hidden states, make
the maximum-likelihood estimation intractable. We
propose a pseudo-likelihood-based approach to maximize
the approximate-likelihood, which provides a tractable
solution to the problem. The experimental results on
the simulated as well as real data show correct event
geo-localization using the proposed method. When
compared with several baselines the proposed method
shows a superior performance. The overall computation
time required is much smaller, since only the sensor
metadata are used instead of visual data.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "75",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Shah:2019:PCB,
author = "Mohsin Shah and Weiming Zhang and Honggang Hu and
Nenghai Yu",
title = "{Paillier} Cryptosystem based Mean Value Computation
for Encrypted Domain Image Processing Operations",
journal = j-TOMM,
volume = "15",
number = "3",
pages = "76:1--76:??",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3325194",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:47 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3325194",
abstract = "Due to its large storage facility and high-end
computing capability, cloud computing has received
great attention as a huge amount of personal multimedia
data and computationally expensive tasks can be
outsourced to the cloud. However, the cloud being
third-party semi-trusted, is prone to information
leakage, raising privacy risks. Signal processing in
the encrypted domain has emerged as a new research
paradigm on privacy-preserving processing over
outsourced data by semi-trusted cloud. In this article,
we propose a solution for non-integer mean value
computation in the homomorphic encrypted domain without
any interactive protocol between the client and the
service provider. Using the proposed solution, various
image processing operations, such as local smoothing
filter, un-sharp masking, and histogram equalization,
can be performed in the encrypted domain at the cloud
server without any privacy concerns. Our experimental
results from standard test images reveal that these
image processing operations can be performed without
pre-processing, without client-server interactive
protocol, and without any error between the encrypted
domain and the plain domain.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "76",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yue:2019:SRS,
author = "Guanghui Yue and Chunping Hou and Tianwei Zhou",
title = "Subtitle Region Selection of {S$3$D} Images in
Consideration of Visual Discomfort and Viewing Habit",
journal = j-TOMM,
volume = "15",
number = "3",
pages = "77:1--77:??",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3325197",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:47 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3325197",
abstract = "Subtitles, serving as a linguistic approximation of
the visual content, are an essential element in
stereoscopic advertisement and the film industry. Due
to the vergence accommodation conflict, the
stereoscopic 3D (S3D) subtitle inevitably causes visual
discomfort. To meet the viewing experience, the
subtitle region should be carefully arranged.
Unfortunately, very few works have been dedicated to
this area. In this article, we propose a method for S3D
subtitle region selection in consideration of visual
discomfort and viewing habit. First, we divide the
disparity map into multiple depth layers according to
the disparity value. The preferential processed depth
layer is determined by considering the disparity value
of the foremost object. Second, the optimal region and
coarse disparity value for S3D subtitle insertion are
chosen by convolving the selective depth layer with the
mean filter. Specifically, the viewing habit is
considered during the region selection. Finally, after
region selection, the disparity value of the subtitle
is further modified by using the just noticeable depth
difference (JNDD) model. Given that there is no public
database reported for the evaluation of S3D subtitle
insertion, we collect 120 S3D images as the test
platform. Both objective and subjective experiments are
conducted to evaluate the comfort degree of the
inserted subtitle. Experimental results demonstrate
that the proposed method can obtain promising
performance in improving the viewing experience of the
inserted subtitle.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "77",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Li:2019:LCB,
author = "Yehao Li and Yingwei Pan and Ting Yao and Hongyang
Chao and Yong Rui and Tao Mei",
title = "Learning Click-Based Deep Structure-Preserving
Embeddings with Visual Attention",
journal = j-TOMM,
volume = "15",
number = "3",
pages = "78:1--78:??",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3328994",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:47 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3328994",
abstract = "One fundamental problem in image search is to learn
the ranking functions (i.e., the similarity between
query and image). Recent progress on this topic has
evolved through two paradigms: the text-based model and
image ranker learning. The former relies on image
surrounding texts, making the similarity sensitive to
the quality of textual descriptions. The latter may
suffer from the robustness problem when human-labeled
query-image pairs cannot represent user search intent
precisely. We demonstrate in this article that the
preceding two limitations can be well mitigated by
learning a cross-view embedding that leverages click
data. Specifically, a novel click-based Deep
Structure-Preserving Embeddings with visual Attention
(DSPEA) model is presented, which consists of two
components: deep convolutional neural networks followed
by image embedding layers for learning visual
embedding, and a deep neural networks for generating
query semantic embedding. Meanwhile, visual attention
is incorporated at the top of the convolutional neural
network to reflect the relevant regions of the image to
the query. Furthermore, considering the high dimension
of the query space, a new click-based representation on
a query set is proposed for alleviating this sparsity
problem. The whole network is end-to-end trained by
optimizing a large margin objective that combines
cross-view ranking constraints with in-view
neighborhood structure preservation constraints. On a
large-scale click-based image dataset with 11.7 million
queries and 1 million images, our model is shown to be
powerful for keyword-based image search with superior
performance over several state-of-the-art methods and
achieves, to date, the best reported NDCG@25 of
52.21\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "78",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Cao:2019:SOG,
author = "Tengfei Cao and Changqiao Xu and Mu Wang and Zhongbai
Jiang and Xingyan Chen and Lujie Zhong and Luigi
Alfredo Grieco",
title = "Stochastic Optimization for Green Multimedia Services
in Dense {$5$G} Networks",
journal = j-TOMM,
volume = "15",
number = "3",
pages = "79:1--79:??",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3328996",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:47 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3328996",
abstract = "The manyfold capacity magnification promised by dense
5G networks will make possible the provisioning of
broadband multimedia services, including virtual
reality, augmented reality, and mobile immersive video,
to name a few. These new applications will coexist with
classic ones and contribute to the exponential growth
of multimedia services in mobile networks. At the same
time, the different requirements of past and old
services pose new challenges to the effective usage of
5G resources. In response to these challenges, a novel
Stochastic Optimization framework for Green Multimedia
Services named SOGMS is proposed herein that targets
the maximization of system throughput and the
minimization of energy consumption in data delivery. In
particular, Lyapunov optimization is leveraged to face
this optimization objective, which is formulated and
decomposed into three tractable subproblems. For each
subproblem, a distinct algorithm is conceived, namely
quality of experience--based admission control,
cooperative resource allocation, and multimedia
services scheduling. Finally, extensive simulations are
carried out to evaluate the proposed method against
state-of-art solutions in dense 5G networks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "79",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wu:2019:PAT,
author = "Jie Wu and Haifeng Hu and Liang Yang",
title = "Pseudo-{$3$D} Attention Transfer Network with
Content-aware Strategy for Image Captioning",
journal = j-TOMM,
volume = "15",
number = "3",
pages = "80:1--80:??",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3336495",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:47 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3336495",
abstract = "In this article, we propose a novel Pseudo-3D
Attention Transfer network with Content-aware Strategy
(P3DAT-CAS) for the image captioning task. Our model is
composed of three parts: the Pseudo-3D Attention (P3DA)
network, the P3DA-based Transfer (P3DAT) network, and
the Content-aware Strategy (CAS). First, we propose
P3DA to take full advantage of three-dimensional (3D)
information in convolutional feature maps and capture
more details. Most existing attention-based models only
extract the 2D spatial representation from
convolutional feature maps to decide which area should
be paid more attention to. However, convolutional
feature maps are 3D and different channel features can
detect diverse semantic attributes associated with
images. P3DA is proposed to combine 2D spatial maps
with 1D semantic-channel attributes and generate more
informative captions. Second, we design the transfer
network to maintain and transfer the key previous
attention information. The traditional attention-based
approaches only utilize the current attention
information to predict words directly, whereas transfer
network is able to learn long-term attention
dependencies and explore global modeling pattern.
Finally, we present CAS to provide a more relevant and
distinct caption for each image. The captioning model
trained by maximum likelihood estimation may generate
the captions that have a weak correlation with image
contents, resulting in the cross-modal gap between
vision and linguistics. However, CAS is helpful to
convey the meaningful visual contents accurately.
P3DAT-CAS is evaluated on Flickr30k and MSCOCO, and it
achieves very competitive performance among the
state-of-the-art models.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "80",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wang:2019:DSS,
author = "Min Wang and Wengang Zhou and Qi Tian and Houqiang
Li",
title = "Deep Scalable Supervised Quantization by
Self-Organizing Map",
journal = j-TOMM,
volume = "15",
number = "3",
pages = "81:1--81:??",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3328995",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:47 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3328995",
abstract = "Approximate Nearest Neighbor (ANN) search is an
important research topic in multimedia and computer
vision fields. In this article, we propose a new deep
supervised quantization method by Self-Organizing Map
to address this problem. Our method integrates the
Convolutional Neural Networks and Self-Organizing Map
into a unified deep architecture. The overall training
objective optimizes supervised quantization loss as
well as classification loss. With the supervised
quantization objective, we minimize the differences on
the maps between similar image pairs and maximize the
differences on the maps between dissimilar image pairs.
By optimization, the deep architecture can
simultaneously extract deep features and quantize the
features into suitable nodes in self-organizing map. To
make the proposed deep supervised quantization method
scalable for large datasets, instead of constructing a
larger self-organizing map, we propose to divide the
input space into several subspaces and construct
self-organizing map in each subspace. The
self-organizing maps in all the subspaces implicitly
construct a large self-organizing map, which costs less
memory and training time than directly constructing a
self-organizing map with equal size. The experiments on
several public standard datasets prove the superiority
of our approaches over the existing ANN search methods.
Besides, as a by-product, our deep architecture can be
directly applied to visualization with little
modification, and promising performance is demonstrated
in the experiments.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "81",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Ozcelik:2019:CDA,
author = "Ihsan Mert Ozcelik and Cem Ersoy",
title = "Chunk Duration-Aware {SDN}-Assisted {DASH}",
journal = j-TOMM,
volume = "15",
number = "3",
pages = "82:1--82:??",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3337681",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:47 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3337681",
abstract = "Although Dynamic Adaptive Streaming over HTTP (DASH)
is the pillar of multimedia content delivery
mechanisms, its purely client-based adaptive video
bitrate mechanisms have quality-of-experience fairness
and stability problems in the existence of multiple
DASH clients and highly fluctuating background traffic
on the same shared bottleneck link. Varying chunk
duration among different titles of multiple video
providers exacerbates this problem. With the help of
the global network view provided by the
software-defined networking paradigm, we propose a
centralized joint optimization module-assisted adaptive
video bitrate mechanism that takes diversity of chunk
sizes among different content into account. Our system
collects possible video bitrate levels and chunk
duration from DASH clients and simply calculates the
optimal video bitrates per client based on the
available capacity and chunk duration of each client's
selected content while not invading users' privacy. By
continuously following the background traffic flows, it
asynchronously updates the target video bitrate levels
to avoid both buffer stall events and network
underutilization issues rather than bandwidth slicing,
which brings about scalability problems in practice. It
also guarantees fair startup delays for video sessions
with various chunk duration. Our experiments clearly
show that our proposed approach considering diversity
of chunk duration and that background traffic
fluctuations can significantly provide a better and
fair quality of experience in terms of structural
similarity--based video quality and startup delay
compared to both purely client-based and
state-of-the-art software-defined networking--based
adaptive bitrate mechanisms.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "82",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhuang:2019:RCI,
author = "Naifan Zhuang and Guo-Jun Qi and The Duc Kieu and Kien
A. Hua",
title = "Rethinking the Combined and Individual Orders of
Derivative of States for Differential Recurrent Neural
Networks: Deep Differential Recurrent Neural Networks",
journal = j-TOMM,
volume = "15",
number = "3",
pages = "83:1--83:??",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3337928",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:47 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3337928",
abstract = "Due to their special gating schemes, Long Short-Term
Memory (LSTM) has shown greater potential to process
complex sequential information than the traditional
Recurrent Neural Network (RNN). The conventional LSTM,
however, fails to take into consideration the impact of
salient spatio-temporal dynamics present in the
sequential input data. This problem was first addressed
by the differential Recurrent Neural Network (dRNN),
which uses a differential gating scheme known as
Derivative of States (DoS). DoS uses higher orders of
internal state derivatives to analyze the change in
information gain originated from the salient motions
between the successive frames. The weighted combination
of several orders of DoS is then used to modulate the
gates in dRNN. While each individual order of DoS is
good at modeling a certain level of salient
spatio-temporal sequences, the sum of all the orders of
DoS could distort the detected motion patterns. To
address this problem, we propose to control the LSTM
gates via individual orders of DoS. To fully utilize
the different orders of DoS, we further propose to
stack multiple levels of LSTM cells in an increasing
order of state derivatives. The proposed model
progressively builds up the ability of the LSTM gates
to detect salient dynamical patterns in deeper stacked
layers modeling higher orders of DoS; thus, the
proposed LSTM model is termed deep differential
Recurrent Neural Network (d$^2$ RNN). The effectiveness
of the proposed model is demonstrated on three publicly
available human activity datasets: NUS-HGA,
Violent-Flows, and UCF101. The proposed model
outperforms both LSTM and non-LSTM based
state-of-the-art algorithms.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "83",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Wang:2019:EBD,
author = "Zhangcheng Wang and Ya Li and Richang Hong and Xinmei
Tian",
title = "Eigenvector-Based Distance Metric Learning for Image
Classification and Retrieval",
journal = j-TOMM,
volume = "15",
number = "3",
pages = "84:1--84:??",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3340262",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Oct 2 10:12:47 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3340262",
abstract = "Distance metric learning has been widely studied in
multifarious research fields. The mainstream approaches
learn a Mahalanobis metric or learn a linear
transformation. Recent related works propose learning a
linear combination of base vectors to approximate the
metric. In this way, fewer variables need to be
determined, which is efficient when facing
high-dimensional data. Nevertheless, such works obtain
base vectors using additional data from related domains
or randomly generate base vectors. However, obtaining
base vectors from related domains requires extra time
and additional data, and random vectors introduce
randomness into the learning process, which requires
sufficient random vectors to ensure the stability of
the algorithm. Moreover, the random vectors cannot
capture the rich information of the training data,
leading to a degradation in performance. Considering
these drawbacks, we propose a novel distance metric
learning approach by introducing base vectors
explicitly learned from training data. Given a specific
task, we can make a sparse approximation of its
objective function using the top eigenvalues and
corresponding eigenvectors of a predefined integral
operator on the reproducing kernel Hilbert space.
Because the process of generating eigenvectors simply
refers to the training data of the considered task, our
proposed method does not require additional data and
can reflect the intrinsic information of the input
features. Furthermore, the explicitly learned
eigenvectors do not result in randomness, and we can
extend our method to any kernel space without changing
the objective function. We only need to learn the
coefficients of these eigenvectors, and the only
hyperparameter that we need to determine is the number
of eigenvectors that we utilize. Additionally, an
optimization algorithm is proposed to efficiently solve
this problem. Extensive experiments conducted on
several datasets demonstrate the effectiveness of our
proposed method.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "84",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Pala:2020:ISI,
author = "Pietro Pala and Liming Chen and Di Huang and Xiaoming
Liu and Stefanos Zafeiriou",
title = "Introduction to the Special Issue on Face Analysis
Applications",
journal = j-TOMM,
volume = "15",
number = "3s",
pages = "1--2",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3359624",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jan 23 07:04:18 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3359624",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "85",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Feng:2020:UTB,
author = "Zhen-Hua Feng and Josef Kittler and Bill Christmas and
Xiao-Jun Wu",
title = "A Unified Tensor-based Active Appearance Model",
journal = j-TOMM,
volume = "15",
number = "3s",
pages = "1--22",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3338841",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jan 23 07:04:18 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3338841",
abstract = "Appearance variations result in many difficulties in
face image analysis. To deal with this challenge, we
present a Unified Tensor-based Active Appearance Model
(UT-AAM) for jointly modelling the geometry and texture
information of 2D faces. For each \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "86",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Shamai:2020:SFP,
author = "Gil Shamai and Ron Slossberg and Ron Kimmel",
title = "Synthesizing Facial Photometries and Corresponding
Geometries Using Generative Adversarial Networks",
journal = j-TOMM,
volume = "15",
number = "3s",
pages = "1--24",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3337067",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jan 23 07:04:18 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3337067",
abstract = "Artificial data synthesis is currently a well-studied
topic with useful applications in data science,
computer vision, graphics, and many other fields.
Generating realistic data is especially challenging,
since human perception is highly sensitive to
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "87",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2020:UNC,
author = "Xueping Wang and Yunhong Wang and Weixin Li",
title = "{U-Net} Conditional {GANs} for Photo-Realistic and
Identity-Preserving Facial Expression Synthesis",
journal = j-TOMM,
volume = "15",
number = "3s",
pages = "1--23",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3355397",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jan 23 07:04:18 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3355397",
abstract = "Facial expression synthesis (FES) is a challenging
task since the expression changes are highly non-linear
and depend on the facial appearance. Person identity
should also be well preserved in the synthesized face.
In this article, we present a novel U- \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "88",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2020:EFA,
author = "Zhiwei Liu and Xiangyu Zhu and Ming Tang and Zhen Lei
and Jinqiao Wang",
title = "Efficient Face Alignment with Fast Normalization and
Contour Fitting Loss",
journal = j-TOMM,
volume = "15",
number = "3s",
pages = "1--16",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3338842",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jan 23 07:04:18 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3338842",
abstract = "Face alignment is a key component of numerous face
analysis tasks. In recent years, most existing methods
have focused on designing high-performance face
alignment systems and paid less attention to
efficiency. However more face alignment systems are
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "89",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Duan:2020:VAA,
author = "Huiyu Duan and Xiongkuo Min and Yi Fang and Lei Fan
and Xiaokang Yang and Guangtao Zhai",
title = "Visual Attention Analysis and Prediction on Human
Faces for Children with Autism Spectrum Disorder",
journal = j-TOMM,
volume = "15",
number = "3s",
pages = "1--23",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3337066",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jan 23 07:04:18 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3337066",
abstract = "The focus of this article is to analyze and predict
the visual attention of children with Autism Spectrum
Disorder (ASD) when looking at human faces. Social
difficulties are the hallmark features of ASD and will
lead to atypical visual attention toward \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "90",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Duan:2020:FEM,
author = "Mingxing Duan and Kenli Li and Xiangke Liao and Keqin
Li and Qi Tian",
title = "Features-Enhanced Multi-Attribute Estimation with
Convolutional Tensor Correlation Fusion Network",
journal = j-TOMM,
volume = "15",
number = "3s",
pages = "1--23",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3355542",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jan 23 07:04:18 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3355542",
abstract = "To achieve robust facial attribute estimation, a
hierarchical prediction system referred to as tensor
correlation fusion network (TCFN) is proposed for
attribute estimation. The system includes feature
extraction, correlation excavation among facial
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "91",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhao:2020:ISI,
author = "Sicheng Zhao and Dhiraj Joshi and Mohammad Soleymani
and Qiang Ji",
title = "Introduction to the Special Issue on Affective
Computing for Large-scale Heterogeneous Multimedia
Data",
journal = j-TOMM,
volume = "15",
number = "3s",
pages = "1--2",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3365845",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jan 23 07:04:18 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3365845",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "92",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhao:2020:ACL,
author = "Sicheng Zhao and Shangfei Wang and Mohammad Soleymani
and Dhiraj Joshi and Qiang Ji",
title = "Affective Computing for Large-scale Heterogeneous
Multimedia Data: a Survey",
journal = j-TOMM,
volume = "15",
number = "3s",
pages = "1--32",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3363560",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jan 23 07:04:18 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3363560",
abstract = "The wide popularity of digital photography and social
networks has generated a rapidly growing volume of
multimedia data (i.e., images, music, and videos),
resulting in a great demand for managing, retrieving,
and understanding these data. Affective \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "93",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Hong:2020:CSF,
author = "Xiaopeng Hong and Wei Peng and Mehrtash Harandi and
Ziheng Zhou and Matti Pietik{\"a}inen and Guoying
Zhao",
title = "Characterizing Subtle Facial Movements via
{Riemannian} Manifold",
journal = j-TOMM,
volume = "15",
number = "3s",
pages = "1--24",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3342227",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jan 23 07:04:18 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3342227",
abstract = "Characterizing subtle facial movements from videos is
one of the most intensive topics in computer vision
research. It is, however, challenging, since (1) the
intensity of subtle facial muscle movement is usually
low, (2) the duration may be transient, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "94",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhu:2020:PSB,
author = "Junjie Zhu and Yuxuan Wei and Yifan Feng and Xibin
Zhao and Yue Gao",
title = "Physiological Signals-based Emotion Recognition via
High-order Correlation Learning",
journal = j-TOMM,
volume = "15",
number = "3s",
pages = "1--18",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3332374",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jan 23 07:04:18 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3332374",
abstract = "Emotion recognition by physiological signals is an
effective way to discern the inner state of human
beings and therefore has been widely adopted in many
user-centered applications. The majority of current
state-of-the-art methods focus on exploring \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "95",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{She:2020:LDS,
author = "Dongyu She and Ming Sun and Jufeng Yang",
title = "Learning Discriminative Sentiment Representation from
Strongly- and Weakly Supervised {CNNs}",
journal = j-TOMM,
volume = "15",
number = "3s",
pages = "1--19",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3326335",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jan 23 07:04:18 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3326335",
abstract = "Visual sentiment analysis is attracting increasing
attention with the rapidly growing amount of images
uploaded to social networks. Learning rich visual
representations often requires training deep
convolutional neural networks (CNNs) on massive
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "96",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2020:HCR,
author = "Liang Li and Xinge Zhu and Yiming Hao and Shuhui Wang
and Xingyu Gao and Qingming Huang",
title = "A Hierarchical {CNN-RNN} Approach for Visual Emotion
Classification",
journal = j-TOMM,
volume = "15",
number = "3s",
pages = "1--17",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3359753",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jan 23 07:04:18 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3359753",
abstract = "Visual emotion classification is predicting emotional
reactions of people for the given visual content.
Psychological studies show that human emotions are
affected by various visual stimuli from low level to
high level, including contrast, color, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "97",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yang:2020:ASC,
author = "Liang Yang and Yuexue Wang and Junhua Gu and Xiaochun
Cao and Xiao Wang and Di Jin and Guiguang Ding and
Jungong Han and Weixiong Zhang",
title = "Autonomous Semantic Community Detection via Adaptively
Weighted Low-rank Approximation",
journal = j-TOMM,
volume = "15",
number = "3s",
pages = "1--22",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3355393",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jan 23 07:04:18 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3355393",
abstract = "Identification of semantic community structures is
important for understanding the interactions and
sentiments of different groups of people and predicting
the social emotion. A robust community detection method
needs to autonomously determine the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "98",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Hou:2020:SDE,
author = "Yuxin Hou and Hongxun Yao and Xiaoshuai Sun and Haoran
Li",
title = "{Soul Dancer}: Emotion-Based Human Action Generation",
journal = j-TOMM,
volume = "15",
number = "3s",
pages = "1--19",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3340463",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jan 23 07:04:18 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3340463",
abstract = "Body language is one of the most common ways of
expressing human emotion. In this article, we make the
first attempt to generate an action video with a
specific emotion from a single person image. The goal
of the emotion-based action generation task \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "99",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Hu:2020:ACA,
author = "Shenghong Hu and Min Xu and Haimin Zhang and Chunxia
Xiao and Chao Gui",
title = "Affective Content-aware Adaptation Scheme on {QoE}
Optimization of Adaptive Streaming over {HTTP}",
journal = j-TOMM,
volume = "15",
number = "3s",
pages = "1--18",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3328997",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jan 23 07:04:18 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3328997",
abstract = "The article presents a novel affective content-aware
adaptation scheme (ACAA) to optimize Quality of
Experience (QoE) for dynamic adaptive video streaming
over HTTP (DASH). Most of the existing DASH adaptation
schemes conduct video bit-rate adaptation \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "100",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Nie:2020:HHG,
author = "Weizhi Nie and Weijie Wang and Anan Liu and Yuting Su
and Jie Nie",
title = "{HGAN}: Holistic Generative Adversarial Networks for
Two-dimensional Image-based Three-dimensional Object
Retrieval",
journal = j-TOMM,
volume = "15",
number = "4",
pages = "1--24",
month = jan,
year = "2020",
DOI = "https://doi.org/10.1145/3344684",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 11 08:35:19 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3344684",
abstract = "In this article, we propose a novel method to address
the two-dimensional (2D) image-based 3D object
retrieval problem. First, we extract a set of virtual
views to represent each 3D object. Then, a
soft-attention model is utilized to find the weight of
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "101",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Li:2020:IVR,
author = "Mading Li and Jiaying Liu and Xiaoyan Sun and Zhiwei
Xiong",
title = "Image\slash Video Restoration via Multiplanar
Autoregressive Model and Low-Rank Optimization",
journal = j-TOMM,
volume = "15",
number = "4",
pages = "1--23",
month = jan,
year = "2020",
DOI = "https://doi.org/10.1145/3341728",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 11 08:35:19 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3341728",
abstract = "In this article, we introduce an image/video
restoration approach by utilizing the high-dimensional
similarity in images/videos. After grouping similar
patches from neighboring frames, we propose to build a
multiplanar autoregressive (AR) model to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "102",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zhong:2020:SDM,
author = "Sheng-Hua Zhong and Yuantian Wang and Tongwei Ren and
Mingjie Zheng and Yan Liu and Gangshan Wu",
title = "Steganographer Detection via Multi-Scale Embedding
Probability Estimation",
journal = j-TOMM,
volume = "15",
number = "4",
pages = "1--23",
month = jan,
year = "2020",
DOI = "https://doi.org/10.1145/3352691",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 11 08:35:19 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3352691",
abstract = "Steganographer detection aims to identify the guilty
user who utilizes steganographic methods to hide secret
information in the spread of multimedia data,
especially image data, from a large amount of innocent
users on social networks. A true embedding \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "103",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{AlvesdeAlmeida:2020:RPS,
author = "Marcos {Alves de Almeida} and Carolina {Coimbra
Vieira} and Pedro Olmo Stancioli {Vaz De Melo} and
Renato {Martins Assun{\c{c}}{\~a}o}",
title = "Random Playlists Smoothly Commuting Between Styles",
journal = j-TOMM,
volume = "15",
number = "4",
pages = "1--20",
month = jan,
year = "2020",
DOI = "https://doi.org/10.1145/3361742",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 11 08:35:19 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3361742",
abstract = "Someone enjoys listening to playlists while commuting.
He wants a different playlist of n songs each day, but
always starting from Locked Out of Heaven, a Bruno Mars
song. The list should progress in smooth transitions
between successive and randomly \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "104",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Ye:2020:SCM,
author = "Zhaoda Ye and Yuxin Peng",
title = "Sequential Cross-Modal Hashing Learning via
Multi-scale Correlation Mining",
journal = j-TOMM,
volume = "15",
number = "4",
pages = "1--20",
month = jan,
year = "2020",
DOI = "https://doi.org/10.1145/3356338",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 11 08:35:19 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3356338",
abstract = "Cross-modal hashing aims to map heterogeneous
multimedia data into a common Hamming space through
hash function, and achieves fast and flexible
cross-modal retrieval. Most existing cross-modal
hashing methods learn hash function by mining the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "105",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Liu:2020:EIH,
author = "Shiguang Liu and Ziqing Huang",
title = "Efficient Image Hashing with Geometric Invariant
Vector Distance for Copy Detection",
journal = j-TOMM,
volume = "15",
number = "4",
pages = "1--22",
month = jan,
year = "2020",
DOI = "https://doi.org/10.1145/3355394",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 11 08:35:19 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3355394",
abstract = "Hashing method is an efficient technique of multimedia
security for content protection. It maps an image into
a content-based compact code for denoting the image
itself. While most existing algorithms focus on
improving the classification between \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "106",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Liu:2020:LAB,
author = "Zhandong Liu and Wengang Zhou and Houqiang Li",
title = "{AB-LSTM}: Attention-based Bidirectional {LSTM} Model
for Scene Text Detection",
journal = j-TOMM,
volume = "15",
number = "4",
pages = "1--23",
month = jan,
year = "2020",
DOI = "https://doi.org/10.1145/3356728",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 11 08:35:19 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3356728",
abstract = "Detection of scene text in arbitrary shapes is a
challenging task in the field of computer vision. Most
existing scene text detection methods exploit the
rectangle/quadrangular bounding box to denote the
detected text, which fails to accurately fit text
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "107",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Bhowmik:2020:EDA,
author = "Deepayan Bhowmik and Charith Abhayaratne",
title = "Embedding Distortion Analysis in Wavelet-domain
Watermarking",
journal = j-TOMM,
volume = "15",
number = "4",
pages = "1--24",
month = jan,
year = "2020",
DOI = "https://doi.org/10.1145/3357333",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 11 08:35:19 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3357333",
abstract = "Imperceptibility and robustness are two complementary
fundamental requirements of any watermarking algorithm.
Low-strength watermarking yields high imperceptibility,
but exhibits poor robustness. High-strength
watermarking schemes achieve good \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "108",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Shen:2020:VRS,
author = "Ling Shen and Richang Hong and Haoran Zhang and Xinmei
Tian and Meng Wang",
title = "Video Retrieval with Similarity-Preserving Deep
Temporal Hashing",
journal = j-TOMM,
volume = "15",
number = "4",
pages = "1--16",
month = jan,
year = "2020",
DOI = "https://doi.org/10.1145/3356316",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 11 08:35:19 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3356316",
abstract = "Despite the fact that remarkable progress has been
made in recent years, Content-based Video Retrieval
(CBVR) is still an appealing research topic due to
increasing search demands in the Internet era of big
data. This article aims to explore an \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "109",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{VanderHooft:2020:TBA,
author = "Jeroen {Van der Hooft} and Maria {Torres Vega} and
Stefano Petrangeli and Tim Wauters and Filip {De
Turck}",
title = "Tile-based Adaptive Streaming for Virtual Reality
Video",
journal = j-TOMM,
volume = "15",
number = "4",
pages = "1--24",
month = jan,
year = "2020",
DOI = "https://doi.org/10.1145/3362101",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 11 08:35:19 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3362101",
abstract = "The increasing popularity of head-mounted devices and
360${}^\circ $ video cameras allows content providers
to provide virtual reality (VR) video streaming over
the Internet, using a two-dimensional representation of
the immersive content combined with \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "110",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Filho:2020:DPV,
author = "Roberto Iraja {Tavares Da Costa Filho} and Marcelo
{Caggiani Luizelli} and Stefano Petrangeli and Maria
{Torres Vega} and Jeroen {Van der Hooft} and Tim
Wauters and Filip {De Turck} and Luciano {Paschoal
Gaspary}",
title = "Dissecting the Performance of {VR} Video Streaming
through the {VR-EXP} Experimentation Platform",
journal = j-TOMM,
volume = "15",
number = "4",
pages = "1--23",
month = jan,
year = "2020",
DOI = "https://doi.org/10.1145/3360286",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 11 08:35:19 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3360286",
abstract = "To cope with the massive bandwidth demands of Virtual
Reality (VR) video streaming, both the scientific
community and the industry have been proposing
optimization techniques such as viewport-aware
streaming and tile-based adaptive bitrate heuristics.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "111",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Zheng:2020:ULH,
author = "Yunpeng Zheng and Xuelong Li and Xiaoqiang Lu",
title = "Unsupervised Learning of Human Action Categories in
Still Images with Deep Representations",
journal = j-TOMM,
volume = "15",
number = "4",
pages = "1--20",
month = jan,
year = "2020",
DOI = "https://doi.org/10.1145/3362161",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 11 08:35:19 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3362161",
abstract = "In this article, we propose a novel method for
unsupervised learning of human action categories in
still images. In contrast to previous methods, the
proposed method explores distinctive information of
actions directly from unlabeled image databases,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "112",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Xing:2020:ICC,
author = "Meng Xing and Zhiyong Feng and Yong Su and Jianhai
Zhang",
title = "An Image Cues Coding Approach for {$3$D} Human Pose
Estimation",
journal = j-TOMM,
volume = "15",
number = "4",
pages = "1--20",
month = jan,
year = "2020",
DOI = "https://doi.org/10.1145/3368066",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 11 08:35:19 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3368066",
abstract = "Although Deep Convolutional Neural Networks (DCNNs)
facilitate the evolution of 3D human pose estimation,
ambiguity remains the most challenging problem in such
tasks. Inspired by the Human Perception Mechanism
(HPM), we propose an image-to-pose coding \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "113",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Liu:2020:EEA,
author = "Jinhuan Liu and Xuemeng Song and Liqiang Nie and Tian
Gan and Jun Ma",
title = "An End-to-End Attention-Based Neural Model for
Complementary Clothing Matching",
journal = j-TOMM,
volume = "15",
number = "4",
pages = "1--16",
month = jan,
year = "2020",
DOI = "https://doi.org/10.1145/3368071",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 11 08:35:19 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3368071",
abstract = "In modern society, people tend to prefer fashionable
and decent outfits that can meet more than basic
physiological needs. In fact, a proper outfit usually
relies on good matching among complementary fashion
items (e.g., the top, bottom, and shoes) that
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "114",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Kua:2020:ACA,
author = "Jonathan Kua and Grenville Armitage and Philip Branch
and Jason But",
title = "Adaptive Chunklets and {AQM} for Higher-Performance
Content Streaming",
journal = j-TOMM,
volume = "15",
number = "4",
pages = "1--24",
month = jan,
year = "2020",
DOI = "https://doi.org/10.1145/3344381",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 11 08:35:19 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3344381",
abstract = "Commercial streaming services such as Netflix and
YouTube use proprietary HTTP-based adaptive streaming
(HAS) techniques to deliver content to consumers
worldwide. MPEG recently developed Dynamic Adaptive
Streaming over HTTP (DASH) as a unifying \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "115",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Chen:2020:LLF,
author = "Bin Chen and Lingyan Ruan and Miu-Ling Lam",
title = "{LFGAN}: {$4$D} Light Field Synthesis from a Single
{RGB} Image",
journal = j-TOMM,
volume = "16",
number = "1",
pages = "2:1--2:20",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3366371",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Apr 6 09:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3366371",
abstract = "We present a deep neural network called the light
field generative adversarial network (LFGAN) that
synthesizes a 4D light field from a single 2D RGB
image. We generate light fields using a single image
super-resolution (SISR) technique based on two
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "2",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Ding:2020:AEU,
author = "Yuhang Ding and Hehe Fan and Mingliang Xu and Yi
Yang",
title = "Adaptive Exploration for Unsupervised Person
Re-identification",
journal = j-TOMM,
volume = "16",
number = "1",
pages = "3:1--3:19",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3369393",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Apr 6 09:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3369393",
abstract = "Due to domain bias, directly deploying a deep person
re-identification (re-ID) model trained on one dataset
often achieves considerably poor accuracy on another
dataset. In this article, we propose an Adaptive
Exploration (AE) method to address the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "3",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Bentaleb:2020:DDQ,
author = "Abdelhak Bentaleb and Praveen Kumar Yadav and Wei
Tsang Ooi and Roger Zimmermann",
title = "{DQ-DASH}: a Queuing Theory Approach to Distributed
Adaptive Video Streaming",
journal = j-TOMM,
volume = "16",
number = "1",
pages = "4:1--4:24",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3371040",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Apr 6 09:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3371040",
abstract = "The significant popularity of HTTP adaptive video
streaming (HAS), such as Dynamic Adaptive Streaming
over HTTP (DASH), over the Internet has led to a stark
increase in user expectations in terms of video quality
and delivery robustness. This situation \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "4",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Huang:2020:RHR,
author = "Xin Huang and Yuxin Peng and Zhang Wen",
title = "{RCE-HIL}: Recognizing Cross-media Entailment with
Heterogeneous Interactive Learning",
journal = j-TOMM,
volume = "16",
number = "1",
pages = "5:1--5:21",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3365003",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Apr 6 09:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3365003",
abstract = "Entailment recognition is an important paradigm of
reasoning that judges if a hypothesis can be inferred
from given premises. However, previous efforts mainly
concentrate on text-based reasoning as recognizing
textual entailment (RTE), where the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "5",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2020:CRT,
author = "Miaopeng Li and Zimeng Zhou and Xinguo Liu",
title = "Cross Refinement Techniques for Markerless Human
Motion Capture",
journal = j-TOMM,
volume = "16",
number = "1",
pages = "6:1--6:18",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3372207",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Apr 6 09:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3372207",
abstract = "This article presents a global 3D human pose
estimation method for markerless motion capture. Given
two calibrated images of a person, it first obtains the
2D joint locations in the images using a pre-trained 2D
Pose CNN, then constructs the 3D pose \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "6",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Illahi:2020:CGF,
author = "Gazi Karam Illahi and Thomas {Van Gemert} and Matti
Siekkinen and Enrico Masala and Antti Oulasvirta and
Antti Yl{\"a}-J{\"a}{\"a}ski",
title = "Cloud Gaming with Foveated Video Encoding",
journal = j-TOMM,
volume = "16",
number = "1",
pages = "7:1--7:24",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3369110",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Apr 6 09:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3369110",
abstract = "Cloud gaming enables playing high-end games,
originally designed for PC or game console setups, on
low-end devices such as netbooks and smartphones, by
offloading graphics rendering to GPU-powered cloud
servers. However, transmitting the high-resolution
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "7",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Nguyen:2020:ETS,
author = "Duc V. Nguyen and Huyen T. T. Tran and Truong Cong
Thang",
title = "An Evaluation of Tile Selection Methods for
Viewport-Adaptive Streaming of 360-Degree Video",
journal = j-TOMM,
volume = "16",
number = "1",
pages = "8:1--8:24",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3373359",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Apr 6 09:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3373359",
abstract = "360-degree video has become increasingly popular
nowadays. For effective transmission of
bandwidth-intensive 360-degree video over networks,
viewport-adaptive streaming has been introduced. In
this article, we evaluate, for the first time, ten
existing \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "8",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yang:2020:LSS,
author = "Zhenguo Yang and Zehang Lin and Peipei Kang and
Jianming Lv and Qing Li and Wenyin Liu",
title = "Learning Shared Semantic Space with Correlation
Alignment for Cross-Modal Event Retrieval",
journal = j-TOMM,
volume = "16",
number = "1",
pages = "9:1--9:22",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3374754",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Apr 6 09:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3374754",
abstract = "In this article, we propose to learn shared semantic
space with correlation alignment ( S$^3$ CA ) for
multimodal data representations, which aligns nonlinear
correlations of multimodal data distributions in deep
neural networks designed for heterogeneous \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "9",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2020:JSH,
author = "Junfeng Zhang and Haifeng Hu and Guobin Shen",
title = "Joint Stacked Hourglass Network and Salient Region
Attention Refinement for Robust Face Alignment",
journal = j-TOMM,
volume = "16",
number = "1",
pages = "10:1--10:18",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3374760",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Apr 6 09:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3374760",
abstract = "Facial landmark detection aims to locate keypoints for
facial images, which typically suffer from variations
caused by arbitrary pose, diverse facial expressions,
and partial occlusion. In this article, we propose a
coarse-to-fine framework that joins a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "10",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Tasaka:2020:CSM,
author = "Shuji Tasaka",
title = "Causal Structures of Multidimensional {QoE} in
Haptic-Audiovisual Communications: {Bayesian}
Modeling",
journal = j-TOMM,
volume = "16",
number = "1",
pages = "11:1--11:23",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3375922",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Apr 6 09:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3375922",
abstract = "This article proposes a methodology for building and
verifying plausible models that can express causation
in multidimensional QoE for haptic-audiovisual
interactive communications. For the modeling, we
utilize subjective experimental data of five-point
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "11",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Punn:2020:IUN,
author = "Narinder Singh Punn and Sonali Agarwal",
title = "Inception {U-Net} Architecture for Semantic
Segmentation to Identify Nuclei in Microscopy Cell
Images",
journal = j-TOMM,
volume = "16",
number = "1",
pages = "12:1--12:15",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3376922",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Apr 6 09:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3376922",
abstract = "With the increasing applications of deep learning in
biomedical image analysis, in this article we introduce
an inception U-Net architecture for automating nuclei
detection in microscopy cell images of varying size and
modality to help unlock faster \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "12",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Chaudhary:2020:IRC,
author = "Chandramani Chaudhary and Poonam Goyal and Navneet
Goyal and Yi-Ping Phoebe Chen",
title = "Image Retrieval for Complex Queries Using Knowledge
Embedding",
journal = j-TOMM,
volume = "16",
number = "1",
pages = "13:1--13:23",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3375786",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Apr 6 09:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3375786",
abstract = "With the increase in popularity of image-based
applications, users are retrieving images using more
sophisticated and complex queries. We present three
types of complex queries, namely, long, ambiguous, and
abstract. Each type of query has its own \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "13",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Luo:2020:STS,
author = "Guoliang Luo and Zhigang Deng and Xin Zhao and
Xiaogang Jin and Wei Zeng and Wenqiang Xie and Hyewon
Seo",
title = "Spatio-temporal Segmentation Based Adaptive
Compression of Dynamic Mesh Sequences",
journal = j-TOMM,
volume = "16",
number = "1",
pages = "14:1--14:24",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3377475",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Apr 6 09:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3377475",
abstract = "With the recent advances in data acquisition
techniques, the compression of various dynamic mesh
sequence data has become an important topic in the
computer graphics community. In this article, we
present a new spatio-temporal segmentation-based
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "14",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Pan:2020:FLB,
author = "Zhaoqing Pan and Xiaokai Yi and Yun Zhang and Hui Yuan
and Fu Lee Wang and Sam Kwong",
title = "Frame-level Bit Allocation Optimization Based on Video
Content Characteristics for {HEVC}",
journal = j-TOMM,
volume = "16",
number = "1",
pages = "15:1--15:20",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3380827",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Apr 6 09:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3380827",
abstract = "Rate control plays an important role in high
efficiency video coding (HEVC), and bit allocation is
the foundation of rate control. The video content
characteristics are significant for bit allocation, and
modeling an accurate relationship between video
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "15",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Ainam:2020:EAF,
author = "Jean-Paul Ainam and Ke Qin and Guisong Liu and
Guangchun Luo and Brighter Agyemang",
title = "Enforcing Affinity Feature Learning through
Self-attention for Person Re-identification",
journal = j-TOMM,
volume = "16",
number = "1",
pages = "16:1--16:22",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3377352",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Apr 6 09:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3377352",
abstract = "Person re-identification is the task of recognizing an
individual across heterogeneous non-overlapping camera
views. It has become a crucial capability needed by
many applications in public space video surveillance.
However, it remains a challenging \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "16",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2020:DLA,
author = "Mengyan Li and Zhaoyu Zhang and Guochen Xie and Jun
Yu",
title = "A Deep Learning Approach for Face Hallucination Guided
by Facial Boundary Responses",
journal = j-TOMM,
volume = "16",
number = "1",
pages = "17:1--17:23",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3377874",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Apr 6 09:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3377874",
abstract = "Face hallucination is a domain-specific
super-resolution (SR) problem of learning a mapping
between a low-resolution (LR) face image and its
corresponding high-resolution (HR) image. Tremendous
progress on deep learning has shown exciting potential
for \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "17",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Gao:2020:EDL,
author = "Zan Gao and Yinming Li and Shaohua Wan",
title = "Exploring Deep Learning for View-Based {$3$D} Model
Retrieval",
journal = j-TOMM,
volume = "16",
number = "1",
pages = "18:1--18:21",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3377876",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Apr 6 09:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3377876",
abstract = "In recent years, view-based 3D model retrieval has
become one of the research focuses in the field of
computer vision and machine learning. In fact, the 3D
model retrieval algorithm consists of feature
extraction and similarity measurement, and the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "18",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2020:ISI,
author = "Shengping Zhang and Huiyu Zhou and Dong Xu and M. Emre
Celebi and Thierry Bouwmans",
title = "Introduction to the Special Issue on Multimodal
Machine Learning for Human Behavior Analysis",
journal = j-TOMM,
volume = "16",
number = "1s",
pages = "19:1--19:2",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3381917",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Apr 30 10:35:21 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3381917",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "19",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Guo:2020:RVT,
author = "Changyong Guo and Zhaoxin Zhang and Jinjiang Li and
Xuesong Jiang and Jun Zhang and Lei Zhang",
title = "Robust Visual Tracking Using Kernel Sparse Coding on
Multiple Covariance Descriptors",
journal = j-TOMM,
volume = "16",
number = "1s",
pages = "20:1--20:22",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3360308",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Apr 30 10:35:21 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3360308",
abstract = "In this article, we aim to improve the performance of
visual tracking by combing different features of
multiple modalities. The core idea is to use covariance
matrices as feature descriptors and then use sparse
coding to encode different features. The \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "20",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2020:CSO,
author = "Zhaoxin Zhang and Changyong Guo and Fanzhi Meng and
Taizhong Xu and Junkai Huang",
title = "{CovLets}: a Second-Order Descriptor for Modeling
Multiple Features",
journal = j-TOMM,
volume = "16",
number = "1s",
pages = "21:1--21:14",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3357525",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Apr 30 10:35:21 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3357525",
abstract = "State-of-the-art techniques for image and video
classification take a bottom-up approach where local
features are aggregated into a global final
representation. Existing frameworks (i.e., bag of words
or Fisher vectors) are specifically designed to
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "21",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Meng:2020:ARU,
author = "Quanling Meng and Heyan Zhu and Weigang Zhang and
Xuefeng Piao and Aijie Zhang",
title = "Action Recognition Using Form and Motion Modalities",
journal = j-TOMM,
volume = "16",
number = "1s",
pages = "22:1--22:16",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3350840",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Apr 30 10:35:21 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3350840",
abstract = "Action recognition has attracted increasing interest
in computer vision due to its potential applications in
many vision systems. One of the main challenges in
action recognition is to extract powerful features from
videos. Most existing approaches \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "22",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Shamsolmoali:2020:AAM,
author = "Pourya Shamsolmoali and Masoumeh Zareapoor and Huiyu
Zhou and Jie Yang",
title = "{AMIL}: Adversarial Multi-instance Learning for Human
Pose Estimation",
journal = j-TOMM,
volume = "16",
number = "1s",
pages = "23:1--23:23",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3355612",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Apr 30 10:35:21 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3355612",
abstract = "Human pose estimation has an important impact on a
wide range of applications, from human-computer
interface to surveillance and content-based video
retrieval. For human pose estimation, joint
obstructions and overlapping upon human bodies result
in \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "23",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhuang:2020:MAR,
author = "Yueting Zhuang and Dejing Xu and Xin Yan and Wenzhuo
Cheng and Zhou Zhao and Shiliang Pu and Jun Xiao",
title = "Multichannel Attention Refinement for Video Question
Answering",
journal = j-TOMM,
volume = "16",
number = "1s",
pages = "24:1--24:23",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3366710",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Apr 30 10:35:21 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3366710",
abstract = "Video Question Answering (VideoQA) is the extension of
image question answering (ImageQA) in the video domain.
Methods are required to give the correct answer after
analyzing the provided video and question in this task.
Comparing to ImageQA, the most \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "24",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Grigorev:2020:DDD,
author = "Aleksei Grigorev and Shaohui Liu and Zhihong Tian and
Jianxin Xiong and Seungmin Rho and Jiang Feng",
title = "Delving Deeper in Drone-Based Person Re-Id by
Employing Deep Decision Forest and Attributes Fusion",
journal = j-TOMM,
volume = "16",
number = "1s",
pages = "25:1--25:15",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3360050",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Apr 30 10:35:21 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3360050",
abstract = "Deep learning has revolutionized the field of computer
vision and image processing. Its ability to extract the
compact image representation has taken the person
re-identification (re-id) problem to a new level.
However, in most cases, researchers are \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "25",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2020:SPG,
author = "Zhaoju Li and Zongwei Zhou and Nan Jiang and Zhenjun
Han and Junliang Xing and Jianbin Jiao",
title = "Spatial Preserved Graph Convolution Networks for
Person Re-identification",
journal = j-TOMM,
volume = "16",
number = "1s",
pages = "26:1--26:14",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3362988",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Apr 30 10:35:21 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3362988",
abstract = "Person Re-identification is a very challenging task
due to inter-class ambiguity caused by similar
appearances, and large intra-class diversity caused by
viewpoints, illuminations, and poses. To address these
challenges, in this article, a graph \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "26",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Chen:2020:AAC,
author = "Hui Chen and Guiguang Ding and Zijia Lin and Sicheng
Zhao and Xiaopeng Gu and Wenyuan Xu and Jungong Han",
title = "{ACMNet}: Adaptive Confidence Matching Network for
Human Behavior Analysis via Cross-modal Retrieval",
journal = j-TOMM,
volume = "16",
number = "1s",
pages = "27:1--27:21",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3362065",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Apr 30 10:35:21 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3362065",
abstract = "Cross-modality human behavior analysis has attracted
much attention from both academia and industry. In this
article, we focus on the cross-modality image-text
retrieval problem for human behavior analysis, which
can learn a common latent space for \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "27",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2020:MSS,
author = "Anran Zhang and Xiaolong Jiang and Baochang Zhang and
Xianbin Cao",
title = "Multi-scale Supervised Attentive Encoder--Decoder
Network for Crowd Counting",
journal = j-TOMM,
volume = "16",
number = "1s",
pages = "28:1--28:20",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3356019",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Apr 30 10:35:21 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3356019",
abstract = "Crowd counting is a popular topic with widespread
applications. Currently, the biggest challenge to crowd
counting is large-scale variation in objects. In this
article, we focus on overcoming this challenge by
proposing a novel Attentive Encoder-Decoder \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "28",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Tanveer:2020:ISI,
author = "M. Tanveer and P. Khanna and M. Prasad and C. T. Lin",
title = "Introduction to the Special Issue on Computational
Intelligence for Biomedical Data and Imaging",
journal = j-TOMM,
volume = "16",
number = "1s",
pages = "29:1--29:4",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3381919",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Apr 30 10:35:21 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3381919",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "29",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Tanveer:2020:MLT,
author = "M. Tanveer and B. Richhariya and R. U. Khan and A. H.
Rashid and P. Khanna and M. Prasad and C. T. Lin",
title = "Machine Learning Techniques for the Diagnosis of
{Alzheimer}'s Disease: a Review",
journal = j-TOMM,
volume = "16",
number = "1s",
pages = "30:1--30:35",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3344998",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Apr 30 10:35:21 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3344998",
abstract = "Alzheimer's disease is an incurable neurodegenerative
disease primarily affecting the elderly population.
Efficient automated techniques are needed for early
diagnosis of Alzheimer's. Many novel approaches are
proposed by researchers for classification \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "30",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yadav:2020:EDA,
author = "Shweta Yadav and Pralay Ramteke and Asif Ekbal and
Sriparna Saha and Pushpak Bhattacharyya",
title = "Exploring Disorder-Aware Attention for Clinical Event
Extraction",
journal = j-TOMM,
volume = "16",
number = "1s",
pages = "31:1--31:21",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3372328",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Apr 30 10:35:21 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3372328",
abstract = "Event extraction is one of the crucial tasks in
biomedical text mining that aims to extract specific
information concerning incidents embedded in the texts.
In this article, we propose a deep learning framework
that aims to identify the attributes \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "31",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Tripathi:2020:CNC,
author = "Suvidha Tripathi and Satish Kumar Singh",
title = "Cell Nuclei Classification in Histopathological Images
using {Hybrid O L ConvNet}",
journal = j-TOMM,
volume = "16",
number = "1s",
pages = "32:1--32:22",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3345318",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Apr 30 10:35:21 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3345318",
abstract = "Computer-aided histopathological image analysis for
cancer detection is a major research challenge in the
medical domain. Automatic detection and classification
of nuclei for cancer diagnosis impose a lot of
challenges in developing state-of-the-art \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "32",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhu:2020:DSS,
author = "Nengjun Zhu and Jian Cao and Kunwei Shen and Xiaosong
Chen and Siji Zhu",
title = "A Decision Support System with Intelligent
Recommendation for Multi-disciplinary Medical
Treatment",
journal = j-TOMM,
volume = "16",
number = "1s",
pages = "33:1--33:23",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3352573",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Apr 30 10:35:21 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3352573",
abstract = "Recent years have witnessed an emerging trend for
improving disease treatment by forming
multi-disciplinary medical teams. The collaboration
among specialists from multiple medical domains has
been shown to be significantly helpful for designing
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "33",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2020:RFS,
author = "Qingyong Wang and Yun Zhou and Weiping Ding and Zhiguo
Zhang and Khan Muhammad and Zehong Cao",
title = "Random Forest with Self-Paced Bootstrap Learning in
Lung Cancer Prognosis",
journal = j-TOMM,
volume = "16",
number = "1s",
pages = "34:1--34:12",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3345314",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Apr 30 10:35:21 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3345314",
abstract = "Training gene expression data with supervised learning
approaches can provide an alarm sign for early
treatment of lung cancer to decrease death rates.
However, the samples of gene features involve lots of
noises in a realistic environment. In this \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "34",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Saini:2020:TEB,
author = "Naveen Saini and Sriparna Saha and Pushpak
Bhattacharyya and Himanshu Tuteja",
title = "Textual Entailment-Based Figure Summarization for
Biomedical Articles",
journal = j-TOMM,
volume = "16",
number = "1s",
pages = "35:1--35:24",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3357334",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Apr 30 10:35:21 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3357334",
abstract = "This article proposes a novel unsupervised approach
(FigSum++) for automatic figure summarization in
biomedical scientific articles using a multi-objective
evolutionary algorithm. The problem is treated as an
optimization problem where relevant \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "35",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Tong:2020:PND,
author = "Chao Tong and Baoyu Liang and Mengze Zhang and
Rongshan Chen and Arun Kumar Sangaiah and Zhigao Zheng
and Tao Wan and Chenyang Yue and Xinyi Yang",
title = "Pulmonary Nodule Detection Based on {ISODATA}-Improved
Faster {RCNN} and {$3$D-CNN} with Focal Loss",
journal = j-TOMM,
volume = "16",
number = "1s",
pages = "36:1--36:9",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3365445",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Apr 30 10:35:21 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3365445",
abstract = "The early diagnosis of pulmonary cancer can
significantly improve the survival rate of patients,
where pulmonary nodules detection in computed
tomography images plays an important role. In this
article, we propose a novel pulmonary nodule detection
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "36",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Agrawal:2020:HWB,
author = "Utkarsh Agrawal and Jatin Arora and Rahul Singh and
Deepak Gupta and Ashish Khanna and Aditya Khamparia",
title = "Hybrid Wolf--Bat Algorithm for Optimization of
Connection Weights in Multi-layer Perceptron",
journal = j-TOMM,
volume = "16",
number = "1s",
pages = "37:1--37:20",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3350532",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Apr 30 10:35:21 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3350532",
abstract = "In a neural network, the weights act as parameters to
determine the output(s) from a set of inputs. The
weights are used to find the activation values of nodes
of a layer from the values of the previous layer.
Finding the ideal set of these weights for \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "37",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Rout:2020:ICA,
author = "Ranjeet Kumar Rout and Sk. Sarif Hassan and Sanchit
Sindhwani and Hari Mohan Pandey and Saiyed Umer",
title = "Intelligent Classification and Analysis of Essential
Genes Using Quantitative Methods",
journal = j-TOMM,
volume = "16",
number = "1s",
pages = "38:1--38:21",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3343856",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Apr 30 10:35:21 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3343856",
abstract = "Essential genes are considered to be the genes
required to sustain life of different organisms. These
genes encode proteins that maintain central metabolism,
DNA replications, translation of genes, and basic
cellular structure, and mediate the transport
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "38",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2020:ABM,
author = "Hongyi Zhang and Haoke Zhang and Sandeep Pirbhulal and
Wanqing Wu and Victor Hugo C. {De Albuquerque}",
title = "Active Balancing Mechanism for Imbalanced Medical Data
in Deep Learning-Based Classification Models",
journal = j-TOMM,
volume = "16",
number = "1s",
pages = "39:1--39:15",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3357253",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Apr 30 10:35:21 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3357253",
abstract = "Imbalanced data always has a serious impact on a
predictive model, and most under-sampling techniques
consume more time and suffer from loss of samples
containing critical information during imbalanced data
processing, especially in the biomedical \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "39",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Vellingiri:2020:SCB,
author = "Shanthi Vellingiri and Ryan P. McMahan and
Balakrishnan Prabhakaran",
title = "{SCeVE}: a Component-based Framework to Author Mixed
Reality Tours",
journal = j-TOMM,
volume = "16",
number = "2",
pages = "40:1--40:23",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3377353",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 16 10:45:32 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3377353",
abstract = "Authoring a collaborative, interactive Mixed Reality
(MR) tour requires flexible design and development of
various software modules for tasks such as managing
geographically distributed participants, adaptable
travel and virtual camera techniques, data \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "40",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2020:BDC,
author = "Jiaying Liu and Sijie Song and Chunhui Liu and Yanghao
Li and Yueyu Hu",
title = "A Benchmark Dataset and Comparison Study for
Multi-modal Human Action Analytics",
journal = j-TOMM,
volume = "16",
number = "2",
pages = "41:1--41:24",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3365212",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 16 10:45:32 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3365212",
abstract = "Large-scale benchmarks provide a solid foundation for
the development of action analytics. Most of the
previous activity benchmarks focus on analyzing actions
in RGB videos. There is a lack of large-scale and
high-quality benchmarks for multi-modal \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "41",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Duan:2020:EFE,
author = "Mingxing Duan and Kenli Li and Aijia Ouyang and Khin
Nandar Win and Keqin Li and Qi Tian",
title = "{EGroupNet}: a Feature-enhanced Network for Age
Estimation with Novel Age Group Schemes",
journal = j-TOMM,
volume = "16",
number = "2",
pages = "42:1--42:23",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3379449",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 16 10:45:32 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3379449",
abstract = "Although age estimation is easily affected by smiling,
race, gender, and other age-related attributes, most of
the researchers did not pay attention to the
correlations among these attributes. Moreover, many
researchers perform age estimation from a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "42",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Baez-Suarez:2020:SSS,
author = "Abraham B{\'a}ez-Su{\'a}rez and Nolan Shah and Juan
Arturo Nolazco-Flores and Shou-Hsuan S. Huang and
Omprakash Gnawali and Weidong Shi",
title = "{SAMAF}: Sequence-to-sequence Autoencoder Model for
Audio Fingerprinting",
journal = j-TOMM,
volume = "16",
number = "2",
pages = "43:1--43:23",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3380828",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 16 10:45:32 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3380828",
abstract = "Audio fingerprinting techniques were developed to
index and retrieve audio samples by comparing a
content-based compact signature of the audio instead of
the entire audio sample, thereby reducing memory and
computational expense. Different techniques \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "43",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Mettes:2020:SIB,
author = "Pascal Mettes and Dennis C. Koelma and Cees G. M.
Snoek",
title = "Shuffled {ImageNet} Banks for Video Event Detection
and Search",
journal = j-TOMM,
volume = "16",
number = "2",
pages = "44:1--44:21",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3377875",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 16 10:45:32 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3377875",
abstract = "This article aims for the detection and search of
events in videos, where video examples are either
scarce or even absent during training. To enable such
event detection and search, ImageNet concept banks have
shown to be effective. Rather than \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "44",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Noori:2020:HAR,
author = "Farzan Majeed Noori and Michael Riegler and Md Zia
Uddin and Jim Torresen",
title = "Human Activity Recognition from Multiple Sensors Data
Using Multi-fusion Representations and {CNNs}",
journal = j-TOMM,
volume = "16",
number = "2",
pages = "45:1--45:19",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3377882",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 16 10:45:32 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3377882",
abstract = "With the emerging interest in the ubiquitous sensing
field, it has become possible to build assistive
technologies for persons during their daily life
activities to provide personalized feedback and
services. For instance, it is possible to detect an
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "45",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Rossi:2020:DUB,
author = "Silvia Rossi and Cagri Ozcinar and Aljosa Smolic and
Laura Toni",
title = "Do Users Behave Similarly in {VR}? {Investigation} of
the User Influence on the System Design",
journal = j-TOMM,
volume = "16",
number = "2",
pages = "46:1--46:26",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3381846",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 16 10:45:32 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3381846",
abstract = "With the overarching goal of developing user-centric
Virtual Reality (VR) systems, a new wave of studies
focused on understanding how users interact in VR
environments has recently emerged. Despite the intense
efforts, however, current literature still \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "46",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2020:LLF,
author = "Xiao Wang and Wu Liu and Jun Chen and Xiaobo Wang and
Chenggang Yan and Tao Mei",
title = "Listen, Look, and Find the One: Robust Person Search
with Multimodality Index",
journal = j-TOMM,
volume = "16",
number = "2",
pages = "47:1--47:20",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3380549",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 16 10:45:32 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3380549",
abstract = "Person search with one portrait, which attempts to
search the targets in arbitrary scenes using one
portrait image at a time, is an essential yet
unexplored problem in the multimedia field. Existing
approaches, which predominantly depend on the visual
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "47",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Luo:2020:FFI,
author = "Xiaofan Luo and Fukoeng Wong and Haifeng Hu",
title = "{FIN}: Feature Integrated Network for Object
Detection",
journal = j-TOMM,
volume = "16",
number = "2",
pages = "48:1--48:18",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3381086",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 16 10:45:32 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3381086",
abstract = "Multi-layer detection is a widely used method in the
field of object detection. It extracts multiple feature
maps with different resolutions from the backbone
network to detect objects of different scales, which
can effectively cope with the problem of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "48",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Akpinar:2020:PPP,
author = "Kutalmis Akpinar and Kien A. Hua",
title = "{PPNet}: Privacy Protected {CDN--ISP} Collaboration
for {QoS}-aware Multi-{CDN} Adaptive Video Streaming",
journal = j-TOMM,
volume = "16",
number = "2",
pages = "49:1--49:23",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3379983",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 16 10:45:32 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3379983",
abstract = "Software-defined networking introduces opportunities
to optimize the Internet Service Provider's network and
to improve client experience for the Video-on-Demand
applications. Recent studies on SDN frameworks show
that traffic engineering methods allow \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "49",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Tanwar:2020:CPP,
author = "Vishesh Kumar Tanwar and Balasubramanian Raman and
Amitesh Singh Rajput and Rama Bhargava",
title = "{CryptoLesion}: a Privacy-preserving Model for Lesion
Segmentation Using Whale Optimization over Cloud",
journal = j-TOMM,
volume = "16",
number = "2",
pages = "50:1--50:23",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3380743",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 16 10:45:32 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3380743",
abstract = "The low-cost, accessing flexibility, agility, and
mobility of cloud infrastructures have attracted
medical organizations to store their high-resolution
data in encrypted form. Besides storage, these
infrastructures provide various image processing
services for plain (non-encrypted) images. Meanwhile,
the privacy and security of uploaded data depend upon
the reliability of the service provider(s). The
enforcement of laws towards privacy policies in
health-care organizations, for not disclosing their
patient's sensitive and private medical information,
restrict them to utilize these services. To address
these privacy concerns for melanoma detection, we
propose CryptoLesion, a privacy-preserving model for
segmenting lesion region using whale optimization
algorithm (WOA) over the cloud in the encrypted domain
(ED). The user's image is encrypted using a permutation
ordered binary number system and a random stumble
matrix. The task of segmentation is accomplished by
dividing an encrypted image into a pre-defined number
of clusters whose optimal centroids are obtained by WOA
in ED, followed by the assignment of each pixel of an
encrypted image to the unique centroid. The qualitative
and quantitative analysis of CryptoLesion is evaluated
over publicly available datasets provided in The
International Skin Imaging Collaboration Challenges in
2016, 2017, 2018, and PH2 dataset. The segmented
results obtained by CryptoLesion are found to be
comparable with the winners of respective challenges.
CryptoLesion is proved to be secure from a
probabilistic viewpoint and various cryptographic
attacks. To the best of our knowledge, CryptoLesion is
first moving towards the direction of lesion
segmentation in ED.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "50",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zheng:2020:DPC,
author = "Zhedong Zheng and Liang Zheng and Michael Garrett and
Yi Yang and Mingliang Xu and Yi-Dong Shen",
title = "Dual-path Convolutional Image-Text Embeddings with
Instance Loss",
journal = j-TOMM,
volume = "16",
number = "2",
pages = "51:1--51:23",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3383184",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 16 10:45:32 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3383184",
abstract = "Matching images and sentences demands a fine
understanding of both modalities. In this article, we
propose a new system to discriminatively embed the
image and text to a shared visual-textual space. In
this field, most existing works apply the ranking
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "51",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Huang:2020:MPA,
author = "Xiaowen Huang and Shengsheng Qian and Quan Fang and
Jitao Sang and Changsheng Xu",
title = "Meta-path Augmented Sequential Recommendation with
Contextual Co-attention Network",
journal = j-TOMM,
volume = "16",
number = "2",
pages = "52:1--52:24",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3382180",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 16 10:45:32 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3382180",
abstract = "It is critical to comprehensively and efficiently
learn user preferences for an effective sequential
recommender system. Existing sequential recommendation
methods mainly focus on modeling local preference from
users' historical behaviors, which largely \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "52",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wu:2020:IMC,
author = "Lingxiang Wu and Min Xu and Shengsheng Qian and
Jianwei Cui",
title = "Image to Modern {Chinese} Poetry Creation via a
Constrained Topic-aware Model",
journal = j-TOMM,
volume = "16",
number = "2",
pages = "53:1--53:21",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3381858",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 16 10:45:32 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3381858",
abstract = "Artificial creativity has attracted increasing
research attention in the field of multimedia and
artificial intelligence. Despite the promising work on
poetry/painting/music generation, creating modern
Chinese poetry from images, which can significantly
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "53",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhou:2020:RLV,
author = "Zhili Zhou and Q. M. Jonathan Wu and Yimin Yang and
Xingming Sun",
title = "Region-Level Visual Consistency Verification for
Large-Scale Partial-Duplicate Image Search",
journal = j-TOMM,
volume = "16",
number = "2",
pages = "54:1--54:25",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3383582",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 16 10:45:32 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3383582",
abstract = "Most recent large-scale image search approaches build
on a bag-of-visual-words model, in which local features
are quantized and then efficiently matched between
images. However, the limited discriminability of local
features and the BOW quantization \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "54",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{He:2020:STS,
author = "Jiale He and Gaobo Yang and Xin Liu and Xiangling
Ding",
title = "Spatio-temporal Saliency-based Motion Vector
Refinement for Frame Rate Up-conversion",
journal = j-TOMM,
volume = "16",
number = "2",
pages = "55:1--55:18",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3382506",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 16 10:45:32 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3382506",
abstract = "A spatio-temporal saliency-based frame rate
up-conversion (FRUC) approach is proposed, which
achieves better quality of interpolated frames and
invalidates existing texture variation-based FRUC
detectors. A spatio-temporal saliency model is designed
to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "55",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Gelli:2020:LVE,
author = "Francesco Gelli and Tiberio Uricchio and Xiangnan He
and Alberto {Del Bimbo} and Tat-Seng Chua",
title = "Learning Visual Elements of Images for Discovery of
Brand Posts",
journal = j-TOMM,
volume = "16",
number = "2",
pages = "56:1--56:21",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3385413",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 16 10:45:32 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3385413",
abstract = "Online Social Network Sites have become a primary
platform for brands and organizations to engage their
audience by sharing image and video posts on their
timelines. Different from traditional advertising,
these posts are not restricted to the products
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "56",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Han:2020:HRR,
author = "Xian-Hua Han and Yinqiang Zheng and Jiande Sun and
Yen-Wei Chen",
title = "Hyperspectral Reconstruction with Redundant Camera
Spectral Sensitivity Functions",
journal = j-TOMM,
volume = "16",
number = "2",
pages = "57:1--57:15",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3386313",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 16 10:45:32 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3386313",
abstract = "High-resolution hyperspectral (HS) reconstruction has
recently achieved significantly progress, among which
the method based on the fusion of the RGB and HS images
of the same scene can greatly improve the
reconstruction performance compared with those
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "57",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Gao:2020:ISI,
author = "Honghao Gao and Yudong Zhang",
title = "Introduction to the Special Issue on Smart
Communications and Networking for Future Video
Surveillance",
journal = j-TOMM,
volume = "16",
number = "2s",
pages = "58:1--58:2",
month = jul,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3398382",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun Jul 19 08:56:56 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3398382",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "58",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Jiang:2020:SDM,
author = "Yizhang Jiang and Xiaoqing Gu and Dingcheng Ji and
Pengjiang Qian and Jing Xue and Yuanpeng Zhang and
Jiaqi Zhu and Kaijian Xia and Shitong Wang",
title = "Smart Diagnosis: a Multiple-Source Transfer {TSK}
Fuzzy System for {EEG} Seizure Identification",
journal = j-TOMM,
volume = "16",
number = "2s",
pages = "59:1--59:21",
month = jul,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3340240",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun Jul 19 08:56:56 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3340240",
abstract = "To effectively identify electroencephalogram (EEG)
signals in multiple-source domains, a multiple-source
transfer learning-based Takagi-Sugeno-Kang (TSK) fuzzy
system (FS), called MST-TSK, is proposed, which
combines multiple-source transfer learning \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "59",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2020:DBD,
author = "Shui-Hua Wang and Yu-Dong Zhang",
title = "{DenseNet-201}-Based Deep Neural Network with
Composite Learning Factor and Precomputation for
Multiple Sclerosis Classification",
journal = j-TOMM,
volume = "16",
number = "2s",
pages = "60:1--60:19",
month = jul,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3341095",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun Jul 19 08:56:56 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3341095",
abstract = "(Aim) Multiple sclerosis is a neurological condition
that may cause neurologic disability. Convolutional
neural network can achieve good results, but tuning
hyperparameters of CNN needs expert knowledge and are
difficult and time-consuming. To identify \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "60",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xia:2020:CDB,
author = "Kaijian Xia and Hongsheng Yin and Yong Jin and Shi Qiu
and Hongru Zhao",
title = "Cross-Domain Brain {CT} Image Smart Segmentation via
Shared Hidden Space Transfer {FCM} Clustering",
journal = j-TOMM,
volume = "16",
number = "2s",
pages = "61:1--61:21",
month = jul,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3357233",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun Jul 19 08:56:56 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3357233",
abstract = "Clustering is an important issue in brain medical
image segmentation. Original medical images used for
clinical diagnosis are often insufficient for
clustering in the current domain. As there are
sufficient medical images in the related domains,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "61",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2020:STD,
author = "Yonggang Li and Chunping Liu and Yi Ji and Shengrong
Gong and Haibao Xu",
title = "Spatio-Temporal Deep Residual Network with
Hierarchical Attentions for Video Event Recognition",
journal = j-TOMM,
volume = "16",
number = "2s",
pages = "62:1--62:21",
month = jul,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3378026",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun Jul 19 08:56:56 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3378026",
abstract = "Event recognition in surveillance video has gained
extensive attention from the computer vision community.
This process still faces enormous challenges due to the
tiny inter-class variations that are caused by various
facets, such as severe occlusion, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "62",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Si:2020:MLT,
author = "Wen Si and Cong Liu and Zhongqin Bi and Meijing Shan",
title = "Modeling Long-Term Dependencies from Videos Using Deep
Multiplicative Neural Networks",
journal = j-TOMM,
volume = "16",
number = "2s",
pages = "63:1--63:19",
month = jul,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3357797",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun Jul 19 08:56:56 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3357797",
abstract = "Understanding temporal dependencies of videos is
fundamental for vision problems, but deep
learning-based models are still insufficient in this
field. In this article, we propose a novel deep
multiplicative neural network (DMNN) for learning
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "63",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhu:2020:PCA,
author = "Suguo Zhu and Xiaoxian Yang and Jun Yu and Zhenying
Fang and Meng Wang and Qingming Huang",
title = "Proposal Complementary Action Detection",
journal = j-TOMM,
volume = "16",
number = "2s",
pages = "64:1--64:12",
month = jul,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3361845",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun Jul 19 08:56:56 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3361845",
abstract = "Temporal action detection not only requires correct
classification but also needs to detect the start and
end times of each action accurately. However,
traditional approaches always employ sliding windows or
actionness to predict the actions, and it is \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "64",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Huang:2020:NTF,
author = "Chenxi Huang and Yisha Lan and Guokai Zhang and Gaowei
Xu and Landu Jiang and Nianyin Zeng and Jenhong Tan and
E. Y. K. Ng and Yongqiang Cheng and Ningzhi Han and
Rongrong Ji and Yonghong Peng",
title = "A New Transfer Function for Volume Visualization of
Aortic Stent and Its Application to Virtual Endoscopy",
journal = j-TOMM,
volume = "16",
number = "2s",
pages = "65:1--65:14",
month = jul,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3373358",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun Jul 19 08:56:56 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3373358",
abstract = "Aortic stent has been widely used in restoring
vascular stenosis and assisting patients with
cardiovascular disease. The effective visualization of
aortic stent is considered to be critical to ensure the
effectiveness and functions of the aortic stent
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "65",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zink:2020:IBP,
author = "Michael Zink and Laura Toni and Ali C. Begen",
title = "Introduction to the Best Papers from the {ACM
Multimedia Systems (MMSys) 2019 and Co-Located
Workshops}",
journal = j-TOMM,
volume = "16",
number = "2s",
pages = "66:1--66:2",
month = jul,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3398384",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun Jul 19 08:56:56 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3398384",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "66",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2020:PLB,
author = "Rui-Xiao Zhang and Ming Ma and Tianchi Huang and
Haitian Pang and Xin Yao and Chenglei Wu and Lifeng
Sun",
title = "A Practical Learning-based Approach for Viewer
Scheduling in the Crowdsourced Live Streaming",
journal = j-TOMM,
volume = "16",
number = "2s",
pages = "67:1--67:22",
month = jul,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3397226",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun Jul 19 08:56:56 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3397226",
abstract = "Scheduling viewers effectively among different Content
Delivery Network (CDN) providers is challenging owing
to the extreme diversity in the crowdsourced live
streaming (CLS) scenarios. Abundant algorithms have
been proposed in recent years, which, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "67",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Altamimi:2020:QFD,
author = "Sa'di Altamimi and Shervin Shirmohammadi",
title = "{QoE}-Fair {DASH} Video Streaming Using Server-side
Reinforcement Learning",
journal = j-TOMM,
volume = "16",
number = "2s",
pages = "68:1--68:21",
month = jul,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3397227",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun Jul 19 08:56:56 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3397227",
abstract = "To design an optimal adaptive video streaming method,
video service providers need to consider both the
efficiency and the fairness of the Quality of
Experience (QoE) of their users. In Reference [8], we
proposed a server-side QoE-fair rate adaptation
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "68",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Bentaleb:2020:PAA,
author = "Abdelhak Bentaleb and Christian Timmerer and Ali C.
Begen and Roger Zimmermann",
title = "Performance Analysis of {ACTE}: a Bandwidth Prediction
Method for Low-latency Chunked Streaming",
journal = j-TOMM,
volume = "16",
number = "2s",
pages = "69:1--69:24",
month = jul,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3387921",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun Jul 19 08:56:56 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3387921",
abstract = "HTTP adaptive streaming with chunked transfer encoding
can offer low-latency streaming without sacrificing the
coding efficiency. This allows media segments to be
delivered while still being packaged. However,
conventional schemes often make widely \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "69",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Pham:2020:ESR,
author = "Stefan Pham and Patrick Heeren and Calvin Schmidt and
Daniel Silhavy and Stefan Arbanowski",
title = "Evaluation of Shared Resource Allocation Using {SAND}
for {ABR} Streaming",
journal = j-TOMM,
volume = "16",
number = "2s",
pages = "70:1--70:18",
month = jul,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3388926",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun Jul 19 08:56:56 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3388926",
abstract = "Adaptive bitrate media streaming clients adjust the
quality of media content depending on the current
network conditions. The shared resource allocation
(SRA) feature defined in MPEG-SAND (server and network
assisted DASH) allows servers to allocate \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "70",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Gutterman:2020:RRT,
author = "Craig Gutterman and Katherine Guo and Sarthak Arora
and Trey Gilliland and Xiaoyang Wang and Les Wu and
Ethan Katz-Bassett and Gil Zussman",
title = "{Requet}: Real-Time {QoE} Metric Detection for
Encrypted {YouTube} Traffic",
journal = j-TOMM,
volume = "16",
number = "2s",
pages = "71:1--71:28",
month = jul,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3394498",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun Jul 19 08:56:56 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3394498",
abstract = "As video traffic dominates the Internet, it is
important for operators to detect video quality of
experience (QoE) to ensure adequate support for video
traffic. With wide deployment of end-to-end encryption,
traditional deep packet inspection-based traffic
monitoring approaches are becoming ineffective. This
poses a challenge for network operators to monitor user
QoE and improve upon their experience. To resolve this
issue, we develop and present a system for REal-time
QUality of experience metric detection for Encrypted
Traffic --- Requet --- which is suitable for network
middlebox deployment. Requet uses a detection algorithm
that we develop to identify video and audio chunks from
the IP headers of encrypted traffic. Features extracted
from the chunk statistics are used as input to a
machine learning algorithm to predict QoE metrics,
specifically buffer warning (low buffer, high buffer),
video state (buffer increase, buffer decay, steady,
stall), and video resolution. We collect a large
YouTube dataset consisting of diverse video assets
delivered over various WiFi and LTE network conditions
to evaluate the performance. We compare Requet with a
baseline system based on previous work and show that
Requet outperforms the baseline system in accuracy of
predicting buffer low warning, video state, and video
resolution by $ 1.12 \times $, $ 1.53 \times $, and $
3.14 \times $, respectively.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "71",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Hu:2020:ATL,
author = "Xinjue Hu and Jingming Shan and Yu Liu and Lin Zhang
and Shervin Shirmohammadi",
title = "An Adaptive Two-Layer Light Field Compression Scheme
Using {GNN}-Based Reconstruction",
journal = j-TOMM,
volume = "16",
number = "2s",
pages = "72:1--72:23",
month = jul,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3395620",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun Jul 19 08:56:56 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3395620",
abstract = "As a new form of volumetric media, Light Field (LF)
can provide users with a true six degrees of freedom
immersive experience because LF captures the scene with
photo-realism, including aperture-limited changes in
viewpoint. But uncompressed LF data is too large for
network transmission, which is the reason why LF
compression has become an important research topic. One
of the more recent approaches for LF compression is to
reduce the angular resolution of the input LF during
compression and to use LF reconstruction to recover the
discarded viewpoints during decompression. Following
this approach, we propose a new LF reconstruction
algorithm based on Graph Neural Networks; we show that
it can achieve higher compression and better quality
compared to existing reconstruction methods, although
suffering from the same problem as those methods ---
the inability to deal effectively with high-frequency
image components. To solve this problem, we propose an
adaptive two-layer compression architecture that
separates high-frequency and low-frequency components
and compresses each with a different strategy so that
the performance can become robust and controllable.
Experiments with multiple datasets show that our
proposed scheme is capable of providing a decompression
quality of above 40 dB, and can significantly improve
compression efficiency compared with similar LF
reconstruction schemes.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "72",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Claypool:2020:IMD,
author = "Mark Claypool and Andy Cockburn and Carl Gutwin",
title = "The Impact of Motion and Delay on Selecting Game
Targets with a Mouse",
journal = j-TOMM,
volume = "16",
number = "2s",
pages = "73:1--73:24",
month = jul,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3390464",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sun Jul 19 08:56:56 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/3390464",
abstract = "All real-time computer games, particularly networked
computer games, have a delay from when a player starts
an action (e.g., clicking the mouse) until the game
renders the result (e.g., firing a projectile). This
delay can degrade both player \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "73",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Anonymous:2020:TCO,
author = "Anonymous",
title = "Table of Contents: Online Supplement Volume 16, Number
1s",
journal = j-TOMM,
volume = "16",
number = "3",
pages = "74:1--74:5",
month = sep,
year = "2020",
DOI = "https://doi.org/10.1145/3409367",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:45:43 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3409367",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "74",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Yang:2020:CLR,
author = "Liang Yang and Haifeng Hu and Songlong Xing and
Xinlong Lu",
title = "Constrained {LSTM} and Residual Attention for Image
Captioning",
journal = j-TOMM,
volume = "16",
number = "3",
pages = "75:1--75:18",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3386725",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Sep 5 18:46:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3386725",
abstract = "Visual structure and syntactic structure are essential
in images and texts, respectively. Visual structure
depicts both entities in an image and their
interactions, whereas syntactic structure in texts can
reflect the part-of-speech constraints between
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "75",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zeng:2020:DTN,
author = "Donghuo Zeng and Yi Yu and Keizo Oyama",
title = "Deep Triplet Neural Networks with Cluster-{CCA} for
Audio-Visual Cross-Modal Retrieval",
journal = j-TOMM,
volume = "16",
number = "3",
pages = "76:1--76:23",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3387164",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Sep 5 18:46:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3387164",
abstract = "Cross-modal retrieval aims to retrieve data in one
modality by a query in another modality, which has been
a very interesting research issue in the field of
multimedia, information retrieval, and computer vision,
and database. Most existing works focus \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "76",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Su:2020:MVG,
author = "Yu-Ting Su and Wen-Hui Li and Wei-Zhi Nie and An-An
Liu",
title = "Multi-View Graph Matching for {$3$D} Model Retrieval",
journal = j-TOMM,
volume = "16",
number = "3",
pages = "77:1--77:20",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3387920",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Sep 5 18:46:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3387920",
abstract = "3D model retrieval has been widely utilized in
numerous domains, such as computer-aided design,
digital entertainment, and virtual reality. Recently,
many graph-based methods have been proposed to address
this task by using multi-view information of 3D
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "77",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Fan:2020:RAN,
author = "Hehe Fan and Linchao Zhu and Yi Yang and Fei Wu",
title = "Recurrent Attention Network with Reinforced Generator
for Visual Dialog",
journal = j-TOMM,
volume = "16",
number = "3",
pages = "78:1--78:16",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3390891",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Sep 5 18:46:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3390891",
abstract = "In Visual Dialog, an agent has to parse temporal
context in the dialog history and spatial context in
the image to hold a meaningful dialog with humans. For
example, to answer ``what is the man on her left
wearing?'' the agent needs to (1) analyze the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "78",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Huang:2020:ABM,
author = "Feiran Huang and Kaimin Wei and Jian Weng and Zhoujun
Li",
title = "Attention-Based Modality-Gated Networks for Image-Text
Sentiment Analysis",
journal = j-TOMM,
volume = "16",
number = "3",
pages = "79:1--79:19",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3388861",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Sep 5 18:46:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3388861",
abstract = "Sentiment analysis of social multimedia data has
attracted extensive research interest and has been
applied to many tasks, such as election prediction and
products evaluation. Sentiment analysis of one modality
(e.g., text or image) has been broadly \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "79",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2020:PSE,
author = "Shangfei Wang and Longfei Hao and Qiang Ji",
title = "Posed and Spontaneous Expression Distinction Using
Latent Regression {Bayesian} Networks",
journal = j-TOMM,
volume = "16",
number = "3",
pages = "80:1--80:18",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3391290",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Sep 5 18:46:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3391290",
abstract = "Facial spatial patterns can help distinguish between
posed and spontaneous expressions, but this information
has not been thoroughly leveraged by current studies.
We present several latent regression Bayesian networks
(LRBNs) to capture the patterns \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "80",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2020:UNA,
author = "Fangyu Liu and R{\'e}mi Lebret and Didier Orel and
Philippe Sordet and Karl Aberer",
title = "Upgrading the Newsroom: an Automated Image Selection
System for News Articles",
journal = j-TOMM,
volume = "16",
number = "3",
pages = "81:1--81:28",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3396520",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Sep 5 18:46:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3396520",
abstract = "We propose an automated image selection system to
assist photo editors in selecting suitable images for
news articles. The system fuses multiple textual
sources extracted from news articles and accepts
multilingual inputs. It is equipped with char-level
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "81",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Lv:2020:FSM,
author = "Chenlei Lv and Zhongke Wu and Xingce Wang and Mingquan
Zhou",
title = "{$3$D} Facial Similarity Measurement and Its
Application in Facial Organization",
journal = j-TOMM,
volume = "16",
number = "3",
pages = "82:1--82:20",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3397765",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Sep 5 18:46:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3397765",
abstract = "We propose a novel framework for 3D facial similarity
measurement and its application in facial organization.
The construction of the framework is based on Kendall
shape space theory. Kendall shape space is a quotient
space that is constructed by shape \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "82",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yuan:2020:ICJ,
author = "Jin Yuan and Lei Zhang and Songrui Guo and Yi Xiao and
Zhiyong Li",
title = "Image Captioning with a Joint Attention Mechanism by
Visual Concept Samples",
journal = j-TOMM,
volume = "16",
number = "3",
pages = "83:1--83:22",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3394955",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Sep 5 18:46:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3394955",
abstract = "The attention mechanism has been established as an
effective method for generating caption words in image
captioning; it explores one noticed subregion in an
image to predict a related caption word. However, even
though the attention mechanism could \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "83",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2020:IMP,
author = "Xun Wang and Yan Tian and Xuran Zhao and Tao Yang and
Judith Gelernter and Jialei Wang and Guohua Cheng and
Wei Hu",
title = "Improving Multiperson Pose Estimation by Mask-aware
Deep Reinforcement Learning",
journal = j-TOMM,
volume = "16",
number = "3",
pages = "84:1--84:18",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3397340",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Sep 5 18:46:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3397340",
abstract = "Research on single-person pose estimation based on
deep neural networks has recently witnessed progress in
both accuracy and execution efficiency. However,
multiperson pose estimation is still a challenging
topic, partially because the object regions \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "84",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Feng:2020:LJS,
author = "Shenming Feng and Haifeng Hu",
title = "Learning Joint Structure for Human Pose Estimation",
journal = j-TOMM,
volume = "16",
number = "3",
pages = "85:1--85:17",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3392302",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Sep 5 18:46:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3392302",
abstract = "Recently, tremendous progress has been achieved on
human pose estimation with the development of
convolutional neural networks (CNNs). However, current
methods still suffer from severe occlusion, back view,
and large pose variation due to the lack of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "85",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Lin:2020:SSI,
author = "Feng Lin and Bin Li and Wengang Zhou and Houqiang Li
and Yan Lu",
title = "Single-stage Instance Segmentation",
journal = j-TOMM,
volume = "16",
number = "3",
pages = "86:1--86:19",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3387926",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Sep 5 18:46:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3387926",
abstract = "Albeit the highest accuracy of object detection is
generally acquired by multi-stage detectors, like R-CNN
and its extension approaches, the single-stage object
detectors also achieve remarkable performance with
faster execution and higher scalability. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "86",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Jiang:2020:FSF,
author = "Shuqiang Jiang and Weiqing Min and Yongqiang Lyu and
Linhu Liu",
title = "Few-shot Food Recognition via Multi-view
Representation Learning",
journal = j-TOMM,
volume = "16",
number = "3",
pages = "87:1--87:20",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3391624",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Sep 5 18:46:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3391624",
abstract = "This article considers the problem of few-shot
learning for food recognition. Automatic food
recognition can support various applications, e.g.,
dietary assessment and food journaling. Most existing
works focus on food recognition with large numbers of
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "87",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Ho:2020:SGD,
author = "Trang-Thi Ho and John Jethro Virtusio and Yung-Yao
Chen and Chih-Ming Hsu and Kai-Lung Hua",
title = "Sketch-guided Deep Portrait Generation",
journal = j-TOMM,
volume = "16",
number = "3",
pages = "88:1--88:18",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3396237",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Sep 5 18:46:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3396237",
abstract = "Generating a realistic human class image from a sketch
is a unique and challenging problem considering that
the human body has a complex structure that must be
preserved. Additionally, input sketches often lack
important details that are crucial in the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "88",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Srivastava:2020:DAI,
author = "Gargi Srivastava and Rajeev Srivastava",
title = "Design, Analysis, and Implementation of Efficient
Framework for Image Annotation",
journal = j-TOMM,
volume = "16",
number = "3",
pages = "89:1--89:24",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3386249",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Sep 5 18:46:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3386249",
abstract = "In this article, a general framework of image
annotation is proposed by involving salient object
detection (SOD), feature extraction, feature selection,
and multi-label classification. For SOD,
Augmented-Gradient Vector Flow (A-GVF) is proposed,
which \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "89",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2020:KAN,
author = "Dongyang Zhang and Jie Shao and Heng Tao Shen",
title = "Kernel Attention Network for Single Image
Super-Resolution",
journal = j-TOMM,
volume = "16",
number = "3",
pages = "90:1--90:15",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3398685",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Sep 5 18:46:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3398685",
abstract = "Recently, attention mechanisms have shown a developing
tendency toward convolutional neural network (CNN), and
some representative attention mechanisms, i.e., channel
attention (CA) and spatial attention (SA) have been
fully applied to single image \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "90",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2020:BIQ,
author = "Yutao Liu and Ke Gu and Xiu Li and Yongbing Zhang",
title = "Blind Image Quality Assessment by Natural Scene
Statistics and Perceptual Characteristics",
journal = j-TOMM,
volume = "16",
number = "3",
pages = "91:1--91:91",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3414837",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Sep 5 18:46:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3414837",
abstract = "Opinion-unaware blind image quality assessment (OU
BIQA) refers to establishing a blind quality prediction
model without using the expensive subjective quality
scores, which is a highly promising direction in the
BIQA research. In this article, we focus \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "91",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Francis:2020:UTF,
author = "Jobin Francis and Baburaj M. and Sudhish N. George",
title = "A Unified Tensor Framework for Clustering and
Simultaneous Reconstruction of Incomplete Imaging
Data",
journal = j-TOMM,
volume = "16",
number = "3",
pages = "92:1--92:24",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.1145/3399806",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Sep 5 18:46:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3399806",
abstract = "Incomplete observations in the data are always
troublesome to data clustering algorithms. In fact,
most of the well-received techniques are not designed
to encounter such imperative scenarios. Hence,
clustering of images under incomplete samples is an
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "92",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Sharma:2021:ISI,
author = "Suraj Sharma and Xuyun Zhang and Hesham El-Sayed and
Zhiyuan Tan",
title = "Introduction to the Special Issue on Privacy and
Security in Evolving {Internet of Multimedia Things}",
journal = j-TOMM,
volume = "16",
number = "3s",
pages = "93:1--93:3",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3423955",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 22 06:57:30 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3423955",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "93",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xu:2021:LBO,
author = "Xiaolong Xu and Qihe Huang and Yiwen Zhang and
Shancang Li and Lianyong Qi and Wanchun Dou",
title = "An {LSH}-based Offloading Method for {IoMT} Services
in Integrated Cloud-Edge Environment",
journal = j-TOMM,
volume = "16",
number = "3s",
pages = "94:1--94:19",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3408319",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 22 06:57:30 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3408319",
abstract = "Benefiting from the massive available data provided by
Internet of multimedia things (IoMT), enormous
intelligent services requiring information of various
types to make decisions are emerging. Generally, the
IoMT devices are equipped with limited \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "94",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Gati:2021:DPT,
author = "Nicholaus J. Gati and Laurence T. Yang and Jun Feng
and Yijun Mo and Mamoun Alazab",
title = "Differentially Private Tensor Train Deep Computation
for {Internet of Multimedia Things}",
journal = j-TOMM,
volume = "16",
number = "3s",
pages = "95:1--95:20",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3421276",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 22 06:57:30 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3421276",
abstract = "The significant growth of the Internet of Things (IoT)
takes a key and active role in healthcare, smart homes,
smart manufacturing, and wearable gadgets. Due to
complexness and difficulty in processing multimedia
data, the IoT based scheme, namely \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "95",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liang:2021:FBS,
author = "Haoran Liang and Jun Wu and Xi Zheng and Mengshi Zhang
and Jianhua Li and Alireza Jolfaei",
title = "Fog-based Secure Service Discovery for {Internet of
Multimedia Things}: a Cross-blockchain Approach",
journal = j-TOMM,
volume = "16",
number = "3s",
pages = "96:1--96:23",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3415151",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 22 06:57:30 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3415151",
abstract = "The Internet of Multimedia Things (IoMT) has become
the backbone of innumerable multimedia applications in
various fields. The wide application of IoMT not only
makes our life convenient but also brings challenges to
service discovery. Service discovery \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "96",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Lv:2021:ASI,
author = "Zhihan Lv and Liang Qiao and Houbing Song",
title = "Analysis of the Security of {Internet of Multimedia
Things}",
journal = j-TOMM,
volume = "16",
number = "3s",
pages = "97:1--97:16",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3398201",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 22 06:57:30 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3398201",
abstract = "To study the security performance of the Internet of
multimedia things on the privacy protection of user
identity, behavior trajectory, and preference under the
new information technology industry wave, in this
study, aiming at the problems of the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "97",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Sahoo:2021:SAD,
author = "Kshira Sagar Sahoo and Deepak Puthal",
title = "{SDN}-Assisted {DDoS} Defense Framework for the
{Internet of Multimedia Things}",
journal = j-TOMM,
volume = "16",
number = "3s",
pages = "98:1--98:18",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3394956",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 22 06:57:30 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3394956",
abstract = "The Internet of Things is visualized as a fundamental
networking model that bridges the gap between the cyber
and real-world entity. Uniting the real-world object
with virtualization technology is opening further
opportunities for innovation in nearly \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "98",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Namasudra:2021:SMU,
author = "Suyel Namasudra and Rupak Chakraborty and Abhishek
Majumder and Nageswara Rao Moparthi",
title = "Securing Multimedia by Using {DNA}-Based Encryption in
the Cloud Computing Environment",
journal = j-TOMM,
volume = "16",
number = "3s",
pages = "99:1--99:19",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3392665",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 22 06:57:30 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3392665",
abstract = "Today, the size of a multimedia file is increasing day
by day from gigabytes to terabytes or even petabytes,
mainly because of the evolution of a large amount of
real-time data. As most of the multimedia files are
transmitted through the internet, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "99",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Fang:2021:PPM,
author = "Liming Fang and Changchun Yin and Juncen Zhu and
Chunpeng Ge and M. Tanveer and Alireza Jolfaei and
Zehong Cao",
title = "Privacy Protection for Medical Data Sharing in Smart
Healthcare",
journal = j-TOMM,
volume = "16",
number = "3s",
pages = "100:1--100:18",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3408322",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 22 06:57:30 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3408322",
abstract = "In virtue of advances in smart networks and the cloud
computing paradigm, smart healthcare is transforming.
However, there are still challenges, such as storing
sensitive data in untrusted and controlled
infrastructure and ensuring the secure \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "100",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Singh:2021:DHC,
author = "A. K. Singh",
title = "Data Hiding: Current Trends, Innovation and Potential
Challenges",
journal = j-TOMM,
volume = "16",
number = "3s",
pages = "101:1--101:16",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3382772",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 22 06:57:30 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3382772",
abstract = "With the widespread growth of digital information and
improved internet technologies, the demand for improved
information security techniques has significantly
increased due to privacy leakage, identity theft,
illegal copying, and data distribution. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "101",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Hu:2021:MLM,
author = "Hezhen Hu and Wengang Zhou and Xingze Li and Ning Yan
and Houqiang Li",
title = "{MV2Flow}: Learning Motion Representation for Fast
Compressed Video Action Recognition",
journal = j-TOMM,
volume = "16",
number = "3s",
pages = "102:1--102:19",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3422360",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 22 06:57:30 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3422360",
abstract = "In video action recognition, motion is a very crucial
clue, which is usually represented by optical flow.
However, optical flow is computationally expensive to
obtain, which becomes the bottleneck for the efficiency
of traditional action recognition \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "102",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Cui:2021:SSI,
author = "Chaoran Cui and Peiguang Lin and Xiushan Nie and Muwei
Jian and Yilong Yin",
title = "Social-sensed Image Aesthetics Assessment",
journal = j-TOMM,
volume = "16",
number = "3s",
pages = "103:1--103:19",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3414843",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 22 06:57:30 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3414843",
abstract = "Image aesthetics assessment aims to endow computers
with the ability to judge the aesthetic values of
images, and its potential has been recognized in a
variety of applications. Most previous studies perform
aesthetics assessment purely based on image \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "103",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Sharma:2021:TCO,
author = "Suraj Sharma",
title = "Table of Contents: Online Supplement Volume 16, Number
3s",
journal = j-TOMM,
volume = "16",
number = "4",
pages = "117e-1:117e-2",
month = jan,
year = "2021",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 10:01:20 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "117",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J961",
}
@Article{Shao:2021:EBR,
author = "Huiru Shao and Jing Li and Jia Zhang and Hui Yu and
Jiande Sun",
title = "Eye-based Recognition for User Identification on
Mobile Devices",
journal = j-TOMM,
volume = "16",
number = "4",
pages = "117:1--117:19",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3399659",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Feb 10 10:15:11 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3399659",
abstract = "User identification is becoming more and more
important for Apps on mobile devices. However, the
identity recognition based on eyes, e.g., iris
recognition, is rarely used on mobile devices comparing
with those based on face and fingerprint due to its
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "117",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2021:NKT,
author = "Zuquan Liu and Guopu Zhu and Yuan-Gen Wang and
Jianquan Yang and Sam Kwong",
title = "A Novel $ (t, s, k, n)$-Threshold Visual Secret
Sharing Scheme Based on Access Structure Partition",
journal = j-TOMM,
volume = "16",
number = "4",
pages = "118:1--118:21",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3418212",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Feb 10 10:15:11 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3418212",
abstract = "Visual secret sharing (VSS) is a new technique for
sharing a binary image into multiple shadows. For VSS,
the original image can be reconstructed from the
shadows in any qualified set, but cannot be
reconstructed from those in any forbidden set. In most
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "118",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Becattini:2021:DPA,
author = "Federico Becattini and Tiberio Uricchio and Lorenzo
Seidenari and Lamberto Ballan and Alberto {Del Bimbo}",
title = "Am {I} Done? {Predicting} Action Progress in Videos",
journal = j-TOMM,
volume = "16",
number = "4",
pages = "119:1--119:24",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3402447",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Feb 10 10:15:11 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3402447",
abstract = "In this article, we deal with the problem of
predicting action progress in videos. We argue that
this is an extremely important task, since it can be
valuable for a wide range of interaction applications.
To this end, we introduce a novel approach, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "119",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Ruan:2021:CDI,
author = "Weijian Ruan and Chao Liang and Yi Yu and Zheng Wang
and Wu Liu and Jun Chen and Jiayi Ma",
title = "Correlation Discrepancy Insight Network for Video
Re-identification",
journal = j-TOMM,
volume = "16",
number = "4",
pages = "120:1--120:21",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3402666",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Feb 10 10:15:11 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3402666",
abstract = "Video-based person re-identification (ReID) aims at
re-identifying a specified person sequence from videos
that were captured by disjoint cameras. Most existing
works on this task ignore the quality discrepancy
across frames by using all video frames to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "120",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yang:2021:SSI,
author = "Xin Yang and Yu Qiao and Shaozhe Chen and Shengfeng He
and Baocai Yin and Qiang Zhang and Xiaopeng Wei and
Rynson W. H. Lau",
title = "Smart Scribbles for Image Matting",
journal = j-TOMM,
volume = "16",
number = "4",
pages = "121:1--121:21",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3408323",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Feb 10 10:15:11 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3408323",
abstract = "Image matting is an ill-posed problem that usually
requires additional user input, such as trimaps or
scribbles. Drawing a fine trimap requires a large
amount of user effort, while using scribbles can hardly
obtain satisfactory alpha mattes for non-. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "121",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yan:2021:DID,
author = "Chenggang Yan and Zhisheng Li and Yongbing Zhang and
Yutao Liu and Xiangyang Ji and Yongdong Zhang",
title = "Depth Image Denoising Using Nuclear Norm and Learning
Graph Model",
journal = j-TOMM,
volume = "16",
number = "4",
pages = "122:1--122:17",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3404374",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Feb 10 10:15:11 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3404374",
abstract = "Depth image denoising is increasingly becoming the hot
research topic nowadays, because it reflects the
three-dimensional scene and can be applied in various
fields of computer vision. But the depth images
obtained from depth camera usually contain \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "122",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhu:2021:MAS,
author = "Lin Zhu and Xiurong Jiang and Jianing Li and Yuanhong
Hao and Yonghong Tian",
title = "Motion-Aware Structured Matrix Factorization for
Foreground Detection in Complex Scenes",
journal = j-TOMM,
volume = "16",
number = "4",
pages = "123:1--123:23",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3407188",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Feb 10 10:15:11 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3407188",
abstract = "Foreground detection is one of the key steps in
computer vision applications. Many foreground and
background models have been proposed and achieved
promising performance in static scenes. However, due to
challenges such as dynamic background, irregular
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "123",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wei:2021:CNL,
author = "Yang Wei and Zhuzhu Wang and Bin Xiao and Ximeng Liu
and Zheng Yan and Jianfeng Ma",
title = "Controlling Neural Learning Network with Multiple
Scales for Image Splicing Forgery Detection",
journal = j-TOMM,
volume = "16",
number = "4",
pages = "124:1--124:22",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3408299",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Feb 10 10:15:11 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3408299",
abstract = "The guarantee of social stability comes from many
aspects of life, and image information security as one
of them is being subjected to various malicious
attacks. As a means of information attack, image
splicing forgery refers to copying some areas of an
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "124",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zeng:2021:VRS,
author = "Kun Zeng and Jiangchuan Hu and Yongyi Gong and
Kanoksak Wattanachote and Runpeng Yu and Xiaonan Luo",
title = "Vertical Retargeting for Stereoscopic Images via
Stereo Seam Carving",
journal = j-TOMM,
volume = "16",
number = "4",
pages = "125:1--125:22",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3408295",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Feb 10 10:15:11 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3408295",
abstract = "Vertical retargeting for stereoscopic images using
seam manipulation-based approaches has remained an open
challenge over the years. Even though horizontal
retargeting had attracted a huge amount of interest,
its seam coupling strategies were not \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "125",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Tian:2021:PIC,
author = "Tao Tian and Hanli Wang and Sam Kwong and C.-C. Jay
Kuo",
title = "Perceptual Image Compression with Block-Level Just
Noticeable Difference Prediction",
journal = j-TOMM,
volume = "16",
number = "4",
pages = "126:1--126:15",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3408320",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Feb 10 10:15:11 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3408320",
abstract = "A block-level perceptual image compression framework
is proposed in this work, including a block-level just
noticeable difference (JND) prediction model and a
preprocessing scheme. Specifically speaking,
block-level JND values are first deduced by \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "126",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{He:2021:MFU,
author = "Xin He and Qiong Liu and You Yang",
title = "Make Full Use of Priors: Cross-View Optimized Filter
for Multi-View Depth Enhancement",
journal = j-TOMM,
volume = "16",
number = "4",
pages = "127:1--127:19",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3408293",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Feb 10 10:15:11 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3408293",
abstract = "Multi-view video plus depth (MVD) is the promising and
widely adopted data representation for future 3D visual
applications and interactive media. However,
compression distortions on depth videos impede the
development of such applications, and filters
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "127",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2021:AAB,
author = "Xiaoxiao Liu and Qingyang Xu",
title = "Adaptive Attention-based High-level Semantic
Introduction for Image Caption",
journal = j-TOMM,
volume = "16",
number = "4",
pages = "128:1--128:22",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3409388",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Feb 10 10:15:11 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3409388",
abstract = "There have been several attempts to integrate a
spatial visual attention mechanism into an image
caption model and introduce semantic concepts as the
guidance of image caption generation. High-level
semantic information consists of the abstractedness
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "128",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{UlFazal:2021:EIC,
author = "Muhammad Abu {Ul Fazal} and Sam Ferguson and Andrew
Johnston",
title = "Evaluation of Information Comprehension in Concurrent
Speech-based Designs",
journal = j-TOMM,
volume = "16",
number = "4",
pages = "129:1--129:19",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3409463",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Feb 10 10:15:11 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3409463",
abstract = "In human-computer interaction, particularly in
multimedia delivery, information is communicated to
users sequentially, whereas users are capable of
receiving information from multiple sources
concurrently. This mismatch indicates that a sequential
mode \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "129",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhu:2021:LDA,
author = "Yucheng Zhu and Guangtao Zhai and Xiongkuo Min and
Jiantao Zhou",
title = "Learning a Deep Agent to Predict Head Movement in
360-Degree Images",
journal = j-TOMM,
volume = "16",
number = "4",
pages = "130:1--130:23",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3410455",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Feb 10 10:15:11 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3410455",
abstract = "Virtual reality adequately stimulates senses to trick
users into accepting the virtual environment. To create
a sense of immersion, high-resolution images are
required to satisfy human visual system, and low
latency is essential for smooth operations, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "130",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Nie:2021:MMI,
author = "Weizhi Nie and Qi Liang and Yixin Wang and Xing Wei
and Yuting Su",
title = "{MMFN}: Multimodal Information Fusion Networks for
{$3$D} Model Classification and Retrieval",
journal = j-TOMM,
volume = "16",
number = "4",
pages = "131:1--131:22",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3410439",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Feb 10 10:15:11 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3410439",
abstract = "In recent years, research into 3D shape recognition in
the field of multimedia and computer vision has
attracted wide attention. With the rapid development of
deep learning, various deep models have achieved
state-of-the-art performance based on \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "131",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhao:2021:GRC,
author = "Zhongying Zhao and Yonghao Yang and Chao Li and
Liqiang Nie",
title = "{GuessUNeed}: Recommending Courses via Neural
Attention Network and Course Prerequisite Relation
Embeddings",
journal = j-TOMM,
volume = "16",
number = "4",
pages = "132:1--132:17",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3410441",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Feb 10 10:15:11 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3410441",
abstract = "Massive Open Online Courses, offering millions of
high-quality courses from prestigious universities and
prominent experts, are picking up momentum in
popularity. Although users enrolling on MOOCs have free
access to abundant knowledge, they may easily
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "132",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Huang:2021:KDE,
author = "Yi Huang and Xiaoshan Yang and Junyu Gao and Jitao
Sang and Changsheng Xu",
title = "Knowledge-driven Egocentric Multimodal Activity
Recognition",
journal = j-TOMM,
volume = "16",
number = "4",
pages = "133:1--133:133",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3409332",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Feb 10 10:15:11 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3409332",
abstract = "Recognizing activities from egocentric multimodal data
collected by wearable cameras and sensors, is gaining
interest, as multimodal methods always benefit from the
complementarity of different modalities. However, since
high-dimensional videos contain \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "133",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2021:PBS,
author = "Yaoyu Li and Hantao Yao and Tianzhu Zhang and
Changsheng Xu",
title = "Part-based Structured Representation Learning for
Person Re-identification",
journal = j-TOMM,
volume = "16",
number = "4",
pages = "134:1--134:22",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3412384",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Feb 10 10:15:11 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3412384",
abstract = "Person re-identification aims to match person of
interest under non-overlapping camera views. Therefore,
how to generate a robust and discriminative
representation is crucial for person re-identification.
Mining local clues from human body parts to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "134",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Jin:2021:MTL,
author = "Xin Jin and Jianfeng Xu and Kazuyuki Tasaka and Zhibo
Chen",
title = "Multi-task Learning-based All-in-one Collaboration
Framework for Degraded Image Super-resolution",
journal = j-TOMM,
volume = "17",
number = "1",
pages = "21:1--21:21",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3417333",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:40:21 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3417333",
abstract = "In this article, we address the degraded image
super-resolution problem in a multi-task learning (MTL)
manner. To better share representations between
multiple tasks, we propose an all-in-one collaboration
framework (ACF) with a learnable ``junction'' unit
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "21",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Tran:2021:CQM,
author = "Huyen T. T. Tran and Nam Pham Ngoc and Tobias
Ho{\ss}feld and Michael Seufert and Truong Cong Thang",
title = "Cumulative Quality Modeling for {HTTP} Adaptive
Streaming",
journal = j-TOMM,
volume = "17",
number = "1",
pages = "22:1--22:24",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3423421",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:40:21 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3423421",
abstract = "HTTP Adaptive Streaming has become the de facto choice
for multimedia delivery. However, the quality of
adaptive video streaming may fluctuate strongly during
a session due to throughput fluctuations. So, it is
important to evaluate the quality of a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "22",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xu:2021:SVM,
author = "Tong Xu and Peilun Zhou and Linkang Hu and Xiangnan He
and Yao Hu and Enhong Chen",
title = "Socializing the Videos: a Multimodal Approach for
Social Relation Recognition",
journal = j-TOMM,
volume = "17",
number = "1",
pages = "23:1--23:23",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3416493",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:40:21 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3416493",
abstract = "As a crucial task for video analysis, social relation
recognition for characters not only provides
semantically rich description of video content but also
supports intelligent applications, e.g., video
retrieval and visual question answering. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "23",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yan:2021:RSI,
author = "Xuehu Yan and Lintao Liu and Longlong Li and Yuliang
Lu",
title = "Robust Secret Image Sharing Resistant to Noise in
Shares",
journal = j-TOMM,
volume = "17",
number = "1",
pages = "24:1--24:22",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3419750",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:40:21 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3419750",
abstract = "A secret image is split into shares in the generation
phase of secret image sharing (SIS) for a threshold. In
the recovery phase, the secret image is recovered when
any or more shares are collected, and each collected
share is generally assumed to be \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "24",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xu:2021:ANM,
author = "Mingliang Xu and Qingfeng Li and Jianwei Niu and Hao
Su and Xiting Liu and Weiwei Xu and Pei Lv and Bing
Zhou and Yi Yang",
title = "{ART-UP}: a Novel Method for Generating
Scanning-Robust Aesthetic {QR} Codes",
journal = j-TOMM,
volume = "17",
number = "1",
pages = "25:1--25:23",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3418214",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:40:21 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3418214",
abstract = "Quick response (QR) codes are usually scanned in
different environments, so they must be robust to
variations in illumination, scale, coverage, and camera
angles. Aesthetic QR codes improve the visual quality,
but subtle changes in their appearance may \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "25",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yang:2021:CIR,
author = "Peihao Yang and Linghe Kong and Meikang Qiu and Xue
Liu and Guihai Chen",
title = "Compressed Imaging Reconstruction with Sparse Random
Projection",
journal = j-TOMM,
volume = "17",
number = "1",
pages = "26:1--26:25",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3447431",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:40:21 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3447431",
abstract = "As the Internet of Things thrives, monitors and
cameras produce tons of image data every day. To
efficiently process these images, many compressed
imaging frameworks are proposed. A compressed imaging
framework comprises two parts, image signal \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "26",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Qi:2021:GNT,
author = "Lei Qi and Lei Wang and Jing Huo and Yinghuan Shi and
Yang Gao",
title = "{GreyReID}: a Novel Two-stream Deep Framework with
{RGB}-grey Information for Person Re-identification",
journal = j-TOMM,
volume = "17",
number = "1",
pages = "27:1--27:22",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3419439",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:40:21 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3419439",
abstract = "In this article, we observe that most false positive
images (i.e., different identities with query images)
in the top ranking list usually have the similar color
information with the query image in person
re-identification (Re-ID). Meanwhile, when we use
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "27",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Chehabeddine:2021:BMH,
author = "Said Chehabeddine and Muhammad Hassan Jamil and Wanjoo
Park and Dianne L. Sefo and Peter M. Loomer and Mohamad
Eid",
title = "{Bi}-manual Haptic-based Periodontal Simulation with
Finger Support and Vibrotactile Feedback",
journal = j-TOMM,
volume = "17",
number = "1",
pages = "28:1--28:17",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3421765",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:40:21 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3421765",
abstract = "The rise of virtual reality and haptic technologies
has created exciting new applications in medical
training and education. In a dental simulation, haptic
technology can create the illusion of substances
(teeth, gingiva, bone, etc.) by providing \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "28",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2021:MHP,
author = "Jianshu Li and Jian Zhao and Congyan Lang and Yidong
Li and Yunchao Wei and Guodong Guo and Terence Sim and
Shuicheng Yan and Jiashi Feng",
title = "Multi-human Parsing with a Graph-based Generative
Adversarial Model",
journal = j-TOMM,
volume = "17",
number = "1",
pages = "29:1--29:21",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3418217",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:40:21 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3418217",
abstract = "Human parsing is an important task in human-centric
image understanding in computer vision and multimedia
systems. However, most existing works on human parsing
mainly tackle the single-person scenario, which
deviates from real-world applications where \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "29",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Cinar:2021:IJB,
author = "Yusuf Cinar and Peter Pocta and Desmond Chambers and
Hugh Melvin",
title = "Improved Jitter Buffer Management for {WebRTC}",
journal = j-TOMM,
volume = "17",
number = "1",
pages = "30:1--30:20",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3410449",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:40:21 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3410449",
abstract = "This work studies the jitter buffer management
algorithm for Voice over IP in WebRTC. In particular,
it details the core concepts of WebRTC's jitter buffer
management. Furthermore, it investigates how jitter
buffer management algorithm behaves under \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "30",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Czekierda:2021:AOO,
author = "Lukasz Czekierda and Krzysztof Zieli{\'n}ski and
S{\l}awomir Zieli{\'n}ski",
title = "Automated Orchestration of Online Educational
Collaboration in Cloud-based Environments",
journal = j-TOMM,
volume = "17",
number = "1",
pages = "31:1--31:26",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3412381",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:40:21 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3412381",
abstract = "Integrated collaboration environments (ICEs) are
widely used by corporations to increase productivity by
fostering groupwide and interpersonal collaboration. In
this article, we discuss the enhancements of such
environment needed to build an educational \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "31",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Kieu:2021:BLD,
author = "My Kieu and Andrew D. Bagdanov and Marco Bertini",
title = "Bottom-up and Layerwise Domain Adaptation for
Pedestrian Detection in Thermal Images",
journal = j-TOMM,
volume = "17",
number = "1",
pages = "32:1--32:19",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3418213",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:40:21 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3418213",
abstract = "Pedestrian detection is a canonical problem for safety
and security applications, and it remains a challenging
problem due to the highly variable lighting conditions
in which pedestrians must be detected. This article
investigates several domain \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "32",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2021:MIH,
author = "Wenjie Wang and Ling-Yu Duan and Hao Jiang and
Peiguang Jing and Xuemeng Song and Liqiang Nie",
title = "{Market$2$Dish}: Health-aware Food Recommendation",
journal = j-TOMM,
volume = "17",
number = "1",
pages = "33:1--33:19",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3418211",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:40:21 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3418211",
abstract = "With the rising incidence of some diseases, such as
obesity and diabetes, the healthy diet is arousing
increasing attention. However, most existing
food-related research efforts focus on recipe
retrieval, user-preference-based food recommendation,
cooking \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "33",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2021:ADA,
author = "Yiding Liu and Siyu Yang and Bin Li and Wengang Zhou
and Jizheng Xu and Houqiang Li and Yan Lu",
title = "Affinity Derivation for Accurate Instance
Segmentation",
journal = j-TOMM,
volume = "17",
number = "1",
pages = "34:1--34:20",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3407090",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:40:21 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3407090",
abstract = "Affinity, which represents whether two pixels belong
to a same instance, is an equivalent representation to
the instance segmentation labels. Conventional works do
not make an explicit exploration on the affinity. In
this article, we present two instance \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "34",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yu:2021:CLG,
author = "Yi Yu and Abhishek Srivastava and Simon Canales",
title = "Conditional {LSTM-GAN} for Melody Generation from
Lyrics",
journal = j-TOMM,
volume = "17",
number = "1",
pages = "35:1--35:20",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3424116",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:40:21 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3424116",
abstract = "Melody generation from lyrics has been a challenging
research issue in the field of artificial intelligence
and music, which enables us to learn and discover
latent relationships between interesting lyrics and
accompanying melodies. Unfortunately, the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "35",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yang:2021:AWE,
author = "Xin Yang and Xuemeng Song and Fuli Feng and Haokun Wen
and Ling-Yu Duan and Liqiang Nie",
title = "Attribute-wise Explainable Fashion Compatibility
Modeling",
journal = j-TOMM,
volume = "17",
number = "1",
pages = "36:1--36:21",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3425636",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:40:21 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3425636",
abstract = "With the boom of the fashion market and people's daily
needs for beauty, clothing matching has gained
increased research attention. In a sense, tackling this
problem lies in modeling the human notions of the
compatibility between fashion items, i.e., \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "36",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2021:SSL,
author = "Zhixin Li and Lan Lin and Canlong Zhang and Huifang Ma
and Weizhong Zhao and Zhiping Shi",
title = "A Semi-supervised Learning Approach Based on Adaptive
Weighted Fusion for Automatic Image Annotation",
journal = j-TOMM,
volume = "17",
number = "1",
pages = "37:1--37:23",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3426974",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:40:21 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3426974",
abstract = "To learn a well-performed image annotation model, a
large number of labeled samples are usually required.
Although the unlabeled samples are readily available
and abundant, it is a difficult task for humans to
annotate large numbers of images manually. In
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "37",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2021:DVV,
author = "Yanwei Liu and Jinxia Liu and Antonios Argyriou and
Siwei Ma and Liming Wang and Zhen Xu",
title = "$ 360$-Degree {VR} Video Watermarking Based on
Spherical Wavelet Transform",
journal = j-TOMM,
volume = "17",
number = "1",
pages = "38:1--38:23",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3425605",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:40:21 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3425605",
abstract = "Similar to conventional video, the increasingly
popular 360 virtual reality (VR) video requires
copyright protection mechanisms. The classic approach
for copyright protection is the introduction of a
digital watermark into the video sequence. Due to the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "38",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2021:IBM,
author = "Yang Wang and Meng Fang and Joey Tianyi Zhou and
Tingting Mu and Dacheng Tao",
title = "Introduction to Big Multimodal Multimedia Data with
Deep Analytics",
journal = j-TOMM,
volume = "17",
number = "1s",
pages = "1:1--1:3",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3447530",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:50:01 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3447530",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "1",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xu:2021:ZSC,
author = "Xing Xu and Jialin Tian and Kaiyi Lin and Huimin Lu
and Jie Shao and Heng Tao Shen",
title = "Zero-shot Cross-modal Retrieval by Assembling
{AutoEncoder} and Generative Adversarial Network",
journal = j-TOMM,
volume = "17",
number = "1s",
pages = "3:1--3:17",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3424341",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:50:01 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3424341",
abstract = "Conventional cross-modal retrieval models mainly
assume the same scope of the classes for both the
training set and the testing set. This assumption
limits their extensibility on zero-shot cross-modal
retrieval (ZS-CMR), \ldots{} (More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "3",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Fu:2021:DGL,
author = "Sichao Fu and Weifeng Liu and Weili Guan and Yicong
Zhou and Dapeng Tao and Changsheng Xu",
title = "Dynamic Graph Learning Convolutional Networks for
Semi-supervised Classification",
journal = j-TOMM,
volume = "17",
number = "1s",
pages = "4:1--4:13",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3412846",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:50:01 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3412846",
abstract = "Over the past few years, graph representation learning
(GRL) has received widespread attention on the feature
representations of the non-Euclidean data. As a typical
model of GRL, graph convolutional networks \ldots{}
(More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "4",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2021:DNP,
author = "Zhao Zhang and Jiahuan Ren and Haijun Zhang and Zheng
Zhang and Guangcan Liu and Shuicheng Yan",
title = "{DLRF-Net}: a Progressive Deep Latent Low-Rank Fusion
Network for Hierarchical Subspace Discovery",
journal = j-TOMM,
volume = "17",
number = "1s",
pages = "5:1--5:24",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3402030",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:50:01 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3402030",
abstract = "Low-rank coding-based representation learning is
powerful for discovering and recovering the subspace
structures in data, which has obtained an impressive
performance; however, it still cannot obtain deep
hidden \ldots{} (More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "5",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2021:GMM,
author = "Yi Zhang and Miaomiao Li and Siwei Wang and Sisi Dai
and Lei Luo and En Zhu and Huiying Xu and Xinzhong Zhu
and Chaoyun Yao and Haoran Zhou",
title = "{Gaussian} Mixture Model Clustering with Incomplete
Data",
journal = j-TOMM,
volume = "17",
number = "1s",
pages = "6:1--6:14",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3408318",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:50:01 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3408318",
abstract = "Gaussian mixture model (GMM) clustering has been
extensively studied due to its effectiveness and
efficiency. Though demonstrating promising performance
in various applications, it cannot effectively address
the \ldots{} (More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "6",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2021:ROR,
author = "Jing Zhang and Jiaqi Guo and Yonggong Ren",
title = "Robust Ordinal Regression: User Credit Grading with
Triplet Loss-Based Sampling",
journal = j-TOMM,
volume = "17",
number = "1s",
pages = "7:1--7:20",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3408303",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:50:01 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3408303",
abstract = "With the development of social media sites, user
credit grading, which served as an important and
fashionable problem, has attracted substantial
attention from a slew of developers and operators of
mobile applications. \ldots{} (More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "7",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xu:2021:EIE,
author = "Xin Xu and Shiqin Wang and Zheng Wang and Xiaolong
Zhang and Ruimin Hu",
title = "Exploring Image Enhancement for Salient Object
Detection in Low Light Images",
journal = j-TOMM,
volume = "17",
number = "1s",
pages = "8:1--8:19",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3414839",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:50:01 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3414839",
abstract = "Low light images captured in a non-uniform
illumination environment usually are degraded with the
scene depth and the corresponding environment lights.
This degradation results in severe object information
loss in the \ldots{} (More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "8",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2021:LSI,
author = "Yanchun Li and Jianglian Cao and Zhetao Li and
Sangyoon Oh and Nobuyoshi Komuro",
title = "Lightweight Single Image Super-resolution with Dense
Connection Distillation Network",
journal = j-TOMM,
volume = "17",
number = "1s",
pages = "9:1--9:17",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3414838",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:50:01 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3414838",
abstract = "Single image super-resolution attempts to reconstruct
a high-resolution (HR) image from its corresponding
low-resolution (LR) image, which has been a research
hotspot in computer vision and image processing for
\ldots{} (More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "9",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2021:SDM,
author = "Yang Wang",
title = "Survey on Deep Multi-modal Data Analytics:
Collaboration, Rivalry, and Fusion",
journal = j-TOMM,
volume = "17",
number = "1s",
pages = "10:1--10:25",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3408317",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:50:01 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3408317",
abstract = "With the development of web technology, multi-modal or
multi-view data has surged as a major stream for big
data, where each modal/view encodes individual property
of data objects. Often, different modalities are
\ldots{} (More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "10",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2021:ISI,
author = "Yang Wang and Meng Fang and Joey Tianyi Zhou and
Tingting Mu and Dacheng Tao",
title = "Introduction to the Special Issue on Fine-grained
Visual Computing",
journal = j-TOMM,
volume = "17",
number = "1s",
pages = "11:1--11:3",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3447532",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:50:01 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3447532",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "11",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Hu:2021:AEN,
author = "Yutao Hu and Xuhui Liu and Baochang Zhang and Jungong
Han and Xianbin Cao",
title = "Alignment Enhancement Network for Fine-grained Visual
Categorization",
journal = j-TOMM,
volume = "17",
number = "1s",
pages = "12:1--12:20",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3446208",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:50:01 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3446208",
abstract = "Fine-grained visual categorization (FGVC) aims to
automatically recognize objects from different
sub-ordinate categories. Despite attracting
considerable attention from both academia and industry,
it remains a \ldots{} (More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "12",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Guan:2021:UPS,
author = "Weili Guan and Zhaozheng Chen and Fuli Feng and
Weifeng Liu and Liqiang Nie",
title = "Urban Perception: Sensing Cities via a Deep
Interactive Multi-task Learning Framework",
journal = j-TOMM,
volume = "17",
number = "1s",
pages = "13:1--13:20",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3424115",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:50:01 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3424115",
abstract = "Social scientists have shown evidence that visual
perceptions of urban attributes, such as safe, wealthy,
and beautiful perspectives of the given cities, are
highly correlated to the residents' behaviors and
quality \ldots{} (More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "13",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Lu:2021:CIC,
author = "Huimin Lu and Rui Yang and Zhenrong Deng and Yonglin
Zhang and Guangwei Gao and Rushi Lan",
title = "{Chinese} Image Captioning via Fuzzy Attention-based
{DenseNet-BiLSTM}",
journal = j-TOMM,
volume = "17",
number = "1s",
pages = "14:1--14:18",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3422668",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:50:01 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3422668",
abstract = "Chinese image description generation tasks usually
have some challenges, such as single-feature
extraction, lack of global information, and lack of
detailed description of the image content. To address
these limitations, we \ldots{} (More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "14",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xiao:2021:WSS,
author = "Junsheng Xiao and Huahu Xu and Honghao Gao and Minjie
Bian and Yang Li",
title = "A Weakly Supervised Semantic Segmentation Network by
Aggregating Seed Cues: The Multi-Object Proposal
Generation Perspective",
journal = j-TOMM,
volume = "17",
number = "1s",
pages = "15:1--15:19",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3419842",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:50:01 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3419842",
abstract = "Weakly supervised semantic segmentation under
image-level annotations is effectiveness for real-world
applications. The small and sparse discriminative
regions obtained from an image classification network
\ldots{} (More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "15",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2021:RMR,
author = "Chao Zhang and Xiaopei Wu and Jianchao Lu and Xi Zheng
and Alireza Jolfaei and Quan Z. Sheng and Dongjin Yu",
title = "{RICA-MD}: a Refined {ICA} Algorithm for Motion
Detection",
journal = j-TOMM,
volume = "17",
number = "1s",
pages = "17:1--17:17",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3416492",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:50:01 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3416492",
abstract = "With the rapid development of various computing
technologies, the constraints of data processing
capabilities gradually disappeared, and more data can
be simultaneously processed to obtain better \ldots{}
(More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "17",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Rahman:2021:MMP,
author = "MD Abdur Rahman and M. Shamim Hossain and Nabil A.
Alrajeh and B. B. Gupta",
title = "A Multimodal, Multimedia Point-of-Care Deep Learning
Framework for {COVID-19} Diagnosis",
journal = j-TOMM,
volume = "17",
number = "1s",
pages = "18:1--18:24",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3421725",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:50:01 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3421725",
abstract = "In this article, we share our experiences in designing
and developing a suite of deep neural network-(DNN)
based COVID-19 case detection and recognition
framework. Existing pathological tests such as
RT-PCR-based \ldots{} (More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "18",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2021:SFF,
author = "Yidong Li and Wenhua Liu and Yi Jin and Yuanzhouhan
Cao",
title = "{SPGAN}: Face Forgery Using Spoofing Generative
Adversarial Networks",
journal = j-TOMM,
volume = "17",
number = "1s",
pages = "19:1--19:20",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3432817",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:50:01 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3432817",
abstract = "Current face spoof detection schemes mainly rely on
physiological cues such as eye blinking, mouth
movements, and micro-expression changes, or textural
attributes of the face images [9]. But none of these
methods \ldots{} (More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "19",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Qi:2021:CAW,
author = "Lianyong Qi and Houbing Song and Xuyun Zhang and
Gautam Srivastava and Xiaolong Xu and Shui Yu",
title = "Compatibility-Aware {Web} {API} Recommendation for
Mashup Creation via Textual Description Mining",
journal = j-TOMM,
volume = "17",
number = "1s",
pages = "20:1--20:19",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3417293",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Apr 17 08:50:01 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3417293",
abstract = "With the ever-increasing prosperity of web Application
Programming Interface (API) sharing platforms, it is
becoming an economic and efficient way for software
developers to design their interested mashups \ldots{}
(More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "20",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Krishnan:2021:SEQ,
author = "Prabhakar Krishnan and Kurunandan Jain and Pramod
George Jose and Krishnashree Achuthan and Rajkumar
Buyya",
title = "{SDN} Enabled {QoE} and Security Framework for
Multimedia Applications in {5G} Networks",
journal = j-TOMM,
volume = "17",
number = "2",
pages = "39:1--39:29",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3377390",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jun 5 07:35:45 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3377390",
abstract = "The technologies for real-time multimedia transmission
and immersive 3D gaming applications are rapidly
emerging, posing challenges in terms of performance,
security, authentication, data privacy, and encoding.
The communication channel for these \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "39",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Kumar:2021:ESE,
author = "S. Sambath Kumar and M. Nandhini",
title = "Entropy Slicing Extraction and Transfer Learning
Classification for Early Diagnosis of {Alzheimer}
Diseases with {sMRI}",
journal = j-TOMM,
volume = "17",
number = "2",
pages = "40:1--40:22",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3383749",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jun 5 07:35:45 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3383749",
abstract = "Alzheimer's Disease (AD) is an irreversible
neurogenerative disorder that undergoes progressive
decline in memory and cognitive function and is
characterized by structural brain Magnetic Resonance
Images (sMRI). In recent years, sMRI data has played a
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "40",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xu:2021:TTF,
author = "Xiaolong Xu and Zijie Fang and Lianyong Qi and Xuyun
Zhang and Qiang He and Xiaokang Zhou",
title = "{TripRes}: Traffic Flow Prediction Driven Resource
Reservation for Multimedia {IoV} with Edge Computing",
journal = j-TOMM,
volume = "17",
number = "2",
pages = "41:1--41:21",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3401979",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jun 5 07:35:45 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3401979",
abstract = "The Internet of Vehicles (IoV) connects vehicles,
roadside units (RSUs) and other intelligent objects,
enabling data sharing among them, thereby improving the
efficiency of urban traffic and safety. Currently,
collections of multimedia content, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "41",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liang:2021:FDI,
author = "Wei Liang and Jing Long and Kuan-Ching Li and Jianbo
Xu and Nanjun Ma and Xia Lei",
title = "A Fast Defogging Image Recognition Algorithm Based on
Bilateral Hybrid Filtering",
journal = j-TOMM,
volume = "17",
number = "2",
pages = "42:1--42:16",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3391297",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jun 5 07:35:45 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3391297",
abstract = "With the rapid advancement of video and image
processing technologies in the Internet of Things, it
is urgent to address the issues in real-time
performance, clarity, and reliability of image
recognition technology for a monitoring system in foggy
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "42",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Tong:2021:IPP,
author = "Chao Tong and Mengze Zhang and Chao Lang and Zhigao
Zheng",
title = "An Image Privacy Protection Algorithm Based on
Adversarial Perturbation Generative Networks",
journal = j-TOMM,
volume = "17",
number = "2",
pages = "43:1--43:14",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3381088",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jun 5 07:35:45 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3381088",
abstract = "Today, users of social platforms upload a large number
of photos. These photos contain personal private
information, including user identity information, which
is easily gleaned by intelligent detection algorithms.
To thwart this, in this work, we \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "43",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Fu:2021:FAA,
author = "Yunfei Fu and Hongchuan Yu and Chih-Kuo Yeh and
Tong-Yee Lee and Jian J. Zhang",
title = "Fast Accurate and Automatic Brushstroke Extraction",
journal = j-TOMM,
volume = "17",
number = "2",
pages = "44:1--44:24",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3429742",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jun 5 07:35:45 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3429742",
abstract = "Brushstrokes are viewed as the artist's
``handwriting'' in a painting. In many applications
such as style learning and transfer, mimicking
painting, and painting authentication, it is highly
desired to quantitatively and accurately identify
brushstroke \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "44",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{K:2021:AML,
author = "Mythili K. and Manish Narwaria",
title = "Assessment of Machine Learning-Based Audiovisual
Quality Predictors: Why Uncertainty Matters",
journal = j-TOMM,
volume = "17",
number = "2",
pages = "45:1--45:22",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3430376",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jun 5 07:35:45 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3430376",
abstract = "Quality assessment of audiovisual (AV) signals is
important from the perspective of system design,
optimization, and management of a modern multimedia
communication system. However, automatic prediction of
AV quality via the use of computational models
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "45",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Hama:2021:EUM,
author = "Kenta Hama and Takashi Matsubara and Kuniaki Uehara
and Jianfei Cai",
title = "Exploring Uncertainty Measures for Image-caption
Embedding-and-retrieval Task",
journal = j-TOMM,
volume = "17",
number = "2",
pages = "46:1--46:19",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3425663",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jun 5 07:35:45 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3425663",
abstract = "With the significant development of black-box machine
learning algorithms, particularly deep neural networks,
the practical demand for reliability assessment is
rapidly increasing. On the basis of the concept that
``Bayesian deep learning knows what it \ldots{}''",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "46",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Nguyen:2021:ISV,
author = "Phuong-Anh Nguyen and Chong-Wah Ngo",
title = "Interactive Search vs. Automatic Search: an Extensive
Study on Video Retrieval",
journal = j-TOMM,
volume = "17",
number = "2",
pages = "47:1--47:24",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3429457",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jun 5 07:35:45 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3429457",
abstract = "This article conducts user evaluation to study the
performance difference between interactive and
automatic search. Particularly, the study aims to
provide empirical insights of how the performance
landscape of video search changes, with tens of
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "47",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2021:TAE,
author = "Yang Li and Guangcan Liu and Yubao Sun and Qingshan
Liu and Shengyong Chen",
title = "{$3$D} Tensor Auto-encoder with Application to Video
Compression",
journal = j-TOMM,
volume = "17",
number = "2",
pages = "48:1--48:18",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3431768",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jun 5 07:35:45 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3431768",
abstract = "Auto-encoder has been widely used to compress
high-dimensional data such as the images and videos.
However, the traditional auto-encoder network needs to
store a large number of parameters. Namely, when the
input data is of dimension n, the number of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "48",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Mehrabi:2021:MTC,
author = "Abbas Mehrabi and Matti Siekkinen and Teemu
K{\"a}m{\"a}r{\"a}inen and Antti
Yl{\"a}-J{\"a}{\"a}ski",
title = "Multi-Tier {CloudVR}: Leveraging Edge Computing in
Remote Rendered Virtual Reality",
journal = j-TOMM,
volume = "17",
number = "2",
pages = "49:1--49:24",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3429441",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jun 5 07:35:45 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3429441",
abstract = "The availability of high bandwidth with low-latency
communication in 5G mobile networks enables remote
rendered real-time virtual reality (VR) applications.
Remote rendering of VR graphics in a cloud removes the
need for local personal computer for \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "49",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Sun:2021:ARO,
author = "Lu Sun and Hussein {Al Osman} and Jochen Lang",
title = "An Augmented Reality Online Assistance Platform for
Repair Tasks",
journal = j-TOMM,
volume = "17",
number = "2",
pages = "50:1--50:23",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3429285",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jun 5 07:35:45 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3429285",
abstract = "Our augmented reality online assistance platform
enables an expert to specify 6DoF movements of a
component and apply the geometrical and physical
constraints in real-time. We track the real components
on the expert's side to monitor the operations of
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "50",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhao:2021:SAM,
author = "Meiqi Zhao and Jianmin Zheng and Elvis S. Liu",
title = "Server Allocation for Massively Multiplayer Online
Cloud Games Using Evolutionary Optimization",
journal = j-TOMM,
volume = "17",
number = "2",
pages = "51:1--51:23",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3433027",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jun 5 07:35:45 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3433027",
abstract = "In recent years, Massively Multiplayer Online Games
(MMOGs) are becoming popular, partially due to their
sophisticated graphics and broad virtual world, and
cloud gaming is demanded more than ever especially when
entertaining with light and portable \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "51",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wei:2021:ISS,
author = "Haiyang Wei and Zhixin Li and Feicheng Huang and
Canlong Zhang and Huifang Ma and Zhongzhi Shi",
title = "Integrating Scene Semantic Knowledge into Image
Captioning",
journal = j-TOMM,
volume = "17",
number = "2",
pages = "52:1--52:22",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3439734",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jun 5 07:35:45 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3439734",
abstract = "Most existing image captioning methods use only the
visual information of the image to guide the generation
of captions, lack the guidance of effective scene
semantic information, and the current visual attention
mechanism cannot adjust the focus \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "52",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Gupta:2021:VSB,
author = "Shikha Gupta and Krishan Sharma and Dileep Aroor
Dinesh and Veena Thenkanidiyoor",
title = "Visual Semantic-Based Representation Learning Using
Deep {CNNs} for Scene Recognition",
journal = j-TOMM,
volume = "17",
number = "2",
pages = "53:1--53:24",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3436494",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jun 5 07:35:45 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3436494",
abstract = "In this work, we address the task of scene recognition
from image data. A scene is a spatially correlated
arrangement of various visual semantic contents also
known as concepts, e.g., ``chair,'' ``car,'' ``sky,''
etc. Representation learning using visual \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "53",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Huang:2021:PCN,
author = "Chun-ying Huang and Yun-chen Cheng and Guan-zhang
Huang and Ching-ling Fan and Cheng-hsin Hsu",
title = "On the Performance Comparisons of Native and
Clientless Real-Time Screen-Sharing Technologies",
journal = j-TOMM,
volume = "17",
number = "2",
pages = "54:1--54:26",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3437881",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jun 5 07:35:45 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3437881",
abstract = "Real-time screen-sharing provides users with
ubiquitous access to remote applications, such as
computer games, movie players, and desktop applications
(apps), anywhere and anytime. In this article, we study
the performance of different screen-sharing \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "54",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yang:2021:ACG,
author = "Xin Yang and Zongliang Ma and Letian Yu and Ying Cao
and Baocai Yin and Xiaopeng Wei and Qiang Zhang and
Rynson W. H. Lau",
title = "Automatic Comic Generation with Stylistic Multi-page
Layouts and Emotion-driven Text Balloon Generation",
journal = j-TOMM,
volume = "17",
number = "2",
pages = "55:1--55:19",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3440053",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jun 5 07:35:45 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3440053",
abstract = "In this article, we propose a fully automatic system
for generating comic books from videos without any
human intervention. Given an input video along with its
subtitles, our approach first extracts informative
keyframes by analyzing the subtitles and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "55",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Sharma:2021:HQF,
author = "Prasen Kumar Sharma and Sujoy Ghosh and Arijit Sur",
title = "High-quality Frame Recurrent Video De-raining with
Multi-contextual Adversarial Network",
journal = j-TOMM,
volume = "17",
number = "2",
pages = "56:1--56:24",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3444974",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jun 5 07:35:45 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3444974",
abstract = "In this article, we address the problem of rain-streak
removal in the videos. Unlike the image, challenges in
video restoration comprise temporal consistency besides
spatial enhancement. The researchers across the world
have proposed several effective \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "56",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Lan:2021:STR,
author = "Xiangyuan Lan and Zifei Yang and Wei Zhang and Pong C.
Yuen",
title = "Spatial-temporal Regularized Multi-modality
Correlation Filters for Tracking with Re-detection",
journal = j-TOMM,
volume = "17",
number = "2",
pages = "57:1--57:16",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3430257",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jun 5 07:35:45 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3430257",
abstract = "The development of multi-spectrum image sensing
technology has brought great interest in exploiting the
information of multiple modalities (e.g., RGB and
infrared modalities) for solving computer vision
problems. In this article, we investigate how to
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "57",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Singh:2021:ISI,
author = "Amit Kumar Singh and Zhihan Lv and Hoon Ko",
title = "Introduction to the Special Issue on {Recent Trends in
Medical Data Security for e-Health Applications}",
journal = j-TOMM,
volume = "17",
number = "2s",
pages = "58:1--58:3",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3459601",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 22 08:33:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3459601",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "58",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Singh:2021:SHD,
author = "A. K. Singh and A. Anand and Z. Lv and H. Ko and A.
Mohan",
title = "A Survey on Healthcare Data: a Security Perspective",
journal = j-TOMM,
volume = "17",
number = "2s",
pages = "59:1--59:26",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3422816",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 22 08:33:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3422816",
abstract = "With the remarkable development of internet
technologies, the popularity of smart healthcare has
regularly come to the fore. Smart healthcare uses
advanced technologies to transform the traditional
medical system in an \ldots{} (More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "59",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wu:2021:SPP,
author = "Hongjiao Wu and Ashutosh Dhar Dwivedi and Gautam
Srivastava",
title = "Security and Privacy of Patient Information in Medical
Systems Based on Blockchain Technology",
journal = j-TOMM,
volume = "17",
number = "2s",
pages = "60:1--60:17",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3408321",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 22 08:33:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3408321",
abstract = "The essence of ``blockchain'' is a shared database in
which information stored is un-falsifiable, traceable,
open, and transparent. Therefore, to improve the
security of private information in medical systems,
this article \ldots{} (More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "60",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2021:OBR,
author = "Ting Wang and Xiangjun Ji and Aiguo Song and Kurosh
Madani and Amine Chohra and Huimin Lu and Ramon
Monero",
title = "Output-Bounded and {RBFNN}-Based Position Tracking and
Adaptive Force Control for Security Tele-Surgery",
journal = j-TOMM,
volume = "17",
number = "2s",
pages = "61:1--61:15",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3394920",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 22 08:33:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3394920",
abstract = "In security e-health brain neurosurgery, one of the
important processes is to move the electrocoagulation
to the appropriate position in order to excavate the
diseased tissue.$^1$ However, it has been problematic
for \ldots{} (More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "61",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Alkhariji:2021:SPD,
author = "Lamya Alkhariji and Nada Alhirabi and Mansour Naser
Alraja and Mahmoud Barhamgi and Omer Rana and Charith
Perera",
title = "Synthesising Privacy by Design Knowledge Toward
Explainable {Internet of Things} Application Designing
in Healthcare",
journal = j-TOMM,
volume = "17",
number = "2s",
pages = "62:1--62:29",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3434186",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 22 08:33:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3434186",
abstract = "Privacy by Design (PbD) is the most common approach
followed by software developers who aim to reduce risks
within their application designs, yet it remains
commonplace for developers to retain little \ldots{}
(More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "62",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Tanveer:2021:PLT,
author = "M. Tanveer and Tarun Gupta and Miten Shah and {For the
Alzheimer's Disease Neuroimaging Initiative}",
title = "Pinball Loss Twin Support Vector Clustering",
journal = j-TOMM,
volume = "17",
number = "2s",
pages = "63:1--63:23",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3409264",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 22 08:33:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3409264",
abstract = "Twin Support Vector Clustering (TWSVC) is a clustering
algorithm inspired by the principles of Twin Support
Vector Machine (TWSVM). TWSVC has already outperformed
other traditional plane based clustering algorithms.
\ldots{} (More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "63",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Sahu:2021:LMP,
author = "Amiya Kumar Sahu and Suraj Sharma and Deepak Puthal",
title = "Lightweight Multi-party Authentication and Key
Agreement Protocol in {IoT}-based E-Healthcare
Service",
journal = j-TOMM,
volume = "17",
number = "2s",
pages = "64:1--64:20",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3398039",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 22 08:33:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3398039",
abstract = "Internet of Things (IoT) is playing a promising role
in e-healthcare applications in the recent decades;
nevertheless, security is one of the crucial challenges
in the current field of study. Many healthcare devices
\ldots{} (More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "64",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Rajput:2021:SBS,
author = "Amitesh Singh Rajput and Vishesh Kumar Tanwar and
Balasubramanian Raman",
title = "-Score-Based Secure Biomedical Model for Effective
Skin Lesion Segmentation Over {eHealth} Cloud",
journal = j-TOMM,
volume = "17",
number = "2s",
pages = "65:1--65:19",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3430806",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 22 08:33:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3430806",
abstract = "This study aims to process the private medical data
over eHealth cloud platform. The current pandemic
situation, caused by Covid19 has made us to realize the
importance of automatic remotely operated independent
\ldots{} (More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "65",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Singh:2021:EEB,
author = "Ashima Singh and Arwinder Dhillon and Neeraj Kumar and
M. Shamim Hossain and Ghulam Muhammad and Manoj Kumar",
title = "{eDiaPredict}: an Ensemble-based Framework for
Diabetes Prediction",
journal = j-TOMM,
volume = "17",
number = "2s",
pages = "66:1--66:26",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3415155",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 22 08:33:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3415155",
abstract = "Medical systems incorporate modern computational
intelligence in healthcare. Machine learning techniques
are applied to predict the onset and reoccurrence of
the disease, identify biomarkers for survivability
\ldots{} (More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "66",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Amato:2021:SPV,
author = "Flora Amato and Valentina Casola and Giovanni
Cozzolino and Alessandra {De Benedictis} and Nicola
Mazzocca and Francesco Moscato",
title = "A Security and Privacy Validation Methodology for
e-Health Systems",
journal = j-TOMM,
volume = "17",
number = "2s",
pages = "67:1--67:22",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3412373",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 22 08:33:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3412373",
abstract = "e-Health applications enable one to acquire, process,
and share patient medical data to improve diagnosis,
treatment, and patient monitoring. Despite the
undeniable benefits brought by the digitization of
health systems, the \ldots{} (More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "67",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Kasyap:2021:PPD,
author = "Harsh Kasyap and Somanath Tripathy",
title = "Privacy-preserving Decentralized Learning Framework
for Healthcare System",
journal = j-TOMM,
volume = "17",
number = "2s",
pages = "68:1--68:24",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3426474",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 22 08:33:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3426474",
abstract = "Clinical trials and drug discovery would not be
effective without the collaboration of institutions.
Earlier, it has been at the cost of individual's
privacy. Several pacts and compliances have been
enforced to avoid data \ldots{} (More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "68",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Shamsolmoali:2021:ISI,
author = "Pourya Shamsolmoali and Ruili Wang and A. H. Sadka",
title = "Introduction to the Special Issue on {Advanced
Approaches for Multiple Instance Learning on Multimedia
Applications}",
journal = j-TOMM,
volume = "17",
number = "2s",
pages = "69:1--69:2",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3459603",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 22 08:33:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3459603",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "69",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Ji:2021:MPG,
author = "Ruyi Ji and Zeyu Liu and Libo Zhang and Jianwei Liu
and Xin Zuo and Yanjun Wu and Chen Zhao and Haofeng
Wang and Lin Yang",
title = "Multi-peak Graph-based Multi-instance Learning for
Weakly Supervised Object Detection",
journal = j-TOMM,
volume = "17",
number = "2s",
pages = "70:1--70:21",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3432861",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 22 08:33:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3432861",
abstract = "Weakly supervised object detection (WSOD), aiming to
detect objects with only image-level annotations, has
become one of the research hotspots over the past few
years. Recently, much effort has been devoted to
\ldots{} (More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "70",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Ding:2021:MSA,
author = "Yaoling Ding and Liehuang Zhu and An Wang and Yuan Li
and Yongjuan Wang and Siu Ming Yiu and Keke Gai",
title = "A Multiple Sieve Approach Based on Artificial
Intelligent Techniques and Correlation Power Analysis",
journal = j-TOMM,
volume = "17",
number = "2s",
pages = "71:1--71:21",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3433165",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 22 08:33:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3433165",
abstract = "Side-channel analysis achieves key recovery by
analyzing physical signals generated during the
operation of cryptographic devices. Power consumption
is one kind of these signals and can be regarded as a
\ldots{} (More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "71",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Ji:2021:MIM,
author = "Wanting Ji and Ruili Wang",
title = "A Multi-instance Multi-label Dual Learning Approach
for Video Captioning",
journal = j-TOMM,
volume = "17",
number = "2s",
pages = "72:1--72:18",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3446792",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 22 08:33:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3446792",
abstract = "Video captioning is a challenging task in the field of
multimedia processing, which aims to generate
informative natural language descriptions/captions to
describe video contents. Previous video captioning
approaches \ldots{} (More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "72",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zareapoor:2021:EAN,
author = "Masoumeh Zareapoor and Jie Yang",
title = "Equivariant Adversarial Network for Image-to-image
Translation",
journal = j-TOMM,
volume = "17",
number = "2s",
pages = "73:1--73:14",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3458280",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 22 08:33:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3458280",
abstract = "Image-to-Image translation aims to learn an image from
a source domain to a target domain. However, there are
three main challenges, such as lack of paired datasets,
multimodality, and diversity, that are associated
\ldots{} (More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "73",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Mohammed:2021:MAF,
author = "Mazin Abed Mohammed and Mohamed Elhoseny and Karrar
Hameed Abdulkareem and Salama A. Mostafa and Mashael S.
Maashi",
title = "A Multi-agent Feature Selection and Hybrid
Classification Model for {Parkinson}'s Disease
Diagnosis",
journal = j-TOMM,
volume = "17",
number = "2s",
pages = "74:1--74:22",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3433180",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 22 08:33:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3433180",
abstract = "Parkinson's disease (PD) diagnostics includes numerous
analyses related to the neurological, physical, and
psychical status of the patient. Medical teams analyze
multiple symptoms and patient history considering
\ldots{} (More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "74",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{An:2021:MTU,
author = "Na An and Wei Qi Yan",
title = "Multitarget Tracking Using {Siamese} Neural Networks",
journal = j-TOMM,
volume = "17",
number = "2s",
pages = "75:1--75:16",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3441656",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 22 08:33:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3441656",
abstract = "In this article, we detect and track visual objects by
using Siamese network or twin neural network. The
Siamese network is constructed to classify moving
objects based on the associations of object detection
network and \ldots{} (More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "75",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Tang:2021:MCA,
author = "Xiaochuan Tang and Mingzhe Liu and Hao Zhong and
Yuanzhen Ju and Weile Li and Qiang Xu",
title = "{MILL}: Channel Attention-based Deep Multiple Instance
Learning for Landslide Recognition",
journal = j-TOMM,
volume = "17",
number = "2s",
pages = "76:1--76:11",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3454009",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Jun 22 08:33:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3454009",
abstract = "Landslide recognition is widely used in natural
disaster risk management. Traditional landslide
recognition is mainly conducted by geologists, which is
accurate but inefficient. This article introduces
multiple instance learning \ldots{} (More)",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "76",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2021:NNB,
author = "Yue Li and Yan Yi and Dong Liu and Li Li and Zhu Li
and Houqiang Li",
title = "Neural-Network-Based Cross-Channel Intra Prediction",
journal = j-TOMM,
volume = "17",
number = "3",
pages = "77:1--77:23",
month = aug,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3434250",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 19 08:56:09 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3434250",
abstract = "To reduce the redundancy among different color
channels, e.g., YUV, previous methods usually adopt a
linear model that tends to be oversimple for complex
image content. We propose a neural-network-based method
for cross-channel prediction in intra frame \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "77",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2021:MML,
author = "Zhandong Liu and Wengang Zhou and Houqiang Li",
title = "{MFECN}: Multi-level Feature Enhanced Cumulative
Network for Scene Text Detection",
journal = j-TOMM,
volume = "17",
number = "3",
pages = "78:1--78:22",
month = aug,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3440087",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 19 08:56:09 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3440087",
abstract = "Recently, many scene text detection algorithms have
achieved impressive performance by using convolutional
neural networks. However, most of them do not make full
use of the context among the hierarchical multi-level
features to improve the performance of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "78",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Dong:2021:SCL,
author = "Xingbo Dong and Soohyong Kim and Zhe Jin and Jung Yeon
Hwang and Sangrae Cho and Andrew Beng Jin Teoh",
title = "Secure Chaff-less Fuzzy Vault for Face Identification
Systems",
journal = j-TOMM,
volume = "17",
number = "3",
pages = "79:1--79:22",
month = aug,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3442198",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 19 08:56:09 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3442198",
abstract = "Biometric cryptosystems such as fuzzy vaults represent
one of the most popular approaches for secret and
biometric template protection. However, they are solely
designed for biometric verification, where the user is
required to input both identity \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "79",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Hu:2021:GLE,
author = "Hezhen Hu and Wengang Zhou and Junfu Pu and Houqiang
Li",
title = "Global-Local Enhancement Network for {NMF}-Aware Sign
Language Recognition",
journal = j-TOMM,
volume = "17",
number = "3",
pages = "80:1--80:19",
month = aug,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3436754",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 19 08:56:09 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3436754",
abstract = "Sign language recognition (SLR) is a challenging
problem, involving complex manual features (i.e., hand
gestures) and fine-grained non-manual features (NMFs)
(i.e., facial expression, mouth shapes, etc.).
Although manual features are dominant, non-manual
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "80",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Lin:2021:RRN,
author = "Feng Lin and Wengang Zhou and Jiajun Deng and Bin Li
and Yan Lu and Houqiang Li",
title = "Residual Refinement Network with Attribute Guidance
for Precise Saliency Detection",
journal = j-TOMM,
volume = "17",
number = "3",
pages = "81:1--81:19",
month = aug,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3440694",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 19 08:56:09 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3440694",
abstract = "As an important topic in the multimedia and computer
vision fields, salient object detection has been
researched for years. Recently, state-of-the-art
performance has been witnessed with the aid of the
fully convolutional networks (FCNs) and the various
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "81",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zheng:2021:IIR,
author = "Hongdi Zheng and Junfeng Wang and Jianping Zhang and
Ruirui Li",
title = "{IRTS}: an Intelligent and Reliable Transmission
Scheme for Screen Updates Delivery in {DaaS}",
journal = j-TOMM,
volume = "17",
number = "3",
pages = "82:1--82:24",
month = aug,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3440035",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 19 08:56:09 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3440035",
abstract = "Desktop-as-a-service (DaaS) has been recognized as an
elastic and economical solution that enables users to
access personal desktops from anywhere at any time.
During the interaction process of DaaS, users rely on
screen updates to perceive execution \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "82",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2021:SCG,
author = "Rui Wang and Dong Liang and Xiaochun Cao and Yuanfang
Guo",
title = "Semantic Correspondence with Geometric Structure
Analysis",
journal = j-TOMM,
volume = "17",
number = "3",
pages = "83:1--83:21",
month = aug,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3441576",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 19 08:56:09 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3441576",
abstract = "This article studies the correspondence problem for
semantically similar images, which is challenging due
to the joint visual and geometric deformations. We
introduce the Flip-aware Distance Ratio method (FDR) to
solve this problem from the perspective of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "83",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2021:SSS,
author = "Xinfang Liu and Xiushan Nie and Junya Teng and Li Lian
and Yilong Yin",
title = "Single-shot Semantic Matching Network for Moment
Localization in Videos",
journal = j-TOMM,
volume = "17",
number = "3",
pages = "84:1--84:14",
month = aug,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3441577",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 19 08:56:09 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3441577",
abstract = "Moment localization in videos using natural language
refers to finding the most relevant segment from videos
given a natural language query. Most of the existing
methods require video segment candidates for further
matching with the query, which leads to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "84",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Alaya:2021:PBD,
author = "Bechir Alaya",
title = "Payoff-based Dynamic Segment Replication and Graph
Classification Method with Attribute Vectors Adapted to
Urban {VANET}",
journal = j-TOMM,
volume = "17",
number = "3",
pages = "85:1--85:22",
month = aug,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3440018",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 19 08:56:09 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3440018",
abstract = "Due to the number of constraints and the dynamic
nature of vehicular ad hoc networks (VANET), effective
video broadcasting always remains a difficult task. In
this work, we proposed a quality of video visualization
guarantee model based on a feedback loop \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "85",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Dhiman:2021:PWS,
author = "Chhavi Dhiman and Dinesh Kumar Vishwakarma and Paras
Agarwal",
title = "Part-wise Spatio-temporal Attention Driven {CNN}-based
{$3$D} Human Action Recognition",
journal = j-TOMM,
volume = "17",
number = "3",
pages = "86:1--86:24",
month = aug,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3441628",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 19 08:56:09 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3441628",
abstract = "Recently, human activity recognition using skeleton
data is increasing due to its ease of acquisition and
finer shape details. Still, it suffers from a wide
range of intra-class variation, inter-class similarity
among the actions and view variation due to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "86",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Nie:2021:PPF,
author = "Jie Nie and Zhi-Qiang Wei and Weizhi Nie and An-An
Liu",
title = "{PGNet}: Progressive Feature Guide Learning Network
for Three-dimensional Shape Recognition",
journal = j-TOMM,
volume = "17",
number = "3",
pages = "87:1--87:17",
month = aug,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3443708",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 19 08:56:09 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3443708",
abstract = "Three-dimensional (3D) shape recognition is a popular
topic and has potential application value in the field
of computer vision. With the recent proliferation of
deep learning, various deep learning models have
achieved state-of-the-art performance. Among \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "87",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2021:VDB,
author = "Shiguang Liu and Huixin Wang and Xiaoli Zhang",
title = "Video Decolorization Based on the {CNN} and {LSTM}
Neural Network",
journal = j-TOMM,
volume = "17",
number = "3",
pages = "88:1--88:18",
month = aug,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3446619",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 19 08:56:09 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3446619",
abstract = "Video decolorization is the process of transferring
three-channel color videos into single-channel
grayscale videos, which is essentially the
decolorization operation of video frames. Most existing
video decolorization algorithms directly apply image
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "88",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yang:2021:DCN,
author = "Zhenzhen Yang and Pengfei Xu and Yongpeng Yang and
Bing-Kun Bao",
title = "A Densely Connected Network Based on {U-Net} for
Medical Image Segmentation",
journal = j-TOMM,
volume = "17",
number = "3",
pages = "89:1--89:14",
month = aug,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3446618",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 19 08:56:09 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3446618",
abstract = "The U-Net has become the most popular structure in
medical image segmentation in recent years. Although
its performance for medical image segmentation is
outstanding, a large number of experiments demonstrate
that the classical U-Net network architecture
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "89",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2021:LCF,
author = "Donglin Zhang and Xiao-Jun Wu and Jun Yu",
title = "Label Consistent Flexible Matrix Factorization Hashing
for Efficient Cross-modal Retrieval",
journal = j-TOMM,
volume = "17",
number = "3",
pages = "90:1--90:18",
month = aug,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3446774",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 19 08:56:09 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3446774",
abstract = "Hashing methods have sparked a great revolution on
large-scale cross-media search due to its effectiveness
and efficiency. Most existing approaches learn unified
hash representation in a common Hamming space to
represent all multimodal data. However, the unified
hash codes may not characterize the cross-modal data
discriminatively, because the data may vary greatly due
to its different dimensionalities, physical properties,
and statistical information. In addition, most existing
supervised cross-modal algorithms preserve the
similarity relationship by constructing an $ n \time n
$ pairwise similarity matrix, which requires a large
amount of calculation and loses the category
information. To mitigate these issues, a novel
cross-media hashing approach is proposed in this
article, dubbed label flexible matrix factorization
hashing (LFMH). Specifically, LFMH jointly learns the
modality-specific latent subspace with similar semantic
by the flexible matrix factorization. In addition, LFMH
guides the hash learning by utilizing the semantic
labels directly instead of the large $ n \times n $
pairwise similarity matrix. LFMH transforms the
heterogeneous data into modality-specific latent
semantic representation. Therefore, we can obtain the
hash codes by quantifying the representations, and the
learned hash codes are consistent with the supervised
labels of multimodal data. Then, we can obtain the
similar binary codes of the corresponding modality, and
the binary codes can characterize such samples
flexibly. Accordingly, the derived hash codes have more
discriminative power for single-modal and cross-modal
retrieval tasks. Extensive experiments on eight
different databases demonstrate that our model
outperforms some competitive approaches.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "90",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Lokoc:2021:RIS,
author = "Jakub Lokoc and Patrik Vesel{\'y} and Frantisek
Mejzl{\'\i}k and Gregor Kovalc{\'\i}k and Tom{\'a}s
Soucek and Luca Rossetto and Klaus Schoeffmann and
Werner Bailer and Cathal Gurrin and Loris Sauter and
Jaeyub Song and Stefanos Vrochidis and Jiaxin Wu and
Bj{\"o}rn {\thorn}{\'o}R J{\'o}nsson",
title = "Is the Reign of Interactive Search Eternal? {Findings}
from the {Video Browser Showdown 2020}",
journal = j-TOMM,
volume = "17",
number = "3",
pages = "91:1--91:26",
month = aug,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3445031",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 19 08:56:09 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3445031",
abstract = "Comprehensive and fair performance evaluation of
information retrieval systems represents an essential
task for the current information age. Whereas
Cranfield-based evaluations with benchmark datasets
support development of retrieval models, significant
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "91",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xu:2021:LIR,
author = "Qianli Xu and Ana Garcia {Del Molino} and Jie Lin and
Fen Fang and Vigneshwaran Subbaraju and Liyuan Li and
Joo-Hwee Lim",
title = "Lifelog Image Retrieval Based on Semantic Relevance
Mapping",
journal = j-TOMM,
volume = "17",
number = "3",
pages = "92:1--92:18",
month = aug,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3446209",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 19 08:56:09 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3446209",
abstract = "Lifelog analytics is an emerging research area with
technologies embracing the latest advances in machine
learning, wearable computing, and data analytics.
However, state-of-the-art technologies are still
inadequate to distill voluminous multimodal \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "92",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Du:2021:RTE,
author = "Gaoming Du and Jiting Wu and Hongfang Cao and Kun Xing
and Zhenmin Li and Duoli Zhang and Xiaolei Wang",
title = "A Real-Time Effective Fusion-Based Image Defogging
Architecture on {FPGA}",
journal = j-TOMM,
volume = "17",
number = "3",
pages = "93:1--93:21",
month = aug,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3446241",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 19 08:56:09 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3446241",
abstract = "Foggy weather reduces the visibility of photographed
objects, causing image distortion and decreasing
overall image quality. Many approaches (e.g., image
restoration, image enhancement, and fusion-based
methods) have been proposed to work out the problem.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "93",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Chen:2021:FRS,
author = "Chenglizhao Chen and Hongmeng Zhao and Huan Yang and
Teng Yu and Chong Peng and Hong Qin",
title = "Full-reference Screen Content Image Quality Assessment
by Fusing Multilevel Structure Similarity",
journal = j-TOMM,
volume = "17",
number = "3",
pages = "94:1--94:21",
month = aug,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3447393",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 19 08:56:09 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3447393",
abstract = "Screen content images (SCIs) usually comprise various
content types with sharp edges, in which artifacts or
distortions can be effectively sensed by a vanilla
structure similarity measurement in a full-reference
manner. Nonetheless, almost all of the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "94",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2021:DBS,
author = "Honglin Li and Xiaoyang Mao and Mengdi Xu and Xiaogang
Jin",
title = "Deep-based Self-refined Face-top Coordination",
journal = j-TOMM,
volume = "17",
number = "3",
pages = "95:1--95:23",
month = aug,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3446970",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 19 08:56:09 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3446970",
abstract = "Face-top coordination, which exists in most
clothes-fitting scenarios, is challenging due to
varieties of attributes, implicit correlations, and
tradeoffs between general preferences and individual
preferences. We present a Deep-Based Self-Refined
(DBSR) \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "95",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Lin:2021:DAM,
author = "Minxuan Lin and Fan Tang and Weiming Dong and Xiao Li
and Changsheng Xu and Chongyang Ma",
title = "Distribution Aligned Multimodal and Multi-domain Image
Stylization",
journal = j-TOMM,
volume = "17",
number = "3",
pages = "96:1--96:17",
month = aug,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3450525",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 19 08:56:09 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3450525",
abstract = "Multimodal and multi-domain stylization are two
important problems in the field of image style
transfer. Currently, there are few methods that can
perform multimodal and multi-domain stylization
simultaneously. In this study, we propose a unified
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "96",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Du:2021:IGS,
author = "Yong Du and Yangyang Xu and Taizhong Ye and Qiang Wen
and Chufeng Xiao and Junyu Dong and Guoqiang Han and
Shengfeng He",
title = "Invertible Grayscale with Sparsity Enforcing Priors",
journal = j-TOMM,
volume = "17",
number = "3",
pages = "97:1--97:17",
month = aug,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3451993",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 19 08:56:09 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3451993",
abstract = "Color dimensionality reduction is believed as a
non-invertible process, as re-colorization results in
perceptually noticeable and unrecoverable distortion.
In this article, we propose to convert a color image
into a grayscale image that can fully recover
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "97",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Qian:2021:KAM,
author = "Shengsheng Qian and Jun Hu and Quan Fang and
Changsheng Xu",
title = "Knowledge-aware Multi-modal Adaptive Graph
Convolutional Networks for Fake News Detection",
journal = j-TOMM,
volume = "17",
number = "3",
pages = "98:1--98:23",
month = aug,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3451215",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Aug 19 08:56:09 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3451215",
abstract = "In this article, we focus on fake news detection task
and aim to automatically identify the fake news from
vast amount of social media posts. To date, many
approaches have been proposed to detect fake news,
which includes traditional learning methods and
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "98",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2021:ISI,
author = "Yu-Dong Zhang and Juan Manuel Gorriz and Zhengchao
Dong",
title = "Introduction to the Special Issue on Explainable Deep
Learning for Medical Image Computing",
journal = j-TOMM,
volume = "17",
number = "3s",
pages = "99:1--99:2",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3485046",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Dec 31 09:04:25 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3485046",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "99",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Ni:2021:LCL,
author = "Tongguang Ni and Yan Ding and Jing Xue and Kaijian Xia
and Xiaoqing Gu and Yizhang Jiang",
title = "Local Constraint and Label Embedding Multi-layer
Dictionary Learning for Sperm Head Classification",
journal = j-TOMM,
volume = "17",
number = "3s",
pages = "100:1--100:16",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3458927",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Dec 31 09:04:25 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3458927",
abstract = "Morphological classification of human sperm heads is a
key technology for diagnosing male infertility. Due to
its sparse representation and learning capability,
dictionary learning has shown remarkable performance in
human sperm head classification. To \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "100",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Chen:2021:DAC,
author = "Bingzhi Chen and Yishu Liu and Zheng Zhang and
Yingjian Li and Zhao Zhang and Guangming Lu and
Hongbing Yu",
title = "Deep Active Context Estimation for Automated
{COVID-19} Diagnosis",
journal = j-TOMM,
volume = "17",
number = "3s",
pages = "101:1--101:22",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3457124",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Dec 31 09:04:25 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3457124",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "101",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2021:MIC,
author = "Xiangbin Liu and Jiesheng He and Liping Song and Shuai
Liu and Gautam Srivastava",
title = "Medical Image Classification based on an Adaptive Size
Deep Learning Model",
journal = j-TOMM,
volume = "17",
number = "3s",
pages = "102:1--102:18",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3465220",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Dec 31 09:04:25 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3465220",
abstract = "With the rapid development of Artificial Intelligence
(AI), deep learning has increasingly become a research
hotspot in various fields, such as medical image
classification. Traditional deep learning models use
Bilinear Interpolation when processing \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "102",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Lu:2021:EFD,
author = "Siyuan Lu and Di Wu and Zheng Zhang and Shui-Hua
Wang",
title = "An Explainable Framework for Diagnosis of {COVID-19}
Pneumonia via Transfer Learning and Discriminant
Correlation Analysis",
journal = j-TOMM,
volume = "17",
number = "3s",
pages = "103:1--103:16",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3449785",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Dec 31 09:04:25 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3449785",
abstract = "The new coronavirus COVID-19 has been spreading all
over the world in the last six months, and the death
toll is still rising. The accurate diagnosis of
COVID-19 is an emergent task as to stop the spreading
of the virus. In this paper, we proposed to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "103",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Alizadehsani:2021:UAS,
author = "Roohallah Alizadehsani and Danial Sharifrazi and Navid
Hoseini Izadi and Javad Hassannataj Joloudari and
Afshin Shoeibi and Juan M. Gorriz and Sadiq Hussain and
Juan E. Arco and Zahra Alizadeh Sani and Fahime
Khozeimeh and Abbas Khosravi and Saeid Nahavandi and
Sheikh Mohammed Shariful Islam and U. Rajendra
Acharya",
title = "Uncertainty-Aware Semi-Supervised Method Using Large
Unlabeled and Limited Labeled {COVID-19} Data",
journal = j-TOMM,
volume = "17",
number = "3s",
pages = "104:1--104:24",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3462635",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Dec 31 09:04:25 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3462635",
abstract = "The new coronavirus has caused more than one million
deaths and continues to spread rapidly. This virus
targets the lungs, causing respiratory distress which
can be mild or severe. The X-ray or computed tomography
(CT) images of lungs can reveal whether \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "104",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Kumar:2021:DDE,
author = "Ambeshwar Kumar and Ramachandran Manikandan and Utku
Kose and Deepak Gupta and Suresh C. Satapathy",
title = "Doctor's Dilemma: Evaluating an Explainable
Subtractive Spatial Lightweight Convolutional Neural
Network for Brain Tumor Diagnosis",
journal = j-TOMM,
volume = "17",
number = "3s",
pages = "105:1--105:26",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3457187",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Dec 31 09:04:25 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3457187",
abstract = "In Medicine Deep Learning has become an essential tool
to achieve outstanding diagnosis on image data.
However, one critical problem is that Deep Learning
comes with complicated, black-box models so it is not
possible to analyze their trust level \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "105",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Su:2021:HRP,
author = "Ge Su and Bo Lin and Wei Luo and Jianwei Yin and
Shuiguang Deng and Honghao Gao and Renjun Xu",
title = "Hypomimia Recognition in {Parkinson}'s Disease With
Semantic Features",
journal = j-TOMM,
volume = "17",
number = "3s",
pages = "106:1--106:20",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3476778",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Dec 31 09:04:25 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3476778",
abstract = "Parkinson's disease is the second most common
neurodegenerative disorder, commonly affecting elderly
people over the age of 65. As the cardinal
manifestation, hypomimia, referred to as impairments in
normal facial expressions, stays covert. Even some
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "106",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xin:2021:WEG,
author = "Qi Xin and Shaohao Hu and Shuaiqi Liu and Ling Zhao
and Shuihua Wang",
title = "{WTRPNet}: an Explainable Graph Feature Convolutional
Neural Network for Epileptic {EEG} Classification",
journal = j-TOMM,
volume = "17",
number = "3s",
pages = "107:1--107:18",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3460522",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Dec 31 09:04:25 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3460522",
abstract = "As one of the important tools of epilepsy diagnosis,
the electroencephalogram (EEG) is noninvasive and
presents no traumatic injury to patients. It contains a
lot of physiological and pathological information that
is easy to obtain. The automatic \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "107",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Cheng:2021:ISI,
author = "Wen-Huang Cheng and Jiaying Liu and Nicu Sebe and
Junsong Yuan and Hong-Han Shuai",
title = "Introduction to the Special Issue on Explainable {AI}
on Multimedia Computing",
journal = j-TOMM,
volume = "17",
number = "3s",
pages = "108:1--108:2",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3489522",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Dec 31 09:04:25 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3489522",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "108",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2021:LFS,
author = "Jiguo Li and Xinfeng Zhang and Jizheng Xu and Siwei Ma
and Wen Gao",
title = "Learning to Fool the Speaker Recognition",
journal = j-TOMM,
volume = "17",
number = "3s",
pages = "109:1--109:21",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3468673",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Dec 31 09:04:25 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3468673",
abstract = "Due to the widespread deployment of
fingerprint/face/speaker recognition systems, the risk
in these systems, especially the adversarial attack,
has drawn increasing attention in recent years.
Previous researches mainly studied the adversarial
attack to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "109",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yan:2021:PNR,
author = "Chenggang Yan and Tong Teng and Yutao Liu and Yongbing
Zhang and Haoqian Wang and Xiangyang Ji",
title = "Precise No-Reference Image Quality Evaluation Based on
Distortion Identification",
journal = j-TOMM,
volume = "17",
number = "3s",
pages = "110:1--110:21",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3468872",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Dec 31 09:04:25 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3468872",
abstract = "The difficulty of no-reference image quality
assessment (NR IQA) often lies in the lack of knowledge
about the distortion in the image, which makes quality
assessment blind and thus inefficient. To tackle such
issue, in this article, we propose a novel \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "110",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Chen:2021:EAM,
author = "Yung-Yao Chen and Sin-Ye Jhong and Chih-Hsien Hsia and
Kai-Lung Hua",
title = "Explainable {AI}: a Multispectral Palm-Vein
Identification System with New Augmentation Features",
journal = j-TOMM,
volume = "17",
number = "3s",
pages = "111:1--111:21",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3468873",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Dec 31 09:04:25 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3468873",
abstract = "Recently, as one of the most promising biometric
traits, the vein has attracted the attention of both
academia and industry because of its living body
identification and the convenience of the acquisition
process. State-of-the-art techniques can provide
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "111",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Lin:2021:XEC,
author = "Yu-Sheng Lin and Zhe-Yu Liu and Yu-An Chen and
Yu-Siang Wang and Ya-Liang Chang and Winston H. Hsu",
title = "{xCos}: an Explainable Cosine Metric for Face
Verification Task",
journal = j-TOMM,
volume = "17",
number = "3s",
pages = "112:1--112:16",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3469288",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Dec 31 09:04:25 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3469288",
abstract = "We study the XAI (explainable AI) on the face
recognition task, particularly the face verification.
Face verification has become a crucial task in recent
days and it has been deployed to plenty of
applications, such as access control, surveillance, and
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "112",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Shorfuzzaman:2021:EDL,
author = "Mohammad Shorfuzzaman and M. Shamim Hossain and
Abdulmotaleb {El Saddik}",
title = "An Explainable Deep Learning Ensemble Model for Robust
Diagnosis of Diabetic Retinopathy Grading",
journal = j-TOMM,
volume = "17",
number = "3s",
pages = "113:1--113:24",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3469841",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Dec 31 09:04:25 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3469841",
abstract = "Diabetic retinopathy (DR) is one of the most common
causes of vision loss in people who have diabetes for a
prolonged period. Convolutional neural networks (CNNs)
have become increasingly popular for computer-aided DR
diagnosis using retinal fundus \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "113",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wu:2021:BBD,
author = "Zhenyu Wu and Zhaowen Wang and Ye Yuan and Jianming
Zhang and Zhangyang Wang and Hailin Jin",
title = "Black-Box Diagnosis and Calibration on {GAN}
Intra-Mode Collapse: a Pilot Study",
journal = j-TOMM,
volume = "17",
number = "3s",
pages = "114:1--114:18",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3472768",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Dec 31 09:04:25 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3472768",
abstract = "Generative adversarial networks (GANs) nowadays are
capable of producing images of incredible realism. Two
concerns raised are whether the state-of-the-art GAN's
learned distribution still suffers from mode collapse
and what to do if so. Existing \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "114",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xia:2021:SED,
author = "Bohui Xia and Xueting Wang and Toshihiko Yamasaki",
title = "Semantic Explanation for Deep Neural Networks Using
Feature Interactions",
journal = j-TOMM,
volume = "17",
number = "3s",
pages = "115:1--115:19",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3474557",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Dec 31 09:04:25 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3474557",
abstract = "Given the promising results obtained by deep-learning
techniques in multimedia analysis, the explainability
of predictions made by networks has become important in
practical applications. We present a method to generate
semantic and quantitative \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "115",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2021:LDS,
author = "Yang Wang and Yang Cao and Jing Zhang and Feng Wu and
Zheng-Jun Zha",
title = "Leveraging Deep Statistics for Underwater Image
Enhancement",
journal = j-TOMM,
volume = "17",
number = "3s",
pages = "116:1--116:20",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3489520",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Dec 31 09:04:25 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3489520",
abstract = "Underwater imaging often suffers from color cast and
contrast degradation due to range-dependent medium
absorption and light scattering. Introducing image
statistics as prior has been proved to be an effective
solution for underwater image enhancement. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "116",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wu:2021:DSG,
author = "Junyi Wu and Yan Huang and Qiang Wu and Zhipeng Gao
and Jianqiang Zhao and Liqin Huang",
title = "Dual-Stream Guided-Learning via a Priori Optimization
for Person Re-identification",
journal = j-TOMM,
volume = "17",
number = "4",
pages = "117:1--117:22",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3447715",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 14 07:01:30 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3447715",
abstract = "The task of person re-identification (re-ID) is to
find the same pedestrian across non-overlapping camera
views. Generally, the performance of person re-ID can
be affected by background clutter. However, existing
segmentation algorithms cannot obtain \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "117",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{He:2021:ACO,
author = "Zhaoliang He and Hongshan Li and Zhi Wang and Shutao
Xia and Wenwu Zhu",
title = "Adaptive Compression for Online Computer Vision: an
Edge Reinforcement Learning Approach",
journal = j-TOMM,
volume = "17",
number = "4",
pages = "118:1--118:23",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3447878",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 14 07:01:30 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3447878",
abstract = "With the growth of computer vision-based applications,
an explosive amount of images have been uploaded to
cloud servers that host such online computer vision
algorithms, usually in the form of deep learning
models. JPEG has been used as the de facto. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "118",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Pan:2021:SDE,
author = "Yingwei Pan and Yue Chen and Qian Bao and Ning Zhang
and Ting Yao and Jingen Liu and Tao Mei",
title = "{Smart Director}: an Event-Driven Directing System for
Live Broadcasting",
journal = j-TOMM,
volume = "17",
number = "4",
pages = "119:1--119:18",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3448981",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 14 07:01:30 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3448981",
abstract = "Live video broadcasting normally requires a multitude
of skills and expertise with domain knowledge to enable
multi-camera productions. As the number of cameras
keeps increasing, directing a live sports broadcast has
now become more complicated and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "119",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xu:2021:DSS,
author = "Chunyan Xu and Rong Liu and Tong Zhang and Zhen Cui
and Jian Yang and Chunlong Hu",
title = "Dual-Stream Structured Graph Convolution Network for
Skeleton-Based Action Recognition",
journal = j-TOMM,
volume = "17",
number = "4",
pages = "120:1--120:22",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3450410",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 14 07:01:30 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3450410",
abstract = "In this work, we propose a dual-stream structured
graph convolution network (DS-SGCN) to solve the
skeleton-based action recognition problem. The
spatio-temporal coordinates and appearance contexts of
the skeletal joints are jointly integrated into the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "120",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2021:UDE,
author = "Jie Wang and Kaibin Tian and Dayong Ding and Gang Yang
and Xirong Li",
title = "Unsupervised Domain Expansion for Visual
Categorization",
journal = j-TOMM,
volume = "17",
number = "4",
pages = "121:1--121:24",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3448108",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 14 07:01:30 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3448108",
abstract = "Expanding visual categorization into a novel domain
without the need of extra annotation has been a
long-term interest for multimedia intelligence.
Previously, this challenge has been approached by
unsupervised domain adaptation (UDA). Given labeled
data \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "121",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Mawalim:2021:TIR,
author = "Candy Olivia Mawalim and Shogo Okada and Yukiko I.
Nakano",
title = "Task-independent Recognition of Communication Skills
in Group Interaction Using Time-series Modeling",
journal = j-TOMM,
volume = "17",
number = "4",
pages = "122:1--122:27",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3450283",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 14 07:01:30 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3450283",
abstract = "Case studies of group discussions are considered an
effective way to assess communication skills (CS). This
method can help researchers evaluate participants'
engagement with each other in a specific realistic
context. In this article, multimodal analysis
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "122",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2021:WTG,
author = "Bo Zhang and Rui Zhang and Niccolo Bisagno and Nicola
Conci and Francesco G. B. {De Natale} and Hongbo Liu",
title = "Where Are They Going? {Predicting} Human Behaviors in
Crowded Scenes",
journal = j-TOMM,
volume = "17",
number = "4",
pages = "123:1--123:19",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3449359",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 14 07:01:30 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3449359",
abstract = "In this article, we propose a framework for crowd
behavior prediction in complicated scenarios. The
fundamental framework is designed using the standard
encoder-decoder scheme, which is built upon the long
short-term memory module to capture the temporal
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "123",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Silva:2021:UMC,
author = "Ellen P. Silva and Nat{\'a}lia Vieira and Glauco
Amorim and Renata Mousinho and Gustavo Guedes and
Gheorghita Ghinea and Joel A. F. {Dos Santos}",
title = "Using Multisensory Content to Impact the Quality of
Experience of Reading Digital Books",
journal = j-TOMM,
volume = "17",
number = "4",
pages = "124:1--124:18",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3458676",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 14 07:01:30 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3458676",
abstract = "Multisensorial books enrich a story with either
traditional multimedia content or sensorial effects.
The main idea is to increase children's interest in
reading by enhancing their QoE while reading. Studies
on enriched and/or augmented e-books also \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "124",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Jiang:2021:BDC,
author = "Weitao Jiang and Weixuan Wang and Haifeng Hu",
title = "Bi-Directional Co-Attention Network for Image
Captioning",
journal = j-TOMM,
volume = "17",
number = "4",
pages = "125:1--125:20",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3460474",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 14 07:01:30 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3460474",
abstract = "Image Captioning, which automatically describes an
image with natural language, is regarded as a
fundamental challenge in computer vision. In recent
years, significant advance has been made in image
captioning through improving attention mechanism.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "125",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Shen:2021:CDO,
author = "Xiangjun Shen and Jinghui Zhou and Zhongchen Ma and
Bingkun Bao and Zhengjun Zha",
title = "Cross-Domain Object Representation via Robust Low-Rank
Correlation Analysis",
journal = j-TOMM,
volume = "17",
number = "4",
pages = "126:1--126:20",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3458825",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 14 07:01:30 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3458825",
abstract = "Cross-domain data has become very popular recently
since various viewpoints and different sensors tend to
facilitate better data representation. In this article,
we propose a novel cross-domain object representation
algorithm (RLRCA) which not only \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "126",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xu:2021:CMH,
author = "Xing Xu and Yifan Wang and Yixuan He and Yang Yang and
Alan Hanjalic and Heng Tao Shen",
title = "Cross-Modal Hybrid Feature Fusion for Image-Sentence
Matching",
journal = j-TOMM,
volume = "17",
number = "4",
pages = "127:1--127:23",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3458281",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 14 07:01:30 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3458281",
abstract = "Image-sentence matching is a challenging task in the
field of language and vision, which aims at measuring
the similarities between images and sentence
descriptions. Most existing methods independently map
the global features of images and sentences into
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "127",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Messina:2021:FGV,
author = "Nicola Messina and Giuseppe Amato and Andrea Esuli and
Fabrizio Falchi and Claudio Gennaro and St{\'e}phane
Marchand-Maillet",
title = "Fine-Grained Visual Textual Alignment for Cross-Modal
Retrieval Using Transformer Encoders",
journal = j-TOMM,
volume = "17",
number = "4",
pages = "128:1--128:23",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3451390",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 14 07:01:30 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3451390",
abstract = "Despite the evolution of deep-learning-based
visual-textual processing systems, precise multi-modal
matching remains a challenging task. In this work, we
tackle the task of cross-modal retrieval through
image-sentence matching based on word-region \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "128",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Ma:2021:HSP,
author = "Xuan Ma and Xiaoshan Yang and Junyu Gao and Changsheng
Xu",
title = "Health Status Prediction with Local-Global
Heterogeneous Behavior Graph",
journal = j-TOMM,
volume = "17",
number = "4",
pages = "129:1--129:21",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3457893",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 14 07:01:30 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3457893",
abstract = "Health management is getting increasing attention all
over the world. However, existing health management
mainly relies on hospital examination and treatment,
which are complicated and untimely. The emergence of
mobile devices provides the possibility to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "129",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhai:2021:PQA,
author = "Guangtao Zhai and Wei Sun and Xiongkuo Min and Jiantao
Zhou",
title = "Perceptual Quality Assessment of Low-light Image
Enhancement",
journal = j-TOMM,
volume = "17",
number = "4",
pages = "130:1--130:24",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3457905",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 14 07:01:30 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3457905",
abstract = "Low-light image enhancement algorithms (LIEA) can
light up images captured in dark or back-lighting
conditions. However, LIEA may introduce various
distortions such as structure damage, color shift, and
noise into the enhanced images. Despite various
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "130",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Mishra:2021:DBR,
author = "Prerna Mishra and Santosh Kumar and Mithilesh Kumar
Chaube",
title = "Dissimilarity-Based Regularized Learning of Charts",
journal = j-TOMM,
volume = "17",
number = "4",
pages = "131:1--131:23",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3458884",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 14 07:01:30 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3458884",
abstract = "Chart images exhibit significant variabilities that
make each image different from others even though they
belong to the same class or categories. Classification
of charts is a major challenge because each chart class
has variations in features, structure,. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "131",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Nandanwar:2021:NFB,
author = "Lokesh Nandanwar and Palaiahnakote Shivakumara and
Divya Krishnani and Raghavendra Ramachandra and Tong Lu
and Umapada Pal and Mohan Kankanhalli",
title = "A New Foreground-Background based Method for
Behavior-Oriented Social Media Image Classification",
journal = j-TOMM,
volume = "17",
number = "4",
pages = "132:1--132:25",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3458051",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 14 07:01:30 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3458051",
abstract = "Due to various applications, research on personal
traits using information on social media has become an
important area. In this paper, a new method for the
classification of behavior-oriented social images
uploaded on various social media platforms is
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "132",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Alahmadi:2021:ABS,
author = "Mohannad Alahmadi and Peter Pocta and Hugh Melvin",
title = "An Adaptive Bitrate Switching Algorithm for Speech
Applications in Context of {WebRTC}",
journal = j-TOMM,
volume = "17",
number = "4",
pages = "133:1--133:21",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3458751",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 14 07:01:30 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3458751",
abstract = "Web Real-Time Communication (WebRTC) combines a set of
standards and technologies to enable high-quality
audio, video, and auxiliary data exchange in web
browsers and mobile applications. It enables
peer-to-peer multimedia sessions over IP networks
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "133",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Gao:2021:FVS,
author = "Wei Gao and Linjie Zhou and Lvfang Tao",
title = "A Fast View Synthesis Implementation Method for Light
Field Applications",
journal = j-TOMM,
volume = "17",
number = "4",
pages = "134:1--134:20",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3459098",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 14 07:01:30 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3459098",
abstract = "View synthesis (VS) for light field images is a very
time-consuming task due to the great quantity of
involved pixels and intensive computations, which may
prevent it from the practical three-dimensional
real-time systems. In this article, we propose an
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "134",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2021:BCR,
author = "Jianhai Zhang and Zhiyong Feng and Yong Su and Meng
Xing",
title = "{Bayesian} Covariance Representation with Global
Informative Prior for {$3$D} Action Recognition",
journal = j-TOMM,
volume = "17",
number = "4",
pages = "135:1--135:22",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3460235",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 14 07:01:30 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3460235",
abstract = "For the merits of high-order statistics and Riemannian
geometry, covariance matrix has become a generic
feature representation for action recognition. An
independent action can be represented by an empirical
statistics over all of its pose samples. Two \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "135",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhu:2021:PAP,
author = "Anqi Zhu and Lin Zhang and Juntao Chen and Yicong
Zhou",
title = "Pedestrian-Aware Panoramic Video Stitching Based on a
Structured Camera Array",
journal = j-TOMM,
volume = "17",
number = "4",
pages = "136:1--136:24",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3460511",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 14 07:01:30 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3460511",
abstract = "The panorama stitching system is an indispensable
module in surveillance or space exploration. Such a
system enables the viewer to understand the
surroundings instantly by aligning the surrounding
images on a plane and fusing them naturally. The
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "136",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Chen:2021:NDB,
author = "Yizhen Chen and Haifeng Hu",
title = "{Y-Net}: Dual-branch Joint Network for Semantic
Segmentation",
journal = j-TOMM,
volume = "17",
number = "4",
pages = "137:1--137:22",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3460940",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 14 07:01:30 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3460940",
abstract = "Most existing segmentation networks are built upon a
``U-shaped'' encoder-decoder structure, where the
multi-level features extracted by the encoder are
gradually aggregated by the decoder. Although this
structure has been proven to be effective in \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "137",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2021:DNA,
author = "Jinwei Wang and Wei Huang and Xiangyang Luo and
Yun-Qing Shi and Sunil Kr. Jha",
title = "Detecting Non-Aligned Double {JPEG} Compression Based
on Amplitude-Angle Feature",
journal = j-TOMM,
volume = "17",
number = "4",
pages = "138:1--138:18",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3464388",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 14 07:01:30 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3464388",
abstract = "Due to the popularity of JPEG format images in recent
years, JPEG images will inevitably involve image
editing operation. Thus, some tramped images will leave
tracks of Non-aligned double JPEG (NA-DJPEG)
compression. By detecting the presence of NA-DJPEG
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "138",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Jia:2021:RGL,
author = "Wei Jia and Li Li and Zhu Li and Xiang Zhang and Shan
Liu",
title = "Residual-guided In-loop Filter Using Convolution
Neural Network",
journal = j-TOMM,
volume = "17",
number = "4",
pages = "139:1--139:19",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3460820",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 14 07:01:30 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3460820",
abstract = "The block-based coding structure in the hybrid video
coding framework inevitably introduces compression
artifacts such as blocking, ringing, and so on. To
compensate for those artifacts, extensive filtering
techniques were proposed in the loop of video
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "139",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Lv:2021:TMF,
author = "Zhihan Lv and Houbing Song",
title = "Trust Mechanism of Feedback Trust Weight in Multimedia
Network",
journal = j-TOMM,
volume = "17",
number = "4",
pages = "140:1--140:26",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.1145/3391296",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Jan 14 07:01:30 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3391296",
abstract = "It is necessary to solve the inaccurate data arising
from data reliability ignored by most data fusion
algorithms drawing upon collaborative filtering and
fuzzy network theory. Therefore, a model is constructed
based on the collaborative filtering \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "140",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yao:2022:SLM,
author = "Peng Yao and Jieqing Feng",
title = "Sparse {LIDAR} Measurement Fusion with Joint Updating
Cost for Fast Stereo Matching",
journal = j-TOMM,
volume = "18",
number = "1",
pages = "1:1--1:18",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3471870",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:22:44 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3471870",
abstract = "The complementary virtues of active and passive depth
sensors inspire the LIDAR-Stereo fusion for enhancing
the accuracy of stereo matching. However, most of the
fusion based stereo matching algorithms have exploited
dense LIDAR priors with single fusion \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "1",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Karagkioules:2022:OLA,
author = "Theodoros Karagkioules and Georgios S. Paschos and
Nikolaos Liakopoulos and Attilio Fiandrotti and
Dimitrios Tsilimantos and Marco Cagnazzo",
title = "Online Learning for Adaptive Video Streaming in Mobile
Networks",
journal = j-TOMM,
volume = "18",
number = "1",
pages = "2:1--2:22",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3460819",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:22:44 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3460819",
abstract = "In this paper, we propose a novel algorithm for video
bitrate adaptation in HTTP Adaptive Streaming (HAS),
based on online learning. The proposed algorithm, named
Learn2Adapt (L2A), is shown to provide a robust bitrate
adaptation strategy which, unlike \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "2",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Fan:2022:MUE,
author = "Ching-Ling Fan and Tse-Hou Hung and Cheng-Hsin Hsu",
title = "Modeling the User Experience of Watching 360${}^\circ
$ Videos with Head-Mounted Displays",
journal = j-TOMM,
volume = "18",
number = "1",
pages = "3:1--3:23",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3463825",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:22:44 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3463825",
abstract = "Conducting user studies to quantify the Quality of
Experience (QoE) of watching the increasingly more
popular 360${}^\circ $ videos in Head-Mounted Displays
(HMDs) is time-consuming, tedious, and expensive.
Deriving QoE models, however, is very challenging
because \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "3",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{S:2022:TRL,
author = "Baiju P. S. and Sudhish N. George",
title = "{TTV} Regularized {LRTA} Technique for the Estimation
of Haze Model Parameters in Video Dehazing",
journal = j-TOMM,
volume = "18",
number = "1",
pages = "4:1--4:22",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3465454",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:22:44 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3465454",
abstract = "Nowadays, intelligent transport systems have a major
role in providing a safe and secure traffic society for
passengers, pedestrians, and vehicles. However, some
bad weather conditions such as haze or fog may affect
the visual clarity of video footage \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "4",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Aloufi:2022:MDT,
author = "Samah Aloufi and Abdulmotaleb {El Saddik}",
title = "{MMSUM} Digital Twins: a Multi-view Multi-modality
Summarization Framework for Sporting Events",
journal = j-TOMM,
volume = "18",
number = "1",
pages = "5:1--5:25",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3462777",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:22:44 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3462777",
abstract = "Sporting events generate a massive amount of traffic
on social media with live moment-to-moment accounts as
any given situation unfolds. The generated data are
intensified by fans feelings, reactions, and subjective
opinions towards what happens during \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "5",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2022:MFF,
author = "Zhoutao Wang and Qian Xie and Mingqiang Wei and Kun
Long and Jun Wang",
title = "Multi-feature Fusion {VoteNet} for {$3$D} Object
Detection",
journal = j-TOMM,
volume = "18",
number = "1",
pages = "6:1--6:17",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3462219",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:22:44 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3462219",
abstract = "In this article, we propose a Multi-feature Fusion
VoteNet (MFFVoteNet) framework for improving the 3D
object detection performance in cluttered and heavily
occluded scenes. Our method takes the point cloud and
the synchronized RGB image as inputs to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "6",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Uddin:2022:NMM,
author = "Md Azher Uddin and Joolekha Bibi Joolee and Young-Koo
Lee and Kyung-Ah Sohn",
title = "A Novel Multi-Modal Network-Based Dynamic Scene
Understanding",
journal = j-TOMM,
volume = "18",
number = "1",
pages = "7:1--7:19",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3462218",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:22:44 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3462218",
abstract = "In recent years, dynamic scene understanding has
gained attention from researchers because of its
widespread applications. The main important factor in
successfully understanding the dynamic scenes lies in
jointly representing the appearance and motion
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "7",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2022:FEA,
author = "Shiguang Liu and Huixin Wang and Min Pei",
title = "Facial-expression-aware Emotional Color Transfer Based
on Convolutional Neural Network",
journal = j-TOMM,
volume = "18",
number = "1",
pages = "8:1--8:19",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3464382",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:22:44 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3464382",
abstract = "Emotional color transfer aims to change the evoked
emotion of a source image to that of a target image by
adjusting color distribution. Most of existing
emotional color transfer methods only consider the
low-level visual features of an image and ignore
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "8",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{PeresRebelo:2022:IAI,
author = "Ana Daniela {Peres Rebelo} and Guedes {De Oliveira
In{\^e}s} and D. E. Verboom Damion",
title = "The Impact of Artificial Intelligence on the
Creativity of Videos",
journal = j-TOMM,
volume = "18",
number = "1",
pages = "9:1--9:27",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3462634",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:22:44 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3462634",
abstract = "This study explored the impact Artificial Intelligence
(AI) has on the evaluation of creative elements in
artistic videos. The aim was to verify to what extent
the use of an AI algorithm (Style Transfer) contributes
to changes in the perceived creativity \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "9",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Song:2022:LHV,
author = "Yaguang Song and Junyu Gao and Xiaoshan Yang and
Changsheng Xu",
title = "Learning Hierarchical Video Graph Networks for
One-Stop Video Delivery",
journal = j-TOMM,
volume = "18",
number = "1",
pages = "10:1--10:23",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3466886",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:22:44 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3466886",
abstract = "The explosive growth of video data has brought great
challenges to video retrieval, which aims to find out
related videos from a video collection. Most users are
usually not interested in all the content of retrieved
videos but have a more fine-grained \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "10",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Mao:2022:MGD,
author = "Aihua Mao and Yuan Liang and Jianbo Jiao and Yongtuo
Liu and Shengfeng He",
title = "Mask-Guided Deformation Adaptive Network for Human
Parsing",
journal = j-TOMM,
volume = "18",
number = "1",
pages = "11:1--11:20",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3467889",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:22:44 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3467889",
abstract = "Due to the challenges of densely compacted body parts,
nonrigid clothing items, and severe overlap in crowd
scenes, human parsing needs to focus more on multilevel
feature representations compared to general scene
parsing tasks. Based on this observation, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "11",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Tiotsop:2022:MIM,
author = "Lohic Fotio Tiotsop and Tomas Mizdos and Marcus
Barkowsky and Peter Pocta and Antonio Servetti and
Enrico Masala",
title = "Mimicking Individual Media Quality Perception with
Neural Network based Artificial Observers",
journal = j-TOMM,
volume = "18",
number = "1",
pages = "12:1--12:25",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3464393",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:22:44 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3464393",
abstract = "The media quality assessment research community has
traditionally been focusing on developing objective
algorithms to predict the result of a typical
subjective experiment in terms of Mean Opinion Score
(MOS) value. However, the MOS, being a single value,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "12",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Thong:2022:DSV,
author = "William Thong and Cees G. M. Snoek",
title = "Diversely-Supervised Visual Product Search",
journal = j-TOMM,
volume = "18",
number = "1",
pages = "13:1--13:22",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3461646",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:22:44 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3461646",
abstract = "This article strives for a diversely supervised visual
product search, where queries specify a diverse set of
labels to search for. Where previous works have focused
on representing attribute, instance, or category labels
individually, we consider them \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "13",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Farhat:2022:CCC,
author = "Farshid Farhat and Mohammad Mahdi Kamani and James Z.
Wang",
title = "{CAPTAIN}: Comprehensive Composition Assistance for
Photo Taking",
journal = j-TOMM,
volume = "18",
number = "1",
pages = "14:1--14:24",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3462762",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:22:44 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3462762",
abstract = "Many people are interested in taking astonishing
photos and sharing them with others. Emerging high-tech
hardware and software facilitate the ubiquitousness and
functionality of digital photography. Because
composition matters in photography, researchers
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "14",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Holloman:2022:DSS,
author = "Amanda K. Holloman and Chris S. Crawford",
title = "Defining Scents: a Systematic Literature Review of
Olfactory-based Computing Systems",
journal = j-TOMM,
volume = "18",
number = "1",
pages = "15:1--15:22",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3470975",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:22:44 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3470975",
abstract = "The human sense of smell is a primal ability that has
the potential to reveal unexplored relationships
between user behaviors and technology. Humans use
millions of olfactory receptor cells to observe the
environment around them. Olfaction studies are
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "15",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Han:2022:HIR,
author = "Xian-Hua Han and Yinqiang Zheng and Yen-Wei Chen",
title = "Hyperspectral Image Reconstruction Using Multi-scale
Fusion Learning",
journal = j-TOMM,
volume = "18",
number = "1",
pages = "16:1--16:21",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3477396",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:22:44 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3477396",
abstract = "Hyperspectral imaging is a promising imaging modality
that simultaneously captures several images for the
same scene on narrow spectral bands, and it has made
considerable progress in different fields, such as
agriculture, astronomy, and surveillance. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "16",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Tasaka:2022:EMC,
author = "Shuji Tasaka",
title = "An Empirical Method for Causal Inference of Constructs
for {QoE} in Haptic-Audiovisual Communications",
journal = j-TOMM,
volume = "18",
number = "1",
pages = "17:1--17:24",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3473986",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:22:44 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3473986",
abstract = "This article proposes an empirical method for
inferring causal directions in multidimensional Quality
of Experience (QoE) in multimedia communications,
noting that causation in QoE is perceptual. As an
example for modeling framework, we pick up a Bayesian
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "17",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yang:2022:RIT,
author = "Dongbao Yang and Yu Zhou and Wei Shi and Dayan Wu and
Weiping Wang",
title = "{RD-IOD}: Two-Level Residual-Distillation-Based
Triple-Network for Incremental Object Detection",
journal = j-TOMM,
volume = "18",
number = "1",
pages = "18:1--18:23",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3472393",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:22:44 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3472393",
abstract = "As a basic component in multimedia applications,
object detectors are generally trained on a fixed set
of classes that are pre-defined. However, new object
classes often emerge after the models are trained in
practice. Modern object detectors based on \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "18",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Hsu:2022:OIV,
author = "Chih-Fan Hsu and Tse-Hou Hung and Cheng-Hsin Hsu",
title = "Optimizing Immersive Video Coding Configurations Using
Deep Learning: a Case Study on {TMIV}",
journal = j-TOMM,
volume = "18",
number = "1",
pages = "19:1--19:25",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3471191",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:22:44 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3471191",
abstract = "Immersive video streaming technologies improve Virtual
Reality (VR) user experience by providing users more
intuitive ways to move in simulated worlds, e.g., with
6 Degree-of-Freedom (6DoF) interaction mode. A naive
method to achieve 6DoF is deploying \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "19",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Siegfried:2022:RUG,
author = "R{\'e}my Siegfried and Jean-Marc Odobez",
title = "Robust Unsupervised Gaze Calibration Using
Conversation and Manipulation Attention Priors",
journal = j-TOMM,
volume = "18",
number = "1",
pages = "20:1--20:27",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3472622",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:22:44 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3472622",
abstract = "Gaze estimation is a difficult task, even for humans.
However, as humans, we are good at understanding a
situation and exploiting it to guess the expected
visual focus of attention of people, and we usually use
this information to retrieve people's gaze. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "20",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2022:LLS,
author = "Jing Wang and Weiqing Min and Sujuan Hou and Shengnan
Ma and Yuanjie Zheng and Shuqiang Jiang",
title = "{LogoDet-3K}: a Large-scale Image Dataset for Logo
Detection",
journal = j-TOMM,
volume = "18",
number = "1",
pages = "21:1--21:19",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3466780",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:22:44 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3466780",
abstract = "Logo detection has been gaining considerable attention
because of its wide range of applications in the
multimedia field, such as copyright infringement
detection, brand visibility monitoring, and product
brand management on social media. In this article,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "21",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wu:2022:ALC,
author = "Da-Chun Wu and Yu-Tsung Hsu",
title = "Authentication of {LINE} Chat History Files by
Information Hiding",
journal = j-TOMM,
volume = "18",
number = "1",
pages = "22:1--22:23",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3474225",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:22:44 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3474225",
abstract = "With the prevalence of smartphones, message exchanges
via mobile chatting programs like LINE have become
popular. The messages in the form of chat records in a
LINE chat history, after being downloaded for legal
uses, might be tampered with illicitly. A \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "22",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2022:PPM,
author = "Changming Liu and Xiaojing Ma and Sixing Cao and
Jiayun Fu and Bin B. Zhu",
title = "Privacy-preserving Motion Detection for
{HEVC}-compressed Surveillance Video",
journal = j-TOMM,
volume = "18",
number = "1",
pages = "23:1--23:27",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3472669",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:22:44 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3472669",
abstract = "In the cloud era, a large amount of data is uploaded
to and processed by public clouds. The risk of privacy
leakage has become a major concern for cloud users.
Cloud-based video surveillance requires motion
detection, which may reveal the privacy of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "23",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2022:ISIa,
author = "Shiliang Zhang and Guorong Li and Weigang Zhang and
Qingming Huang and Tiejun Huang and Mubarak Shah and
Nicu Sebe",
title = "Introduction to the Special Issue on Fine-Grained
Visual Recognition and Re-Identification",
journal = j-TOMM,
volume = "18",
number = "1s",
pages = "24:1--24:3",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3505280",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:52 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3505280",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "24",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2022:HMM,
author = "La Zhang and Haiyun Guo and Kuan Zhu and Honglin Qiao
and Gaopan Huang and Sen Zhang and Huichen Zhang and
Jian Sun and Jinqiao Wang",
title = "Hybrid Modality Metric Learning for Visible-Infrared
Person Re-Identification",
journal = j-TOMM,
volume = "18",
number = "1s",
pages = "25:1--25:15",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3473341",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:52 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3473341",
abstract = "Visible-infrared person re-identification (Re-ID) has
received increasing research attention for its great
practical value in night-time surveillance scenarios.
Due to the large variations in person pose, viewpoint,
and occlusion in the same modality, as \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "25",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xu:2022:BIB,
author = "Sheng Xu and Chang Liu and Baochang Zhang and Jinhu
L{\"u} and Guodong Guo and David Doermann",
title = "{BiRe-ID}: Binary Neural Network for Efficient Person
Re-{ID}",
journal = j-TOMM,
volume = "18",
number = "1s",
pages = "26:1--26:22",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3473340",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:52 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3473340",
abstract = "Person re-identification (Re-ID) has been promoted by
the significant success of convolutional neural
networks (CNNs). However, the application of such
CNN-based Re-ID methods depends on the tremendous
consumption of computation and memory resources,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "26",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhao:2022:JGF,
author = "Zhongwei Zhao and Ran Song and Qian Zhang and Peng
Duan and Youmei Zhang",
title = "{JoT-GAN}: a Framework for Jointly Training {GAN} and
Person Re-Identification Model",
journal = j-TOMM,
volume = "18",
number = "1s",
pages = "27:1--27:18",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3491225",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:52 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3491225",
abstract = "To cope with the problem caused by inadequate training
data, many person re-identification (re-id) methods
exploit generative adversarial networks (GAN) for data
augmentation, where the training of GAN is typically
independent of that of the re-id model. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "27",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liang:2022:SCP,
author = "Liqian Liang and Congyan Lang and Zun Li and Jian Zhao
and Tao Wang and Songhe Feng",
title = "Seeing Crucial Parts: Vehicle Model Verification via a
Discriminative Representation Model",
journal = j-TOMM,
volume = "18",
number = "1s",
pages = "28:1--28:22",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3474596",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:52 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3474596",
abstract = "Widely used surveillance cameras have promoted large
amounts of street scene data, which contains one
important but long-neglected object: the vehicle. Here
we focus on the challenging problem of vehicle model
verification. Most previous works usually \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "28",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yan:2022:AIF,
author = "Chenggang Yan and Lixuan Meng and Liang Li and Jiehua
Zhang and Zhan Wang and Jian Yin and Jiyong Zhang and
Yaoqi Sun and Bolun Zheng",
title = "Age-Invariant Face Recognition by Multi-Feature
Fusionand Decomposition with Self-attention",
journal = j-TOMM,
volume = "18",
number = "1s",
pages = "29:1--29:18",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3472810",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:52 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3472810",
abstract = "Different from general face recognition, age-invariant
face recognition (AIFR) aims at matching faces with a
big age gap. Previous discriminative methods usually
focus on decomposing facial feature into age-related
and age-invariant components, which \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "29",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhai:2022:RML,
author = "Deming Zhai and Ruifeng Shi and Junjun Jiang and
Xianming Liu",
title = "Rectified Meta-learning from Noisy Labels for Robust
Image-based Plant Disease Classification",
journal = j-TOMM,
volume = "18",
number = "1s",
pages = "30:1--30:17",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3472809",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:52 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3472809",
abstract = "Plant diseases serve as one of main threats to food
security and crop production. It is thus valuable to
exploit recent advances of artificial intelligence to
assist plant disease diagnosis. One popular approach is
to transform this problem as a leaf \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "30",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Tan:2022:FGI,
author = "Min Tan and Fu Yuan and Jun Yu and Guijun Wang and
Xiaoling Gu",
title = "Fine-grained Image Classification via Multi-scale
Selective Hierarchical Biquadratic Pooling",
journal = j-TOMM,
volume = "18",
number = "1s",
pages = "31:1--31:23",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3492221",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:52 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3492221",
abstract = "How to extract distinctive features greatly challenges
the fine-grained image classification tasks. In
previous models, bilinear pooling has been frequently
adopted to address this problem. However, most bilinear
pooling models neglect either intra or \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "31",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Cucchiara:2022:FGH,
author = "Rita Cucchiara and Matteo Fabbri",
title = "Fine-grained Human Analysis under Occlusions and
Perspective Constraints in Multimedia Surveillance",
journal = j-TOMM,
volume = "18",
number = "1s",
pages = "32:1--32:23",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3476839",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:52 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3476839",
abstract = "Human detection in the wild is a research topic of
paramount importance in computer vision, and it is the
starting step for designing intelligent systems
oriented to human interaction that work in complete
autonomy. To achieve this goal, computer vision
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "32",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wu:2022:ICG,
author = "Lei Wu and Hefei Ling and Yuxuan Shi and Baiyan
Zhang",
title = "Instance Correlation Graph for Unsupervised Domain
Adaptation",
journal = j-TOMM,
volume = "18",
number = "1s",
pages = "33:1--33:23",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3486251",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:52 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3486251",
abstract = "In recent years, deep neural networks have emerged as
a dominant machine learning tool for a wide variety of
application fields. Due to the expensive cost of manual
labeling efforts, it is important to transfer knowledge
from a label-rich source domain to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "33",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Mugnai:2022:FGA,
author = "Daniele Mugnai and Federico Pernici and Francesco
Turchini and Alberto {Del Bimbo}",
title = "Fine-Grained Adversarial Semi-Supervised Learning",
journal = j-TOMM,
volume = "18",
number = "1s",
pages = "34:1--34:19",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3485473",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:52 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3485473",
abstract = "In this article, we exploit Semi-Supervised Learning
(SSL) to increase the amount of training data to
improve the performance of Fine-Grained Visual
Categorization (FGVC). This problem has not been
investigated in the past in spite of prohibitive
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "34",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Luo:2022:ERU,
author = "Dezhao Luo and Yu Zhou and Bo Fang and Yucan Zhou and
Dayan Wu and Weiping Wang",
title = "Exploring Relations in Untrimmed Videos for
Self-Supervised Learning",
journal = j-TOMM,
volume = "18",
number = "1s",
pages = "35:1--35:21",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3473342",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:52 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3473342",
abstract = "Existing video self-supervised learning methods mainly
rely on trimmed videos for model training. They apply
their methods and verify the effectiveness on trimmed
video datasets including UCF101 and Kinetics-400, among
others. However, trimmed datasets \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "35",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2022:EEC,
author = "Yabin Wang and Zhiheng Ma and Xing Wei and Shuai Zheng
and Yaowei Wang and Xiaopeng Hong",
title = "{ECCNAS}: Efficient Crowd Counting Neural Architecture
Search",
journal = j-TOMM,
volume = "18",
number = "1s",
pages = "36:1--36:19",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3465455",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:52 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3465455",
abstract = "Recent solutions to crowd counting problems have
already achieved promising performance across various
benchmarks. However, applying these approaches to
real-world applications is still challenging, because
they are computation intensive and lack the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "36",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2022:CFH,
author = "Wenxu Li and Gang Pan and Chen Wang and Zhen Xing and
Zhenjun Han",
title = "From Coarse to Fine: Hierarchical Structure-aware
Video Summarization",
journal = j-TOMM,
volume = "18",
number = "1s",
pages = "37:1--37:16",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3485472",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:52 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3485472",
abstract = "Hierarchical structure is a common characteristic for
some kinds of videos (e.g., sports videos, game
videos): The videos are composed of several actions
hierarchically and there exist temporal dependencies
among segments with different scales, where \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "37",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Hossain:2022:SSA,
author = "M. Shamim Hossain and Rita Cucchiara and Ghulam
Muhammad and Diana P. Tob{\'o}n and Abdulmotaleb {El
Saddik}",
title = "Special Section on {AI-empowered} Multimedia Data
Analytics for Smart Healthcare",
journal = j-TOMM,
volume = "18",
number = "1s",
pages = "38:1--38:2",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3505281",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:52 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3505281",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "38",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Chen:2022:MFT,
author = "Min Chen and Wenjing Xiao and Miao Li and Yixue Hao
and Long Hu and Guangming Tao",
title = "A Multi-feature and Time-aware-based Stress Evaluation
Mechanism for Mental Status Adjustment",
journal = j-TOMM,
volume = "18",
number = "1s",
pages = "39:1--39:18",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3462763",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:52 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3462763",
abstract = "With the rapid economic development, the prominent
social competition has led to increasing psychological
pressure of people felt from each aspect of life.
Driven by the Internet of Things and artificial
intelligence, intelligent psychological pressure
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "39",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Masud:2022:CNN,
author = "Mehedi Masud and Mohammed F. Alhamid and Yin Zhang",
title = "A Convolutional Neural Network Model Using Weighted
Loss Function to Detect Diabetic Retinopathy",
journal = j-TOMM,
volume = "18",
number = "1s",
pages = "40:1--40:16",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3470976",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:52 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3470976",
abstract = "Nowadays, artificial intelligence (AI) provides
tremendous prospects for driving future healthcare
while empowering patients and service providers. The
extensive use of digital healthcare produces a massive
amount of multimedia healthcare data \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "40",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2022:TTM,
author = "Debin Liu and Laurence T. Yang and Puming Wang and
Ruonan Zhao and Qingchen Zhang",
title = "{TT-TSVD}: a Multi-modal Tensor Train Decomposition
with Its Application in Convolutional Neural Networks
for Smart Healthcare",
journal = j-TOMM,
volume = "18",
number = "1s",
pages = "41:1--41:17",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3491223",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:52 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3491223",
abstract = "Smart healthcare systems are generating a large scale
of heterogeneous high-dimensional data with complex
relationships. It is hard for current methods to
analyze such high-dimensional healthcare data.
Specifically, the traditional data reduction methods
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "41",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yang:2022:MNM,
author = "Chun-Wei Yang and Thanh Hai Phung and Hong-Han Shuai
and Wen-Huang Cheng",
title = "Mask or Non-Mask? {Robust} Face Mask Detector via
Triplet-Consistency Representation Learning",
journal = j-TOMM,
volume = "18",
number = "1s",
pages = "42:1--42:20",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3472623",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:52 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3472623",
abstract = "In the absence of vaccines or medicines to stop
COVID-19, one of the effective methods to slow the
spread of the coronavirus and reduce the overloading of
healthcare is to wear a face mask. Nevertheless, to
mandate the use of face masks or coverings in
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "42",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Lv:2022:DLB,
author = "Zhihan Lv and Zengchen Yu and Shuxuan Xie and Atif
Alamri",
title = "Deep Learning-based Smart Predictive Evaluation for
Interactive Multimedia-enabled Smart Healthcare",
journal = j-TOMM,
volume = "18",
number = "1s",
pages = "43:1--43:20",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3468506",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:52 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3468506",
abstract = "Two-dimensional arrays of bi-component structures made
of cobalt and permalloy elliptical dots with thickness
of 25 nm, length 1 mm and width of 225 nm, have been
prepared by a self-aligned shadow deposition technique.
Brillouin light scattering has been \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "43",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Amirpour:2022:ELF,
author = "Hadi Amirpour and Antonio Pinheiro and Manuela Pereira
and Fernando J. P. Lopes and Mohammad Ghanbari",
title = "Efficient Light Field Image Compression with Enhanced
Random Access",
journal = j-TOMM,
volume = "18",
number = "2",
pages = "44:1--44:18",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3471905",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:54 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3471905",
abstract = "In light field image compression, facilitating random
access to individual views plays a significant role in
decoding views quickly, reducing memory footprint, and
decreasing the bandwidth requirement for transmission.
Highly efficient light field image \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "44",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Morillo:2022:EIP,
author = "Pedro Morillo and Jos{\'e} J. Navarro-P{\'e}rez and
Juan M. Ordu{\~n}a and Marcos Fern{\'a}ndez",
title = "Evaluation of an Intervention Program Based on Mobile
Apps to Learn Sexism Prevention in Teenagers",
journal = j-TOMM,
volume = "18",
number = "2",
pages = "45:1--45:20",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3471139",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:54 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3471139",
abstract = "The fight against sexism is nowadays one of the
flagship social movements in western countries.
Adolescence is a crucial period, and some empirical
studies have focused on the socialization of teenagers,
proving that the socialization with the surrounding
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "45",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Tang:2022:LTS,
author = "Yansong Tang and Xingyu Liu and Xumin Yu and Danyang
Zhang and Jiwen Lu and Jie Zhou",
title = "Learning from Temporal Spatial Cubism for
Cross-Dataset Skeleton-based Action Recognition",
journal = j-TOMM,
volume = "18",
number = "2",
pages = "46:1--46:24",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3472722",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:54 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3472722",
abstract = "Rapid progress and superior performance have been
achieved for skeleton-based action recognition
recently. In this article, we investigate this problem
under a cross-dataset setting, which is a new,
pragmatic, and challenging task in real-world
scenarios. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "46",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Kizilkaya:2022:EFF,
author = "Burak Kizilkaya and Enver Ever and Hakan Yekta Yatbaz
and Adnan Yazici",
title = "An Effective Forest Fire Detection Framework Using
Heterogeneous Wireless Multimedia Sensor Networks",
journal = j-TOMM,
volume = "18",
number = "2",
pages = "47:1--47:21",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3473037",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:54 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3473037",
abstract = "With improvements in the area of Internet of Things
(IoT), surveillance systems have recently become more
accessible. At the same time, optimizing the energy
requirements of smart sensors, especially for data
transmission, has always been very important \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "47",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2022:UEU,
author = "Yehao Li and Jiahao Fan and Yingwei Pan and Ting Yao
and Weiyao Lin and Tao Mei",
title = "{Uni-EDEN}: Universal Encoder-Decoder Network by
Multi-Granular Vision-Language Pre-training",
journal = j-TOMM,
volume = "18",
number = "2",
pages = "48:1--48:16",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3473140",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:54 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3473140",
abstract = "Vision-language pre-training has been an emerging and
fast-developing research topic, which transfers
multi-modal knowledge from rich-resource pre-training
task to limited-resource downstream tasks. Unlike
existing works that predominantly learn a single
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "48",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Feng:2022:CSL,
author = "Shenming Feng and Xingzhong Nong and Haifeng Hu",
title = "Cascaded Structure-Learning Network with Using
Adversarial Training for Robust Facial Landmark
Detection",
journal = j-TOMM,
volume = "18",
number = "2",
pages = "49:1--49:20",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3474595",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:54 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3474595",
abstract = "Recently, great progress has been achieved on facial
landmark detection based on convolutional neural
network, while it is still challenging due to partial
occlusion and extreme head pose. In this paper, we
propose a Cascaded Structure-Learning Network
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "49",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Damme:2022:MLB,
author = "Sam {Van Damme} and Maria {Torres Vega} and Filip {De
Turck}",
title = "Machine Learning Based Content-Agnostic Viewport
Prediction for 360-Degree Video",
journal = j-TOMM,
volume = "18",
number = "2",
pages = "50:1--50:24",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3474833",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:54 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3474833",
abstract = "Accurate and fast estimations or predictions of the
(near) future location of the users of head-mounted
devices within the virtual omnidirectional environment
open a plethora of opportunities in application domains
such as interactive immersive gaming and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "50",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yeh:2022:GVW,
author = "Chih-Kuo Yeh and Thi-Ngoc-Hanh Le and Zhi-Ying Hou and
Tong-Yee Lee",
title = "Generating Virtual Wire Sculptural Art from {$3$D}
Models",
journal = j-TOMM,
volume = "18",
number = "2",
pages = "51:1--51:23",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3475798",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:54 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3475798",
abstract = "Wire sculptures are objects sculpted by the use of
wires. In this article, we propose practical methods to
create 3D virtual wire sculptural art from a given 3D
model. In contrast, most of the previous 3D wire art
results are reconstructed from input 2D \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "51",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Sun:2022:RGJ,
author = "Teng Sun and Chun Wang and Xuemeng Song and Fuli Feng
and Liqiang Nie",
title = "Response Generation by Jointly Modeling Personalized
Linguistic Styles and Emotions",
journal = j-TOMM,
volume = "18",
number = "2",
pages = "52:1--52:20",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3475872",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:54 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3475872",
abstract = "Natural language generation (NLG) has been an
essential technique for various applications, like
XiaoIce and Siri, and engaged increasing attention
recently. To improve the user experience, several
emotion-aware NLG methods have been developed to
generate \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "52",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Francis:2022:GRS,
author = "Jobin Francis and M. Baburaj and Sudhish N. George",
title = "An $ l_{1 / 2} $ and Graph Regularized Subspace
Clustering Method for Robust Image Segmentation",
journal = j-TOMM,
volume = "18",
number = "2",
pages = "53:1--53:24",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3476514",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:54 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3476514",
abstract = "Segmenting meaningful visual structures from an image
is a fundamental and most-addressed problem in image
analysis algorithms. However, among factors such as
diverse visual patterns, noise, complex backgrounds,
and similar textures present in foreground \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "53",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2022:WYE,
author = "Jiahao Wang and Yunhong Wang and Nina Weng and Tianrui
Chai and Annan Li and Faxi Zhang and Samsi Yu",
title = "Will You Ever Become Popular? {Learning} to Predict
Virality of Dance Clips",
journal = j-TOMM,
volume = "18",
number = "2",
pages = "54:1--54:24",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3477533",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:54 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3477533",
abstract = "Dance challenges are going viral in video communities
like TikTok nowadays. Once a challenge becomes popular,
thousands of short-form videos will be uploaded within
a couple of days. Therefore, virality prediction from
dance challenges is of great \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "54",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhong:2022:DSA,
author = "Sheng-Hua Zhong and Jingxu Lin and Jianglin Lu and
Ahmed Fares and Tongwei Ren",
title = "Deep Semantic and Attentive Network for Unsupervised
Video Summarization",
journal = j-TOMM,
volume = "18",
number = "2",
pages = "55:1--55:21",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3477538",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:54 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3477538",
abstract = "With the rapid growth of video data, video
summarization is a promising approach to shorten a
lengthy video into a compact version. Although
supervised summarization approaches have achieved
state-of-the-art performance, they require frame-level
annotated \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "55",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zeng:2022:MIL,
author = "Yawen Zeng and Da Cao and Shaofei Lu and Hanling Zhang
and Jiao Xu and Zheng Qin",
title = "Moment is Important: Language-Based Video Moment
Retrieval via Adversarial Learning",
journal = j-TOMM,
volume = "18",
number = "2",
pages = "56:1--56:21",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3478025",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:54 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3478025",
abstract = "The newly emerging language-based video moment
retrieval task aims at retrieving a target video moment
from an untrimmed video given a natural language as the
query. It is more applicable in reality since it is
able to accurately localize a specific video \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "56",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wu:2022:LTP,
author = "Hanjie Wu and Yongtuo Liu and Hongmin Cai and
Shengfeng He",
title = "Learning Transferable Perturbations for Image
Captioning",
journal = j-TOMM,
volume = "18",
number = "2",
pages = "57:1--57:18",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3478024",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:54 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3478024",
abstract = "Present studies have discovered that state-of-the-art
deep learning models can be attacked by small but
well-designed perturbations. Existing attack algorithms
for the image captioning task is time-consuming, and
their generated adversarial examples \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "57",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Sun:2022:SSS,
author = "Ziyi Sun and Yunfeng Zhang and Fangxun Bao and Ping
Wang and Xunxiang Yao and Caiming Zhang",
title = "{SADnet}: Semi-supervised Single Image Dehazing Method
Based on an Attention Mechanism",
journal = j-TOMM,
volume = "18",
number = "2",
pages = "58:1--58:23",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3478457",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:54 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3478457",
abstract = "Many real-life tasks such as military reconnaissance
and traffic monitoring require high-quality images.
However, images acquired in foggy or hazy weather pose
obstacles to the implementation of these real-life
tasks; consequently, image dehazing is an \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "58",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2022:TIS,
author = "Feifei Zhang and Mingliang Xu and Changsheng Xu",
title = "Tell, Imagine, and Search: End-to-end Learning for
Composing Text and Image to Image Retrieval",
journal = j-TOMM,
volume = "18",
number = "2",
pages = "59:1--59:23",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3478642",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:54 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3478642",
abstract = "Composing Text and Image to Image Retrieval ( CTI-IR )
is an emerging task in computer vision, which allows
retrieving images relevant to a query image with text
describing desired modifications to the query image.
Most conventional cross-modal retrieval \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "59",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Ma:2022:SAM,
author = "Haoyu Ma and Bingchen Gong and Yizhou Yu",
title = "Structure-aware Meta-fusion for Image
Super-resolution",
journal = j-TOMM,
volume = "18",
number = "2",
pages = "60:1--60:25",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3477553",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:54 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3477553",
abstract = "There are two main categories of image
super-resolution algorithms: distortion oriented and
perception oriented. Recent evidence shows that
reconstruction accuracy and perceptual quality are
typically in disagreement with each other. In this
article, we \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "60",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Tahir:2022:NAT,
author = "Madiha Tahir and Zahid Halim and Atta Ur Rahman and
Muhammad Waqas and Shanshan Tu and Sheng Chen and Zhu
Han",
title = "Non-Acted Text and Keystrokes Database and Learning
Methods to Recognize Emotions",
journal = j-TOMM,
volume = "18",
number = "2",
pages = "61:1--61:24",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3480968",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:54 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3480968",
abstract = "The modern computing applications are presently
adapting to the convenient availability of huge and
diverse data for making their pattern recognition
methods smarter. Identification of dominant emotion
solely based on the text data generated by humans is
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "61",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Fincato:2022:TWD,
author = "Matteo Fincato and Marcella Cornia and Federico Landi
and Fabio Cesari and Rita Cucchiara",
title = "Transform, Warp, and Dress: a New
Transformation-guided Model for Virtual Try-on",
journal = j-TOMM,
volume = "18",
number = "2",
pages = "62:1--62:24",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3491226",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:54 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3491226",
abstract = "Virtual try-on has recently emerged in computer vision
and multimedia communities with the development of
architectures that can generate realistic images of a
target person wearing a custom garment. This research
interest is motivated by the large role \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "62",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Han:2022:AMG,
author = "Ning Han and Jingjing Chen and Hao Zhang and Huanwen
Wang and Hao Chen",
title = "Adversarial Multi-Grained Embedding Network for
Cross-Modal Text-Video Retrieval",
journal = j-TOMM,
volume = "18",
number = "2",
pages = "63:1--63:23",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3483381",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:54 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3483381",
abstract = "Cross-modal retrieval between texts and videos has
received consistent research interest in the multimedia
community. Existing studies follow a trend of learning
a joint embedding space to measure the distance between
text and video representations. In \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "63",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Pang:2022:FUP,
author = "Bo Pang and Deming Zhai and Junjun Jiang and Xianming
Liu",
title = "Fully Unsupervised Person Re-Identification via
Selective Contrastive Learning",
journal = j-TOMM,
volume = "18",
number = "2",
pages = "64:1--64:15",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3485061",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:54 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3485061",
abstract = "Person re-identification (ReID) aims at searching the
same identity person among images captured by various
cameras. Existing fully supervised person ReID methods
usually suffer from poor generalization capability
caused by domain gaps. Unsupervised \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "64",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhuang:2022:MAD,
author = "Wenlin Zhuang and Congyi Wang and Jinxiang Chai and
Yangang Wang and Ming Shao and Siyu Xia",
title = "{Music2Dance}: {DanceNet} for Music-Driven Dance
Generation",
journal = j-TOMM,
volume = "18",
number = "2",
pages = "65:1--65:21",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3485664",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:54 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3485664",
abstract = "Synthesize human motions from music (i.e., music to
dance) is appealing and has attracted lots of research
interests in recent years. It is challenging because of
the requirement for realistic and complex human motions
for dance, but more importantly, the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "65",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Cetinic:2022:UCA,
author = "Eva Cetinic and James She",
title = "Understanding and Creating Art with {AI}: Review and
Outlook",
journal = j-TOMM,
volume = "18",
number = "2",
pages = "66:1--66:22",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3475799",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:54 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3475799",
abstract = "Technologies related to artificial intelligence (AI)
have a strong impact on the changes of research and
creative practices in visual arts. The growing number
of research initiatives and creative applications that
emerge in the intersection of AI and art \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "66",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2022:DVS,
author = "Zheng Zhang and Jianning Wang and Lei Zhu and
Guangming Lu",
title = "Discriminative Visual Similarity Search with
Semantically Cycle-consistent Hashing Networks",
journal = j-TOMM,
volume = "18",
number = "2s",
pages = "114:1--114:??",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3532519",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3532519",
abstract = "Deep hashing has great potential in large-scale visual
similarity search due to its preferable efficiency in
storage and computation. Technically, deep \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "114",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Ge:2022:DVD,
author = "Shiming Ge and Fanzhao Lin and Chenyu Li and Daichi
Zhang and Weiping Wang and Dan Zeng",
title = "Deepfake Video Detection via Predictive Representation
Learning",
journal = j-TOMM,
volume = "18",
number = "2s",
pages = "115:1--115:??",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3536426",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3536426",
abstract = "Increasingly advanced deepfake approaches have made
the detection of deepfake videos very challenging. We
observe that the general deepfake videos often
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "115",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Galteri:2022:LLB,
author = "Leonardo Galteri and Lorenzo Seidenari and Pietro
Bongini and Marco Bertini and Alberto {Del Bimbo}",
title = "{LANBIQUE}: {LANguage-based Blind Image QUality
Evaluation}",
journal = j-TOMM,
volume = "18",
number = "2s",
pages = "116:1--116:??",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3538649",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3538649",
abstract = "Image quality assessment is often performed with deep
networks that are fine-tuned to regress a human
provided quality score of a given image. Usually, this
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "116",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Lv:2022:SCC,
author = "Zhihan Lv and Dongliang Chen and Haibin Lv",
title = "Smart City Construction and Management by Digital
Twins and {BIM} Big Data in {COVID-19} Scenario",
journal = j-TOMM,
volume = "18",
number = "2s",
pages = "117:1--117:??",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3529395",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3529395",
abstract = "With the rapid development of information technology
and the spread of Corona Virus Disease 2019 (COVID-19),
the government and urban managers are looking
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "117",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Anand:2022:CSD,
author = "Ashima Anand and Amit Kumar Singh",
title = "A Comprehensive Study of Deep Learning-based Covert
Communication",
journal = j-TOMM,
volume = "18",
number = "2s",
pages = "118:1--118:??",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3508365",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3508365",
abstract = "Deep learning-based methods have been popular in
multimedia analysis tasks, including classification,
detection, segmentation, and so on. In \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "118",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xu:2022:EAC,
author = "Haotian Xu and Xiaobo Jin and Qiufeng Wang and Amir
Hussain and Kaizhu Huang",
title = "Exploiting Attention-Consistency Loss For
Spatial-Temporal Stream Action Recognition",
journal = j-TOMM,
volume = "18",
number = "2s",
pages = "119:1--119:??",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3538749",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3538749",
abstract = "Currently, many action recognition methods mostly
consider the information from spatial streams. We
propose a new perspective inspired by the human visual
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "119",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Salim:2022:PED,
author = "Sara Salim and Nour Moustafa and Benjamin Turnbull and
Imran Razzak",
title = "Perturbation-enabled Deep Federated Learning for
Preserving {Internet of Things}-based Social Networks",
journal = j-TOMM,
volume = "18",
number = "2s",
pages = "120:1--120:??",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3537899",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3537899",
abstract = "Federated Learning (FL), as an emerging form of
distributed machine learning (ML), can protect
participants' private data from being substantially
disclosed to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "120",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Bi:2022:DTE,
author = "An-Qi Bi and Xiao-Yang Tian and Shui-Hua Wang and
Yu-Dong Zhang",
title = "Dynamic Transfer Exemplar based Facial Emotion
Recognition Model Toward Online Video",
journal = j-TOMM,
volume = "18",
number = "2s",
pages = "121:1--121:??",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3538385",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3538385",
abstract = "In this article, we focus on the dynamic facial
emotion recognition from online video. We combine deep
neural networks with transfer learning theory and
propose a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "121",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Golmaryami:2022:SSS,
author = "Marjan Golmaryami and Rahim Taheri and Zahra Pooranian
and Mohammad Shojafar and Pei Xiao",
title = "{SETTI}: a {Self-supervised AdvErsarial Malware
DeTection ArchiTecture in an IoT} Environment",
journal = j-TOMM,
volume = "18",
number = "2s",
pages = "122:1--122:??",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3536425",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3536425",
abstract = "In recent years, malware detection has become an
active research topic in the area of Internet of Things
(IoT) security. The principle is to exploit knowledge
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "122",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Khan:2022:PPM,
author = "Abbas Khan and Ijaz {Ul Haq} and Tanveer Hussain and
Khan Muhammad and Mohammad Hijji and Muhammad Sajjad
and Victor Hugo C. {De Albuquerque} and Sung Wook
Baik",
title = "{PMAL}: a Proxy Model Active Learning Approach for
Vision Based Industrial Applications",
journal = j-TOMM,
volume = "18",
number = "2s",
pages = "123:1--123:??",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3534932",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3534932",
abstract = "Deep Learning models' performance strongly correlate
with availability of annotated data; however, massive
data labeling is laborious, expensive, and error-prone
when \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "123",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yang:2022:DQN,
author = "Chenyi Yang and Xiaolong Xu and Xiaokang Zhou and
Lianyong Qi",
title = "{Deep Q} Network-Driven Task Offloading for Efficient
Multimedia Data Analysis in Edge Computing-Assisted
{IoV}",
journal = j-TOMM,
volume = "18",
number = "2s",
pages = "124:1--124:??",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3548687",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3548687",
abstract = "With the prosperity of Industry 4.0, numerous emerging
industries continue to gain popularity and their market
scales are expanding ceaselessly. The \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "124",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Tiwari:2022:ODN,
author = "Arti Tiwari and Millie Pant",
title = "Optimized Deep-Neural Network for Content-based
Medical Image Retrieval in a Brownfield {IoMT}
Network",
journal = j-TOMM,
volume = "18",
number = "2s",
pages = "125:1--125:??",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3546194",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3546194",
abstract = "In this paper, a brownfield Internet of Medical Things
network is introduced for imaging data that can be
easily scaled out depending on the objectives,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "125",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Huang:2022:SFM,
author = "Wei Huang and Yuze Zhang and Shaohua Wan",
title = "A Sorting Fuzzy Min-Max Model in an Embedded System
for Atrial Fibrillation Detection",
journal = j-TOMM,
volume = "18",
number = "2s",
pages = "126:1--126:??",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3554737",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3554737",
abstract = "Atrial fibrillation detection (AFD) has attracted much
attention in the field of embedded systems. In this
study, we propose a sorting fuzzy min-max \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "126",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yang:2022:ISS,
author = "Xun Yang and Liang Zheng and Elisa Ricci and Meng
Wang",
title = "Introduction to the Special Section on Learning
Representations, Similarity, and Associations in
Dynamic Multimedia Environments",
journal = j-TOMM,
volume = "18",
number = "2s",
pages = "127:1--127:??",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3569952",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3569952",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "127e",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{He:2022:RLD,
author = "Jun He and Richang Hong and Xueliang Liu and Mingliang
Xu and Qianru Sun",
title = "Revisiting Local Descriptor for Improved Few-Shot
Classification",
journal = j-TOMM,
volume = "18",
number = "2s",
pages = "127:1--127:??",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3511917",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3511917",
abstract = "Few-shot classification studies the problem of quickly
adapting a deep learner to understanding novel classes
based on few support images. In this context,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "127",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Jiao:2022:GGL,
author = "Yingying Jiao and Haipeng Chen and Runyang Feng and
Haoming Chen and Sifan Wu and Yifang Yin and Zhenguang
Liu",
title = "{GLPose}: Global-Local Representation Learning for
Human Pose Estimation",
journal = j-TOMM,
volume = "18",
number = "2s",
pages = "128:1--128:??",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3519305",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3519305",
abstract = "Multi-frame human pose estimation is at the core of
many computer vision tasks. Although state-of-the-art
approaches have demonstrated remarkable results
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "128",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Han:2022:STS,
author = "Qing Han and Huiting Liu and Weidong Min and Tiemei
Huang and Deyu Lin and Qi Wang",
title = "{$3$D} Skeleton and Two Streams Approach to Person
Re-identification Using Optimized Region Matching",
journal = j-TOMM,
volume = "18",
number = "2s",
pages = "129:1--129:??",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3538490",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3538490",
abstract = "Person re-identification (Re-ID) is a challenging and
arduous task due to non-overlapping views, complex
background, and uncontrollable occlusion in video
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "129",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xu:2022:RRL,
author = "Xin Xu and Xin Yuan and Zheng Wang and Kai Zhang and
Ruimin Hu",
title = "Rank-in-Rank Loss for Person Re-identification",
journal = j-TOMM,
volume = "18",
number = "2s",
pages = "130:1--130:??",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3532866",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3532866",
abstract = "Person re-identification (re-ID) is commonly
investigated as a ranking problem. However, the
performance of existing re-ID models drops
dramatically, when they \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "130",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2022:GGA,
author = "Kunpeng Li and Chang Liu and Mike Stopa and Jun Amano
and Yun Fu",
title = "Guided Graph Attention Learning for Video-Text
Matching",
journal = j-TOMM,
volume = "18",
number = "2s",
pages = "131:1--131:??",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3538533",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3538533",
abstract = "As a bridge between videos and natural languages,
video-text matching has been a hot multimedia research
topic in recent years. Such cross-modal retrieval
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "131",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Biondi:2022:CRC,
author = "Niccol{\'o} Biondi and Federico Pernici and Matteo
Bruni and Daniele Mugnai and Alberto Del Bimbo",
title = "{CL$^2$R}: Compatible Lifelong Learning
Representations",
journal = j-TOMM,
volume = "18",
number = "2s",
pages = "132:1--132:??",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3564786",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3564786",
abstract = "In this article, we propose a method to partially
mimic natural intelligence for the problem of lifelong
learning representations that are compatible. We take
the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "132",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Pan:2022:CIK,
author = "Yonghua Pan and Zechao Li and Liyan Zhang and Jinhui
Tang",
title = "Causal Inference with Knowledge Distilling and
Curriculum Learning for Unbiased {VQA}",
journal = j-TOMM,
volume = "18",
number = "3",
pages = "67:1--67:23",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3487042",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:55 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3487042",
abstract = "Recently, many Visual Question Answering (VQA) models
rely on the correlations between questions and answers
yet neglect those between the visual information and
the textual information. They would perform badly if
the handled data distribute differently \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "67",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yanagi:2022:IRR,
author = "Rintaro Yanagi and Ren Togo and Takahiro Ogawa and
Miki Haseyama",
title = "Interactive Re-ranking via Object Entropy-Guided
Question Answering for Cross-Modal Image Retrieval",
journal = j-TOMM,
volume = "18",
number = "3",
pages = "68:1--68:17",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3485042",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:55 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3485042",
abstract = "Cross-modal image-retrieval methods retrieve desired
images from a query text by learning relationships
between texts and images. Such a retrieval approach is
one of the most effective ways of achieving the
easiness of query preparation. Recent cross-.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "68",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Shi:2022:SIN,
author = "Qinghongya Shi and Hong-Bo Zhang and Zhe Li and
Ji-Xiang Du and Qing Lei and Jing-Hua Liu",
title = "Shuffle-invariant Network for Action Recognition in
Videos",
journal = j-TOMM,
volume = "18",
number = "3",
pages = "69:1--69:18",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3485665",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:55 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3485665",
abstract = "The local key features in video are important for
improving the accuracy of human action recognition.
However, most end-to-end methods focus on global
feature learning from videos, while few works consider
the enhancement of the local information in a
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "69",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yuan:2022:LAS,
author = "Di Yuan and Xiaojun Chang and Zhihui Li and Zhenyu
He",
title = "Learning Adaptive Spatial-Temporal Context-Aware
Correlation Filters for {UAV} Tracking",
journal = j-TOMM,
volume = "18",
number = "3",
pages = "70:1--70:18",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3486678",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:55 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3486678",
abstract = "Tracking in the unmanned aerial vehicle (UAV)
scenarios is one of the main components of
target-tracking tasks. Different from the
target-tracking task in the general scenarios, the
target-tracking task in the UAV scenarios is very
challenging because of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "70",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Sun:2022:ESR,
author = "Guofei Sun and Yongkang Wong and Mohan S. Kankanhalli
and Xiangdong Li and Weidong Geng",
title = "Enhanced {$3$D} Shape Reconstruction With Knowledge
Graph of Category Concept",
journal = j-TOMM,
volume = "18",
number = "3",
pages = "71:1--71:20",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3491224",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:55 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3491224",
abstract = "Reconstructing three-dimensional (3D) objects from
images has attracted increasing attention due to its
wide applications in computer vision and robotic tasks.
Despite the promising progress of recent deep
learning-based approaches, which directly \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "71",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2022:DIG,
author = "Jinfeng Li and Weifeng Liu and Yicong Zhou and Jun Yu
and Dapeng Tao and Changsheng Xu",
title = "Domain-invariant Graph for Adaptive Semi-supervised
Domain Adaptation",
journal = j-TOMM,
volume = "18",
number = "3",
pages = "72:1--72:18",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3487194",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:55 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3487194",
abstract = "Domain adaptation aims to generalize a model from a
source domain to tackle tasks in a related but
different target domain. Traditional domain adaptation
algorithms assume that enough labeled data, which are
treated as the prior knowledge are available in
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "72",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Shi:2022:OOS,
author = "Ran Shi and Jing Ma and King Ngi Ngan and Jian Xiong
and Tong Qiao",
title = "Objective Object Segmentation Visual Quality
Evaluation: Quality Measure and Pooling Method",
journal = j-TOMM,
volume = "18",
number = "3",
pages = "73:1--73:19",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3491229",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:55 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3491229",
abstract = "Objective object segmentation visual quality
evaluation is an emergent member of the visual quality
assessment family. It aims to develop an objective
measure instead of a subjective survey to evaluate the
object segmentation quality in agreement with
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "73",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zeng:2022:CAS,
author = "Linghua Zeng and Xinmei Tian",
title = "{CRAR}: Accelerating Stereo Matching with Cascaded
Residual Regression and Adaptive Refinement",
journal = j-TOMM,
volume = "18",
number = "3",
pages = "74:1--74:19",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3488719",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:55 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3488719",
abstract = "Dense stereo matching estimates the depth for each
pixel of the referenced images. Recently, deep learning
algorithms have dramatically promoted the development
of stereo matching. The state-of-the-art result is
achieved by models adopting deep \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "74",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yao:2022:RGA,
author = "Lingxiang Yao and Worapan Kusakunniran and Qiang Wu
and Jingsong Xu and Jian Zhang",
title = "Recognizing Gaits Across Walking and Running Speeds",
journal = j-TOMM,
volume = "18",
number = "3",
pages = "75:1--75:22",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3488715",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:55 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3488715",
abstract = "For decades, very few methods were proposed for
cross-mode (i.e., walking vs. running) gait
recognition. Thus, it remains largely unexplored
regarding how to recognize persons by the way they walk
and run. Existing cross-mode methods handle the
walking-. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "75",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2022:IKB,
author = "Qun Li and Fu Xiao and Bir Bhanu and Biyun Sheng and
Richang Hong",
title = "Inner Knowledge-based {Img2Doc} Scheme for Visual
Question Answering",
journal = j-TOMM,
volume = "18",
number = "3",
pages = "76:1--76:21",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3489142",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:55 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3489142",
abstract = "Visual Question Answering (VQA) is a research topic of
significant interest at the intersection of computer
vision and natural language understanding. Recent
research indicates that attributes and knowledge can
effectively improve performance for both \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "76",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Cornia:2022:MFA,
author = "Marcella Cornia and Matteo Tomei and Lorenzo Baraldi
and Rita Cucchiara",
title = "Matching Faces and Attributes Between the Artistic and
the Real Domain: the {PersonArt} Approach",
journal = j-TOMM,
volume = "18",
number = "3",
pages = "77:1--77:23",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3490033",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:55 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3490033",
abstract = "In this article, we present an approach for retrieving
similar faces between the artistic and the real domain.
The application we refer to is an interactive
exhibition inside a museum, in which a visitor can take
a photo of himself and search for a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "77",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yin:2022:MFL,
author = "Guanghao Yin and Shouqian Sun and Dian Yu and Dejian
Li and Kejun Zhang",
title = "A Multimodal Framework for Large-Scale Emotion
Recognition by Fusing Music and Electrodermal Activity
Signals",
journal = j-TOMM,
volume = "18",
number = "3",
pages = "78:1--78:23",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3490686",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:55 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3490686",
abstract = "Considerable attention has been paid to physiological
signal-based emotion recognition in the field of
affective computing. For reliability and user-friendly
acquisition, electrodermal activity (EDA) has a great
advantage in practical applications. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "78",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Buckchash:2022:GLG,
author = "Himanshu Buckchash and Balasubramanian Raman",
title = "{GraSP}: Local {Grassmannian} Spatio-Temporal Patterns
for Unsupervised Pose Sequence Recognition",
journal = j-TOMM,
volume = "18",
number = "3",
pages = "79:1--79:23",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3491227",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:55 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3491227",
abstract = "Many applications of action recognition, especially
broad domains like surveillance or anomaly-detection,
favor unsupervised methods considering that exhaustive
labeling of actions is not possible. However, very
limited work has happened in this domain. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "79",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhu:2022:SSR,
author = "Xiaoguang Zhu and Ye Zhu and Haoyu Wang and Honglin
Wen and Yan Yan and Peilin Liu",
title = "Skeleton Sequence and {RGB} Frame Based Multi-Modality
Feature Fusion Network for Action Recognition",
journal = j-TOMM,
volume = "18",
number = "3",
pages = "80:1--80:24",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3491228",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:55 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3491228",
abstract = "Action recognition has been a heated topic in computer
vision for its wide application in vision systems.
Previous approaches achieve improvement by fusing the
modalities of the skeleton sequence and RGB video.
However, such methods pose a dilemma between \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "80",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Chowdhury:2022:DGS,
author = "Debanjan Roy Chowdhury and Sukumar Nandi and Diganta
Goswami",
title = "Distributed Gateway Selection for Video Streaming in
{VANET} Using {IP} Multicast",
journal = j-TOMM,
volume = "18",
number = "3",
pages = "81:1--81:24",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3491388",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:55 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3491388",
abstract = "The volume of video traffic as infotainment service
over vehicular ad hoc network (VANET) has rapidly
increased for past few years. Providing video streaming
as VANET infotainment service is very challenging
because of high mobility and heterogeneity of
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "81",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Alaya:2022:MVE,
author = "Bechir Alaya and Lamaa Sellami",
title = "Multilayer Video Encoding for {QoS} Managing of Video
Streaming in {VANET} Environment",
journal = j-TOMM,
volume = "18",
number = "3",
pages = "82:1--82:19",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3491433",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:55 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3491433",
abstract = "Efficient delivery and maintenance of the quality of
service (QoS) of audio/video streams transmitted over
VANETs for mobile and heterogeneous nodes are one of
the major challenges in the convergence of this network
type and these services. In this \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "82",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wu:2022:WPM,
author = "Yike Wu and Shiwan Zhao and Ying Zhang and Xiaojie
Yuan and Zhong Su",
title = "When Pairs Meet Triplets: Improving Low-Resource
Captioning via Multi-Objective Optimization",
journal = j-TOMM,
volume = "18",
number = "3",
pages = "83:1--83:20",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3492325",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:55 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3492325",
abstract = "Image captioning for low-resource languages has
attracted much attention recently. Researchers propose
to augment the low-resource caption dataset into
(image, rich-resource language, and low-resource
language) triplets and develop the dual attention
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "83",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yang:2022:ICD,
author = "Kai-Wei Yang and Yen-Yun Huang and Jen-Wei Huang and
Ya-Rou Hsu and Chang-Lin Wan and Hong-Han Shuai and
Li-Chun Wang and Wen-Huang Cheng",
title = "Improving Crowd Density Estimation by Fusing Aerial
Images and Radio Signals",
journal = j-TOMM,
volume = "18",
number = "3",
pages = "84:1--84:23",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3492346",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:55 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3492346",
abstract = "A recent line of research focuses on crowd density
estimation from RGB images for a variety of
applications, for example, surveillance and traffic
flow control. The performance drops dramatically for
low-quality images, such as occlusion, or poor light
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "84",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xia:2022:FCS,
author = "Zhihua Xia and Qiuju Ji and Qi Gu and Chengsheng Yuan
and Fengjun Xiao",
title = "A Format-compatible Searchable Encryption Scheme for
{JPEG} Images Using Bag-of-words",
journal = j-TOMM,
volume = "18",
number = "3",
pages = "85:1--85:18",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3492705",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:55 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3492705",
abstract = "The development of cloud computing attracts
enterprises and individuals to outsource their data,
such as images, to the cloud server. However, direct
outsourcing causes the extensive concern of privacy
leakage, as images often contain rich sensitive
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "85",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Natgunanathan:2022:BBA,
author = "Iynkaran Natgunanathan and Purathani Praitheeshan and
Longxiang Gao and Yong Xiang and Lei Pan",
title = "Blockchain-Based Audio Watermarking Technique for
Multimedia Copyright Protection in Distribution
Networks",
journal = j-TOMM,
volume = "18",
number = "3",
pages = "86:1--86:23",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3492803",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:55 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3492803",
abstract = "Copyright protection in multimedia protection
distribution is a challenging problem. To protect
multimedia data, many watermarking methods have been
proposed in the literature. However, most of them
cannot be used effectively in a multimedia distribution
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "86",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Guo:2022:DIE,
author = "Kehua Guo and Min Hu and Sheng Ren and Fangfang Li and
Jian Zhang and Haifu Guo and Xiaoyan Kui",
title = "Deep Illumination-Enhanced Face Super-Resolution
Network for Low-Light Images",
journal = j-TOMM,
volume = "18",
number = "3",
pages = "87:1--87:19",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3495258",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:55 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3495258",
abstract = "Face images are typically a key component in the
fields of security and criminal investigation. However,
due to lighting and shooting angles, faces taken under
low-light conditions are often difficult to recognize.
Face super-resolution (FSR) technology \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "87",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2022:SSM,
author = "Xiaoming Liu and Shuo Wang and Ying Zhang and Quan
Yuan",
title = "Scribble-Supervised Meibomian Glands Segmentation in
Infrared Images",
journal = j-TOMM,
volume = "18",
number = "3",
pages = "88:1--88:23",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3497747",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:55 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3497747",
abstract = "Infrared imaging is currently the most effective
clinical method to evaluate the morphology of the
meibomian glands (MGs) in patients. As an important
indicator for monitoring the development of MG
dysfunction, it is necessary to accurately measure
gland-. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "88",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Singh:2022:TII,
author = "Kedar Nath Singh and Amit Kumar Singh",
title = "Towards Integrating Image Encryption with Compression:
a Survey",
journal = j-TOMM,
volume = "18",
number = "3",
pages = "89:1--89:21",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3498342",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:55 MDT 2022",
bibsource = "bhttps://www.math.utah.edu/pub/tex/bib/tomccap.bib;
https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/datacompression.bib",
URL = "https://dl.acm.org/doi/10.1145/3498342",
abstract = "As digital images are consistently generated and
transmitted online, the unauthorized utilization of
these images is an increasing concern that has a
significant impact on both security and privacy issues;
additionally, the representation of digital \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "89",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{MontenegroMarin:2022:ISI,
author = "Carlos Enrique {Montenegro Marin} and Dinesh Jackson
Samuel and Nallappan Gunasekaran",
title = "Introduction to the Special Issue on {6G} Enabled
Interactive Multimedia Communication Systems",
journal = j-TOMM,
volume = "18",
number = "3s",
pages = "133:1--133:??",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3567835",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:31 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3567835",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "133e",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2022:CAP,
author = "Ran Li and Wei Wei and Peinan Hao and Jian Su and
Fengyuan Sun",
title = "Context-aware Pseudo-true Video Interpolation at {6G}
Edge",
journal = j-TOMM,
volume = "18",
number = "3s",
pages = "133:1--133:??",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3555313",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:31 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3555313",
abstract = "In the 6G network, lots of edge devices facilitate the
low-latency transmission of video. However, with
limited processing and storage capabilities, the edge
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "133",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Alharbi:2022:NSA,
author = "Abdullah Alharbi and Mohammed Aljebreen and Amr Tolba
and Konstantinos A. Lizos and Saied Abd El-Atty and
Farid Shawki",
title = "A Normalized Slicing-assigned Virtualization Method
for 6G-based Wireless Communication Systems",
journal = j-TOMM,
volume = "18",
number = "3s",
pages = "134:1--134:??",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3546077",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:31 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3546077",
abstract = "The next generation of wireless communication systems
will rely on advantageous sixth-generation wireless
network (6G) features and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "134",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2022:ISIb,
author = "Yin Zhang and Iztok Humar and Jia Liu and Alireza
Jolfaei",
title = "Introduction to the Special Issue on Affective
Services based on Representation Learning",
journal = j-TOMM,
volume = "18",
number = "3s",
pages = "135:1--135:??",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3567836",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:31 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3567836",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "135e",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xu:2022:DBJ,
author = "Kexin Xu and Haijun Zhang and Keping Long and Jianquan
Wang and Lei Sun",
title = "{DRL} based Joint Affective Services Computing and
Resource Allocation in {ISTN}",
journal = j-TOMM,
volume = "18",
number = "3s",
pages = "135:1--135:??",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3561821",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:31 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3561821",
abstract = "Affective services will become a research hotspot in
artificial intelligence (AI) in the next decade. In
this paper, a novel service paradigm combined with
wireless \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "135",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2022:AIA,
author = "Yazhou Zhang and Prayag Tiwari and Lu Rong and Rui
Chen and Nojoom A. Alnajem and M. Shamim Hossain",
title = "Affective Interaction: Attentive Representation
Learning for Multi-Modal Sentiment Classification",
journal = j-TOMM,
volume = "18",
number = "3s",
pages = "136:1--136:??",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3527175",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:31 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3527175",
abstract = "The recent booming of artificial intelligence (AI)
applications, e.g., affective robots, human-machine
interfaces, autonomous vehicles, and so on, has
produced \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "136",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2022:BRJ,
author = "Xiaoqin Wang and Chen Chen and Rushi Lan and Licheng
Liu and Zhenbing Liu and Huiyu Zhou and Xiaonan Luo",
title = "Binary Representation via Jointly Personalized Sparse
Hashing",
journal = j-TOMM,
volume = "18",
number = "3s",
pages = "137:1--137:??",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3558769",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:31 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3558769",
abstract = "Unsupervised hashing has attracted much attention for
binary representation learning due to the requirement
of economical storage and efficiency of binary
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "137",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Jin:2022:AAA,
author = "Xin Jin and Xinning Li and Hao Lou and Chenyu Fan and
Qiang Deng and Chaoen Xiao and Shuai Cui and Amit Kumar
Singh",
title = "Aesthetic Attribute Assessment of Images Numerically
on Mixed Multi-attribute Datasets",
journal = j-TOMM,
volume = "18",
number = "3s",
pages = "138:1--138:??",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3547144",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:31 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3547144",
abstract = "With the continuous development of social software and
multimedia technology, images have become a kind of
important carrier for spreading information and
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "138",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Cao:2022:SBH,
author = "Jie Cao and Youquan Wang and Haicheng Tao and Xiang
Guo",
title = "Sensor-based Human Activity Recognition Using Graph
{LSTM} and Multi-task Classification Model",
journal = j-TOMM,
volume = "18",
number = "3s",
pages = "139:1--139:??",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3561387",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:31 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3561387",
abstract = "This paper explores human activities recognition from
sensor-based multi-dimensional streams. Recently, deep
learning-based methods such as \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "139",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Huang:2022:OTV,
author = "Jiawei Huang and Qichen Su and Weihe Li and Zhuoran
Liu and Tao Zhang and Sen Liu and Ping Zhong and
Wanchun Jiang and Jianxin Wang",
title = "Opportunistic Transmission for Video Streaming over
Wild {Internet}",
journal = j-TOMM,
volume = "18",
number = "3s",
pages = "140:1--140:??",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3488722",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:31 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3488722",
abstract = "The video streaming system employs adaptive bitrate
(ABR) algorithms to optimize a user's quality of
experience. However, it is hard for ABR \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "140",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Duanmu:2022:BQE,
author = "Zhengfang Duanmu and Wentao Liu and Diqi Chen and
Zhuoran Li and Zhou Wang and Yizhou Wang and Wen Gao",
title = "A {Bayesian} Quality-of-Experience Model for Adaptive
Streaming Videos",
journal = j-TOMM,
volume = "18",
number = "3s",
pages = "141:1--141:??",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3491432",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:31 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3491432",
abstract = "The fundamental conflict between the enormous space of
adaptive streaming videos and the limited capacity for
subjective experiment casts significant \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "141",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Ignat:2022:WDI,
author = "Oana Ignat and Santiago Castro and Yuhang Zhou and
Jiajun Bao and Dandan Shan and Rada Mihalcea",
title = "When Did It Happen? {Duration}-informed Temporal
Localization of Narrated Actions in Vlogs",
journal = j-TOMM,
volume = "18",
number = "3s",
pages = "142:1--142:??",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3495211",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:31 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3495211",
abstract = "We consider the task of temporal human action
localization in lifestyle vlogs. We introduce a novel
dataset consisting of manual annotations of temporal
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "142",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Shi:2022:HMU,
author = "Wuzhen Shi and Shaohui Liu",
title = "Hiding Message Using a Cycle Generative Adversarial
Network",
journal = j-TOMM,
volume = "18",
number = "3s",
pages = "143:1--143:??",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3495566",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:31 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3495566",
abstract = "Training an image steganography is an unsupervised
problem, because it is impossible to obtain an ideal
supervised steganographic image \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "143",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Hui:2022:STC,
author = "Chen Hui and Shaohui Liu and Wuzhen Shi and Feng Jiang
and Debin Zhao",
title = "Spatio-Temporal Context Based Adaptive Camcorder
Recording Watermarking",
journal = j-TOMM,
volume = "18",
number = "3s",
pages = "144:1--144:??",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3503160",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:31 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3503160",
abstract = "Video watermarking technology has attracted increasing
attention in the past few years, and a great deal of
traditional and deep learning-based methods \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "144",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhao:2022:BAP,
author = "Jian Zhao and Xianhui Liu and Weidong Zhao",
title = "Balanced and Accurate Pseudo-Labels for
Semi-Supervised Image Classification",
journal = j-TOMM,
volume = "18",
number = "3s",
pages = "145:1--145:??",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3506711",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:31 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3506711",
abstract = "Image classification by semi-supervised learning has
recently become a hot spot, and the Co-Training
framework is an important method of semi-supervised
image \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "145",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Stacchio:2022:THA,
author = "Lorenzo Stacchio and Alessia Angeli and Giuseppe
Lisanti and Daniela Calanca and Gustavo Marfia",
title = "Toward a Holistic Approach to the Socio-historical
Analysis of Vernacular Photos",
journal = j-TOMM,
volume = "18",
number = "3s",
pages = "146:1--146:??",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3507918",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:31 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3507918",
abstract = "Although one of the most popular practices in
photography since the end of the 19th century, an
increase in scholarly interest in family photo albums
dates back to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "146",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xiao:2022:DAS,
author = "Hui-Chu Xiao and Wan-Lei Zhao and Jie Lin and Yi-Geng
Hong and Chong-Wah Ngo",
title = "Deeply Activated Salient Region for Instance Search",
journal = j-TOMM,
volume = "18",
number = "3s",
pages = "147:1--147:??",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3510004",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:31 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3510004",
abstract = "The performance of instance search relies heavily on
the ability to locate and describe a wide variety of
object instances in a video/image collection. Due to
the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "147",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2022:CEC,
author = "Zuquan Liu and Guopu Zhu and Feng Ding and Xiangyang
Luo and Sam Kwong and Peng Li",
title = "Contrast-Enhanced Color Visual Cryptography for $ (k,
n) $ Threshold Schemes",
journal = j-TOMM,
volume = "18",
number = "3s",
pages = "148:1--148:??",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3508394",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:31 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3508394",
abstract = "In traditional visual cryptography schemes (VCSs),
pixel expansion remains to be an unsolved challenge. To
alleviate the impact of pixel expansion, several
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "148",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2022:DSS,
author = "Zhe Liu and Xian-Hua Han",
title = "Deep Self-Supervised Hyperspectral Image
Reconstruction",
journal = j-TOMM,
volume = "18",
number = "3s",
pages = "149:1--149:??",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3510373",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:31 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3510373",
abstract = "Reconstructing a high-resolution hyperspectral (HR-HS)
image via merging a low-resolution hyperspectral
(LR-HS) image and a high-resolution RGB \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "149",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Singh:2022:SSD,
author = "Gurinder Singh and Puneet Goyal",
title = "{SDCN2}: a Shallow Densely Connected {CNN} for
Multi-Purpose Image Manipulation Detection",
journal = j-TOMM,
volume = "18",
number = "3s",
pages = "150:1--150:??",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3510462",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:31 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3510462",
abstract = "Digital image information can be easily tampered with
to harm the integrity of someone. Thus, recognizing the
truthfulness and processing history of an image
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "150",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2022:SGS,
author = "Yunfei Liu and Yu Li and Shaodi You and Feng Lu",
title = "Semantic Guided Single Image Reflection Removal",
journal = j-TOMM,
volume = "18",
number = "3s",
pages = "151:1--151:??",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3510821",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:31 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3510821",
abstract = "Reflection is common when we see through a glass
window, which not only is a visual disturbance but also
influences the performance of computer vision
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "151",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wu:2022:IFD,
author = "Jingjing Wu and Jianguo Jiang and Meibin Qi and Cuiqun
Chen and Yimin Liu",
title = "Improving Feature Discrimination for Object Tracking
by Structural-similarity-based Metric Learning",
journal = j-TOMM,
volume = "18",
number = "4",
pages = "90:1--90:23",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3497746",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:57 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3497746",
abstract = "Existing approaches usually form the tracking task as
an appearance matching procedure. However, the
discrimination ability of appearance features is
insufficient in these trackers, which is caused by
their weak feature supervision constraints and
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "90",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Huang:2022:IBP,
author = "Xiaowen Huang and Jitao Sang and Changsheng Xu",
title = "Image-Based Personality Questionnaire Design",
journal = j-TOMM,
volume = "18",
number = "4",
pages = "91:1--91:??",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3503489",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:32 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3503489",
abstract = "This article explores the problem of image-based
personality questionnaire design. Compared with the
traditional text-based personality questionnaire, the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "91",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Hao:2022:DLL,
author = "Shijie Hao and Xu Han and Yanrong Guo and Meng Wang",
title = "Decoupled Low-Light Image Enhancement",
journal = j-TOMM,
volume = "18",
number = "4",
pages = "92:1--92:19",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3498341",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:57 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3498341",
abstract = "The visual quality of photographs taken under
imperfect lightness conditions can be degenerated by
multiple factors, e.g., low lightness, imaging noise,
color distortion, and so on. Current low-light image
enhancement models focus on the improvement of
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "92",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2022:AQR,
author = "Yibing Liu and Yangyang Guo and Jianhua Yin and
Xuemeng Song and Weifeng Liu and Liqiang Nie and Min
Zhang",
title = "Answer Questions with Right Image Regions: a Visual
Attention Regularization Approach",
journal = j-TOMM,
volume = "18",
number = "4",
pages = "93:1--93:18",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3498340",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:57 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3498340",
abstract = "Visual attention in Visual Question Answering (VQA)
targets at locating the right image regions regarding
the answer prediction, offering a powerful technique to
promote multi-modal understanding. However, recent
studies have pointed out that the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "93",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yu:2022:DAM,
author = "Yang Yu and Rongrong Ni and Wenjie Li and Yao Zhao",
title = "Detection of {AI-Manipulated} Fake Faces via Mining
Generalized Features",
journal = j-TOMM,
volume = "18",
number = "4",
pages = "94:1--94:23",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3499026",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:57 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3499026",
abstract = "Recently, AI-manipulated face techniques have
developed rapidly and constantly, which has raised new
security issues in society. Although existing detection
methods consider different categories of fake faces,
the performance on detecting the fake faces \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "94",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Cheng:2022:CMG,
author = "Yuhao Cheng and Xiaoguang Zhu and Jiuchao Qian and Fei
Wen and Peilin Liu",
title = "Cross-modal Graph Matching Network for Image-text
Retrieval",
journal = j-TOMM,
volume = "18",
number = "4",
pages = "95:1--95:23",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3499027",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:57 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3499027",
abstract = "Image-text retrieval is a fundamental cross-modal task
whose main idea is to learn image-text matching.
Generally, according to whether there exist
interactions during the retrieval process, existing
image-text retrieval methods can be classified into
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "95",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Dogariu:2022:GRS,
author = "Mihai Dogariu and Liviu-Daniel {\c{S}}tefan and Bogdan
Andrei Boteanu and Claudiu Lamba and Bomi Kim and
Bogdan Ionescu",
title = "Generation of Realistic Synthetic Financial
Time-series",
journal = j-TOMM,
volume = "18",
number = "4",
pages = "96:1--96:27",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3501305",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:57 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3501305",
abstract = "Financial markets have always been a point of interest
for automated systems. Due to their complex nature,
financial algorithms and fintech frameworks require
vast amounts of data to accurately respond to market
fluctuations. This data availability is \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "96",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zheng:2022:CMS,
author = "Yi Zheng and Yong Zhou and Jiaqi Zhao and Ying Chen
and Rui Yao and Bing Liu and Abdulmotaleb {El Saddik}",
title = "Clustering Matters: Sphere Feature for Fully
Unsupervised Person Re-identification",
journal = j-TOMM,
volume = "18",
number = "4",
pages = "97:1--97:18",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3501404",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:57 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3501404",
abstract = "In person re-identification (Re-ID), the data
annotation cost of supervised learning, is huge and it
cannot adapt well to complex situations. Therefore,
compared with supervised deep learning methods,
unsupervised methods are more in line with actual
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "97",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Tang:2022:HMB,
author = "Zengming Tang and Jun Huang",
title = "Harmonious Multi-branch Network for Person
Re-identification with Harder Triplet Loss",
journal = j-TOMM,
volume = "18",
number = "4",
pages = "98:1--98:21",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3501405",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:57 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3501405",
abstract = "Recently, advances in person re-identification (Re-ID)
has benefitted from use of the popular multi-branch
network. However, performing feature learning in a
single branch with uniform partitioning is likely to
separate meaningful local regions, and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "98",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xu:2022:TCA,
author = "Yifan Xu and Kekai Sheng and Weiming Dong and Baoyuan
Wu and Changsheng Xu and Bao-Gang Hu",
title = "Towards Corruption-Agnostic Robust Domain Adaptation",
journal = j-TOMM,
volume = "18",
number = "4",
pages = "99:1--99:16",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3501800",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:57 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3501800",
abstract = "Great progress has been achieved in domain adaptation
in decades. Existing works are always based on an ideal
assumption that testing target domains are independent
and identically distributed with training target
domains. However, due to unpredictable \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "99",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Lin:2022:JSC,
author = "Jinzhi Lin and Yun Zhang and Na Li and Hongling
Jiang",
title = "Joint Source-Channel Decoding of Polar Codes for
{HEVC}-Based Video Streaming",
journal = j-TOMM,
volume = "18",
number = "4",
pages = "100:1--100:23",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3502208",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:57 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3502208",
abstract = "Ultra High-Definition (UHD) and Virtual Reality (VR)
video streaming over 5G networks are emerging, in which
High-Efficiency Video Coding (HEVC) is used as source
coding to compress videos more efficiently and polar
code is used as channel coding to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "100",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2022:DES,
author = "Yongrui Li and Zengfu Wang and Jun Yu",
title = "Densely Enhanced Semantic Network for Conversation
System in Social Media",
journal = j-TOMM,
volume = "18",
number = "4",
pages = "101:1--101:24",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3501799",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:57 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3501799",
abstract = "The human-computer conversation system is a
significant application in the field of multimedia. To
select an appropriate response, retrieval-based systems
model the matching between the dialogue history and
response candidates. However, most of the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "101",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Lin:2022:NCN,
author = "Kai Lin and Chuanmin Jia and Xinfeng Zhang and Shanshe
Wang and Siwei Ma and Wen Gao",
title = "{NR-CNN}: Nested-Residual Guided {CNN} In-loop
Filtering for Video Coding",
journal = j-TOMM,
volume = "18",
number = "4",
pages = "102:1--102:22",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3502723",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:57 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3502723",
abstract = "Recently, deep learning for video coding, such as deep
predictive coding, deep transform coding, and deep
in-loop filtering, has been an emerging research area.
The coding gain of hybrid coding framework could be
extensively promoted by the data-driven \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "102",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Dai:2022:FFS,
author = "Hanbin Dai and Hailin Shi and Wu Liu and Linfang Wang
and Yinglu Liu and Tao Mei",
title = "{FasterPose}: a Faster Simple Baseline for Human Pose
Estimation",
journal = j-TOMM,
volume = "18",
number = "4",
pages = "103:1--103:16",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3503464",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:57 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3503464",
abstract = "The performance of human pose estimation depends on
the spatial accuracy of keypoint localization. Most
existing methods pursue the spatial accuracy through
learning the high-resolution (HR) representation from
input images. By the experimental analysis, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "103",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Man:2022:SAR,
author = "Xin Man and Deqiang Ouyang and Xiangpeng Li and
Jingkuan Song and Jie Shao",
title = "Scenario-Aware Recurrent Transformer for Goal-Directed
Video Captioning",
journal = j-TOMM,
volume = "18",
number = "4",
pages = "104:1--104:17",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3503927",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:57 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3503927",
abstract = "Fully mining visual cues to aid in content
understanding is crucial for video captioning. However,
most state-of-the-art video captioning methods are
limited to generating captions purely based on
straightforward information while ignoring the scenario
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "104",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2022:OCC,
author = "Tianjun Zhang and Hao Deng and Lin Zhang and Shengjie
Zhao and Xiao Liu and Yicong Zhou",
title = "Online Correction of Camera Poses for the
Surround-view System: a Sparse Direct Approach",
journal = j-TOMM,
volume = "18",
number = "4",
pages = "106:1--106:24",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3505252",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:57 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3505252",
abstract = "The surround-view module is an indispensable component
of a modern advanced driving assistance system. By
calibrating the intrinsics and extrinsics of the
surround-view cameras accurately, a top-down
surround-view can be generated from raw fisheye images.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "106",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2022:MGB,
author = "Quan Wang and Sheng Li and Xinpeng Zhang and Guorui
Feng",
title = "Multi-granularity Brushstrokes Network for Universal
Style Transfer",
journal = j-TOMM,
volume = "18",
number = "4",
pages = "107:1--107:17",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3506710",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:57 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3506710",
abstract = "Neural style transfer has been developed in recent
years, where both performance and efficiency have been
greatly improved. However, most existing methods do not
transfer the brushstrokes information of style images
well. In this article, we address this \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "107",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Saxena:2022:PSU,
author = "Nidhi Saxena and Balasubramanian Raman",
title = "Pansharpening Scheme Using Bi-dimensional Empirical
Mode Decomposition and Neural Network",
journal = j-TOMM,
volume = "18",
number = "4",
pages = "108:1--108:22",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3506709",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:57 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3506709",
abstract = "The pansharpening is a combination of multispectral
(MS) and panchromatic (PAN) images that produce a
high-spatial-spectral-resolution MS images. In
multiresolution analysis-based pansharpening schemes,
some spatial and spectral distortions are found. It
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "108",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wu:2022:EEH,
author = "Jingjing Wu and Jianguo Jiang and Meibin Qi and Cuiqun
Chen and Jingjing Zhang",
title = "An End-to-end Heterogeneous Restraint Network for
{RGB-D} Cross-modal Person Re-identification",
journal = j-TOMM,
volume = "18",
number = "4",
pages = "109:1--109:22",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3506708",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:57 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3506708",
abstract = "The RGB-D cross-modal person re-identification (re-id)
task aims to identify the person of interest across the
RGB and depth image modes. The tremendous discrepancy
between these two modalities makes this task difficult
to tackle. Few researchers pay \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "109",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2022:SRP,
author = "Caixia Liu and Dehui Kong and Shaofan Wang and Jinghua
Li and Baocai Yin",
title = "A Spatial Relationship Preserving Adversarial Network
for {$3$D} Reconstruction from a Single Depth View",
journal = j-TOMM,
volume = "18",
number = "4",
pages = "110:1--110:22",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3506733",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:57 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3506733",
abstract = "Recovering the geometry of an object from a single
depth image is an interesting yet challenging problem.
While previous learning based approaches have
demonstrated promising performance, they don't fully
explore spatial relationships of objects, which
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "110",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Ren:2022:EES,
author = "Ruyong Ren and Shaozhang Niu and Hua Ren and Shubin
Zhang and Tengyue Han and Xiaohai Tong",
title = "{ESRNet}: Efficient Search and Recognition Network for
Image Manipulation Detection",
journal = j-TOMM,
volume = "18",
number = "4",
pages = "111:1--111:23",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3506853",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:57 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3506853",
abstract = "With the widespread use of smartphones and the rise of
intelligent software, we can manipulate captured photos
anytime and anywhere, so the fake photos finally
obtained look ``Real.'' If these intelligent operation
methods are maliciously applied to our \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "111",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Duan:2022:NMS,
author = "Mingxing Duan and Kenli Li and Jiayan Deng and Bin
Xiao and Qi Tian",
title = "A Novel Multi-Sample Generation Method for Adversarial
Attacks",
journal = j-TOMM,
volume = "18",
number = "4",
pages = "112:1--112:21",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3506852",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:57 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3506852",
abstract = "Deep learning models are widely used in daily life,
which bring great convenience to our lives, but they
are vulnerable to attacks. How to build an attack
system with strong generalization ability to test the
robustness of deep learning systems is a hot \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "112",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Guo:2022:ATA,
author = "Yang Guo and Wei Gao and Siwei Ma and Ge Li",
title = "Accelerating Transform Algorithm Implementation for
Efficient Intra Coding of {8K UHD} Videos",
journal = j-TOMM,
volume = "18",
number = "4",
pages = "113:1--113:20",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.1145/3507970",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Mar 24 08:21:57 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3507970",
abstract = "Real-time ultra-high-definition (UHD) video
applications have attracted much attention, where the
encoder side urgently demands the high-throughput
two-dimensional (2D) transform hardware implementation
for the latest video coding standards. This article
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "113",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Shao:2023:SIP,
author = "Xuan Shao and Ying Shen and Lin Zhang and Shengjie
Zhao and Dandan Zhu and Yicong Zhou",
title = "{SLAM} for Indoor Parking: a Comprehensive Benchmark
Dataset and a Tightly Coupled Semantic Framework",
journal = j-TOMM,
volume = "19",
number = "1",
pages = "1:1--1:??",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3510856",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3510856",
abstract = "For the task of autonomous indoor parking, various
Visual-Inertial Simultaneous Localization And Mapping
(SLAM) systems are expected to achieve \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "1",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Sharma:2023:WBA,
author = "Prasen Sharma and Ira Bisht and Arijit Sur",
title = "Wavelength-based Attributed Deep Neural Network for
Underwater Image Restoration",
journal = j-TOMM,
volume = "19",
number = "1",
pages = "2:1--2:??",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3511021",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3511021",
abstract = "Background: Underwater images, in general, suffer from
low contrast and high color distortions due to the
non-uniform attenuation of the light as it \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "2",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2023:SCE,
author = "Jie Li and Ling Han and Chong Zhang and Qiyue Li and
Zhi Liu",
title = "Spherical Convolution Empowered Viewport Prediction in
360 Video Multicast with Limited {FoV} Feedback",
journal = j-TOMM,
volume = "19",
number = "1",
pages = "3:1--3:??",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3511603",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3511603",
abstract = "Field of view (FoV) prediction is critical in
360-degree video multicast, which is a key component of
the emerging virtual reality and augmented reality
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "3",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Le:2023:ASN,
author = "Thi-Ngoc-Hanh Le and Chih-Kuo Yeh and Ying-Chi Lin and
Tong-Yee Lee",
title = "Animating Still Natural Images Using Warping",
journal = j-TOMM,
volume = "19",
number = "1",
pages = "4:1--4:??",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3511894",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3511894",
abstract = "From a single still image, a looping video could be
generated by imparting subtle motion to objects in the
image. The results are a hybrid of photography and
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "4",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xiong:2023:RRD,
author = "Lizhi Xiong and Xiao Han and Ching-Nung Yang and
Zhihua Xia",
title = "{RDH-DES}: Reversible Data Hiding over Distributed
Encrypted-Image Servers Based on Secret Sharing",
journal = j-TOMM,
volume = "19",
number = "1",
pages = "5:1--5:??",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3512797",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3512797",
abstract = "Reversible Data Hiding in Encrypted Image (RDHEI)
schemes may redistribute the data hiding procedure to
other parties and can preserve privacy of the cover
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "5",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhen:2023:TAO,
author = "Peining Zhen and Shuqi Wang and Suming Zhang and
Xiaotao Yan and Wei Wang and Zhigang Ji and Hai-Bao
Chen",
title = "Towards Accurate Oriented Object Detection in Aerial
Images with Adaptive Multi-level Feature Fusion",
journal = j-TOMM,
volume = "19",
number = "1",
pages = "6:1--6:??",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3513133",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3513133",
abstract = "Detecting objects in aerial images is a long-standing
and challenging problem since the objects in aerial
images vary dramatically in size and orientation. Most
existing \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "6",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Song:2023:DSD,
author = "Yue Song and Hao Tang and Nicu Sebe and Wei Wang",
title = "Disentangle Saliency Detection into Cascaded Detail
Modeling and Body Filling",
journal = j-TOMM,
volume = "19",
number = "1",
pages = "7:1--7:??",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3513134",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3513134",
abstract = "Salient object detection has been long studied to
identify the most visually attractive objects in
images/videos. Recently, a growing amount of approaches
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "7",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2023:BSG,
author = "Yong Zhang and Yingwei Pan and Ting Yao and Rui Huang
and Tao Mei and Chang-Wen Chen",
title = "Boosting Scene Graph Generation with Visual Relation
Saliency",
journal = j-TOMM,
volume = "19",
number = "1",
pages = "8:1--8:??",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3514041",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3514041",
abstract = "The scene graph is a symbolic data structure that
comprehensively describes the objects and visual
relations in a visual scene, while ignoring the
inherent perceptual \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "8",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Chen:2023:BVL,
author = "Jingwen Chen and Jianjie Luo and Yingwei Pan and Yehao
Li and Ting Yao and Hongyang Chao and Tao Mei",
title = "Boosting Vision-and-Language Navigation with Direction
Guiding and Backtracing",
journal = j-TOMM,
volume = "19",
number = "1",
pages = "9:1--9:??",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3526024",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3526024",
abstract = "Vision-and-Language Navigation (VLN) has been an
emerging and fast-developing research topic, where an
embodied agent is required to navigate in a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "9",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Rao:2023:DPZ,
author = "Yunbo Rao and Ziqiang Yang and Shaoning Zeng and
Qifeng Wang and Jiansu Pu",
title = "Dual Projective Zero-Shot Learning Using Text
Descriptions",
journal = j-TOMM,
volume = "19",
number = "1",
pages = "10:1--10:??",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3514247",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3514247",
abstract = "Zero-shot learning (ZSL) aims to recognize image
instances of unseen classes solely based on the
semantic descriptions of the unseen classes. In this
field, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "10",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yu:2023:MVS,
author = "Hang Yu and Chilam Cheang and Yanwei Fu and Xiangyang
Xue",
title = "Multi-view Shape Generation for a {$3$D} Human-like
Body",
journal = j-TOMM,
volume = "19",
number = "1",
pages = "11:1--11:??",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3514248",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3514248",
abstract = "Three-dimensional (3D) human-like body reconstruction
via a single RGB image has attracted significant
research attention recently. Most of the existing
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "11",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Chen:2023:WST,
author = "Weidong Chen and Guorong Li and Xinfeng Zhang and
Shuhui Wang and Liang Li and Qingming Huang",
title = "Weakly Supervised Text-based Actor-Action Video
Segmentation by Clip-level Multi-instance Learning",
journal = j-TOMM,
volume = "19",
number = "1",
pages = "12:1--12:??",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3514250",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3514250",
abstract = "In real-world scenarios, it is common that a video
contains multiple actors and their activities.
Selectively localizing one specific actor and its
action spatially and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "12",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Shen:2023:QFC,
author = "Feihong Shen and Jun Liu",
title = "Quantum {Fourier} Convolutional Network",
journal = j-TOMM,
volume = "19",
number = "1",
pages = "13:1--13:??",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3514249",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3514249",
abstract = "The neural network and quantum computing are both
significant and appealing fields, with their
interactive disciplines promising for large-scale
computing tasks \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "13",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wu:2023:BBT,
author = "Xiaotian Wu and Peng Yao",
title = "{Boolean}-based Two-in-One Secret Image Sharing by
Adaptive Pixel Grouping",
journal = j-TOMM,
volume = "19",
number = "1",
pages = "14:1--14:??",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3517140",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3517140",
abstract = "The two-in-one secret image sharing (TiOSIS) technique
is a hybrid scheme that protects a secret image by
combining visual cryptography (VCS) and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "14",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yadav:2023:DML,
author = "Ashima Yadav and Dinesh Kumar Vishwakarma",
title = "A Deep Multi-level Attentive Network for Multimodal
Sentiment Analysis",
journal = j-TOMM,
volume = "19",
number = "1",
pages = "15:1--15:??",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3517139",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3517139",
abstract = "Multimodal sentiment analysis has attracted increasing
attention with broad application prospects. Most of the
existing methods have focused on a single modality,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "15",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Gao:2023:NGA,
author = "Honghao Gao and Baobin Dai and Huaikou Miao and
Xiaoxian Yang and Ramon J. Duran Barroso and Hussain
Walayat",
title = "A Novel {GAPG} Approach to Automatic Property
Generation for Formal Verification: The {GAN}
Perspective",
journal = j-TOMM,
volume = "19",
number = "1",
pages = "16:1--16:??",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3517154",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3517154",
abstract = "Formal methods have been widely used to support
software testing to guarantee correctness and
reliability. For example, model checking technology
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "16",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2023:LSS,
author = "Pengyi Zhang and Huanzhang Dou and Wenhu Zhang and
Yuhan Zhao and Zequn Qin and Dongping Hu and Yi Fang
and Xi Li",
title = "A Large-Scale Synthetic Gait Dataset Towards
in-the-Wild Simulation and Comparison Study",
journal = j-TOMM,
volume = "19",
number = "1",
pages = "17:1--17:??",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3517199",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3517199",
abstract = "Gait recognition has a rapid development in recent
years. However, current gait recognition focuses
primarily on ideal laboratory scenes, leaving the gait
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "17",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhou:2023:DAB,
author = "Wei Zhou and Zhiwu Xia and Peng Dou and Tao Su and
Haifeng Hu",
title = "Double Attention Based on Graph Attention Network for
Image Multi-Label Classification",
journal = j-TOMM,
volume = "19",
number = "1",
pages = "18:1--18:??",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3519030",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3519030",
abstract = "The task of image multi-label classification is to
accurately recognize multiple objects in an input
image. Most of the recent works need to leverage the
label \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "18",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2023:ANM,
author = "Xianlin Zhang and Mengling Shen and Xueming Li and
Xiaojie Wang",
title = "{AABLSTM}: a Novel Multi-task Based {CNN-RNN} Deep
Model for Fashion Analysis",
journal = j-TOMM,
volume = "19",
number = "1",
pages = "19:1--19:??",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3519029",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3519029",
abstract = "With the rapid growth of online commerce and
fashion-related applications, visual clothing analysis
and recognition has become a hotspot in computer
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "19",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2023:GML,
author = "Deyin Liu and Lin (Yuanbo) Wu and Richang Hong and
Zongyuan Ge and Jialie Shen and Farid Boussaid and
Mohammed Bennamoun",
title = "Generative Metric Learning for Adversarially Robust
Open-world Person Re-Identification",
journal = j-TOMM,
volume = "19",
number = "1",
pages = "20:1--20:??",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3522714",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3522714",
abstract = "The vulnerability of re-identification (re-ID) models
under adversarial attacks is of significant concern as
criminals may use adversarial perturbations to evade
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "20",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2023:BHI,
author = "Shuo Wang and Huixia Ben and Yanbin Hao and Xiangnan
He and Meng Wang",
title = "Boosting Hyperspectral Image Classification with Dual
Hierarchical Learning",
journal = j-TOMM,
volume = "19",
number = "1",
pages = "21:1--21:??",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3522713",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3522713",
abstract = "Hyperspectral image (HSI) classification aims at
predicting the pixel-wise labels in an image, where
there are only a few labeled pixel samples (hard
labels) for \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "21",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wu:2023:DUD,
author = "Dayan Wu and Qi Dai and Bo Li and Weiping Wang",
title = "Deep Uncoupled Discrete Hashing via Similarity Matrix
Decomposition",
journal = j-TOMM,
volume = "19",
number = "1",
pages = "22:1--22:??",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3524021",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3524021",
abstract = "Hashing has been drawing increasing attention in the
task of large-scale image retrieval owing to its
storage and computation efficiency, especially
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "22",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Cheung:2023:SNA,
author = "Ming Cheung and Weiwei Sun and James She and Jiantao
Zhou",
title = "Social Network Analytic-Based Online Counterfeit
Seller Detection using User Shared Images",
journal = j-TOMM,
volume = "19",
number = "1",
pages = "23:1--23:??",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3524135",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3524135",
abstract = "Selling counterfeit online has become a serious
problem, especially with the advancement of social
media and mobile technology. Instead of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "23",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Feihong:2023:THQ,
author = "Lu Feihong and Chen Hang and Li Kang and Deng Qiliang
and Zhao Jian and Zhang Kaipeng and Han Hong",
title = "Toward High-quality Face-Mask Occluded Restoration",
journal = j-TOMM,
volume = "19",
number = "1",
pages = "24:1--24:??",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3524137",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3524137",
abstract = "Face-mask occluded restoration aims at restoring the
masked region of a human face, which has attracted
increasing attention in the context of the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "24",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2023:CSL,
author = "Yajing Liu and Zhiwei Xiong and Ya Li and Yuning Lu
and Xinmei Tian and Zheng-Jun Zha",
title = "Category-Stitch Learning for Union Domain
Generalization",
journal = j-TOMM,
volume = "19",
number = "1",
pages = "25:1--25:??",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3524136",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3524136",
abstract = "Domain generalization aims at generalizing the network
trained on multiple domains to unknown but related
domains. Under the assumption that different domains
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "25",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Ferrari:2023:CRR,
author = "Claudio Ferrari and Federico Becattini and Leonardo
Galteri and Alberto {Del Bimbo}",
title = "{(Compress and Restore) N}: a Robust Defense Against
Adversarial Attacks on Image Classification",
journal = j-TOMM,
volume = "19",
number = "1s",
pages = "26:1--26:??",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3524619",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3524619",
abstract = "Modern image classification approaches often rely on
deep neural networks, which have shown pronounced
weakness to adversarial examples: images \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "26",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Song:2023:SSC,
author = "Yaguang Song and Xiaoshan Yang and Changsheng Xu",
title = "Self-supervised Calorie-aware Heterogeneous Graph
Networks for Food Recommendation",
journal = j-TOMM,
volume = "19",
number = "1s",
pages = "27:1--27:??",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3524618",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3524618",
abstract = "With the rapid development of online recipe sharing
platforms, food recommendation is emerging as an
important application. Although \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "27",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xue:2023:LEE,
author = "Feng Xue and Tian Yang and Kang Liu and Zikun Hong and
Mingwei Cao and Dan Guo and Richang Hong",
title = "{LCSNet}: End-to-end Lipreading with Channel-aware
Feature Selection",
journal = j-TOMM,
volume = "19",
number = "1s",
pages = "28:1--28:??",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3524620",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3524620",
abstract = "Lipreading is a task of decoding the movement of the
speaker's lip region into text. In recent years,
lipreading methods based on deep neural network
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "28",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Fu:2023:LPA,
author = "Zilong Fu and Hongtao Xie and Shancheng Fang and Yuxin
Wang and Mengting Xing and Yongdong Zhang",
title = "Learning Pixel Affinity Pyramid for Arbitrary-Shaped
Text Detection",
journal = j-TOMM,
volume = "19",
number = "1s",
pages = "29:1--29:??",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3524617",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3524617",
abstract = "Arbitrary-shaped text detection in natural images is a
challenging task due to the complexity of the
background and the diversity of text properties. The
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "29",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{CardiaNeto:2023:LSA,
author = "Jo{\~a}o Baptista {Cardia Neto} and Claudio Ferrari
and Aparecido {Nilceu Marana} and Stefano Berretti and
Alberto {Del Bimbo}",
title = "Learning Streamed Attention Network from Descriptor
Images for Cross-Resolution {$3$D} Face Recognition",
journal = j-TOMM,
volume = "19",
number = "1s",
pages = "30:1--30:??",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3527158",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3527158",
abstract = "In this article, we propose a hybrid framework for
cross-resolution 3D face recognition which utilizes a
Streamed Attention Network (SAN) that \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "30",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Huang:2023:TMM,
author = "Xin Huang",
title = "On Teaching Mode of {MTI} Translation Workshop Based
on {IPT} Corpus for {Tibetan} Areas of {China}",
journal = j-TOMM,
volume = "19",
number = "1s",
pages = "31:1--31:??",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3527173",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3527173",
abstract = "With the technological turn of applied research in
translation, increasing attention has been paid to the
teaching of translation technology. This article
addresses \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "31",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xu:2023:MMM,
author = "Liming Xu and Xianhua Zeng and Weisheng Li and Bochuan
Zheng",
title = "{MFGAN}: Multi-modal Feature-fusion for {CT} Metal
Artifact Reduction Using {GANs}",
journal = j-TOMM,
volume = "19",
number = "1s",
pages = "32:1--32:??",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3528172",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3528172",
abstract = "Due to the existence of metallic implants in certain
patients, the Computed Tomography (CT) images from
these patients are often corrupted by \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "32",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Hu:2023:DIP,
author = "Yuzhang Hu and Wenhan Yang and Jiaying Liu and
Zongming Guo",
title = "Deep Inter Prediction with Error-Corrected
Auto-Regressive Network for Video Coding",
journal = j-TOMM,
volume = "19",
number = "1s",
pages = "33:1--33:??",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3528173",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3528173",
abstract = "Modern codecs remove temporal redundancy of a video
via inter prediction, i.e., searching previously coded
frames for similar blocks and storing motion \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "33",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2023:IIT,
author = "Yue Li and Li Zhang and Kai Zhang",
title = "{iDAM}: Iteratively Trained Deep In-loop Filter with
Adaptive Model Selection",
journal = j-TOMM,
volume = "19",
number = "1s",
pages = "34:1--34:??",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3529107",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3529107",
abstract = "As a rapid development of neural-network-based machine
learning algorithms, deep learning methods are being
tentatively used in a much wider range than \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "34",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Jaiswal:2023:CNN,
author = "Rahul Kumar Jaiswal and Rajesh Kumar Dubey",
title = "{CAQoE}: a Novel No-Reference Context-aware Speech
Quality Prediction Metric",
journal = j-TOMM,
volume = "19",
number = "1s",
pages = "35:1--35:??",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3529394",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3529394",
abstract = "The quality of speech degrades while communicating
over Voice over Internet Protocol applications, for
example, Google Meet, Microsoft Skype, and Apple
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "35",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xiang:2023:BPP,
author = "Tao Xiang and Honghong Zeng and Biwen Chen and
Shangwei Guo",
title = "{BMIF}: Privacy-preserving Blockchain-based Medical
Image Fusion",
journal = j-TOMM,
volume = "19",
number = "1s",
pages = "36:1--36:??",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3531016",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3531016",
abstract = "Medical image fusion generates a fused image
containing multiple features extracted from different
source images, and it is of great help in clinical
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "36",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhu:2023:DDB,
author = "Xiaoke Zhu and Changlong Li and Xiaopan Chen and Xinyu
Zhang and Xiao-Yuan Jing",
title = "Distance and Direction Based Deep Discriminant Metric
Learning for Kinship Verification",
journal = j-TOMM,
volume = "19",
number = "1s",
pages = "37:1--37:??",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3531014",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3531014",
abstract = "Image-based kinship verification is an important task
in computer vision and has many applications in
practice, such as missing children search and
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "37",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhuang:2023:OPF,
author = "Weiming Zhuang and Xin Gan and Yonggang Wen and Shuai
Zhang",
title = "Optimizing Performance of Federated Person
Re-identification: Benchmarking and Analysis",
journal = j-TOMM,
volume = "19",
number = "1s",
pages = "38:1--38:??",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3531013",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3531013",
abstract = "Increasingly stringent data privacy regulations limit
the development of person re-identification (ReID)
because person ReID training requires \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "38",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{DeDivitiis:2023:DFF,
author = "Lavinia {De Divitiis} and Federico Becattini and
Claudio Baecchi and Alberto {Del Bimbo}",
title = "Disentangling Features for Fashion Recommendation",
journal = j-TOMM,
volume = "19",
number = "1s",
pages = "39:1--39:??",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3531017",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3531017",
abstract = "Online stores have become fundamental for the fashion
industry, revolving around recommendation systems to
suggest appropriate items to customers. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "39",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Chan:2023:UFH,
author = "Ka-Hou Chan and Sio-Kei Im",
title = "Using Four Hypothesis Probability Estimators for
{CABAC} in Versatile Video Coding",
journal = j-TOMM,
volume = "19",
number = "1s",
pages = "40:1--40:??",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3531015",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3531015",
abstract = "This article introduces the key technologies involved
in four hypothetical probability estimators for
Context-based Adaptive Binary Arithmetic Coding
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "40",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yuan:2023:ATD,
author = "Mengqi Yuan and Bing-Kun Bao and Zhiyi Tan and
Changsheng Xu",
title = "Adaptive Text Denoising Network for Image Caption
Editing",
journal = j-TOMM,
volume = "19",
number = "1s",
pages = "41:1--41:??",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3532627",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3532627",
abstract = "Image caption editing, which aims at editing the
inaccurate descriptions of the images, is an
interdisciplinary task of computer vision and natural
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "41",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2023:IQA,
author = "Xiaoyu Zhang and Wei Gao and Ge Li and Qiuping Jiang
and Runmin Cong",
title = "Image Quality Assessment-driven Reinforcement Learning
for Mixed Distorted Image Restoration",
journal = j-TOMM,
volume = "19",
number = "1s",
pages = "42:1--42:??",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3532625",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3532625",
abstract = "Due to the diversity of the degradation process that
is difficult to model, the recovery of mixed distorted
images is still a challenging problem. The deep
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "42",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Bai:2023:DDI,
author = "Chongyang Bai and Maksim Bolonkin and Viney Regunath
and V. S. Subrahmanian",
title = "{DIPS}: a Dyadic Impression Prediction System for
Group Interaction Videos",
journal = j-TOMM,
volume = "19",
number = "1s",
pages = "43:1--43:??",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3532865",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3532865",
abstract = "We consider the problem of predicting the impression
that one subject has of another in a video clip showing
a group of interacting people. Our novel \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "43",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2023:SHL,
author = "Yuqing Liu and Xinfeng Zhang and Shanshe Wang and
Siwei Ma and Wen Gao",
title = "Sequential Hierarchical Learning with Distribution
Transformation for Image Super-Resolution",
journal = j-TOMM,
volume = "19",
number = "1s",
pages = "44:1--44:??",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3532864",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3532864",
abstract = "Multi-scale design has been considered in recent image
super-resolution (SR) works to explore the hierarchical
feature information. Existing multi-scale \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "44",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2023:JJD,
author = "Haidong Wang and Xuan He and Zhiyong Li and Jin Yuan
and Shutao Li",
title = "{JDAN}: Joint Detection and Association Network for
Real-Time Online Multi-Object Tracking",
journal = j-TOMM,
volume = "19",
number = "1s",
pages = "45:1--45:??",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3533253",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3533253",
abstract = "In the last few years, enormous strides have been made
for object detection and data association, which are
vital subtasks for one-stage online multi-object
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "45",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xiao:2023:NRD,
author = "Mengyao Xiao and Xiaolong Li and Yao Zhao and Bin Ma
and Guodong Guo",
title = "A Novel Reversible Data Hiding Scheme Based on
Pixel-Residual Histogram",
journal = j-TOMM,
volume = "19",
number = "1s",
pages = "46:1--46:??",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3534565",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3534565",
abstract = "Prediction-error expansion (PEE) is the most popular
reversible data hiding (RDH) technique due to its
efficient capacity-distortion tradeoff. With the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "46",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2023:MGF,
author = "Jiazhi Liu and Feng Liu",
title = "Modified {$2$D}-Ghost-Free Stereoscopic Display with
Depth-of-Field Effects",
journal = j-TOMM,
volume = "19",
number = "1s",
pages = "47:1--47:??",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3534964",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3534964",
abstract = "Backward-compatible stereoscopic display, a novel
display technique that can simultaneously present
satisfying 3D effects to viewers with stereo \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "47",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Chen:2023:RAC,
author = "Jingwen Chen and Yingwei Pan and Yehao Li and Ting Yao
and Hongyang Chao and Tao Mei",
title = "Retrieval Augmented Convolutional Encoder-decoder
Networks for Video Captioning",
journal = j-TOMM,
volume = "19",
number = "1s",
pages = "48:1--48:??",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3539225",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3539225",
abstract = "Video captioning has been an emerging research topic
in computer vision, which aims to generate a natural
sentence to correctly reflect the visual \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "48",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhu:2023:CSA,
author = "Guanyu Zhu and Yong Zhou and Rui Yao and Hancheng Zhu
and Jiaqi Zhao",
title = "Cyclic Self-attention for Point Cloud Recognition",
journal = j-TOMM,
volume = "19",
number = "1s",
pages = "49:1--49:??",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3538648",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3538648",
abstract = "Point clouds provide a flexible geometric
representation for computer vision research. However,
the harsh demands for the number of input points and
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "49",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yang:2023:EMF,
author = "Dinghao Yang and Wei Gao and Ge Li and Hui Yuan and
Junhui Hou and Sam Kwong",
title = "Exploiting Manifold Feature Representation for
Efficient Classification of {$3$D} Point Clouds",
journal = j-TOMM,
volume = "19",
number = "1s",
pages = "50:1--50:??",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3539611",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:33 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3539611",
abstract = "In this paper, we propose an efficient point cloud
classification method via manifold learning based
feature representation. Different from conventional
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "50",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Lan:2023:STS,
author = "Xiaohan Lan and Yitian Yuan and Xin Wang and Zhi Wang
and Wenwu Zhu",
title = "A Survey on Temporal Sentence Grounding in Videos",
journal = j-TOMM,
volume = "19",
number = "2",
pages = "51:1--51:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3532626",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:34 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3532626",
abstract = "Temporal sentence grounding in videos (TSGV), which
aims at localizing one target segment from an untrimmed
video with respect to a given sentence \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "51",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Qiao:2023:HPI,
author = "Yu Qiao and Yuhao Liu and Ziqi Wei and Yuxin Wang and
Qiang Cai and Guofeng Zhang and Xin Yang",
title = "Hierarchical and Progressive Image Matting",
journal = j-TOMM,
volume = "19",
number = "2",
pages = "52:1--52:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3540201",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:34 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3540201",
abstract = "Most matting research resorts to advanced semantics to
achieve high-quality alpha mattes, and a direct
low-level features combination is usually explored to
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "52",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Peng:2023:LDS,
author = "Fei Peng and Wenyan Jiang and Min Long",
title = "A Low Distortion and Steganalysis-resistant Reversible
Data Hiding for {$2$D} Engineering Graphics",
journal = j-TOMM,
volume = "19",
number = "2",
pages = "53:1--53:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3539661",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:34 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3539661",
abstract = "To reduce the distortion resulting from the large
number of crossing quantization cells and resist
steganalysis, a reversible data hiding scheme for 2D
engineering \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "53",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Mai:2023:MGU,
author = "Sijie Mai and Songlong Xing and Jiaxuan He and Ying
Zeng and Haifeng Hu",
title = "Multimodal Graph for Unaligned Multimodal Sequence
Analysis via Graph Convolution and Graph Pooling",
journal = j-TOMM,
volume = "19",
number = "2",
pages = "54:1--54:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3542927",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:34 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3542927",
abstract = "Multimodal sequence analysis aims to draw inferences
from visual, language, and acoustic sequences. A
majority of existing works focus on the aligned fusion
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "54",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zheng:2023:PLN,
author = "Qi Zheng and Jianfeng Dong and Xiaoye Qu and Xun Yang
and Yabing Wang and Pan Zhou and Baolong Liu and Xun
Wang",
title = "Progressive Localization Networks for Language-Based
Moment Localization",
journal = j-TOMM,
volume = "19",
number = "2",
pages = "55:1--55:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3543857",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:34 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3543857",
abstract = "This article targets the task of language-based video
moment localization. The language-based setting of this
task allows for an open set of target activities,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "55",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2023:LCE,
author = "Yue Zhang and Fanghui Zhang and Yi Jin and Yigang Cen
and Viacheslav Voronin and Shaohua Wan",
title = "Local Correlation Ensemble with {GCN} Based on
Attention Features for Cross-domain Person Re-{ID}",
journal = j-TOMM,
volume = "19",
number = "2",
pages = "56:1--56:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3542820",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:34 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3542820",
abstract = "Person re-identification (Re-ID) has achieved great
success in single-domain. However, it remains a
challenging task to adapt a Re-ID model trained on
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "56",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Chakareski:2023:MWF,
author = "Jacob Chakareski and Mahmudur Khan and Tanguy
Ropitault and Steve Blandino",
title = "Millimeter Wave and Free-space-optics for Future
Dual-connectivity {6DOF} Mobile Multi-user {VR}
Streaming",
journal = j-TOMM,
volume = "19",
number = "2",
pages = "57:1--57:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3544494",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:34 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3544494",
abstract = "Dual-connectivity streaming is a key enabler of
next-generation six Degrees Of Freedom (6DOF) Virtual
Reality (VR) scene immersion. Indeed, using
conventional sub-6 \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "57",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Lin:2023:IPG,
author = "Yun-Shao Lin and Yi-Ching Liu and Chi-Chun Lee",
title = "An Interaction-process-guided Framework for
Small-group Performance Prediction",
journal = j-TOMM,
volume = "19",
number = "2",
pages = "58:1--58:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3558768",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:34 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3558768",
abstract = "A small group is a fundamental interaction unit for
achieving a shared goal. Group performance can be
automatically predicted using computational methods to
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "58",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zheng:2023:EEA,
author = "Na Zheng and Xuemeng Song and Tianyu Su and Weifeng
Liu and Yan Yan and Liqiang Nie",
title = "Egocentric Early Action Prediction via Adversarial
Knowledge Distillation",
journal = j-TOMM,
volume = "19",
number = "2",
pages = "59:1--59:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3544493",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:34 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3544493",
abstract = "Egocentric early action prediction aims to recognize
actions from the first-person view by only observing a
partial video segment, which is challenging due to the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "59",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2023:ISR,
author = "Li Wang and Ke Li and Jingjing Tang and Yuying Liang",
title = "Image Super-Resolution via Lightweight
Attention-Directed Feature Aggregation Network",
journal = j-TOMM,
volume = "19",
number = "2",
pages = "60:1--60:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3546076",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:34 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3546076",
abstract = "The advent of convolutional neural networks (CNNs) has
brought substantial progress in image super-resolution
(SR) reconstruction. However, most SR methods pursue
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "60",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Lin:2023:FAC,
author = "Jiaying Lin and Xin Tan and Ke Xu and Lizhuang Ma and
Rynson W. H. Lau",
title = "Frequency-aware Camouflaged Object Detection",
journal = j-TOMM,
volume = "19",
number = "2",
pages = "61:1--61:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3545609",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:34 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3545609",
abstract = "Camouflaged object detection (COD) is important as it
has various potential applications. Unlike salient
object detection (SOD), which tries to identify
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "61",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liang:2023:HNR,
author = "Shuang Liang and Anjie Zhu and Jiasheng Zhang and Jie
Shao",
title = "Hyper-node Relational Graph Attention Network for
Multi-modal Knowledge Graph Completion",
journal = j-TOMM,
volume = "19",
number = "2",
pages = "62:1--62:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3545573",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:34 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3545573",
abstract = "Knowledge graphs often suffer from incompleteness, and
knowledge graph completion (KGC) aims at inferring the
missing triplets through \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "62",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Shi:2023:LVT,
author = "Yaya Shi and Haiyang Xu and Chunfeng Yuan and Bing Li
and Weiming Hu and Zheng-Jun Zha",
title = "Learning Video-Text Aligned Representations for Video
Captioning",
journal = j-TOMM,
volume = "19",
number = "2",
pages = "63:1--63:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3546828",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:34 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3546828",
abstract = "Video captioning requires that the model has the
abilities of video understanding, video-text alignment,
and text generation. Due to the semantic gap between
vision \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "63",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yang:2023:NRQ,
author = "Yang Yang and Yingqiu Ding and Ming Cheng and Weiming
Zhang",
title = "No-reference Quality Assessment for Contrast-distorted
Images Based on Gray and Color-gray-difference Space",
journal = j-TOMM,
volume = "19",
number = "2",
pages = "64:1--64:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3555355",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:34 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3555355",
abstract = "No-reference image quality assessment is a basic and
challenging problem in the field of image processing.
Among them, contrast distortion has a great impact on
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "64",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2023:REC,
author = "Jia Wang and Jingcheng Ke and Hong-Han Shuai and
Yung-Hui Li and Wen-Huang Cheng",
title = "Referring Expression Comprehension Via Enhanced
Cross-modal Graph Attention Networks",
journal = j-TOMM,
volume = "19",
number = "2",
pages = "65:1--65:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3548688",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:34 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3548688",
abstract = "Referring expression comprehension aims to localize a
specific object in an image according to a given
language description. It is still challenging to
comprehend \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "65",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2023:BLL,
author = "Dengyong Zhang and Pu Huang and Xiangling Ding and
Feng Li and Wenjie Zhu and Yun Song and Gaobo Yang",
title = "{L$^2$BEC$^2$}: Local Lightweight Bidirectional
Encoding and Channel Attention Cascade for Video Frame
Interpolation",
journal = j-TOMM,
volume = "19",
number = "2",
pages = "66:1--66:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3547660",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:34 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3547660",
abstract = "Video frame interpolation (VFI) is of great importance
for many video applications, yet it is still
challenging even in the era of deep learning. Some
existing VFI models \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "66",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2023:PBI,
author = "Yushu Zhang and Qing Tan and Shuren Qi and Mingfu
Xue",
title = "{PRNU}-based Image Forgery Localization with Deep
Multi-scale Fusion",
journal = j-TOMM,
volume = "19",
number = "2",
pages = "67:1--67:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3548689",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:34 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3548689",
abstract = "Photo-response non-uniformity (PRNU), as a class of
device fingerprint, plays a key role in the forgery
detection/localization for visual media. The \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "67",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Dong:2023:SEG,
author = "Shanshan Dong and Tianzi Niu and Xin Luo and Wu Liu
and Xinshun Xu",
title = "Semantic Embedding Guided Attention with Explicit
Visual Feature Fusion for Video Captioning",
journal = j-TOMM,
volume = "19",
number = "2",
pages = "68:1--68:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3550276",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:34 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3550276",
abstract = "Video captioning, which bridges vision and language,
is a fundamental yet challenging task in computer
vision. To generate accurate and comprehensive
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "68",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xu:2023:SBS,
author = "Shunxin Xu and Ke Sun and Dong Liu and Zhiwei Xiong
and Zheng-Jun Zha",
title = "Synergy between Semantic Segmentation and Image
Denoising via Alternate Boosting",
journal = j-TOMM,
volume = "19",
number = "2",
pages = "69:1--69:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3548459",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:34 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3548459",
abstract = "The capability of image semantic segmentation may be
deteriorated due to the noisy input image, where image
denoising prior to segmentation \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "69",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Song:2023:SSI,
author = "Dan Song and Chu-Meng Zhang and Xiao-Qian Zhao and
Teng Wang and Wei-Zhi Nie and Xuan-Ya Li and An-An
Liu",
title = "Self-supervised Image-based {$3$D} Model Retrieval",
journal = j-TOMM,
volume = "19",
number = "2",
pages = "70:1--70:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3548690",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:34 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3548690",
abstract = "Image-based 3D model retrieval aims at organizing
unlabeled 3D models according to the relevance to the
labeled 2D images. With easy accessibility of 2D images
and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "70",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Nousias:2023:DSM,
author = "Stavros Nousias and Gerasimos Arvanitis and Aris Lalos
and Konstantinos Moustakas",
title = "Deep Saliency Mapping for {$3$D} Meshes and
Applications",
journal = j-TOMM,
volume = "19",
number = "2",
pages = "71:1--71:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3550073",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:34 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3550073",
abstract = "Nowadays, three-dimensional (3D) meshes are widely
used in various applications in different areas (e.g.,
industry, education, entertainment and safety). The 3D
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "71",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2023:TNR,
author = "Yun Liu and Xiaohua Yin and Zuliang Wan and Guanghui
Yue and Zhi Zheng",
title = "Toward A No-reference Omnidirectional Image Quality
Evaluation by Using Multi-perceptual Features",
journal = j-TOMM,
volume = "19",
number = "2",
pages = "72:1--72:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3549544",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:34 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3549544",
abstract = "Compared to ordinary images, omnidirectional image
(OI) usually has a broader view and a higher
resolution, and image quality assessment (IQA)
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "72",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wu:2023:RIE,
author = "Hua Wu and Xin Li and Gang Wang and Guang Cheng and
Xiaoyan Hu",
title = "Resolution Identification of Encrypted Video Streaming
Based on {HTTP/2} Features",
journal = j-TOMM,
volume = "19",
number = "2",
pages = "73:1--73:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3551891",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:34 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3551891",
abstract = "With the inevitable dominance of video traffic on the
Internet, Internet service providers (ISP) are striving
to deliver video streaming with high quality. Video
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "73",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Qin:2023:QEC,
author = "Qipu Qin and Cheolkon Jung",
title = "Quality Enhancement of Compressed $ 360$-Degree Videos
Using Viewport-based Deep Neural Networks",
journal = j-TOMM,
volume = "19",
number = "2",
pages = "74:1--74:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3551641",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:34 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3551641",
abstract = "360-degree video provides omnidirectional views by a
bounding sphere, thus also called omnidirectional
video. For omnidirectional video, people can only see
specific \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "74",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhou:2023:AIS,
author = "Wei Zhou and Zhiwu Xia and Peng Dou and Tao Su and
Haifeng Hu",
title = "Aligning Image Semantics and Label Concepts for Image
Multi-Label Classification",
journal = j-TOMM,
volume = "19",
number = "2",
pages = "75:1--75:??",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3550278",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:34 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3550278",
abstract = "Image multi-label classification task is mainly to
correctly predict multiple object categories in the
images. To capture the correlation between labels,
graph \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "75",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Jabeen:2023:RMA,
author = "Summaira Jabeen and Xi Li and Muhammad Shoib Amin and
Omar Bourahla and Songyuan Li and Abdul Jabbar",
title = "A Review on Methods and Applications in Multimodal
Deep Learning",
journal = j-TOMM,
volume = "19",
number = "2s",
pages = "76:1--76:??",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3545572",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:35 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3545572",
abstract = "Deep Learning has implemented a wide range of
applications and has become increasingly popular in
recent years. The goal of multimodal deep learning
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "76",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Sun:2023:IRG,
author = "Sophie C. C. Sun and Yongkang Zhao and Fang-Wei Fu and
Yawei Ren",
title = "Improved Random Grid-based Cheating Prevention Visual
Cryptography Using Latin Square",
journal = j-TOMM,
volume = "19",
number = "2s",
pages = "77:1--77:??",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3550275",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:35 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3550275",
abstract = "Visual cryptography scheme is a method of encrypting
secret image into n noiselike shares. The secret image
can be reconstructed by stacking \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "77",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Dong:2023:VFI,
author = "Jiong Dong and Kaoru Ota and Mianxiong Dong",
title = "Video Frame Interpolation: a Comprehensive Survey",
journal = j-TOMM,
volume = "19",
number = "2s",
pages = "78:1--78:??",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3556544",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:35 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3556544",
abstract = "Video Frame Interpolation (VFI) is a fascinating and
challenging problem in the computer vision (CV) field,
aiming to generate non-existing frames \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "78",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Cao:2023:DKP,
author = "Gaofeng Cao and Fei Zhou and Kanglin Liu and Anjie
Wang and Leidong Fan",
title = "A Decoupled Kernel Prediction Network Guided by Soft
Mask for Single Image {HDR} Reconstruction",
journal = j-TOMM,
volume = "19",
number = "2s",
pages = "79:1--79:??",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3550277",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:35 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3550277",
abstract = "Recent works on single image high dynamic range (HDR)
reconstruction fail to hallucinate plausible textures,
resulting in information missing and artifacts
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "79",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2023:PCQ,
author = "Yipeng Liu and Qi Yang and Yiling Xu and Le Yang",
title = "Point Cloud Quality Assessment: Dataset Construction
and Learning-based No-reference Metric",
journal = j-TOMM,
volume = "19",
number = "2s",
pages = "80:1--80:??",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3550274",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:35 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3550274",
abstract = "Full-reference (FR) point cloud quality assessment
(PCQA) has achieved impressive progress in recent
years. However, in many cases, obtaining the reference
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "80",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xu:2023:PAC,
author = "Cheng Xu and Zejun Chen and Jiajie Mai and Xuemiao Xu
and Shengfeng He",
title = "Pose- and Attribute-consistent Person Image
Synthesis",
journal = j-TOMM,
volume = "19",
number = "2s",
pages = "81:1--81:??",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3554739",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:35 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3554739",
abstract = "Person Image Synthesis aims at transferring the
appearance of the source person image into a target
pose. Existing methods cannot handle large pose
variations and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "81",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Park:2023:SCQ,
author = "Jae Hyun Park and Sanghoon Kim and Joo Chan Lee and
Jong Hwan Ko",
title = "Scalable Color Quantization for Task-centric Image
Compression",
journal = j-TOMM,
volume = "19",
number = "2s",
pages = "82:1--82:??",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3551389",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:35 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3551389",
abstract = "Conventional image compression techniques targeted for
the perceptual quality are not generally optimized for
classification tasks using deep neural networks
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "82",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Puig:2023:FFP,
author = "Joan Manuel Marqu{\`e}s Puig and Helena Rif{\`a}-Pous
and Samia Oukemeni",
title = "From False-Free to Privacy-Oriented Communitarian
Microblogging Social Networks",
journal = j-TOMM,
volume = "19",
number = "2s",
pages = "83:1--83:??",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3555354",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:35 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3555354",
abstract = "Online Social Networks (OSNs) have gained enormous
popularity in recent years. They provide a dynamic
platform for sharing content (text messages or \ldots{}
) \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "83",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Tang:2023:QGP,
author = "Yiming Tang and Yi Yu",
title = "Query-Guided Prototype Learning with Decoder Alignment
and Dynamic Fusion in Few-Shot Segmentation",
journal = j-TOMM,
volume = "19",
number = "2s",
pages = "84:1--84:??",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3555314",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:35 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3555314",
abstract = "Few-shot segmentation aims to segment objects
belonging to a specific class under the guidance of a
few annotated examples. Most existing approaches follow
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "84",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2023:MCM,
author = "Zhiming Liu and Kai Niu and Zhiqiang He",
title = "{ML-CookGAN}: Multi-Label Generative Adversarial
Network for Food Image Generation",
journal = j-TOMM,
volume = "19",
number = "2s",
pages = "85:1--85:??",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3554738",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:35 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3554738",
abstract = "Generating food images from recipe and ingredient
information can be applied to many tasks such as food
recommendation, recipe development, and health
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "85",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Alwaely:2023:GGB,
author = "Basheer Alwaely and Charith Abhayaratne",
title = "{GHOSM}: Graph-based Hybrid Outline and Skeleton
Modelling for Shape Recognition",
journal = j-TOMM,
volume = "19",
number = "2s",
pages = "86:1--86:??",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3554922",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:35 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3554922",
abstract = "An efficient and accurate shape detection model plays
a major role in many research areas. With the emergence
of more complex shapes in real-life applications,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "86",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Jonna:2023:DDK,
author = "Sankaraganesh Jonna and Moushumi Medhi and Rajiv
Ranjan Sahay",
title = "{Distill-DBDGAN}: Knowledge Distillation and
Adversarial Learning Framework for Defocus Blur
Detection",
journal = j-TOMM,
volume = "19",
number = "2s",
pages = "87:1--87:??",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3557897",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:35 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3557897",
abstract = "Defocus blur detection (DBD) aims to segment the
blurred regions from a given image affected by defocus
blur. It is a crucial pre-processing step for various
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "87",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Ding:2023:BRD,
author = "Xuewei Ding and Yingwei Pan and Yehao Li and Ting Yao
and Dan Zeng and Tao Mei",
title = "Boosting Relationship Detection in Images with
Multi-Granular Self-Supervised Learning",
journal = j-TOMM,
volume = "19",
number = "2s",
pages = "88:1--88:??",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3556978",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:35 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3556978",
abstract = "Visual and spatial relationship detection in images
has been a fast-developing research topic in the
multimedia field, which learns to recognize the
semantic/spatial \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "88",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Chu:2023:RLT,
author = "Binfei Chu and Yiting Lin and Bineng Zhong and Zhenjun
Tang and Xianxian Li and Jing Wang",
title = "Robust Long-Term Tracking via Localizing Occluders",
journal = j-TOMM,
volume = "19",
number = "2s",
pages = "89:1--89:??",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3557896",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:35 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3557896",
abstract = "Occlusion is known as one of the most challenging
factors in long-term tracking because of its
unpredictable shape. Existing works devoted into the
design of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "89",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wu:2023:CPG,
author = "Huisi Wu and Zhaoze Wang and Zhuoying Li and Zhenkun
Wen and Jing Qin",
title = "Context Prior Guided Semantic Modeling for Biomedical
Image Segmentation",
journal = j-TOMM,
volume = "19",
number = "2s",
pages = "90:1--90:??",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3558520",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:35 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3558520",
abstract = "Most state-of-the-art deep networks proposed for
biomedical image segmentation are developed based on
U-Net. While remarkable success has been \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "90",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wu:2023:OBM,
author = "Jun Wu and Tianliang Zhu and Jiahui Zhu and Tianyi Li
and Chunzhi Wang",
title = "A Optimized {BERT} for Multimodal Sentiment Analysis",
journal = j-TOMM,
volume = "19",
number = "2s",
pages = "91:1--91:??",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3566126",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:35 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3566126",
abstract = "Sentiment analysis of one modality (e.g., text or
image) has been broadly studied. However, not much
attention has been paid to the sentiment analysis of
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "91",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xu:2023:PTM,
author = "Yongzong Xu and Zhijing Yang and Tianshui Chen and Kai
Li and Chunmei Qing",
title = "Progressive Transformer Machine for Natural Character
Reenactment",
journal = j-TOMM,
volume = "19",
number = "2s",
pages = "92:1--92:??",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3559107",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:35 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3559107",
abstract = "Character reenactment aims to control a target
person's full-head movement by a driving monocular
sequence that is made up of the driving character
video. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "92",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Tan:2023:IVV,
author = "Chong Hong Tan and Koksheik Wong and Vishnu Monn
Baskaran and Kiki Adhinugraha and David Taniar",
title = "Is it Violin or {Viola}? {Classifying} the
Instruments' Music Pieces using Descriptive
Statistics",
journal = j-TOMM,
volume = "19",
number = "2s",
pages = "93:1--93:??",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3563218",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:35 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3563218",
abstract = "Classifying music pieces based on their instrument
sounds is pivotal for analysis and application
purposes. Given its importance, techniques using
machine learning \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "93",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Singh:2023:ESM,
author = "KN Singh and OP Singh and Amit Kumar Singh and Amrit
Kumar Agrawal",
title = "{EiMOL}: a Secure Medical Image Encryption Algorithm
based on Optimization and the {Lorenz} System",
journal = j-TOMM,
volume = "19",
number = "2s",
pages = "94:1--94:??",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3561513",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:35 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3561513",
abstract = "Nowadays, the demand for digital images from different
intelligent devices and sensors has dramatically
increased in smart healthcare. Due to advanced
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "94",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Qiao:2023:UUE,
author = "Ziteng Qiao and Dianxi Shi and Xiaodong Yi and Yanyan
Shi and Yuhui Zhang and Yangyang Liu",
title = "{UEFPN}: Unified and Enhanced Feature Pyramid Networks
for Small Object Detection",
journal = j-TOMM,
volume = "19",
number = "2s",
pages = "95:1--95:??",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3561824",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:35 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3561824",
abstract = "Object detection models based on feature pyramid
networks have made significant progress in general
object detection. However, small object detection is
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "95",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhu:2023:DLB,
author = "Linwei Zhu and Yun Zhang and Na Li and Gangyi Jiang
and Sam Kwong",
title = "Deep Learning-Based Intra Mode Derivation for
Versatile Video Coding",
journal = j-TOMM,
volume = "19",
number = "2s",
pages = "96:1--96:??",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3563699",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:35 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3563699",
abstract = "In intra coding, Rate Distortion Optimization (RDO) is
performed to achieve the optimal intra mode from a
pre-defined candidate list. The optimal intra mode is
also \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "96",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zeng:2023:LEI,
author = "Donghuo Zeng and Jianming Wu and Gen Hattori and Rong
Xu and Yi Yu",
title = "Learning Explicit and Implicit Dual Common Subspaces
for Audio-visual Cross-modal Retrieval",
journal = j-TOMM,
volume = "19",
number = "2s",
pages = "97:1--97:??",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3564608",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:35 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3564608",
abstract = "Audio-visual tracks in video contain rich semantic
information with potential in many applications and
research. Since the audio-visual data have inconsistent
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "97",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Gao:2023:RTI,
author = "Qiqi Gao and Jie Li and Tiejun Zhao and Yadong Wang",
title = "Real-time Image Enhancement with Attention
Aggregation",
journal = j-TOMM,
volume = "19",
number = "2s",
pages = "98:1--98:??",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3564607",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:35 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3564607",
abstract = "Image enhancement has stimulated significant research
works over the past years for its great application
potential in video conferencing scenarios. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "98",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhu:2023:TVB,
author = "Yucheng Zhu and Xiongkuo Min and Dandan Zhu and
Guangtao Zhai and Xiaokang Yang and Wenjun Zhang and Ke
Gu and Jiantao Zhou",
title = "Toward Visual Behavior and Attention Understanding for
Augmented 360 Degree Videos",
journal = j-TOMM,
volume = "19",
number = "2s",
pages = "99:1--99:??",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3565024",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:35 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3565024",
abstract = "Augmented reality (AR) overlays digital content onto
reality. In an AR system, correct and precise
estimations of user visual fixations and head movements
can \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "99",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Mei:2023:MSS,
author = "Haiyang Mei and Letian Yu and Ke Xu and Yang Wang and
Xin Yang and Xiaopeng Wei and Rynson W. H. Lau",
title = "Mirror Segmentation via Semantic-aware Contextual
Contrasted Feature Learning",
journal = j-TOMM,
volume = "19",
number = "2s",
pages = "100:1--100:??",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3566127",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:35 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3566127",
abstract = "Mirrors are everywhere in our daily lives. Existing
computer vision systems do not consider mirrors, and
hence may get confused by the reflected content
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "100",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2023:PSN,
author = "Yi Zhang and Fang-Yi Chao and Wassim Hamidouche and
Olivier Deforges",
title = "{PAV-SOD}: a New Task towards Panoramic Audiovisual
Saliency Detection",
journal = j-TOMM,
volume = "19",
number = "3",
pages = "101:1--101:??",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3565267",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3565267",
abstract = "Object-level audiovisual saliency detection in
360${}^\circ $ panoramic real-life dynamic scenes is
important for exploring and modeling human perception
in immersive \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "101",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xie:2023:TDW,
author = "Chi Xie and Zikun Zhuang and Shengjie Zhao and Shuang
Liang",
title = "Temporal Dropout for Weakly Supervised Action
Localization",
journal = j-TOMM,
volume = "19",
number = "3",
pages = "102:1--102:??",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3567827",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3567827",
abstract = "Weakly supervised action localization is a challenging
problem in video understanding and action recognition.
Existing models usually formulate the training
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "102",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Guo:2023:MBR,
author = "Yangyang Guo and Liqiang Nie and Harry Cheng and
Zhiyong Cheng and Mohan Kankanhalli and Alberto {Del
Bimbo}",
title = "On Modality Bias Recognition and Reduction",
journal = j-TOMM,
volume = "19",
number = "3",
pages = "103:1--103:??",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3565266",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3565266",
abstract = "Making each modality in multi-modal data contribute is
of vital importance to learning a versatile multi-modal
model. Existing methods, however, are often \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "103",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xu:2023:CTC,
author = "Kang Xu and Weixin Li and Xia Wang and Xiaoyan Hu and
Ke Yan and Xiaojie Wang and Xuan Dong",
title = "{CUR} Transformer: a Convolutional Unbiased Regional
Transformer for Image Denoising",
journal = j-TOMM,
volume = "19",
number = "3",
pages = "104:1--104:??",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3566125",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3566125",
abstract = "Image denoising is a fundamental problem in computer
vision and multimedia computation. Non-local filters
are effective for image denoising. But existing
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "104",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Huang:2023:BPL,
author = "Wenxin Huang and Xuemei Jia and Xian Zhong and Xiao
Wang and Kui Jiang and Zheng Wang",
title = "Beyond the Parts: Learning Coarse-to-Fine Adaptive
Alignment Representation for Person Search",
journal = j-TOMM,
volume = "19",
number = "3",
pages = "105:1--105:??",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3565886",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3565886",
abstract = "Person search is a time-consuming computer vision task
that entails locating and recognizing query people in
scenic pictures. Body components are \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "105",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yu:2023:DAP,
author = "Hongchuan Yu and Mengqing Huang and Jian Jun Zhang",
title = "Domain Adaptation Problem in Sketch Based Image
Retrieval",
journal = j-TOMM,
volume = "19",
number = "3",
pages = "106:1--106:??",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3565368",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3565368",
abstract = "In this article, we present two algorithms that
discover the discriminative structures of sketches,
given pairs of sketches and photos in sketch-based
image retrieval \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "106",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yan:2023:TIF,
author = "Han Yan and Haijun Zhang and Jianyang Shi and
Jianghong Ma and Xiaofei Xu",
title = "Toward Intelligent Fashion Design: a Texture and Shape
Disentangled Generative Adversarial Network",
journal = j-TOMM,
volume = "19",
number = "3",
pages = "107:1--107:??",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3567596",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3567596",
abstract = "Texture and shape in fashion, constituting essential
elements of garments, characterize the body and surface
of the fabric and outline the silhouette of clothing,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "107",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Dou:2023:MTP,
author = "Peng Dou and Ying Zeng and Zhuoqun Wang and Haifeng
Hu",
title = "Multiple Temporal Pooling Mechanisms for Weakly
Supervised Temporal Action Localization",
journal = j-TOMM,
volume = "19",
number = "3",
pages = "108:1--108:??",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3567828",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3567828",
abstract = "Recent action localization works learn in a weakly
supervised manner to avoid the expensive cost of human
labeling. Those works are mostly based on the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "108",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2023:MSE,
author = "Lei Li and Zhiyuan Zhou and Suping Wu and Yongrong
Cao",
title = "Multi-scale Edge-guided Learning for {$3$D}
Reconstruction",
journal = j-TOMM,
volume = "19",
number = "3",
pages = "109:1--109:??",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3568678",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3568678",
abstract = "Single-view three-dimensional (3D) object
reconstruction has always been a long-term challenging
task. Objects with complex topologies are hard to
accurately \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "109",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2023:LFR,
author = "Zhengxue Wang and Guangwei Gao and Juncheng Li and Hui
Yan and Hao Zheng and Huimin Lu",
title = "Lightweight Feature De-redundancy and Self-calibration
Network for Efficient Image Super-resolution",
journal = j-TOMM,
volume = "19",
number = "3",
pages = "110:1--110:??",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3569900",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3569900",
abstract = "In recent years, thanks to the inherent powerful
feature representation and learning abilities of the
convolutional neural network (CNN), deep CNN-steered
single \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "110",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Huang:2023:FTF,
author = "Zhijie Huang and Jun Sun and Xiaopeng Guo",
title = "{FastCNN}: Towards Fast and Accurate Spatiotemporal
Network for {HEVC} Compressed Video Enhancement",
journal = j-TOMM,
volume = "19",
number = "3",
pages = "111:1--111:??",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3569583",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3569583",
abstract = "Deep neural networks have achieved remarkable success
in HEVC compressed video quality enhancement. However,
most existing multiframe-based \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "111",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2023:DPS,
author = "Xiaohan Wang and Linchao Zhu and Fei Wu and Yi Yang",
title = "A Differentiable Parallel Sampler for Efficient Video
Classification",
journal = j-TOMM,
volume = "19",
number = "3",
pages = "112:1--112:??",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3569584",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3569584",
abstract = "It is crucial to sample a small portion of relevant
frames for efficient video classification. The existing
methods mainly develop hand-designed sampling
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "112",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2023:TFE,
author = "Junjie Li and Jin Yuan and Zhiyong Li",
title = "{TP-FER}: an Effective Three-phase Noise-tolerant
Recognizer for Facial Expression Recognition",
journal = j-TOMM,
volume = "19",
number = "3",
pages = "113:1--113:??",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3570329",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3570329",
abstract = "Single-label facial expression recognition (FER),
which aims to classify single expression for facial
images, usually suffers from the label noisy and
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "113",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Huang:2023:LEF,
author = "Baojin Huang and Zhongyuan Wang and Guangcheng Wang
and Zhen Han and Kui Jiang",
title = "Local Eyebrow Feature Attention Network for Masked
Face Recognition",
journal = j-TOMM,
volume = "19",
number = "3",
pages = "114:1--114:??",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3569943",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3569943",
abstract = "During the COVID-19 coronavirus epidemic, wearing
masks has become increasingly popular. Traditional
occlusion face recognition algorithms are almost
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "114",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yang:2023:ESI,
author = "Bin-Cheng Yang and Gangshan Wu",
title = "Efficient Single-image Super-resolution Using Dual
path Connections with Multiple scale Learning",
journal = j-TOMM,
volume = "19",
number = "3",
pages = "115:1--115:??",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3570164",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3570164",
abstract = "Deep convolutional neural networks have been
demonstrated to be effective for single-image
super-resolution in recent years. On the one hand,
residual \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "115",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhou:2023:AAM,
author = "Wei Zhou and Yanke Hou and Dihu Chen and Haifeng Hu
and Tao Su",
title = "Attention-Augmented Memory Network for Image
Multi-Label Classification",
journal = j-TOMM,
volume = "19",
number = "3",
pages = "116:1--116:??",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3570166",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3570166",
abstract = "The purpose of image multi-label classification is to
predict all the object categories presented in an
image. Some recent works exploit graph \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "116",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Hui:2023:MGC,
author = "Shuaixiong Hui and Qiang Guo and Xiaoyu Geng and
Caiming Zhang",
title = "Multi-Guidance {CNNs} for Salient Object Detection",
journal = j-TOMM,
volume = "19",
number = "3",
pages = "117:1--117:??",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3570507",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3570507",
abstract = "Feature refinement and feature fusion are two key
steps in convolutional neural networks-based salient
object detection (SOD). In this article, we \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "117",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xing:2023:PPI,
author = "Kai Xing and Tao Li and Xuanhan Wang",
title = "{ProposalVLAD} with Proposal-Intra Exploring for
Temporal Action Proposal Generation",
journal = j-TOMM,
volume = "19",
number = "3",
pages = "118:1--118:??",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3571747",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3571747",
abstract = "Temporal action proposal generation aims to localize
temporal segments of human activities in videos.
Current boundary-based proposal generation methods can
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "118",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Tang:2023:DUK,
author = "Hao Tang and Lei Ding and Songsong Wu and Bin Ren and
Nicu Sebe and Paolo Rota",
title = "Deep Unsupervised Key Frame Extraction for Efficient
Video Classification",
journal = j-TOMM,
volume = "19",
number = "3",
pages = "119:1--119:??",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3571735",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3571735",
abstract = "Video processing and analysis have become an urgent
task, as a huge amount of videos (e.g., YouTube, Hulu)
are uploaded online every day. The extraction of
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "119",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2023:ERI,
author = "Ling Zhang and Chengjiang Long and Xiaolong Zhang and
Chunxia Xiao",
title = "Exploiting Residual and Illumination with {GANs} for
Shadow Detection and Shadow Removal",
journal = j-TOMM,
volume = "19",
number = "3",
pages = "120:1--120:??",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3571745",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3571745",
abstract = "Residual image and illumination estimation have been
proven to be helpful for image enhancement. In this
article, we propose a general framework, called RI-GAN,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "120",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2023:DRI,
author = "Yushu Zhang and Nuo Chen and Shuren Qi and Mingfu Xue
and Zhongyun Hua",
title = "Detection of Recolored Image by Texture Features in
Chrominance Components",
journal = j-TOMM,
volume = "19",
number = "3",
pages = "121:1--121:??",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3571076",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3571076",
abstract = "Image recoloring is an emerging editing technique that
can change the color style of an image by modifying
pixel values without altering the original image
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "121",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xue:2023:HFF,
author = "Han Xue and Jun Ling and Anni Tang and Li Song and
Rong Xie and Wenjun Zhang",
title = "High-Fidelity Face Reenactment Via Identity-Matched
Correspondence Learning",
journal = j-TOMM,
volume = "19",
number = "3",
pages = "122:1--122:??",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3571857",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3571857",
abstract = "Face reenactment aims to generate an animation of a
source face using the poses and expressions from a
target face. Although recent methods have made
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "122",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Chen:2023:PHD,
author = "Haozhe Chen and Hang Zhou and Jie Zhang and Dongdong
Chen and Weiming Zhang and Kejiang Chen and Gang Hua
and Nenghai Yu",
title = "Perceptual Hashing of Deep Convolutional Neural
Networks for Model Copy Detection",
journal = j-TOMM,
volume = "19",
number = "3",
pages = "123:1--123:??",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3572777",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3572777",
abstract = "In recent years, many model intellectual property (IP)
proof methods for IP protection have been proposed,
such as model watermarking and model \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "123",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Duan:2023:MGL,
author = "Wei Duan and Yi Yu and Xulong Zhang and Suhua Tang and
Wei Li and Keizo Oyama",
title = "Melody Generation from Lyrics with Local
Interpretability",
journal = j-TOMM,
volume = "19",
number = "3",
pages = "124:1--124:??",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3572031",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3572031",
abstract = "Melody generation aims to learn the distribution of
real melodies to generate new melodies conditioned on
lyrics, which has been a very interesting topic in the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "124",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2023:TFG,
author = "Shiguang Liu and Huixin Wang",
title = "Talking Face Generation via Facial Anatomy",
journal = j-TOMM,
volume = "19",
number = "3",
pages = "125:1--125:??",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3571746",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:36 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3571746",
abstract = "To generate the corresponding talking face from a
speech audio and a face image, it is essential to match
the variations in the facial appearance with the speech
audio \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "125",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zeng:2023:TIA,
author = "Zengri Zeng and Baokang Zhao and Han-Chieh Chao and
Ilsun You and Kuo-Hui Yeh and Weizhi Meng",
title = "Towards Intelligent Attack Detection Using {DNA}
Computing",
journal = j-TOMM,
volume = "19",
number = "3s",
pages = "126:1--126:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3561057",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:37 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3561057",
abstract = "In recent years, frequent network attacks have
seriously threatened the interests and security of
humankind. To address this threat, many detection
methods \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "126",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2023:DCB,
author = "Jinxia Wang and Rui Chen and Zhihan Lv",
title = "{DNA} Computing-Based Multi-Source Data Storage Model
in Digital Twins",
journal = j-TOMM,
volume = "19",
number = "3s",
pages = "127:1--127:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3561823",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:37 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3561823",
abstract = "The work aims to study the application of
Deoxyribonucleic Acid (DNA) multi-source data storage
in Digital Twins (DT). Through the investigation of the
research \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "127",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Ahmed:2023:DBC,
author = "Fawad Ahmed and Muneeb Ur Rehman and Jawad Ahmad and
Muhammad Shahbaz Khan and Wadii Boulila and Gautam
Srivastava and Jerry Chun-Wei Lin and William J.
Buchanan",
title = "A {DNA} Based Colour Image Encryption Scheme Using A
Convolutional Autoencoder",
journal = j-TOMM,
volume = "19",
number = "3s",
pages = "128:1--128:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3570165",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:37 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3570165",
abstract = "With the advancement in technology, digital images can
easily be transmitted and stored over the Internet.
Encryption is used to avoid illegal interception of
digital \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "128",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Menon:2023:EEM,
author = "Vignesh V. Menon and Hadi Amirpour and Mohammad
Ghanbari and Christian Timmerer",
title = "{EMES}: Efficient Multi-encoding Schemes for
{HEVC}-based Adaptive Bitrate Streaming",
journal = j-TOMM,
volume = "19",
number = "3s",
pages = "129:1--129:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3575659",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:37 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3575659",
abstract = "In HTTP Adaptive Streaming (HAS), videos are encoded
at multiple bitrates and spatial resolutions (i.e.,
representations ) to adapt to the heterogeneity of
network \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "129",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2023:VAC,
author = "Jiwei Zhang and Yi Yu and Suhua Tang and Jianming Wu
and Wei Li",
title = "Variational Autoencoder with {CCA} for Audio-Visual
Cross-modal Retrieval",
journal = j-TOMM,
volume = "19",
number = "3s",
pages = "130:1--130:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3575658",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:37 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3575658",
abstract = "Cross-modal retrieval is to utilize one modality as a
query to retrieve data from another modality, which has
become a popular topic in information \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "130",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Le:2023:SAV,
author = "Thi-Ngoc-Hanh Le and Ya-Hsuan Chen and Tong-Yee Lee",
title = "Structure-aware Video Style Transfer with Map Art",
journal = j-TOMM,
volume = "19",
number = "3s",
pages = "131:1--131:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3572030",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:37 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3572030",
abstract = "Changing the style of an image/video while preserving
its content is a crucial criterion to access a new
neural style transfer algorithm. However, it is very
challenging to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "131",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhao:2023:PMT,
author = "Sirui Zhao and Hongyu Jiang and Hanqing Tao and Rui
Zha and Kun Zhang and Tong Xu and Enhong Chen",
title = "{PEDM}: a Multi-task Learning Model for Persona-aware
Emoji-embedded Dialogue Generation",
journal = j-TOMM,
volume = "19",
number = "3s",
pages = "132:1--132:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3571819",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:37 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3571819",
abstract = "As a vivid and linguistic symbol, Emojis have become a
prevailing medium interspersed in text-based
communication (e.g., social media and chit-chat) to
express \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "132",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Hung:2023:FCN,
author = "Heyu Huang and Runmin Cong and Lianhe Yang and Ling Du
and Cong Wang and Sam Kwong",
title = "Feedback Chain Network for Hippocampus Segmentation",
journal = j-TOMM,
volume = "19",
number = "3s",
pages = "133:1--133:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3571744",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:37 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3571744",
abstract = "The hippocampus plays a vital role in the diagnosis
and treatment of many neurological disorders. Recent
years, deep learning technology has made \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "133",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yao:2023:CRA,
author = "Xuanrong Yao and Xin Wang and Yue Liu and Wenwu Zhu",
title = "Continual Recognition with Adaptive Memory Update",
journal = j-TOMM,
volume = "19",
number = "3s",
pages = "134:1--134:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3573202",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:37 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3573202",
abstract = "Class incremental continual learning aims to improve
the ability of modern classification models to
continually recognize new classes without forgetting
the previous \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "134",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2023:AAM,
author = "Jingyao Wang and Luntian Mou and Lei Ma and Tiejun
Huang and Wen Gao",
title = "{AMSA}: Adaptive Multimodal Learning for Sentiment
Analysis",
journal = j-TOMM,
volume = "19",
number = "3s",
pages = "135:1--135:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3572915",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:37 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3572915",
abstract = "Efficient recognition of emotions has attracted
extensive research interest, which makes new
applications in many fields possible, such as
human-computer \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "135",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zeng:2023:JAC,
author = "Shaoning Zeng and Yunbo Rao and Bob Zhang and Yong
Xu",
title = "Joint Augmented and Compressed Dictionaries for Robust
Image Classification",
journal = j-TOMM,
volume = "19",
number = "3s",
pages = "136:1--136:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3572910",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:37 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3572910",
abstract = "Dictionary-based Classification (DC) has been a
promising learning theory in multimedia computing.
Previous studies focused on learning a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "136",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wanyan:2023:DSG,
author = "Yuyang Wanyan and Xiaoshan Yang and Xuan Ma and
Changsheng Xu",
title = "Dual Scene Graph Convolutional Network for Motivation
Prediction",
journal = j-TOMM,
volume = "19",
number = "3s",
pages = "137:1--137:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3572914",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:37 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3572914",
abstract = "Humans can easily infer the motivations behind human
actions from only visual data by comprehensively
analyzing the complex context information and utilizing
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "137",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Lei:2023:LUD,
author = "Fei Lei and Zhongqi Cao and Yuning Yang and Yibo Ding
and Cong Zhang",
title = "Learning the User's Deeper Preferences for Multi-modal
Recommendation Systems",
journal = j-TOMM,
volume = "19",
number = "3s",
pages = "138:1--138:??",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3573010",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:37 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3573010",
abstract = "Recommendation system plays an important role in the
rapid development of micro-video sharing platform.
Micro-video has rich modal features, such as visual,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "138",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yan:2023:FDP,
author = "Xuehu Yan and Longlong Li and Lei Sun and Jia Chen and
Shudong Wang",
title = "Fake and Dishonest Participant Immune Secret Image
Sharing",
journal = j-TOMM,
volume = "19",
number = "4",
pages = "139:1--139:??",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3572842",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:37 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3572842",
abstract = "Secret image sharing (SIS) has received increased
attention from the research community because of its
usefulness in multiparty secure computing, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "139",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yang:2023:SCF,
author = "Song Yang and Qiang Li and Wenhui Li and Xuan-Ya Li
and Ran Jin and Bo Lv and Rui Wang and Anan Liu",
title = "Semantic Completion and Filtration for Image-Text
Retrieval",
journal = j-TOMM,
volume = "19",
number = "4",
pages = "140:1--140:??",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3572844",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:37 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3572844",
abstract = "Image-text retrieval is a vital task in computer
vision and has received growing attention, since it
connects cross-modality data. It comes with the
critical \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "140",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Ma:2023:MSK,
author = "Xuan Ma and Xiaoshan Yang and Changsheng Xu",
title = "Multi-Source Knowledge Reasoning Graph Network for
Multi-Modal Commonsense Inference",
journal = j-TOMM,
volume = "19",
number = "4",
pages = "141:1--141:??",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3573201",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:37 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3573201",
abstract = "As a crucial part of natural language processing,
event-centered commonsense inference task has attracted
increasing attention. With a given observed \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "141",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wu:2023:APA,
author = "Shangxi Wu and Jitao Sang and Kaiyuan Xu and Jiaming
Zhang and Jian Yu",
title = "Attention, Please! {Adversarial} Defense via
Activation Rectification and Preservation",
journal = j-TOMM,
volume = "19",
number = "4",
pages = "142:1--142:??",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3572843",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:37 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3572843",
abstract = "This study provides a new understanding of the
adversarial attack problem by examining the correlation
between adversarial attack and visual attention change.
In \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "142",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2023:CSA,
author = "Kan Wang and Changxing Ding and Jianxin Pang and
Xiangmin Xu",
title = "Context Sensing Attention Network for Video-based
Person Re-identification",
journal = j-TOMM,
volume = "19",
number = "4",
pages = "143:1--143:??",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3573203",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:37 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3573203",
abstract = "Video-based person re-identification (ReID) is
challenging due to the presence of various
interferences in video frames. Recent approaches handle
this problem \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "143",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2023:SSL,
author = "Wenjing Wang and Lilang Lin and Zejia Fan and Jiaying
Liu",
title = "Semi-supervised Learning for {Mars} Imagery
Classification and Segmentation",
journal = j-TOMM,
volume = "19",
number = "4",
pages = "144:1--144:??",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3572916",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:37 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3572916",
abstract = "With the progress of Mars exploration, numerous Mars
image data are being collected and need to be analyzed.
However, due to the severe train-test gap \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "144",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2023:DDD,
author = "Hui Liu and Shanshan Li and Jicheng Zhu and Kai Deng
and Meng Liu and Liqiang Nie",
title = "{DDIFN}: a Dual-discriminator Multi-modal Medical
Image Fusion Network",
journal = j-TOMM,
volume = "19",
number = "4",
pages = "145:1--145:??",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3574136",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:37 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3574136",
abstract = "Multi-modal medical image fusion is a long-standing
important research topic that can obtain informative
medical images and assist doctors diagnose and treat
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "145",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wu:2023:DGD,
author = "Xintian Wu and Huanyu Wang and Yiming Wu and Xi Li",
title = "{D$^3$T-GAN}: Data-Dependent Domain Transfer {GANs}
for Image Generation with Limited Data",
journal = j-TOMM,
volume = "19",
number = "4",
pages = "146:1--146:??",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3576858",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:37 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3576858",
abstract = "As an important and challenging problem, image
generation with limited data aims at generating
realistic images through training a GAN model given few
samples. A \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "146",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhu:2023:NLA,
author = "Dandan Zhu and Xuan Shao and Qiangqiang Zhou and
Xiongkuo Min and Guangtao Zhai and Xiaokang Yang",
title = "A Novel Lightweight Audio-visual Saliency Model for
Videos",
journal = j-TOMM,
volume = "19",
number = "4",
pages = "147:1--147:??",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3576857",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:37 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3576857",
abstract = "Audio information has not been considered an important
factor in visual attention models regardless of many
psychological studies that have shown the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "147",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Abdussalam:2023:NNC,
author = "Amr Abdussalam and Zhongfu Ye and Ammar Hawbani and
Majjed Al-Qatf and Rashid Khan",
title = "{NumCap}: a Number-controlled Multi-caption Image
Captioning Network",
journal = j-TOMM,
volume = "19",
number = "4",
pages = "148:1--148:??",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3576927",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:37 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3576927",
abstract = "Image captioning is a promising task that attracted
researchers in the last few years. Existing image
captioning models are primarily trained to generate one
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "148",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2023:DML,
author = "Hao Liu and Zhaoyu Yan and Bing Liu and Jiaqi Zhao and
Yong Zhou and Abdulmotaleb {El Saddik}",
title = "Distilled Meta-learning for Multi-Class Incremental
Learning",
journal = j-TOMM,
volume = "19",
number = "4",
pages = "149:1--149:??",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3576045",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:37 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3576045",
abstract = "Meta-learning approaches have recently achieved
promising performance in multi-class incremental
learning. However, meta-learners still suffer from
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "149",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yuan:2023:GAT,
author = "Jin Yuan and Shikai Chen and Yao Zhang and Zhongchao
Shi and Xin Geng and Jianping Fan and Yong Rui",
title = "Graph Attention Transformer Network for Multi-label
Image Classification",
journal = j-TOMM,
volume = "19",
number = "4",
pages = "150:1--150:??",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3578518",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:37 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3578518",
abstract = "Multi-label classification aims to recognize multiple
objects or attributes from images. The key to solving
this issue relies on effectively characterizing the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "150",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Hou:2023:UUI,
author = "Guojia Hou and Yuxuan Li and Huan Yang and Kunqian Li
and Zhenkuan Pan",
title = "{UID2021}: an Underwater Image Dataset for Evaluation
of No-Reference Quality Assessment Metrics",
journal = j-TOMM,
volume = "19",
number = "4",
pages = "151:1--151:??",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3578584",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Jun 22 10:29:37 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3578584",
abstract = "Achieving subjective and objective quality assessment
of underwater images is of high significance in
underwater visual perception and image/video
processing. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "151",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Carlsson:2023:CUS,
author = "Niklas Carlsson and Derek Eager",
title = "Cross-User Similarities in Viewing Behavior for
360${}^\circ $ Video and Caching Implications",
journal = j-TOMM,
volume = "19",
number = "5",
pages = "152:1--152:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3507917",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Jul 3 07:03:55 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3507917",
abstract = "The demand and usage of 360${}^\circ $ video services
are expected to increase. However, despite these
services being highly bandwidth intensive, not much is
known about the potential value that basic bandwidth
saving techniques such as server or edge-network on-.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "152",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2023:EEH,
author = "Ziqiang Li and Pengfei Xia and Xue Rui and Bin Li",
title = "Exploring the Effect of High-frequency Components in
{GANs} Training",
journal = j-TOMM,
volume = "19",
number = "5",
pages = "153:1--153:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3578585",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Jul 3 07:03:55 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3578585",
abstract = "Generative Adversarial Networks (GANs) have the
ability to generate images that are visually
indistinguishable from real images. However, recent
studies have revealed that generated and real images
share significant differences in the frequency domain.
In \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "153",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yin:2023:FFM,
author = "Haibing Yin and Hongkui Wang and Li Yu and Junhui
Liang and Guangtao Zhai",
title = "Feedforward and Feedback Modulations Based Foveated
{JND} Estimation for Images",
journal = j-TOMM,
volume = "19",
number = "5",
pages = "154:1--154:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3579094",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Jul 3 07:03:55 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3579094",
abstract = "The just noticeable difference (JND) reveals the key
characteristic of visual perception, which has been
widely used in many perception-based image and video
applications. Nevertheless, the modulatory mechanism of
the human visual system (HVS) has not \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "154",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yang:2023:MID,
author = "Taocun Yang and Yaping Huang and Yanlin Xie and Junbo
Liu and Shengchun Wang",
title = "{MixOOD}: Improving Out-of-distribution Detection with
Enhanced Data Mixup",
journal = j-TOMM,
volume = "19",
number = "5",
pages = "155:1--155:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3578935",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Jul 3 07:03:55 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3578935",
abstract = "Detecting out-of-distribution (OOD) inputs for deep
learning models is a critical task when models are
deployed in real-world environments. Recently, a large
number of works have been dedicated to tackling the OOD
detection problem. One of the most \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "155",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wei:2023:MLC,
author = "Hao Wei and Rui Chen",
title = "A Multi-Level Consistency Network for High-Fidelity
Virtual Try-On",
journal = j-TOMM,
volume = "19",
number = "5",
pages = "156:1--156:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3580500",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Jul 3 07:03:55 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3580500",
abstract = "The 2D virtual try-on task aims to transfer a target
clothing image to the corresponding region of a person
image. Although an extensive amount of research has
been conducted due to its immense applications, this
task still remains a great challenge to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "156",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Hao:2023:FGT,
author = "Jiachang Hao and Haifeng Sun and Pengfei Ren and
Yiming Zhong and Jingyu Wang and Qi Qi and Jianxin
Liao",
title = "Fine-Grained Text-to-Video Temporal Grounding from
Coarse Boundary",
journal = j-TOMM,
volume = "19",
number = "5",
pages = "157:1--157:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3579825",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Jul 3 07:03:55 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3579825",
abstract = "Text-to-video temporal grounding aims to locate a
target video moment that semantically corresponds to
the given sentence query in an untrimmed video. In this
task, fully supervised works require text descriptions
for each event along with its temporal \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "157",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2023:DLH,
author = "Weixin Li and Tiantian Cao and Chang Liu and Xue Tian
and Ya Li and Xiaojie Wang and Xuan Dong",
title = "Dual-Lens {HDR} using Guided {$3$D} Exposure {CNN} and
Guided Denoising Transformer",
journal = j-TOMM,
volume = "19",
number = "5",
pages = "158:1--158:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3579167",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Jul 3 07:03:55 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3579167",
abstract = "We study the high dynamic range (HDR) imaging problem
in dual-lens systems. Existing methods usually treat
the HDR imaging problem as an image fusion problem and
the HDR result is estimated by fusing the aligned short
exposure image and long exposure \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "158",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yang:2023:HHF,
author = "Xin Yang and Hengrui Li and Xiaochuan Li and Tao Li",
title = "{HIFGAN}: a High-Frequency Information-Based
Generative Adversarial Network for Image
Super-Resolution",
journal = j-TOMM,
volume = "19",
number = "5",
pages = "159:1--159:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3578934",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Jul 3 07:03:55 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3578934",
abstract = "Since the neural network was introduced into the
super-resolution (SR) field, many SR deep models have
been proposed and have achieved excellent results.
However, there are two main drawbacks: one is that the
methods based on the best peak-signal-to-noise
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "159",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2023:DMO,
author = "Yang Li",
title = "Detection of Moving Object Using Superpixel Fusion
Network",
journal = j-TOMM,
volume = "19",
number = "5",
pages = "160:1--160:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3579998",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Jul 3 07:03:55 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3579998",
abstract = "Moving object detection is still a challenging task in
complex scenes. The existing methods based on deep
learning mainly use U-Nets and have achieved amazing
results. However, they ignore the local continuity
between pixels. In order to solve this \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "160",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Pan:2023:BTO,
author = "Yingwei Pan and Yehao Li and Ting Yao and Tao Mei",
title = "Bottom-up and Top-down Object Inference Networks for
Image Captioning",
journal = j-TOMM,
volume = "19",
number = "5",
pages = "161:1--161:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3580366",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Jul 3 07:03:55 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3580366",
abstract = "A bottom-up and top-down attention mechanism has led
to the revolutionizing of image captioning techniques,
which enables object-level attention for multi-step
reasoning over all the detected objects. However, when
humans describe an image, they often \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "161",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Feng:2023:MMK,
author = "Duoduo Feng and Xiangteng He and Yuxin Peng",
title = "{MKVSE}: Multimodal Knowledge Enhanced Visual-semantic
Embedding for Image-text Retrieval",
journal = j-TOMM,
volume = "19",
number = "5",
pages = "162:1--162:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3580501",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Jul 3 07:03:55 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3580501",
abstract = "Image-text retrieval aims to take the text (image)
query to retrieve the semantically relevant images
(texts), which is fundamental and critical in the
search system, online shopping, and social network.
Existing works have shown the effectiveness of
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "162",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhao:2023:BTG,
author = "Mengyi Zhao and Hao Tang and Pan Xie and Shuling Dai
and Nicu Sebe and Wei Wang",
title = "Bidirectional Transformer {GAN} for Long-term Human
Motion Prediction",
journal = j-TOMM,
volume = "19",
number = "5",
pages = "163:1--163:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3579359",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Jul 3 07:03:55 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3579359",
abstract = "The mainstream motion prediction methods usually focus
on short-term prediction, and their predicted long-term
motions often fall into an average pose, i.e., the
freezing forecasting problem [ 27 ]. To mitigate this
problem, we propose a novel Bidirectional \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "163",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2023:RVS,
author = "Jian Wang and Qiang Ling and Peiyan Li",
title = "Robust Video Stabilization based on Motion
Decomposition",
journal = j-TOMM,
volume = "19",
number = "5",
pages = "164:1--164:??",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3580498",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Jul 3 07:03:55 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3580498",
abstract = "Video stabilization aims to eliminate camera jitter
and improve the visual experience of shaky videos.
Video stabilization methods often ignore the active
movement of the foreground objects and the camera, and
may result in distortion and over-smoothing \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "164",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Franti:2023:DPC,
author = "Pasi Fr{\"a}nti and Nancy Fazal",
title = "Design Principles for Content Creation in
Location-Based Games",
journal = j-TOMM,
volume = "19",
number = "5s",
pages = "165:1--165:??",
month = oct,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3583689",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Jul 3 08:37:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3583689",
abstract = "Location-based games have been around since 2000
across various fields, including education, health, and
entertainment. The main challenge facing such games
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "165",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2023:VNB,
author = "Chenchi Zhang and Wenbo Ma and Jun Xiao and Hanwang
Zhang and Jian Shao and Yueting Zhuang and Long Chen",
title = "{VL-NMS}: Breaking Proposal Bottlenecks in Two-stage
Visual-language Matching",
journal = j-TOMM,
volume = "19",
number = "5s",
pages = "166:1--166:??",
month = oct,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3579095",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Jul 3 08:37:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3579095",
abstract = "The prevailing framework for matching multimodal
inputs is based on a two-stage process: (1) detecting
proposals with an object detector and (2) matching
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "166",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Mackowski:2023:MPI,
author = "Micha{\l} Ma{\'c}kowski and Piotr Brzoza and Mateusz
Kawulok and Rafa{\l} Meisel and Dominik Spinczyk",
title = "Multimodal Presentation of Interactive Audio-Tactile
Graphics Supporting the Perception of Visual
Information by Blind People",
journal = j-TOMM,
volume = "19",
number = "5s",
pages = "167:1--167:??",
month = oct,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3586076",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Jul 3 08:37:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3586076",
abstract = "Due to the limitations in the perception of graphical
information by blind people and the need to substitute
the sense of sight with other senses, the correct use
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "167",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Man:2023:TTE,
author = "Xin Man and Jie Shao and Feiyu Chen and Mingxing Zhang
and Heng Tao Shen",
title = "{TEVL}: Trilinear Encoder for Video-language
Representation Learning",
journal = j-TOMM,
volume = "19",
number = "5s",
pages = "168:1--168:??",
month = oct,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3585388",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Jul 3 08:37:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3585388",
abstract = "Pre-training model on large-scale unlabeled web videos
followed by task-specific fine-tuning is a canonical
approach to learning video and language \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "168",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Ricci:2023:MLA,
author = "Simone Ricci and Tiberio Uricchio and Alberto {Del
Bimbo}",
title = "Meta-learning Advisor Networks for Long-tail and Noisy
Labels in Social Image Classification",
journal = j-TOMM,
volume = "19",
number = "5s",
pages = "169:1--169:??",
month = oct,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3584360",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Jul 3 08:37:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3584360",
abstract = "Deep neural networks (DNNs) for social image
classification are prone to performance reduction and
overfitting when trained on datasets plagued by
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "169",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2023:LBR,
author = "Chen Li and Li Song and Rong Xie and Wenjun Zhang",
title = "Local Bidirection Recurrent Network for Efficient
Video Deblurring with the Fused Temporal Merge Module",
journal = j-TOMM,
volume = "19",
number = "5s",
pages = "170:1--170:??",
month = oct,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3587468",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Jul 3 08:37:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3587468",
abstract = "Video deblurring methods exploit the correlation
between consecutive blurry inputs to generate sharp
frames. However, designing an effective and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "170",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Niu:2023:VCL,
author = "Tian-Zi Niu and Zhen-Duo Chen and Xin Luo and Peng-Fei
Zhang and Zi Huang and Xin-Shun Xu",
title = "Video Captioning by Learning from Global Sentence and
Looking Ahead",
journal = j-TOMM,
volume = "19",
number = "5s",
pages = "171:1--171:??",
month = oct,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3587252",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Jul 3 08:37:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3587252",
abstract = "Video captioning aims to automatically generate
natural language sentences describing the content of a
video. Although encoder-decoder-based models \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "171",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2023:GAE,
author = "Yang Wang and Bo Dong and Ke Xu and Haiyin Piao and
Yufei Ding and Baocai Yin and Xin Yang",
title = "A Geometrical Approach to Evaluate the Adversarial
Robustness of Deep Neural Networks",
journal = j-TOMM,
volume = "19",
number = "5s",
pages = "172:1--172:??",
month = oct,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3587936",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Jul 3 08:37:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3587936",
abstract = "Deep neural networks (DNNs) are widely used for
computer vision tasks. However, it has been shown that
deep models are vulnerable to adversarial \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "172",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xiang:2023:LML,
author = "Suncheng Xiang and Dahong Qian and Mengyuan Guan and
Binjie Yan and Ting Liu and Yuzhuo Fu and Guanjie You",
title = "Less Is More: Learning from Synthetic Data with
Fine-Grained Attributes for Person Re-Identification",
journal = j-TOMM,
volume = "19",
number = "5s",
pages = "173:1--173:??",
month = oct,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3588441",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Jul 3 08:37:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3588441",
abstract = "Person re-identification (ReID) plays an important
role in applications such as public security and video
surveillance. Recently, learning from synthetic data [
9 ], \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "173",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Siekkinen:2023:NNA,
author = "Matti Siekkinen and Teemu K{\"a}m{\"a}r{\"a}inen",
title = "Neural Network Assisted Depth Map Packing for
Compression Using Standard Hardware Video Codecs",
journal = j-TOMM,
volume = "19",
number = "5s",
pages = "174:1--174:??",
month = oct,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3588440",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Jul 3 08:37:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3588440",
abstract = "Depth maps are needed by various graphics rendering
and processing operations. Depth map streaming is often
necessary when such operations are performed in a
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "174",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{vanRensburg:2023:OWD,
author = "Bianca Jansen van Rensburg and Pauline Puteaux and
William Puech and Jean-Pierre Pedeboy",
title = "{$3$D} Object Watermarking from Data Hiding in the
Homomorphic Encrypted Domain",
journal = j-TOMM,
volume = "19",
number = "5s",
pages = "175:1--175:??",
month = oct,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3588573",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Jul 3 08:37:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3588573",
abstract = "For over a decade, 3D objects are an increasingly
popular form of media. It has become necessary and
urgent to secure them during their transmission or
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "175",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2023:CSR,
author = "Hao Liu and Xiaoshan Yang and Changsheng Xu",
title = "Counterfactual Scenario-relevant Knowledge-enriched
Multi-modal Emotion Reasoning",
journal = j-TOMM,
volume = "19",
number = "5s",
pages = "176:1--176:??",
month = oct,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3583690",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Jul 3 08:37:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3583690",
abstract = "Multi-modal video emotion reasoning (MERV) has
recently attracted increasing attention due to its
potential application in human-computer interaction.
This \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "176",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Ayoughi:2023:SCE,
author = "Melika Ayoughi and Pascal Mettes and Paul Groth",
title = "Self-contained Entity Discovery from Captioned
Videos",
journal = j-TOMM,
volume = "19",
number = "5s",
pages = "177:1--177:??",
month = oct,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3583138",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Mon Jul 3 08:37:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3583138",
abstract = "This article introduces the task of visual named
entity discovery in videos without the need for
task-specific supervision or task-specific external
knowledge \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "177",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xie:2023:CFP,
author = "Jin Xie and Yanwei Pang and Jing Pan and Jing Nie and
Jiale Cao and Jungong Han",
title = "Complementary Feature Pyramid Network for Object
Detection",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "178:1--178:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3584362",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3584362",
abstract = "The way of constructing a robust feature pyramid is
crucial for object detection. However, existing feature
pyramid methods, which aggregate multi-level features
by using element-wise sum or concatenation, are
inefficient to construct a robust feature \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "178",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2023:DCP,
author = "Tianyi Wang and Harry Cheng and Kam Pui Chow and
Liqiang Nie",
title = "Deep Convolutional Pooling Transformer for Deepfake
Detection",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "179:1--179:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3588574",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3588574",
abstract = "Recently, Deepfake has drawn considerable public
attention due to security and privacy concerns in
social media digital forensics. As the wildly spreading
Deepfake videos on the Internet become more realistic,
traditional detection techniques have failed \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "179",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Chan:2023:LDF,
author = "Patrick P. K. Chan and Xiaoman Hu and Haorui Song and
Peng Peng and Keke Chen",
title = "Learning Disentangled Features for Person
Re-identification under Clothes Changing",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "180:1--180:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3584359",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3584359",
abstract = "Clothes changing is one of the challenges in person
re-identification (ReID), since clothes provide
remarkable and reliable information for decision,
especially when the resolution of an image is low.
Variation of clothes significantly downgrades standard
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "180",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zeng:2023:CFG,
author = "Rongfei Zeng and Mai Su and Ruiyun Yu and Xingwei
Wang",
title = "{CD$^2$}: Fine-grained {$3$D} Mesh Reconstruction with
Twice Chamfer Distance",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "181:1--181:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3582694",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3582694",
abstract = "Monocular 3D reconstruction is to reconstruct the
shape of object and its other information from a single
RGB image. In 3D reconstruction, polygon mesh, with
detailed surface information and low computational
cost, is the most prevalent expression form \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "181",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Niu:2023:SEV,
author = "Tian-Zi Niu and Shan-Shan Dong and Zhen-Duo Chen and
Xin Luo and Shanqing Guo and Zi Huang and Xin-Shun Xu",
title = "Semantic Enhanced Video Captioning with Multi-feature
Fusion",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "182:1--182:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3588572",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3588572",
abstract = "Video captioning aims to automatically describe a
video clip with informative sentences. At present, deep
learning-based models have become the mainstream for
this task and achieved competitive results on public
datasets. Usually, these methods leverage \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "182",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2023:TBV,
author = "Kun Li and Jiaxiu Li and Dan Guo and Xun Yang and Meng
Wang",
title = "Transformer-Based Visual Grounding with Cross-Modality
Interaction",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "183:1--183:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3587251",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3587251",
abstract = "This article tackles the challenging yet important
task of Visual Grounding (VG), which aims to localize a
visual region in the given image referred by a natural
language query. Existing efforts on the VG task are
twofold: (1) two-stage methods first \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "183",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xie:2023:VPG,
author = "Jiayuan Xie and Jiali Chen and Yi Cai and Qingbao
Huang and Qing Li",
title = "Visual Paraphrase Generation with Key Information
Retained",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "184:1--184:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3585010",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3585010",
abstract = "Visual paraphrase generation task aims to rewrite a
given image-related original sentence into a new
paraphrase, where the paraphrase needs to have the same
expressed meaning as the original sentence but have a
difference in expression form. Existing \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "184",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2023:NVS,
author = "Bingzheng Liu and Jianjun Lei and Bo Peng and Chuanbo
Yu and Wanqing Li and Nam Ling",
title = "Novel View Synthesis from a Single Unposed Image via
Unsupervised Learning",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "186:1--186:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3587467",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3587467",
abstract = "Novel view synthesis aims to generate novel views from
one or more given source views. Although existing
methods have achieved promising performance, they
usually require paired views with different poses to
learn a pixel transformation. This article \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "186",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhou:2023:LLI,
author = "Mingliang Zhou and Hongyue Leng and Bin Fang and Tao
Xiang and Xuekai Wei and Weijia Jia",
title = "Low-light Image Enhancement via a Frequency-based
Model with Structure and Texture Decomposition",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "187:1--187:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3590965",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3590965",
abstract = "This article proposes a frequency-based structure and
texture decomposition model in a Retinex-based
framework for low-light image enhancement and noise
suppression. First, we utilize the total
variation-based noise estimation to decompose the
observed \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "187",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhu:2023:AAM,
author = "Hongguang Zhu and Yunchao Wei and Yao Zhao and Chunjie
Zhang and Shujuan Huang",
title = "{AMC}: Adaptive Multi-expert Collaborative Network for
Text-guided Image Retrieval",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "188:1--188:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3584703",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3584703",
abstract = "Text-guided image retrieval integrates reference image
and text feedback as a multimodal query to search the
image corresponding to user intention. Recent
approaches employ multi-level matching, multiple
accesses, or multiple subnetworks for better \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "188",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Fontanini:2023:UDM,
author = "Tomaso Fontanini and Luca Donati and Massimo Bertozzi
and Andrea Prati",
title = "Unsupervised Discovery and Manipulation of Continuous
Disentangled Factors of Variation",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "189:1--189:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3591358",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3591358",
abstract = "Learning a disentangled representation of a
distribution in a completely unsupervised way is a
challenging task that has drawn attention recently. In
particular, much focus has been put in separating
factors of variation (i.e., attributes) within the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "189",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Kumar:2023:AFS,
author = "Puneet Kumar and Gaurav Bhatt and Omkar Ingle and
Daksh Goyal and Balasubramanian Raman",
title = "Affective Feedback Synthesis Towards Multimodal Text
and Image Data",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "190:1--190:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3589186",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3589186",
abstract = "In this article, we have defined a novel task of
affective feedback synthesis that generates feedback
for input text and corresponding images in a way
similar to humans responding to multimodal data. A
feedback synthesis system has been proposed and
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "190",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xu:2023:AAS,
author = "Yikun Xu and Xingxing Wei and Pengwen Dai and Xiaochun
Cao",
title = "{A$^2$SC}: Adversarial Attacks on Subspace
Clustering",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "191:1--191:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3587097",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3587097",
abstract = "Many studies demonstrate that supervised learning
techniques are vulnerable to adversarial examples.
However, adversarial threats in unsupervised learning
have not drawn sufficient scholarly attention. In this
article, we formally address the unexplored \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "191",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zeng:2023:DTV,
author = "Xianhua Zeng and Saiyuan Chen and Yicai Xie and
Tianxing Liao",
title = "{3V3D}: Three-View Contextual Cross-slice Difference
Three-dimensional Medical Image Segmentation
Adversarial Network",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "192:1--192:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3592614",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3592614",
abstract = "In three-dimensional (3D) medical image segmentation,
it is still a great challenge to obtain the
multidimensional feature information contained in voxel
images using a single view for smaller segmentation
targets, and the robustness of models obtained by
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "192",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Becattini:2023:VLS,
author = "Federico Becattini and Pietro Bongini and Luana Bulla
and Alberto {Del Bimbo} and Ludovica Marinucci and
Misael Mongiov{\`\i} and Valentina Presutti",
title = "{VISCOUNTH}: a Large-scale Multilingual Visual
Question Answering Dataset for Cultural Heritage",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "193:1--193:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3590773",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3590773",
abstract = "Visual question answering has recently been settled as
a fundamental multi-modal reasoning task of artificial
intelligence that allows users to get information about
visual content by asking questions in natural language.
In the cultural heritage domain, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "193",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Hsu:2023:RMS,
author = "Wei-Yen Hsu and Pei-Wen Jian",
title = "Recurrent Multi-scale Approximation-Guided Network for
Single Image Super-Resolution",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "194:1--194:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3592613",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3592613",
abstract = "Single-image super-resolution (SISR) is an essential
topic in computer vision applications. However, most
CNN-based SISR approaches directly learn the
relationship between low- and high-resolution images
while ignoring the contextual texture and detail
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "194",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2023:HAW,
author = "Bo Li and Yong Zhang and Chengyang Zhang and Xinglin
Piao and Baocai Yin",
title = "Hypergraph Association Weakly Supervised Crowd
Counting",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "195:1--195:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3594670",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3594670",
abstract = "Weakly supervised crowd counting involves the
regression of the number of individuals present in an
image, using only the total number as the label.
However, this task is plagued by two primary
challenges: the large variation of head size and uneven
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "195",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Tai:2023:MAS,
author = "Yichun Tai and Hailin Shi and Dan Zeng and Hang Du and
Yibo Hu and Zicheng Zhang and Zhijiang Zhang and Tao
Mei",
title = "Multi-Agent Semi-{Siamese} Training for Long-Tail and
Shallow Face Learning",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "196:1--196:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3594669",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3594669",
abstract = "With the recent development of deep convolutional
neural networks and large-scale datasets, deep face
recognition has made remarkable progress and been
widely used in various applications. However, unlike
the existing public face datasets, in many real-.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "196",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2023:PEE,
author = "Rui Li and Baopeng Zhang and Wei Liu and Zhu Teng and
Jianping Fan",
title = "{PANet}: an End-to-end Network Based on Relative
Motion for Online Multi-object Tracking",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "197:1--197:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3595379",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3595379",
abstract = "The popular tracking-by-detection paradigm of
multi-object tracking (MOT) takes detections of each
frame as the input and associates detections from one
frame to another. Existing association methods based on
the relative motion have attracted attention,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "197",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yuan:2023:SBD,
author = "Ye Yuan and Jiawan Zhang",
title = "Shot Boundary Detection Using Color Clustering and
Attention Mechanism",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "198:1--198:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3595923",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3595923",
abstract = "Shot boundary detection (SBD) is widely used in scene
segmentation, semantic analysis, and video retrieval.
However, existing SBD algorithms have certain
applications in video processing, but they have the
following three problems. First, these algorithms
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "198",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Huang:2023:TIS,
author = "Cong Huang and Xiulian Peng and Dong Liu and Yan Lu",
title = "Text Image Super-Resolution Guided by Text Structure
and Embedding Priors",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "199:1--199:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3595924",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3595924",
abstract = "We aim to super-resolve text images from
unrecognizable low-resolution inputs. Existing
super-resolution methods mainly learn a direct mapping
from low-resolution to high-resolution images by
exploring low-level features, which usually generate
blurry \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "199",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhu:2023:MLR,
author = "Jie Zhu and Bo Peng and Wanqing Li and Haifeng Shen
and Qingming Huang and Jianjun Lei",
title = "Modeling Long-range Dependencies and Epipolar Geometry
for Multi-view Stereo",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "200:1--200:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3596445",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3596445",
abstract = "This article proposes a network, referred to as
Multi-View Stereo TRansformer (MVSTR) for depth
estimation from multi-view images. By modeling
long-range dependencies and epipolar geometry, the
proposed MVSTR is capable of extracting dense features
with \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "200",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Chen:2023:IFD,
author = "Xiumei Chen and Xiangtao Zheng and Xiaoqiang Lu",
title = "Identity Feature Disentanglement for Visible-Infrared
Person Re-Identification",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "201:1--201:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3595183",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3595183",
abstract = "Visible-infrared person re-identification (VI-ReID)
task aims to retrieve persons from different spectrum
cameras (i.e., visible and infrared images). The
biggest challenge of VI-ReID is the huge cross-modal
discrepancy caused by different imaging \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "201",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Shu:2023:CAP,
author = "Zhenyu Shu and Ling Gao and Shun Yi and Fangyu Wu and
Xin Ding and Ting Wan and Shiqing Xin",
title = "Context-Aware {$3$D} Points of Interest Detection via
Spatial Attention Mechanism",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "202:1--202:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3597026",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3597026",
abstract = "Detecting points of interest is a fundamental problem
in 3D shape analysis and can be beneficial to various
tasks in multimedia processing. Traditional
learning-based detection methods usually rely on each
vertex's geometric features to discriminate \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "202",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Chen:2023:CCF,
author = "Zhen Chen and Ming Yang and Shiliang Zhang",
title = "Complementary Coarse-to-Fine Matching for Video Object
Segmentation",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "203:1--203:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3596496",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3596496",
abstract = "Semi-supervised Video Object Segmentation (VOS) needs
to establish pixel-level correspondences between a
video frame and preceding segmented frames to leverage
their segmentation clues. Most works rely on features
at a single scale to establish those \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "203",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Srinivas:2023:CBN,
author = "Kankanala Srinivas and Ashish Kumar Bhandari",
title = "Context-Based Novel Histogram Bin Stretching Algorithm
for Automatic Contrast Enhancement",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "204:1--204:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3597303",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3597303",
abstract = "This article presents CHBS, a novel context-based
histogram bin stretching method that enhances the
contrast by increasing the range of gray levels and
randomness among the gray levels. It comprises image
spatial contextual information and discrete cosine
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "204",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Tang:2023:UDA,
author = "Zhenjun Tang and Zhiyuan Chen and Zhixin Li and Bineng
Zhong and Xianquan Zhang and Xinpeng Zhang",
title = "Unifying Dual-Attention and {Siamese} Transformer
Network for Full-Reference Image Quality Assessment",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "205:1--205:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3597434",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3597434",
abstract = "Image Quality Assessment (IQA) is a critical task of
computer vision. Most Full-Reference (FR) IQA methods
have limitation in the accurate prediction of
perceptual qualities of the traditional distorted
images and the Generative Adversarial Networks (GANs).
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "205",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Tang:2023:LSR,
author = "Geyu Tang and Xingyu Gao and Zhenyu Chen",
title = "Learning Semantic Representation on Visual Attribute
Graph for Person Re-identification and Beyond",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "206:1--206:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3487044",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3487044",
abstract = "Person re-identification (re-ID) aims to match
pedestrian pairs captured from different cameras.
Recently, various attribute-based models have been
proposed to combine the pedestrian attribute as an
auxiliary semantic information to learn a more
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "206",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Deng:2023:LGL,
author = "Zijun Deng and Xiangteng He and Yuxin Peng",
title = "{LFR-GAN}: Local Feature Refinement based Generative
Adversarial Network for Text-to-Image Generation",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "207:1--207:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3589002",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3589002",
abstract = "Text-to-image generation aims to generate images from
text descriptions. Its main challenge lies in two
aspects: (1) Semantic consistency, i.e., the generated
images should be semantically consistent with the input
text; and (2) Visual reality, i.e., the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "207",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Du:2023:WSH,
author = "Yongchao Du and Min Wang and Zhenbo Lu and Wengang
Zhou and Houqiang Li",
title = "Weakly Supervised Hashing with Reconstructive
Cross-modal Attention",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "208:1--208:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3589185",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3589185",
abstract = "On many popular social websites, images are usually
associated with some meta-data such as textual tags,
which involve semantic information relevant to the
image and can be used to supervise the representation
learning for image retrieval. However, these \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "208",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2023:CSC,
author = "Meng Wang and Jizheng Xu and Li Zhang and Junru Li and
Kai Zhang and Shiqi Wang and Siwei Ma",
title = "Compressed Screen Content Image Super Resolution",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "209:1--209:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3589963",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3589963",
abstract = "Screen content has become one of the prominent mediums
in the increasingly connected world. With the
prevalence of remote collaboration and communication
such as virtual conferences and online education,
recent years have witnessed a dramatic increase in
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "209",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xu:2023:CUH,
author = "Boqiang Xu and Jian Liang and Lingxiao He and Jinlin
Wu and Chao Fan and Zhenan Sun",
title = "Color-Unrelated Head-Shoulder Networks for
Fine-Grained Person Re-identification",
journal = j-TOMM,
volume = "19",
number = "6",
pages = "210:1--210:??",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.1145/3599730",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:46 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3599730",
abstract = "Person re-identification (re-id) attempts to match
pedestrian images with the same identity across
non-overlapping cameras. Existing methods usually study
person re-id by learning discriminative features based
on the clothing attributes (e.g., color, \ldots{})",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "210",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xu:2024:IBC,
author = "Zhenbo Xu and Hai-Miao Hu and Liu Liu and Dongping
Zhang and Shifeng Zhang and Wenming Tan",
title = "Instance-Based Continual Learning: a Real-World
Dataset and Baseline for Fresh Recognition",
journal = j-TOMM,
volume = "20",
number = "1",
pages = "1:1--1:??",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3591209",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:48 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3591209",
abstract = "Real-time learning on real-world data streams with
temporal relations is essential for intelligent agents.
However, current online Continual Learning (CL)
benchmarks adopt the mini-batch setting and are
composed of temporally unrelated and disjoint tasks
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "1",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liang:2024:RHG,
author = "Xiaoping Liang and Zhenjun Tang and Zhixin Li and
Mengzhu Yu and Hanyun Zhang and Xianquan Zhang",
title = "Robust Hashing via Global and Local Invariant Features
for Image Copy Detection",
journal = j-TOMM,
volume = "20",
number = "1",
pages = "2:1--2:??",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3600234",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:48 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3600234",
abstract = "Robust hashing is a powerful technique for processing
large-scale images. Currently, many reported image
hashing schemes do not perform well in balancing the
performances of discrimination and robustness, and thus
they cannot efficiently detect image \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "2",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Sarma:2024:DID,
author = "Sandipan Sarma and Arijit Sur",
title = "{DiRaC-I}: Identifying Diverse and Rare Training
Classes for Zero-Shot Learning",
journal = j-TOMM,
volume = "20",
number = "1",
pages = "3:1--3:??",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3603147",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:48 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3603147",
abstract = "Zero-Shot Learning (ZSL) is an extreme form of
transfer learning that aims at learning from a few
``seen classes'' to have an understanding about the
``unseen classes'' in the wild. Given a dataset in ZSL
research, most existing works use a predetermined,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "3",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zheng:2024:SSJ,
author = "Chengyu Zheng and Ning Song and Ruoyu Zhang and Lei
Huang and Zhiqiang Wei and Jie Nie",
title = "Scale-Semantic Joint Decoupling Network for Image-Text
Retrieval in Remote Sensing",
journal = j-TOMM,
volume = "20",
number = "1",
pages = "4:1--4:??",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3603628",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:48 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3603628",
abstract = "Image-text retrieval in remote sensing aims to provide
flexible information for data analysis and application.
In recent years, state-of-the-art methods are dedicated
to ``scale decoupling'' and ``semantic decoupling''
strategies to further enhance the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "4",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2024:ZSS,
author = "Jiankai Li and Yunhong Wang and Weixin Li",
title = "Zero-shot Scene Graph Generation via Triplet
Calibration and Reduction",
journal = j-TOMM,
volume = "20",
number = "1",
pages = "5:1--5:??",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3604284",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:48 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3604284",
abstract = "Scene Graph Generation (SGG) plays a pivotal role in
downstream vision-language tasks. Existing SGG methods
typically suffer from poor compositional
generalizations on unseen triplets. They are generally
trained on incompletely annotated scene graphs that
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "5",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yaqoob:2024:APT,
author = "Abid Yaqoob and Gabriel-Miro Muntean",
title = "Advanced Predictive Tile Selection Using Dynamic
Tiling for Prioritized 360${}^\circ $ Video {VR}
Streaming",
journal = j-TOMM,
volume = "20",
number = "1",
pages = "6:1--6:??",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3603146",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:48 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3603146",
abstract = "The widespread availability of smart computing and
display devices such as mobile phones, gaming consoles,
laptops, and tethered/untethered head-mounted displays
has fueled an increase in demand for omnidirectional
(360${}^\circ $) videos. 360${}^\circ $ video
applications \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "6",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2024:LGR,
author = "Jia Wang and Hong-Han Shuai and Yung-Hui Li and
Wen-Huang Cheng",
title = "Language-guided Residual Graph Attention Network and
Data Augmentation for Visual Grounding",
journal = j-TOMM,
volume = "20",
number = "1",
pages = "7:1--7:??",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3604557",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:48 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3604557",
abstract = "Visual grounding is an essential task in understanding
the semantic relationship between the given text
description and the target object in an image. Due to
the innate complexity of language and the rich semantic
context of the image, it is still a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "7",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2024:ACN,
author = "Haoran Wang and Yajie Wang and Baosheng Yu and Yibing
Zhan and Chunfeng Yuan and Wankou Yang",
title = "Attentional Composition Networks for Long-Tailed Human
Action Recognition",
journal = j-TOMM,
volume = "20",
number = "1",
pages = "8:1--8:??",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3603253",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:48 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3603253",
abstract = "The problem of long-tailed visual recognition has been
receiving increasing research attention. However, the
long-tailed distribution problem remains underexplored
for video-based visual recognition. To address this
issue, in this article we propose a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "8",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2024:SSC,
author = "Zi-Chao Zhang and Zhen-Duo Chen and Zhen-Yu Xie and
Xin Luo and Xin-Shun Xu",
title = "{S3Mix}: Same Category Same Semantics Mixing for
Augmenting Fine-grained Images",
journal = j-TOMM,
volume = "20",
number = "1",
pages = "9:1--9:??",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3605892",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:48 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3605892",
abstract = "Data augmentation is a common technique to improve the
generalization performance of models for image
classification. Although methods such as Mixup and
CutMix that mix images randomly are indeed instrumental
in general image classification, randomly \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "9",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Tan:2024:TBR,
author = "Mingkui Tan and Zhiquan Wen and Leyuan Fang and Qi
Wu",
title = "Transformer-Based Relational Inference Network for
Complex Visual Relational Reasoning",
journal = j-TOMM,
volume = "20",
number = "1",
pages = "10:1--10:??",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3605781",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:48 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3605781",
abstract = "Visual Relational Reasoning is the basis of many
vision-and-language based tasks (e.g., visual question
answering and referring expression comprehension). In
this article, we regard the complex referring
expression comprehension (c-REF) task as the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "10",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yang:2024:SSL,
author = "Yiming Yang and Weipeng Hu and Haifeng Hu",
title = "Syncretic Space Learning Network for {NIR-VIS} Face
Recognition",
journal = j-TOMM,
volume = "20",
number = "1",
pages = "11:1--11:??",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3607143",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:48 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3607143",
abstract = "To overcome the technical bottleneck of face
recognition in low-light scenarios, Near-InfraRed and
VISible (NIR-VIS) heterogeneous face recognition is
proposed for matching well-lit VIS faces with poorly
lit NIR faces. Current cross-modal synthesis \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "11",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2024:DWG,
author = "Chenghua Li and Zongze Li and Jing Sun and Yun Zhang
and Xiaoping Jiang and Fan Zhang",
title = "Dynamic Weighted Gradient Reversal Network for
Visible-infrared Person Re-identification",
journal = j-TOMM,
volume = "20",
number = "1",
pages = "12:1--12:??",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3607535",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:48 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3607535",
abstract = "Due to intra-modality variations and cross-modality
discrepancy, visible-infrared person re-identification
(VI Re-ID) is an important and challenging task in
intelligent video surveillance. The cross-modality
discrepancy is mainly caused by the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "12",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Song:2024:TFI,
author = "Jiajun Song and Zhuo Li and Weiqing Min and Shuqiang
Jiang",
title = "Towards Food Image Retrieval via
Generalization-Oriented Sampling and Loss Function
Design",
journal = j-TOMM,
volume = "20",
number = "1",
pages = "13:1--13:??",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3600095",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:48 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3600095",
abstract = "Food computing has increasingly received widespread
attention in the multimedia field. As a basic task of
food computing, food image retrieval has wide
applications, that is, food image retrieval can help
users to find the desired food from a large number
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "13",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Jin:2024:CBN,
author = "Yiting Jin and Jie Wu and Wanliang Wang and Yidong Yan
and Jiawei Jiang and Jianwei Zheng",
title = "Cascading Blend Network for Image Inpainting",
journal = j-TOMM,
volume = "20",
number = "1",
pages = "14:1--14:??",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3608952",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:48 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3608952",
abstract = "Image inpainting refers to filling in unknown regions
with known knowledge, which is in full flourish
accompanied by the popularity and prosperity of deep
convolutional networks. Current inpainting methods have
excelled in completing small-sized \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "14",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Guo:2024:DLS,
author = "Kehua Guo and Liang Chen and Xiangyuan Zhu and Xiaoyan
Kui and Jian Zhang and Heyuan Shi",
title = "Double-Layer Search and Adaptive Pooling Fusion for
Reference-Based Image Super-Resolution",
journal = j-TOMM,
volume = "20",
number = "1",
pages = "15:1--15:??",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3604937",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:48 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3604937",
abstract = "Reference-based image super-resolution (RefSR) aims to
reconstruct high-resolution (HR) images from
low-resolution (LR) images by introducing HR reference
images. The key step of RefSR is to transfer reference
features to LR features. However, existing \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "15",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhao:2024:UOF,
author = "Jing Zhao and Bin Li and Jiahao Li and Ruiqin Xiong
and Yan Lu",
title = "A Universal Optimization Framework for Learning-based
Image Codec",
journal = j-TOMM,
volume = "20",
number = "1",
pages = "16:1--16:??",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3580499",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:48 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3580499",
abstract = "Recently, machine learning-based image compression has
attracted increasing interests and is approaching the
state-of-the-art compression ratio. But unlike
traditional codec, it lacks a universal optimization
method to seek efficient representation for \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "16",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2024:ICS,
author = "Liping Zhang and Shukai Chen and Fei Lin and Wei Ren
and Kim-Kwang Raymond Choo and Geyong Min",
title = "{$1$DIEN}: Cross-session Electrocardiogram
Authentication Using {$1$D} Integrated {EfficientNet}",
journal = j-TOMM,
volume = "20",
number = "1",
pages = "17:1--17:??",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3609800",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:48 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3609800",
abstract = "The potential of using electrocardiogram (ECG), an
important physiological signal for humans, as a new
biometric trait has been demonstrated, and ongoing
efforts have focused on utilizing deep learning (e.g.,
2D neural networks) to improve authentication
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "17",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Chen:2024:DMP,
author = "Baian Chen and Zhilei Chen and Xiaowei Hu and Jun Xu
and Haoran Xie and Jing Qin and Mingqiang Wei",
title = "Dynamic Message Propagation Network for {RGB-D} and
Video Salient Object Detection",
journal = j-TOMM,
volume = "20",
number = "1",
pages = "18:1--18:??",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3597612",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:48 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3597612",
abstract = "Exploiting long-range semantic contexts and geometric
information is crucial to infer salient objects from
RGB and depth features. However, existing methods
mainly focus on excavating local features within fixed
regions by continuously feeding forward \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "18",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Gao:2024:SSM,
author = "Xiang Gao and Wei Hu and Guo-Jun Qi",
title = "Self-supervised Multi-view Learning via Auto-encoding
{$3$D} Transformations",
journal = j-TOMM,
volume = "20",
number = "1",
pages = "19:1--19:??",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3597613",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:48 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3597613",
abstract = "3D object representation learning is a fundamental
challenge in computer vision to infer about the 3D
world. Recent advances in deep learning have shown
their efficiency in 3D object recognition, among which
view-based methods have performed best so far.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "19",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2024:EAE,
author = "Dewang Wang and Gaobo Yang and Zhiqing Guo and Jiyou
Chen",
title = "Enhancing Adversarial Embedding based Image
Steganography via Clustering Modification Directions",
journal = j-TOMM,
volume = "20",
number = "1",
pages = "20:1--20:??",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3603377",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:48 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3603377",
abstract = "Image steganography is a technique used to conceal
secret information within cover images without being
detected. However, the advent of convolutional neural
networks (CNNs) has threatened the security of image
steganography. Due to the inherent \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "20",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhao:2024:DHO,
author = "Xiaojia Zhao and Tingting Xu and Qiangqiang Shen and
Youfa Liu and Yongyong Chen and Jingyong Su",
title = "Double High-Order Correlation Preserved Robust
Multi-View Ensemble Clustering",
journal = j-TOMM,
volume = "20",
number = "1",
pages = "21:1--21:??",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3612923",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:48 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3612923",
abstract = "Ensemble clustering (EC), utilizing multiple basic
partitions (BPs) to yield a robust consensus
clustering, has shown promising clustering performance.
Nevertheless, most current algorithms suffer from two
challenging hurdles: (1) a surge of EC-based \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "21",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Tasaka:2024:UQM,
author = "Shuji Tasaka",
title = "Usefulness of {QoS} in Multidimensional {QoE}
Prediction for Haptic-Audiovisual Communications",
journal = j-TOMM,
volume = "20",
number = "1",
pages = "22:1--22:??",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3613246",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:48 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3613246",
abstract = "This article investigates prediction of Quality of
Experience (QoE) by comparing borrowing-from-neighbor
situations and isolated ones. We demonstrate that joint
utilization of multiple QoE measures enhances the
accuracy of QoE prediction compared to that \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "22",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yang:2024:EIC,
author = "Ching-Nung Yang and Xiaotian Wu and Min-Jung Chung",
title = "Enhancement of Information Carrying and Decoding for
Visual Cryptography with Error Correction",
journal = j-TOMM,
volume = "20",
number = "1",
pages = "23:1--23:??",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3612927",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:48 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3612927",
abstract = "Recently, three visual cryptography schemes with t
-error-correcting capability (VCSs- t EC) were
introduced for preventing the shadows carrying
additional information from being corrupted by noise
interference. However, the concerns on VCS- t EC, such
as the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "23",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2024:SSV,
author = "Yuqing Zhang and Yong Zhang and Shaofan Wang and Yun
Liang and Baocai Yin",
title = "Semi-supervised Video Object Segmentation Via an Edge
Attention Gated Graph Convolutional Network",
journal = j-TOMM,
volume = "20",
number = "1",
pages = "24:1--24:??",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3611389",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:48 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3611389",
abstract = "Video object segmentation (VOS) exhibits heavy
occlusions, large deformation, and severe motion blur.
While many remarkable convolutional neural networks are
devoted to the VOS task, they often mis-identify
background noise as the target or output coarse
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "24",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wen:2024:VSI,
author = "Wenying Wen and Minghui Huang and Yushu Zhang and
Yuming Fang and Yifan Zuo",
title = "Visual Security Index Combining {CNN} and Filter for
Perceptually Encrypted Light Field Images",
journal = j-TOMM,
volume = "20",
number = "1",
pages = "25:1--25:??",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3612924",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:48 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3612924",
abstract = "Visual security index (VSI) represents a quantitative
index for the visual security evaluation of
perceptually encrypted images. Recently, the research
on visual security of encrypted light field (LF) images
faces two challenges. One is that the existing
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "25",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2024:CCS,
author = "Linlin Liu and Haijun Zhang and Qun Li and Jianghong
Ma and Zhao Zhang",
title = "Collocated Clothing Synthesis with {GANs} Aided by
Textual Information: a Multi-Modal Framework",
journal = j-TOMM,
volume = "20",
number = "1",
pages = "26:1--26:??",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3614097",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:48 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3614097",
abstract = "Synthesizing realistic images of fashion items which
are compatible with given clothing images, as well as
conditioning on multiple modalities, brings novel and
exciting applications together with enormous economic
potential. In this work, we propose a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "26",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Lou:2024:SSC,
author = "Xulei Lou and Tinghui Wu and Haifeng Hu and Dihu
Chen",
title = "Self-Supervised Consistency Based on Joint Learning
for Unsupervised Person Re-identification",
journal = j-TOMM,
volume = "20",
number = "1",
pages = "27:1--27:??",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3612926",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:48 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3612926",
abstract = "Recently, unsupervised domain adaptive person
re-identification (Re-ID) methods have been extensively
studied thanks to not requiring annotations, and they
have achieved excellent performance. Most of the
existing methods aim to train the Re-ID model for
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "27",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2024:CAP,
author = "Yichi Zhang and Gongchun Ding and Dandan Ding and Zhan
Ma and Zhu Li",
title = "On Content-Aware Post-Processing: Adapting
Statistically Learned Models to Dynamic Content",
journal = j-TOMM,
volume = "20",
number = "1",
pages = "28:1--28:??",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3612925",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:48 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3612925",
abstract = "Learning-based post-processing methods generally
produce neural models that are statistically optimal on
their training datasets. These models, however, neglect
intrinsic variations of local video content and may
fail to process unseen content. To address \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "28",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xu:2024:DIC,
author = "Jing Xu and Bing Liu and Yong Zhou and Mingming Liu
and Rui Yao and Zhiwen Shao",
title = "Diverse Image Captioning via Conditional Variational
Autoencoder and Dual Contrastive Learning",
journal = j-TOMM,
volume = "20",
number = "1",
pages = "29:1--29:??",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3614435",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:48 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3614435",
abstract = "Diverse image captioning has achieved substantial
progress in recent years. However, the discriminability
of generative models and the limitation of cross
entropy loss are generally overlooked in the
traditional diverse image captioning models, which
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "29",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zou:2024:CLN,
author = "Cong Zou and Rui Wang and Cheng Jin and Sanyi Zhang
and Xin Wang",
title = "{S$^2$CL-LeafNet}: Recognizing Leaf Images
Like Human Botanists",
journal = j-TOMM,
volume = "20",
number = "1",
pages = "30:1--30:??",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3615659",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Sep 29 07:50:48 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3615659",
abstract = "Automatically classifying plant leaves is a
challenging fine-grained classification task because of
the diversity in leaf morphology, including size,
texture, shape, and venation. Although powerful deep
learning-based methods have achieved great \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "30",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Namasudra:2024:ISI,
author = "Suyel Namasudra and Pascal Lorenz and Seifedine Kadry
and Syed Ahmad Chan Bukhari",
title = "Introduction to the Special Issue on {DNA}-centric
Modeling and Practice for Next-generation Computing and
Communication Systems",
journal = j-TOMM,
volume = "20",
number = "2",
pages = "31:1--31:??",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3578364",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Nov 3 14:55:26 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3578364",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "31",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wan:2024:ESI,
author = "Shaohua Wan and Yi Jin and Guangdong Xu and Michele
Nappi",
title = "Editorial to Special Issue on Multimedia Cognitive
Computing for Intelligent Transportation System",
journal = j-TOMM,
volume = "20",
number = "2",
pages = "32:1--32:??",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3604938",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Nov 3 14:55:26 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3604938",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "32",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhao:2024:TEL,
author = "Ruonan Zhao and Laurence T. Yang and Debin Liu and
Wanli Lu and Chenlu Zhu and Yiheng Ruan",
title = "Tensor-Empowered {LSTM} for Communication-Efficient
and Privacy-Enhanced Cognitive Federated Learning in
Intelligent Transportation Systems",
journal = j-TOMM,
volume = "20",
number = "2",
pages = "33:1--33:??",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3575661",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Nov 3 14:55:26 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3575661",
abstract = "Multimedia cognitive computing as a revolutionary
emerging concept of artificial intelligence emulating
the reasoning process like human brains can facilitate
the evolution of intelligent transportation systems
(ITS) to be smarter, safer, and more \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "33",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Shi:2024:RSB,
author = "Hongjian Shi and Hao Wang and Ruhui Ma and Yang Hua
and Tao Song and Honghao Gao and Haibing Guan",
title = "Robust Searching-Based Gradient Collaborative
Management in Intelligent Transportation System",
journal = j-TOMM,
volume = "20",
number = "2",
pages = "34:1--34:??",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3549939",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Nov 3 14:55:26 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3549939",
abstract = "With the rapid development of big data and the
Internet of Things (IoT), traffic data from an
Intelligent Transportation System (ITS) is becoming
more and more accessible. To understand and simulate
the traffic patterns from the traffic data, Multimedia
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "34",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Weng:2024:HHC,
author = "Zejia Weng and Zuxuan Wu and Hengduo Li and Jingjing
Chen and Yu-Gang Jiang",
title = "{HCMS}: Hierarchical and Conditional Modality
Selection for Efficient Video Recognition",
journal = j-TOMM,
volume = "20",
number = "2",
pages = "35:1--35:??",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3572776",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Nov 3 14:55:26 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3572776",
abstract = "Videos are multimodal in nature. Conventional video
recognition pipelines typically fuse multimodal
features for improved performance. However, this is not
only computationally expensive but also neglects the
fact that different videos rely on different \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "35",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2024:DAS,
author = "Shixiong Zhang and Wenmin Wang and Honglei Li and
Shenyong Zhang",
title = "E-detector: Asynchronous Spatio-temporal for
Event-based Object Detection in Intelligent
Transportation System",
journal = j-TOMM,
volume = "20",
number = "2",
pages = "36:1--36:??",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3584361",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Nov 3 14:55:26 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3584361",
abstract = "In intelligent transportation systems, various
sensors, including radar and conventional frame
cameras, are used to improve system robustness in
various challenging scenarios. An event camera is a
novel bio-inspired sensor that has attracted the
interest \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "36",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Padhy:2024:MVA,
author = "Ram Prasad Padhy and Pankaj Kumar Sa and Fabio
Narducci and Carmen Bisogni and Sambit Bakshi",
title = "Monocular Vision-aided Depth Measurement from {RGB}
Images for Autonomous {UAV} Navigation",
journal = j-TOMM,
volume = "20",
number = "2",
pages = "37:1--37:??",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3550485",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Nov 3 14:55:26 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3550485",
abstract = "Monocular vision-based 3D scene understanding has been
an integral part of many machine vision applications.
Always, the objective is to measure the depth using a
single RGB camera, which is at par with the depth
cameras. In this regard, monocular vision-. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "37",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Lv:2024:SID,
author = "Zhihan Lv and Fabio Poiesi and Qi Dong and Jaime
Lloret and Houbing Song",
title = "Special Issue on Deep Learning for Intelligent Human
Computer Interaction",
journal = j-TOMM,
volume = "20",
number = "2",
pages = "38:1--38:??",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3605151",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Nov 3 14:55:26 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3605151",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "38",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Gong:2024:MMM,
author = "Wenjuan Gong and Yue Zhang and Wei Wang and Peng Cheng
and Jordi Gonz{\`a}lez",
title = "{Meta-MMFNet}: Meta-learning-based Multi-model Fusion
Network for Micro-expression Recognition",
journal = j-TOMM,
volume = "20",
number = "2",
pages = "39:1--39:??",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3539576",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Nov 3 14:55:26 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3539576",
abstract = "Despite its wide applications in criminal
investigations and clinical communications with
patients suffering from autism, automatic
micro-expression recognition remains a challenging
problem because of the lack of training data and
imbalanced classes \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "39",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Djenouri:2024:EAG,
author = "Youcef Djenouri and Asma Belhadi and Gautam Srivastava
and Jerry Chun-Wei Lin",
title = "An Efficient and Accurate {GPU}-based Deep Learning
Model for Multimedia Recommendation",
journal = j-TOMM,
volume = "20",
number = "2",
pages = "40:1--40:??",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3524022",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Nov 3 14:55:26 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3524022",
abstract = "This article proposes the use of deep learning in
human-computer interaction and presents a new
explainable hybrid framework for recommending relevant
hashtags on a set of orpheline tweets, which are tweets
with hashtags. The approach starts by \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "40",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Loveleen:2024:EDH,
author = "Gaur Loveleen and Bhandari Mohan and Bhadwal Singh
Shikhar and Jhanjhi Nz and Mohammad Shorfuzzaman and
Mehedi Masud",
title = "Explanation-Driven {HCI} Model to Examine the
Mini-Mental State for {Alzheimer}'s Disease",
journal = j-TOMM,
volume = "20",
number = "2",
pages = "41:1--41:??",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3527174",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Nov 3 14:55:26 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3527174",
abstract = "Directing research on Alzheimer's disease toward only
early prediction and accuracy cannot be considered a
feasible approach toward tackling a ubiquitous
degenerative disease today. Applying deep learning
(DL), Explainable artificial intelligence, and
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "41",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2024:AAD,
author = "Mi Li and Wei Zhang and Bin Hu and Jiaming Kang and
Yuqi Wang and Shengfu Lu",
title = "Automatic Assessment of Depression and Anxiety through
Encoding Pupil-wave from {HCI} in {VR} Scenes",
journal = j-TOMM,
volume = "20",
number = "2",
pages = "42:1--42:??",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3513263",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Nov 3 14:55:26 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3513263",
abstract = "At present, there have been many studies on the
methods of using the deep learning regression model to
assess depression level based on behavioral signals
(facial expression, speech, and language); however, the
research on the assessment method of anxiety \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "42",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Qayyum:2024:SFB,
author = "Abdul Qayyum and Imran Razzak and M. Tanveer and Moona
Mazher",
title = "Spontaneous Facial Behavior Analysis Using Deep
Transformer-based Framework for Child-computer
Interaction",
journal = j-TOMM,
volume = "20",
number = "2",
pages = "43:1--43:??",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3539577",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Nov 3 14:55:26 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3539577",
abstract = "A fascinating challenge in robotics-human interaction
is imitating the emotion recognition capability of
humans to robots with the aim to make human-robotics
interaction natural, genuine and intuitive. To achieve
the natural interaction in affective \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "43",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Chen:2024:FBH,
author = "Xiaowei Chen and Xiao Jiang and Lishuang Zhan and
Shihui Guo and Qunsheng Ruan and Guoliang Luo and
Minghong Liao and Yipeng Qin",
title = "Full-body Human Motion Reconstruction with Sparse
Joint Tracking Using Flexible Sensors",
journal = j-TOMM,
volume = "20",
number = "2",
pages = "44:1--44:??",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3564700",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Nov 3 14:55:26 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3564700",
abstract = "Human motion tracking is a fundamental building block
for various applications including computer animation,
human-computer interaction, healthcare, and so on. To
reduce the burden of wearing multiple sensors, human
motion prediction from sparse sensor \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "44",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Qiao:2024:SSL,
author = "Shanbao Qiao and Neal N. Xiong and Yongbin Gao and
Zhijun Fang and Wenjun Yu and Juan Zhang and Xiaoyan
Jiang",
title = "Self-Supervised Learning of Depth and Ego-Motion for
{$3$D} Perception in Human Computer Interaction",
journal = j-TOMM,
volume = "20",
number = "2",
pages = "45:1--45:??",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3588571",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Nov 3 14:55:26 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3588571",
abstract = "3D perception of depth and ego-motion is of vital
importance in intelligent agent and Human Computer
Interaction (HCI) tasks, such as robotics and
autonomous driving. There are different kinds of
sensors that can directly obtain 3D depth information.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "45",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Kang:2024:DGN,
author = "Yan Kang and Bin Pu and Yongqi Kou and Yun Yang and
Jianguo Chen and Khan Muhammad and Po Yang and Lida Xu
and Mohammad Hijji",
title = "A Deep Graph Network with Multiple Similarity for User
Clustering in Human-Computer Interaction",
journal = j-TOMM,
volume = "20",
number = "2",
pages = "46:1--46:??",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3549954",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Nov 3 14:55:26 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3549954",
abstract = "User counterparts, such as user attributes in social
networks or user interests, are the keys to more
natural Human-Computer Interaction (HCI). In addition,
users' attributes and social structures help us
understand the complex interactions in HCI. Most
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "46",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Mahmud:2024:SHA,
author = "Bahar Mahmud and Guan Hong and Bernard Fong",
title = "A Study of Human--{AI} Symbiosis for Creative Work:
Recent Developments and Future Directions in Deep
Learning",
journal = j-TOMM,
volume = "20",
number = "2",
pages = "47:1--47:??",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3542698",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Nov 3 14:55:26 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3542698",
abstract = "Recent advances in Artificial Intelligence (AI),
particularly deep learning, are having an enormous
impact on our society today. Record numbers of jobs
previously held by people have been automated, from
manufacturing to transportation to customer \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "47",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Gu:2024:PPR,
author = "Xiaoling Gu and Jie Huang and Yongkang Wong and Jun Yu
and Jianping Fan and Pai Peng and Mohan S.
Kankanhalli",
title = "{PAINT}: Photo-realistic Fashion Design Synthesis",
journal = j-TOMM,
volume = "20",
number = "2",
pages = "48:1--48:??",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3545610",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Nov 3 14:55:26 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3545610",
abstract = "In this article, we investigate a new problem of
generating a variety of multi-view fashion designs
conditioned on a human pose and texture examples of
arbitrary sizes, which can replace the repetitive and
low-level design work for fashion designers. To
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "48",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Dai:2024:UDA,
author = "Qingfeng Dai and Yongkang Wong and Guofei Sun and
Yanwei Wang and Zhou Zhou and Mohan S. Kankanhalli and
Xiangdong Li and Weidong Geng",
title = "Unsupervised Domain Adaptation by Causal Learning for
Biometric Signal-based {HCI}",
journal = j-TOMM,
volume = "20",
number = "2",
pages = "49:1--49:??",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3583885",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Nov 3 14:55:26 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3583885",
abstract = "Biometric signal based human-computer interface (HCI)
has attracted increasing attention due to its wide
application in healthcare, entertainment,
neurocomputing, and so on. In recent years, deep
learning-based approaches have made great progress on
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "49",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xiao:2024:RRD,
author = "Yi Xiao and Tong Liu and Yu Han and Yue Liu and
Yongtian Wang",
title = "Realtime Recognition of Dynamic Hand Gestures in
Practical Applications",
journal = j-TOMM,
volume = "20",
number = "2",
pages = "50:1--50:??",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3561822",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Nov 3 14:55:26 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3561822",
abstract = "Dynamic hand gesture acting as a semaphoric gesture is
a practical and intuitive mid-air gesture interface.
Nowadays benefiting from the development of deep
convolutional networks, the gesture recognition has
already achieved a high accuracy, however, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "50",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Gou:2024:HMA,
author = "Jianping Gou and Liyuan Sun and Baosheng Yu and
Shaohua Wan and Dacheng Tao",
title = "Hierarchical Multi-Attention Transfer for Knowledge
Distillation",
journal = j-TOMM,
volume = "20",
number = "2",
pages = "51:1--51:??",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3568679",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Nov 3 14:55:26 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3568679",
abstract = "Knowledge distillation (KD) is a powerful and widely
applicable technique for the compression of deep
learning models. The main idea of knowledge
distillation is to transfer knowledge from a large
teacher model to a small student model, where the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "51",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Deb:2024:AIC,
author = "Subhrajyoti Deb and Abhilash Das and Nirmalya Kar",
title = "An Applied Image Cryptosystem on {Moore}'s Automaton
Operating on {$ \delta (q_k) / F_2 $}",
journal = j-TOMM,
volume = "20",
number = "2",
pages = "52:1--52:??",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3614433",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Nov 3 14:55:26 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3614433",
abstract = "The volume of multimedia-based image data or video
frames in Web 3.0 is constantly increasing, owing to
the advancement of real-time data transmission.
However, security vulnerabilities frequently impair the
performance of real-time applications. Many \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "52",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{You:2024:IAV,
author = "Sisi You and Yukun Zuo and Hantao Yao and Changsheng
Xu",
title = "Incremental Audio-Visual Fusion for Person Recognition
in Earthquake Scene",
journal = j-TOMM,
volume = "20",
number = "2",
pages = "53:1--53:??",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3614434",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Nov 3 14:55:26 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3614434",
abstract = "Earthquakes have a profound impact on social harmony
and property, resulting in damage to buildings and
infrastructure. Effective earthquake rescue efforts
require rapid and accurate determination of whether any
survivors are trapped in the rubble of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "53",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Sun:2024:BSG,
author = "Shiqi Sun and Danlan Huang and Xiaoming Tao and
Chengkang Pan and Guangyi Liu and Changwen Chen",
title = "Boosting Scene Graph Generation with Contextual
Information",
journal = j-TOMM,
volume = "20",
number = "2",
pages = "54:1--54:??",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3615868",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Nov 3 14:55:26 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3615868",
abstract = "Scene graph generation (SGG) has been developed to
detect objects and their relationships from the visual
data and has attracted increasing attention in recent
years. Existing works have focused on extracting object
context for SGG. However, very few \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "54",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zheng:2024:CAG,
author = "Jianwei Zheng and Yu Liu and Yuchao Feng and Honghui
Xu and Meiyu Zhang",
title = "Contrastive Attention-guided Multi-level Feature
Registration for Reference-based Super-resolution",
journal = j-TOMM,
volume = "20",
number = "2",
pages = "55:1--55:??",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3616495",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Nov 3 14:55:26 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3616495",
abstract = "Given low-quality input and assisted by referential
images, reference-based super-resolution (RefSR)
strives to enlarge the spatial size with the guarantee
of realistic textures, for which sophisticated
feature-matching strategies are naturally demanded.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "55",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wu:2024:AAL,
author = "Shangxi Wu and Jitao Sang and Kaiyan Xu and Guanhua
Zheng and Changsheng Xu",
title = "Adaptive Adversarial Logits Pairing",
journal = j-TOMM,
volume = "20",
number = "2",
pages = "56:1--56:??",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3616375",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Nov 3 14:55:26 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3616375",
abstract = "Adversarial examples provide an opportunity as well as
impose a challenge for understanding image
classification systems. Based on the analysis of the
adversarial training solution-Adversarial Logits
Pairing (ALP), we observed in this work that: (1) The
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "56",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Chen:2024:BBA,
author = "Ying Chen and Rui Yao and Yong Zhou and Jiaqi Zhao and
Bing Liu and Abdulmotaleb {El Saddik}",
title = "Black-box Attack against Self-supervised Video Object
Segmentation Models with Contrastive Loss",
journal = j-TOMM,
volume = "20",
number = "2",
pages = "57:1--57:??",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3617502",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Nov 3 14:55:26 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3617502",
abstract = "Deep learning models have been proven to be
susceptible to malicious adversarial attacks, which
manipulate input images to deceive the model into
making erroneous decisions. Consequently, the threat
posed to these models serves as a poignant reminder of
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "57",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liang:2024:RFO,
author = "Shuang Liang and Wentao Ma and Chi Xie",
title = "Relation with Free Objects for Action Recognition",
journal = j-TOMM,
volume = "20",
number = "2",
pages = "58:1--58:??",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3617596",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Nov 3 14:55:26 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3617596",
abstract = "Relevant objects are widely used for aiding human
action recognition in still images. Such objects are
founded by a dedicated and pre-trained object detector
in all previous methods. Such methods have two
drawbacks. First, training an object detector
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "58",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{He:2024:FMW,
author = "Qiaolin He and Zhijie Zheng and Haifeng Hu",
title = "A Feature Map is Worth a Video Frame: Rethinking
Convolutional Features for Visible-Infrared Person
Re-identification",
journal = j-TOMM,
volume = "20",
number = "2",
pages = "59:1--59:??",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3617375",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Nov 3 14:55:26 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3617375",
abstract = "Visible-Infrared Person Re-identification (VI-ReID)
aims to search for the identity of the same person
across different spectra. The feature maps obtained
from the convolutional layers are generally used for
loss calculation in the later stages of the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "59",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Huang:2024:GCL,
author = "Wuliang Huang and Yiqiang Chen and Xinlong Jiang and
Teng Zhang and Qian Chen",
title = "{GJFusion}: a Channel-Level Correlation Construction
Method for Multimodal Physiological Signal Fusion",
journal = j-TOMM,
volume = "20",
number = "2",
pages = "60:1--60:??",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3617503",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Fri Nov 3 14:55:26 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3617503",
abstract = "Physiological signal based ubiquitous computing has
garnered significant attention. However, the
heterogeneity among multimodal physiological signals
poses a critical challenge to practical applications.
To traverse this heterogeneity gap, recent studies
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "60",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Shen:2024:SAC,
author = "Chengji Shen and Zhenjiang Liu and Xin Gao and Zunlei
Feng and Mingli Song",
title = "Self-Adaptive Clothing Mapping Based Virtual Try-on",
journal = j-TOMM,
volume = "20",
number = "3",
pages = "61:1--61:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3613453",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 21 10:47:32 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3613453",
abstract = "VTON (Virtual Try-ON), as an innovative visual
application in e-commerce scenarios with great
commercial value, has been widely studied in recent
years. Due to its better robustness and realistic
effect, deformation-synthesize-based VTON has become
the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "61",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Baldrati:2024:CIR,
author = "Alberto Baldrati and Marco Bertini and Tiberio
Uricchio and Alberto {Del Bimbo}",
title = "Composed Image Retrieval using Contrastive Learning
and Task-oriented {CLIP}-based Features",
journal = j-TOMM,
volume = "20",
number = "3",
pages = "62:1--62:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3617597",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 21 10:47:32 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3617597",
abstract = "Given a query composed of a reference image and a
relative caption, the Composed Image Retrieval goal is
to retrieve images visually similar to the reference
one that integrates the modifications expressed by the
caption. Given that recent research has \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "62",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2024:CMM,
author = "Yan Wang and Peize Li and Qingyi Si and Hanwen Zhang
and Wenyu Zang and Zheng Lin and Peng Fu",
title = "Cross-modality Multiple Relations Learning for
Knowledge-based Visual Question Answering",
journal = j-TOMM,
volume = "20",
number = "3",
pages = "63:1--63:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3618301",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 21 10:47:32 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3618301",
abstract = "Knowledge-based visual question answering not only
needs to answer the questions based on images but also
incorporates external knowledge to study reasoning in
the joint space of vision and language. To bridge the
gap between visual content and semantic \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "63",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Guo:2024:IDB,
author = "Qiang Guo and Zhi Zhang and Mingliang Zhou and Hong
Yue and Huayan Pu and Jun Luo",
title = "Image Defogging Based on Regional Gradient Constrained
Prior",
journal = j-TOMM,
volume = "20",
number = "3",
pages = "64:1--64:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3617834",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 21 10:47:32 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3617834",
abstract = "Foggy days limit the functionality of outdoor
surveillance systems. However, it is still a challenge
for existing methods to maintain the uniformity of
defogging between image regions with a similar depth of
field and large differences in appearance. To
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "64",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Guo:2024:PDP,
author = "Jintao Guo and Lei Qi and Yinghuan Shi and Yang Gao",
title = "{PLACE Dropout}: a Progressive Layer-wise and
Channel-wise Dropout for Domain Generalization",
journal = j-TOMM,
volume = "20",
number = "3",
pages = "65:1--65:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3624015",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 21 10:47:32 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3624015",
abstract = "Domain generalization (DG) aims to learn a generic
model from multiple observed source domains that
generalizes well to arbitrary unseen target domains
without further training. The major challenge in DG is
that the model inevitably faces a severe \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "65",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xiong:2024:VLS,
author = "Yuan Xiong and Jingru Wang and Zhong Zhou",
title = "{VirtualLoc}: Large-scale Visual Localization Using
Virtual Images",
journal = j-TOMM,
volume = "20",
number = "3",
pages = "66:1--66:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3622788",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 21 10:47:32 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3622788",
abstract = "Robust and accurate camera pose estimation is
fundamental in computer vision. Learning-based
regression approaches acquire six-degree-of-freedom
camera parameters accurately from visual cues of an
input image. However, most are trained on street-view
and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "66",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2024:ECD,
author = "Yiheng Zhang and Ting Yao and Zhaofan Qiu and Tao
Mei",
title = "Explaining Cross-domain Recognition with Interpretable
Deep Classifier",
journal = j-TOMM,
volume = "20",
number = "3",
pages = "67:1--67:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3623399",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 21 10:47:32 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3623399",
abstract = "The recent advances in deep learning predominantly
construct models in their internal representations, and
it is opaque to explain the rationale behind and
decisions to human users. Such explainability is
especially essential for domain adaptation, whose
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "67",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2024:AGM,
author = "Ruimin Wang and Fasheng Wang and Yiming Su and Jing
Sun and Fuming Sun and Haojie Li",
title = "Attention-guided Multi-modality Interaction Network
for {RGB-D} Salient Object Detection",
journal = j-TOMM,
volume = "20",
number = "3",
pages = "68:1--68:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3624747",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 21 10:47:32 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3624747",
abstract = "The past decade has witnessed great progress in RGB-D
salient object detection (SOD). However, there are two
bottlenecks that limit its further development. The
first one is low-quality depth maps. Most existing
methods directly use raw depth maps to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "68",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Rime:2024:HWY,
author = "Jemily Rime and Alan Archer-Boyd and Tom Collins",
title = "How Will You Pod? {Implications} of Creators'
Perspectives for Designing Innovative Podcasting
Tools",
journal = j-TOMM,
volume = "20",
number = "3",
pages = "69:1--69:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3625099",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 21 10:47:32 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3625099",
abstract = "While centred on the medium of audio, podcasts are
often a multimedia concern, and one that has become
hugely popular in recent years, though relatively
little is known about the perspectives of podcast
creators and their visions of innovation. This
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "69",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Cheung:2024:LPF,
author = "Ming Cheung",
title = "Learning from the Past: Fast {NAS} for Tasks and
Datasets",
journal = j-TOMM,
volume = "20",
number = "3",
pages = "70:1--70:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3618000",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 21 10:47:32 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3618000",
abstract = "Nowadays, with the advancement of technology, many
retail companies require in-house data scientist teams
to build machine learning tasks, such as user
segmentation and item price prediction. These teams
typically use a trial-and-error process to obtain a
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "70",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2024:UIQ,
author = "Xinyue Li and Haiyong Xu and Gangyi Jiang and Mei Yu
and Ting Luo and Xuebo Zhang and Hongwei Ying",
title = "Underwater Image Quality Assessment from Synthetic to
Real-world: Dataset and Objective Method",
journal = j-TOMM,
volume = "20",
number = "3",
pages = "71:1--71:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3624983",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 21 10:47:32 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3624983",
abstract = "The complicated underwater environment and lighting
conditions lead to severe influence on the quality of
underwater imaging, which tends to impair underwater
exploration and research. To effectively evaluate the
quality of underwater images, an \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "71",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Hou:2024:DLL,
author = "Sujuan Hou and Jiacheng Li and Weiqing Min and Qiang
Hou and Yanna Zhao and Yuanjie Zheng and Shuqiang
Jiang",
title = "Deep Learning for Logo Detection: a Survey",
journal = j-TOMM,
volume = "20",
number = "3",
pages = "72:1--72:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3611309",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 21 10:47:32 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3611309",
abstract = "Logo detection has gradually become a research hotspot
in the field of computer vision and multimedia for its
various applications, such as social media monitoring,
intelligent transportation, and video advertising
recommendation. Recent advances in this \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "72",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Peng:2024:DLB,
author = "Yunjie Peng and Jinlin Wu and Boqiang Xu and Chunshui
Cao and Xu Liu and Zhenan Sun and Zhiqiang He",
title = "Deep Learning Based Occluded Person Re-Identification:
a Survey",
journal = j-TOMM,
volume = "20",
number = "3",
pages = "73:1--73:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3610534",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 21 10:47:32 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3610534",
abstract = "Occluded person re-identification (Re-ID) focuses on
addressing the occlusion problem when retrieving the
person of interest across non-overlapping cameras. With
the increasing demand for intelligent video
surveillance and the application of person Re-ID
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "73",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Manzoor:2024:MRL,
author = "Muhammad Arslan Manzoor and Sarah Albarri and Ziting
Xian and Zaiqiao Meng and Preslav Nakov and Shangsong
Liang",
title = "Multimodality Representation Learning: a Survey on
Evolution, Pretraining and Its Applications",
journal = j-TOMM,
volume = "20",
number = "3",
pages = "74:1--74:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3617833",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 21 10:47:32 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3617833",
abstract = "Multimodality Representation Learning, as a technique
of learning to embed information from different
modalities and their correlations, has achieved
remarkable success on a variety of applications, such
as Visual Question Answering (VQA), Natural \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "74",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Shi:2024:BFS,
author = "Yanyan Shi and Shaowu Yang and Wenjing Yang and Dianxi
Shi and Xuehui Li",
title = "Boosting Few-shot Object Detection with Discriminative
Representation and Class Margin",
journal = j-TOMM,
volume = "20",
number = "3",
pages = "75:1--75:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3608478",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 21 10:47:32 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3608478",
abstract = "Classifying and accurately locating a visual category
with few annotated training samples in computer vision
has motivated the few-shot object detection technique,
which exploits transferring the source-domain detection
model to the target domain. Under \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "75",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Cheng:2024:VFH,
author = "Harry Cheng and Yangyang Guo and Tianyi Wang and Qi Li
and Xiaojun Chang and Liqiang Nie",
title = "Voice-Face Homogeneity Tells Deepfake",
journal = j-TOMM,
volume = "20",
number = "3",
pages = "76:1--76:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3625231",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 21 10:47:32 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3625231",
abstract = "Detecting forgery videos is highly desirable due to
the abuse of deepfake. Existing detection approaches
contribute to exploring the specific artifacts in
deepfake videos and fit well on certain data. However,
the growing technique on these artifacts \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "76",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Ye:2024:VSA,
author = "Jin Ye and Meng Dan and Wenchao Jiang",
title = "A Visual Sensitivity Aware {ABR} Algorithm for {DASH}
via Deep Reinforcement Learning",
journal = j-TOMM,
volume = "20",
number = "3",
pages = "77:1--77:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3591108",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 21 10:47:32 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3591108",
abstract = "In order to cope with the fluctuation of network
bandwidth and provide smooth video services, adaptive
video streaming technology is proposed. In particular,
the adaptive bitrate (ABR) algorithm is widely used in
dynamic adaptive streaming over HTTP (DASH). \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "77",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2024:TRH,
author = "Jian Wang and Xiao Wang and Guosheng Zhao",
title = "Task Recommendation via Heterogeneous Multi-modal
Features and Decision Fusion in Mobile Crowdsensing",
journal = j-TOMM,
volume = "20",
number = "3",
pages = "78:1--78:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3626239",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 21 10:47:32 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3626239",
abstract = "In the decision-making process of the behavior of
mobile crowdsensing, using a single view to learn a
user's preference will lead to a mismatch between the
user's wishes and the final task recommendation list,
resulting in the low efficiency of the model \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "78",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Lei:2024:BDV,
author = "Si-Chao Lei and Yue-Jiao Gong and Xiao-Lin Xiao and
Yi-cong Zhou and Jun Zhang",
title = "Boosting Diversity in Visual Search with {Pareto}
Non-Dominated Re-Ranking",
journal = j-TOMM,
volume = "20",
number = "3",
pages = "79:1--79:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3625296",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 21 10:47:32 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3625296",
abstract = "The field of visual search has gained significant
attention recently, particularly in the context of web
search engines and e-commerce product search platforms.
However, the abundance of web images presents a
challenge for modern image retrieval systems,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "79",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2024:ISS,
author = "Huijie Zhang and Pu Li and Xiaobai Liu and Xianfeng
Yang and Li An",
title = "An Iterative Semi-supervised Approach with Pixel-wise
Contrastive Loss for Road Extraction in Aerial Images",
journal = j-TOMM,
volume = "20",
number = "3",
pages = "80:1--80:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3606374",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 21 10:47:32 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3606374",
abstract = "Extracting roads in aerial images has numerous
applications in artificial intelligence and multimedia
computing, including traffic pattern analysis and
parking space planning. Learning deep neural networks,
though very successful, demand vast amounts of
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "80",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Fang:2024:IAS,
author = "Jing Fang and Yinbo Yu and Zhongyuan Wang and Xin Ding
and Ruimin Hu",
title = "An Image Arbitrary-Scale Super-Resolution Network
Using Frequency-domain Information",
journal = j-TOMM,
volume = "20",
number = "3",
pages = "81:1--81:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3616376",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 21 10:47:32 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3616376",
abstract = "Image super-resolution (SR) is a technique to recover
lost high-frequency information in low-resolution (LR)
images. Since spatial-domain information has been
widely exploited, there is a new trend to involve
frequency-domain information in SR tasks. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "81",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Luo:2024:TES,
author = "Xiao Luo and Wei Ju and Yiyang Gu and Yifang Qin and
Siyu Yi and Daqing Wu and Luchen Liu and Ming Zhang",
title = "Toward Effective Semi-supervised Node Classification
with Hybrid Curriculum Pseudo-labeling",
journal = j-TOMM,
volume = "20",
number = "3",
pages = "82:1--82:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3626528",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 21 10:47:32 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3626528",
abstract = "Semi-supervised node classification is a crucial
challenge in relational data mining and has attracted
increasing interest in research on graph neural
networks (GNNs). However, previous approaches merely
utilize labeled nodes to supervise the overall
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "82",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Guo:2024:FDN,
author = "Wen Guo and Wuzhou Quan and Junyu Gao and Tianzhu
Zhang and Changsheng Xu",
title = "Feature Disentanglement Network: Multi-Object Tracking
Needs More Differentiated Features",
journal = j-TOMM,
volume = "20",
number = "3",
pages = "83:1--83:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3626825",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 21 10:47:32 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3626825",
abstract = "To reduce computational redundancies, a common
approach is to integrate detection and
re-identification (Re-ID) into a single network in
multi-object tracking (MOT), referred to as ``tracking
by detection.'' Most of the previous research has
focused on \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "83",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Khaleel:2024:VVC,
author = "Mohammed Khaleel and Azeez Idris and Wallapak
Tavanapong and Jacob R. Pratt and Junghwan Oh and Piet
C. de Groen",
title = "{VisActive}: Visual-concept-based Active Learning for
Image Classification under Class Imbalance",
journal = j-TOMM,
volume = "20",
number = "3",
pages = "84:1--84:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3617999",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 21 10:47:32 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3617999",
abstract = "Active learning methods recommend the most informative
images from a large unlabeled dataset for manual
labeling. These methods improve the performance of an
image classifier while minimizing manual labeling
efforts. We propose VisActive, a visual-concept-.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "84",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Chen:2024:GLB,
author = "Honghua Chen and Zhiqi Li and Mingqing Wei and Jun
Wang",
title = "Geometric and Learning-Based Mesh Denoising: a
Comprehensive Survey",
journal = j-TOMM,
volume = "20",
number = "3",
pages = "85:1--85:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3625098",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 21 10:47:32 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3625098",
abstract = "Mesh denoising is a fundamental problem in digital
geometry processing. It seeks to remove surface noise
while preserving surface intrinsic signals as
accurately as possible. While traditional wisdom has
been built upon specialized priors to smooth \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "85",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Han:2024:BNL,
author = "Ning Han and Yawen Zeng and Chuhao Shi and Guangyi
Xiao and Hao Chen and Jingjing Chen",
title = "{BiC-Net}: Learning Efficient Spatio-temporal Relation
for Text-Video Retrieval",
journal = j-TOMM,
volume = "20",
number = "3",
pages = "86:1--86:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3627103",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 21 10:47:32 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3627103",
abstract = "The task of text-video retrieval aims to understand
the correspondence between language and vision and has
gained increasing attention in recent years. Recent
works have demonstrated the superiority of local
spatio-temporal relation learning with graph-.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "86",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Feng:2024:ADD,
author = "Yuan Feng and Yaojun Hu and Pengfei Fang and Sheng Liu
and Yanhong Yang and Shengyong Chen",
title = "Asymmetric Dual-Decoder {U-Net} for Joint Rain and
Haze Removal",
journal = j-TOMM,
volume = "20",
number = "3",
pages = "87:1--87:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3628451",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 21 10:47:32 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3628451",
abstract = "This work studies the multi-weather restoration
problem. In real-life scenarios, rain and haze, two
often co-occurring common weather phenomena, can
greatly degrade the clarity and quality of the scene
images, leading to a performance drop in the visual
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "87",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xie:2024:SGD,
author = "Yurui Xie and Ling Guan",
title = "Sparsity-guided Discriminative Feature Encoding for
Robust Keypoint Detection",
journal = j-TOMM,
volume = "20",
number = "3",
pages = "88:1--88:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3628432",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 21 10:47:32 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3628432",
abstract = "Existing handcrafted keypoint detectors typically
focus on designing specific local structures manually
while ignoring whether they have enough flexibility to
explore diverse visual patterns in an image. Despite
the advancement of learning-based \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "88",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Beuve:2024:HLD,
author = "Nicolas Beuve and Wassim Hamidouche and Olivier
D{\'e}forges",
title = "Hierarchical Learning and Dummy Triplet Loss for
Efficient Deepfake Detection",
journal = j-TOMM,
volume = "20",
number = "3",
pages = "89:1--89:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3626101",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 21 10:47:32 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3626101",
abstract = "The advancement of generative models has made it
easier to create highly realistic Deepfake videos. This
accessibility has led to a surge in research on
Deepfake detection to mitigate potential misuse.
Typically, Deepfake detection models utilize binary
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "89",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xiang:2024:RPR,
author = "Suncheng Xiang and Dahong Qian and Jingsheng Gao and
Zirui Zhang and Ting Liu and Yuzhuo Fu",
title = "Rethinking Person Re-Identification via Semantic-based
Pretraining",
journal = j-TOMM,
volume = "20",
number = "3",
pages = "90:1--90:??",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3628452",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Dec 21 10:47:32 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3628452",
abstract = "Pretraining is a dominant paradigm in computer vision.
Generally, supervised ImageNet pretraining is commonly
used to initialize the backbones of person
re-identification (Re-ID) models. However, recent works
show a surprising result that CNN-based \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "90",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Peng:2024:HSE,
author = "Min Peng and Xiaohu Shao and Yu Shi and Xiangdong
Zhou",
title = "Hierarchical Synergy-Enhanced Multimodal Relational
Network for Video Question Answering",
journal = j-TOMM,
volume = "20",
number = "4",
pages = "91:1--91:??",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3630101",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 13 15:13:22 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3630101",
abstract = "Video question answering (VideoQA) is challenging as
it requires reasoning about natural language and
multimodal interactive relations. Most existing methods
apply attention mechanisms to extract interactions
between the question and the video or to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "91",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Ren:2024:CIT,
author = "Bin Ren and Hao Tang and Fanyang Meng and Ding Runwei
and Philip H. S. Torr and Nicu Sebe",
title = "Cloth Interactive Transformer for Virtual Try-On",
journal = j-TOMM,
volume = "20",
number = "4",
pages = "92:1--92:??",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3617374",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 13 15:13:22 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3617374",
abstract = "The 2D image-based virtual try-on has aroused
increased interest from the multimedia and computer
vision fields due to its enormous commercial value.
Nevertheless, most existing image-based virtual try-on
approaches directly combine the person-identity
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "92",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Nie:2024:CSI,
author = "Xiushan Nie and Yang Shi and Ziyu Meng and Jin Huang
and Weili Guan and Yilong Yin",
title = "Complex Scenario Image Retrieval via Deep
Similarity-aware Hashing",
journal = j-TOMM,
volume = "20",
number = "4",
pages = "93:1--93:??",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3624016",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 13 15:13:22 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3624016",
abstract = "When performing hashing-based image retrieval, it is
difficult to learn discriminative hash codes especially
for the multi-label, zero-shot and fine-grained
settings. This is due to the fact that the similarities
vary, even within the same category, under \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "93",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Tan:2024:CLS,
author = "Jiawei Tan and Hongxing Wang and Junsong Yuan",
title = "Characters Link Shots: Character Attention Network for
Movie Scene Segmentation",
journal = j-TOMM,
volume = "20",
number = "4",
pages = "94:1--94:??",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3630257",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 13 15:13:22 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3630257",
abstract = "Movie scene segmentation aims to automatically segment
a movie into multiple story units, i.e., scenes, each
of which is a series of semantically coherent and
time-continual shots. Previous methods have continued
efforts on shot semantic association, but \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "94",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhou:2024:RRT,
author = "Mingliang Zhou and Xinwen Zhao and Futing Luo and Jun
Luo and Huayan Pu and Tao Xiang",
title = "Robust {RGB-T} Tracking via Adaptive Modality Weight
Correlation Filters and Cross-modality Learning",
journal = j-TOMM,
volume = "20",
number = "4",
pages = "95:1--95:??",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3630100",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 13 15:13:22 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3630100",
abstract = "RGBT tracking is gaining popularity due to its ability
to provide effective tracking results in a variety of
weather conditions. However, feature specificity and
complementarity have not been fully used in existing
models that directly fuse the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "95",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2024:SOQ,
author = "Zicheng Zhang and Wei Sun and Yingjie Zhou and Jun Jia
and Zhichao Zhang and Jing Liu and Xiongkuo Min and
Guangtao Zhai",
title = "Subjective and Objective Quality Assessment for
in-the-Wild Computer Graphics Images",
journal = j-TOMM,
volume = "20",
number = "4",
pages = "96:1--96:??",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3631357",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 13 15:13:22 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3631357",
abstract = "Computer graphics images (CGIs) are artificially
generated by means of computer programs and are widely
perceived under various scenarios, such as games,
streaming media, etc. In practice, the quality of CGIs
consistently suffers from poor rendering \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "96",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Roy:2024:CLV,
author = "Shuvendu Roy and Ali Etemad",
title = "Contrastive Learning of View-invariant Representations
for Facial Expressions Recognition",
journal = j-TOMM,
volume = "20",
number = "4",
pages = "97:1--97:??",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3632960",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 13 15:13:22 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3632960",
abstract = "Although there has been much progress in the area of
facial expression recognition (FER), most existing
methods suffer when presented with images that have
been captured from viewing angles that are non-frontal
and substantially different from those used \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "97",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2024:GRA,
author = "Jun Liu and Jiantao Zhou and Haiwei Wu and Weiwei Sun
and Jinyu Tian",
title = "Generating Robust Adversarial Examples against Online
Social Networks {(OSNs)}",
journal = j-TOMM,
volume = "20",
number = "4",
pages = "98:1--98:??",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3632528",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 13 15:13:22 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3632528",
abstract = "Online Social Networks (OSNs) have blossomed into
prevailing transmission channels for images in the
modern era. Adversarial examples (AEs) deliberately
designed to mislead deep neural networks (DNNs) are
found to be fragile against the inevitable lossy
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "98",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yao:2024:CMS,
author = "Tao Yao and Yiru Li and Ying Li and Yingying Zhu and
Gang Wang and Jun Yue",
title = "Cross-modal Semantically Augmented Network for
Image-text Matching",
journal = j-TOMM,
volume = "20",
number = "4",
pages = "99:1--99:??",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3631356",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 13 15:13:22 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3631356",
abstract = "Image-text matching plays an important role in solving
the problem of cross-modal information processing.
Since there are nonnegligible semantic differences
between heterogeneous pairwise data, a crucial
challenge is how to learn a unified
representation. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "99",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Telili:2024:DBL,
author = "Ahmed Telili and Sid Ahmed Fezza and Wassim Hamidouche
and Hanene F. Z. Brachemi Meftah",
title = "{2BiVQA}: Double Bi-{LSTM}-based Video Quality
Assessment of {UGC} Videos",
journal = j-TOMM,
volume = "20",
number = "4",
pages = "100:1--100:??",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3632178",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 13 15:13:22 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3632178",
abstract = "Recently, with the growing popularity of mobile
devices as well as video sharing platforms (e.g.,
YouTube, Facebook, TikTok, and Twitch), User-Generated
Content (UGC) videos have become increasingly common
and now account for a large portion of multimedia
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "100",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Chen:2024:WMS,
author = "Hongzhou Chen and Haihan Duan and Maha Abdallah and
Yufeng Zhu and Yonggang Wen and Abdulmotaleb {El
Saddik} and Wei Cai",
title = "{Web3 Metaverse}: State-of-the-Art and Vision",
journal = j-TOMM,
volume = "20",
number = "4",
pages = "101:1--101:??",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3630258",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 13 15:13:22 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3630258",
abstract = "The metaverse, as a rapidly evolving socio-technical
phenomenon, exhibits significant potential across
diverse domains by leveraging Web3 (a.k.a. Web 3.0)
technologies such as blockchain, smart contracts, and
non-fungible tokens (NFTs). This survey aims \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "101",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2024:GBC,
author = "Lilong Wang and Yunhui Shi and Jin Wang and Shujun
Chen and Baocai Yin and Nam Ling",
title = "Graph Based Cross-Channel Transform for Color Image
Compression",
journal = j-TOMM,
volume = "20",
number = "4",
pages = "102:1--102:??",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3631710",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 13 15:13:22 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3631710",
abstract = "Adaptive transform coding is gaining more and more
attention for better mining of image content over fixed
transforms such as discrete cosine transform (DCT). As
a special case, graph transform learning establishes a
novel paradigm for the graph-based \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "102",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Han:2024:SHO,
author = "Kai Han and Yu Liu and Rukai Wei and Ke Zhou and
Jinhui Xu and Kun Long",
title = "Supervised Hierarchical Online Hashing for Cross-modal
Retrieval",
journal = j-TOMM,
volume = "20",
number = "4",
pages = "103:1--103:??",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3632527",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 13 15:13:22 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3632527",
abstract = "Online cross-modal hashing has gained attention for
its adaptability in processing streaming data. However,
existing methods only define the hard similarity
between data using labels. This results in poor
retrieval performance, as they fail to exploit the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "103",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Fu:2024:SOT,
author = "Fengyi Fu and Shancheng Fang and Weidong Chen and
Zhendong Mao",
title = "Sentiment-Oriented Transformer-Based Variational
Autoencoder Network for Live Video Commenting",
journal = j-TOMM,
volume = "20",
number = "4",
pages = "104:1--104:??",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3633334",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 13 15:13:22 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3633334",
abstract = "Automatic live video commenting is getting increasing
attention due to its significance in narration
generation, topic explanation, etc. However, the
diverse sentiment consideration of the generated
comments is missing from current methods. Sentimental
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "104",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Peng:2024:JCJ,
author = "Yuxiang Peng and Chong Fu and Guixing Cao and Wei Song
and Junxin Chen and Chiu-Wing Sham",
title = "{JPEG}-compatible Joint Image Compression and
Encryption Algorithm with File Size Preservation",
journal = j-TOMM,
volume = "20",
number = "4",
pages = "105:1--105:??",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3633459",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 13 15:13:22 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3633459",
abstract = "Joint image compression and encryption algorithms are
intensively investigated due to their powerful
capability of simultaneous image data compression and
sensitive information protection. Unfortunately, most
of the existing algorithms suffered from \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "105",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2024:TEC,
author = "Daizong Liu and Xiaoye Qu and Jianfeng Dong and Pan
Zhou and Zichuan Xu and Haozhao Wang and Xing Di and
Weining Lu and Yu Cheng",
title = "Transform-Equivariant Consistency Learning for
Temporal Sentence Grounding",
journal = j-TOMM,
volume = "20",
number = "4",
pages = "106:1--106:??",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3634749",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 13 15:13:22 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3634749",
abstract = "This paper addresses the temporal sentence grounding
(TSG). Although existing methods have made decent
achievements in this task, they not only severely rely
on abundant video-query paired data for training, but
also easily fail into the dataset \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "106",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Hu:2024:STR,
author = "Yijie Hu and Bin Dong and Kaizhu Huang and Lei Ding
and Wei Wang and Xiaowei Huang and Qiu-Feng Wang",
title = "Scene Text Recognition via Dual-path Network with
Shape-driven Attention Alignment",
journal = j-TOMM,
volume = "20",
number = "4",
pages = "107:1--107:??",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3633517",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 13 15:13:22 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3633517",
abstract = "Scene text recognition (STR), one typical
sequence-to-sequence problem, has drawn much attention
recently in multimedia applications. To guarantee good
performance, it is essential for STR to obtain aligned
character-wise features from the whole-image \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "107",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liang:2024:NHN,
author = "Rongjiao Liang and Shichao Zhang and Wenzhen Zhang and
Guixian Zhang and Jinyun Tang",
title = "Nonlocal Hybrid Network for Long-tailed Image
Classification",
journal = j-TOMM,
volume = "20",
number = "4",
pages = "108:1--108:??",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3630256",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 13 15:13:22 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3630256",
abstract = "It is a significant issue to deal with long-tailed
data when classifying images. A nonlocal hybrid network
(NHN) that takes into account both feature learning and
classifier learning is proposed. The NHN can capture
the existence of dependencies between \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "108",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Shi:2024:DMC,
author = "Piao Shi and Min Hu and Xuefeng Shi and Fuji Ren",
title = "Deep Modular Co-Attention Shifting Network for
Multimodal Sentiment Analysis",
journal = j-TOMM,
volume = "20",
number = "4",
pages = "109:1--109:??",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3634706",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 13 15:13:22 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3634706",
abstract = "Human Multimodal Sentiment Analysis (MSA) is an
attractive research that studies sentiment expressed
from multiple heterogeneous modalities. While
transformer-based methods have achieved great success,
designing an effective ``co-attention'' model to
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "109",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2024:VLS,
author = "Jing Zhang and Dan Guo and Xun Yang and Peipei Song
and Meng Wang",
title = "Visual-linguistic-stylistic Triple Reward for
Cross-lingual Image Captioning",
journal = j-TOMM,
volume = "20",
number = "4",
pages = "110:1--110:??",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3634917",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 13 15:13:22 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3634917",
abstract = "Generating image captions in different languages is
worth exploring and essential for non-native speakers.
Nevertheless, collecting paired annotation for every
language is time-consuming and impractical,
particularly for minor languages. To this end, the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "110",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Jia:2024:ENC,
author = "Zhaoyang Jia and Yan Lu and Houqiang Li",
title = "Exploring Neighbor Correspondence Matching for
Multiple-hypotheses Video Frame Synthesis",
journal = j-TOMM,
volume = "20",
number = "4",
pages = "111:1--111:??",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3633780",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 13 15:13:22 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3633780",
abstract = "Video frame synthesis, which consists of interpolation
and extrapolation, is an essential video processing
technique that can be applied to various scenarios.
However, most existing methods cannot handle small
objects or large motion well, especially in \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "111",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhou:2024:GPI,
author = "Sheng Zhou and Dan Guo and Xun Yang and Jianfeng Dong
and Meng Wang",
title = "Graph Pooling Inference Network for Text-based {VQA}",
journal = j-TOMM,
volume = "20",
number = "4",
pages = "112:1--112:??",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3634918",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 13 15:13:22 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3634918",
abstract = "Effectively leveraging objects and optical character
recognition (OCR) tokens to reason out pivotal scene
text is critical for the challenging Text-based Visual
Question Answering (TextVQA) task. Graph-based models
can effectively capture the semantic \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "112",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Hu:2024:OBS,
author = "Hengtong Hu and Lingxi Xie and Xinyue Huo and Richang
Hong and Qi Tian",
title = "One-Bit Supervision for Image Classification: Problem,
Solution, and Beyond",
journal = j-TOMM,
volume = "20",
number = "4",
pages = "113:1--113:??",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3633779",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 13 15:13:22 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3633779",
abstract = "This article presents one-bit supervision, a novel
setting of learning with fewer labels, for image
classification. Instead of the training model using the
accurate label of each sample, our setting requires the
model to interact with the system by \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "113",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yuan:2024:DCB,
author = "Hang Yuan and Wei Gao and Siwei Ma and Yiqiang Yan",
title = "Divide-and-conquer-based {RDO}-free {CU} Partitioning
for {8K} Video Compression",
journal = j-TOMM,
volume = "20",
number = "4",
pages = "114:1--114:??",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3634705",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 13 15:13:22 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3634705",
abstract = "8K (7689$ \times $4320) ultra-high definition (UHD)
videos are growing popular with the improvement of
human visual experience demand. Therefore, the
compression of 8K UHD videos has become a top priority
in the third-generation audio video coding standard
(AVS3). \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "114",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2024:DWA,
author = "Mingyu Li and Tao Zhou and Zhuo Huang and Jian Yang
and Jie Yang and Chen Gong",
title = "Dynamic Weighted Adversarial Learning for
Semi-Supervised Classification under Intersectional
Class Mismatch",
journal = j-TOMM,
volume = "20",
number = "4",
pages = "115:1--115:??",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3635310",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 13 15:13:22 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3635310",
abstract = "Nowadays, class-mismatch problem has drawn intensive
attention in Semi-Supervised Learning (SSL), where the
classes of labeled data are assumed to be only a subset
of the classes of unlabeled data. However, in a more
realistic scenario, the labeled data \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "115",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Huang:2024:SLC,
author = "Hui Huang and Di Xiao and Jia Liang",
title = "Secure Low-complexity Compressive Sensing with
Preconditioning Prior Regularization Reconstruction",
journal = j-TOMM,
volume = "20",
number = "4",
pages = "116:1--116:??",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3635308",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 13 15:13:22 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3635308",
abstract = "Compressive sensing (CS), a breakthrough technology in
image processing, provides a privacy-preserving layer
and image reconstruction while performing sensing and
recovery processes, respectively. Unfortunately, it
still faces high-complexity, low-. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "116",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Clement:2024:SDH,
author = "Nathan Clement and Alan Schoen and Arnold Boedihardjo
and Andrew Jenkins",
title = "Synthetic Data and Hierarchical Object Detection in
Overhead Imagery",
journal = j-TOMM,
volume = "20",
number = "4",
pages = "117:1--117:??",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3635309",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 13 15:13:22 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3635309",
abstract = "The performance of neural network models is often
limited by the availability of big datasets. To treat
this problem, we survey and develop novel synthetic
data generation and augmentation techniques for
enhancing low/zero-sample learning in satellite
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "117",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Bian:2024:PAL,
author = "Jiang Bian and Xuhong Li and Tao Wang and Qingzhong
Wang and Jun Huang and Chen Liu and Jun Zhao and
Feixiang Lu and Dejing Dou and Haoyi Xiong",
title = "{P$^2$ANet}: a Large-Scale Benchmark for Dense Action
Detection from Table Tennis Match Broadcasting Videos",
journal = j-TOMM,
volume = "20",
number = "4",
pages = "118:1--118:??",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3633516",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 13 15:13:22 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3633516",
abstract = "While deep learning has been widely used for video
analytics, such as video classification and action
detection, dense action detection with fast-moving
subjects from sports videos is still challenging. In
this work, we release yet another sports video
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "118",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yang:2024:AIG,
author = "Jifan Yang and Zhongyuan Wang and Guangcheng Wang and
Baojin Huang and Yuhong Yang and Weiping Tu",
title = "Auxiliary Information Guided Self-attention for Image
Quality Assessment",
journal = j-TOMM,
volume = "20",
number = "4",
pages = "119:1--119:??",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3635716",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 13 15:13:22 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3635716",
abstract = "Image quality assessment (IQA) is an important problem
in computer vision with many applications. We propose a
transformer-based multi-task learning framework for the
IQA task. Two subtasks: constructing an auxiliary
information error map and completing \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "119",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Feng:2024:EVT,
author = "Zhanzhou Feng and Jiaming Xu and Lei Ma and Shiliang
Zhang",
title = "Efficient Video Transformers via Spatial-temporal
Token Merging for Action Recognition",
journal = j-TOMM,
volume = "20",
number = "4",
pages = "120:1--120:??",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3633781",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat Jan 13 15:13:22 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3633781",
abstract = "Transformer has exhibited promising performance in
various video recognition tasks but brings a huge
computational cost in modeling spatial-temporal cues.
This work aims to boost the efficiency of existing
video transformers for action recognition through
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "120",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2024:PCA,
author = "Shupei Zhang and Chenqiu Zhao and Anup Basu",
title = "Principal Component Approximation Network for Image
Compression",
journal = j-TOMM,
volume = "20",
number = "5",
pages = "121:1--121:??",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3637490",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Apr 10 08:42:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3637490",
abstract = "In this work, we propose a novel principal component
approximation network (PCANet) for image compression.
The proposed network is based on the assumption that a
set of images can be decomposed into several shared
feature matrices, and an image can be \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "121",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2024:TEC,
author = "Tianyu Zhang and Weiqing Min and Tao Liu and Shuqiang
Jiang and Yong Rui",
title = "Toward Egocentric Compositional Action Anticipation
with Adaptive Semantic Debiasing",
journal = j-TOMM,
volume = "20",
number = "5",
pages = "122:1--122:??",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3633333",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Apr 10 08:42:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3633333",
abstract = "Predicting the unknown from the first-person
perspective is expected as a necessary step toward
machine intelligence, which is essential for practical
applications including autonomous driving and robotics.
As a human-level task, egocentric action \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "122",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2024:AVT,
author = "Yu Liu and Mingbo Zhao and Zhao Zhang and Yuping Liu
and Shuicheng Yan",
title = "Arbitrary Virtual Try-on Network: Characteristics
Preservation and Tradeoff between Body and Clothing",
journal = j-TOMM,
volume = "20",
number = "5",
pages = "123:1--123:??",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3636426",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Apr 10 08:42:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3636426",
abstract = "Deep learning based virtual try-on system has achieved
some encouraging progress recently, but there still
remain several big challenges that need to be solved,
such as trying on arbitrary clothes of all types,
trying on the clothes from one category to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "123",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yang:2024:CCM,
author = "Shih-Wei Yang and Li-Hsiang Shen and Hong-Han Shuai
and Kai-Ten Feng",
title = "{CMAF}: Cross-Modal Augmentation via Fusion for
Underwater Acoustic Image Recognition",
journal = j-TOMM,
volume = "20",
number = "5",
pages = "124:1--124:??",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3636427",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Apr 10 08:42:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3636427",
abstract = "Underwater image recognition is crucial for underwater
detection applications. Fish classification has been
one of the emerging research areas in recent years.
Existing image classification models usually classify
data collected from terrestrial \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "124",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2024:SAR,
author = "Yazhou Zhang and Yang Yu and Mengyao Wang and Min
Huang and M. Shamim Hossain",
title = "Self-Adaptive Representation Learning Model for
Multi-Modal Sentiment and Sarcasm Joint Analysis",
journal = j-TOMM,
volume = "20",
number = "5",
pages = "125:1--125:??",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3635311",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Apr 10 08:42:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3635311",
abstract = "Sentiment and sarcasm are intimate and complex, as
sarcasm often deliberately elicits an emotional
response in order to achieve its specific purpose.
Current challenges in multi-modal sentiment and sarcasm
joint detection mainly include multi-modal \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "125",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Qi:2024:DSD,
author = "Lei Qi and Peng Dong and Tan Xiong and Hui Xue and Xin
Geng",
title = "{DoubleAUG}: Single-domain Generalized Object Detector
in Urban via Color Perturbation and Dual-style Memory",
journal = j-TOMM,
volume = "20",
number = "5",
pages = "126:1--126:??",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3634683",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Apr 10 08:42:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3634683",
abstract = "Object detection in urban scenarios is crucial for
autonomous driving in intelligent traffic systems.
However, unlike conventional object detection tasks,
urban-scene images vary greatly in style. For example,
images taken on sunny days differ \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "126",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Shi:2024:ICM,
author = "Dan Shi and Lei Zhu and Jingjing Li and Guohua Dong
and Huaxiang Zhang",
title = "Incomplete Cross-Modal Retrieval with Deep Correlation
Transfer",
journal = j-TOMM,
volume = "20",
number = "5",
pages = "127:1--127:??",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3637442",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Apr 10 08:42:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3637442",
abstract = "Most cross-modal retrieval methods assume the
multi-modal training data is complete and has a
one-to-one correspondence. However, in the real world,
multi-modal data generally suffers from missing
modality information due to the uncertainty of data
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "127",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zeng:2024:MPS,
author = "Xianhua Zeng and Xinyu Wang and Yicai Xie",
title = "Multiple Pseudo-{Siamese} Network with Supervised
Contrast Learning for Medical Multi-modal Retrieval",
journal = j-TOMM,
volume = "20",
number = "5",
pages = "128:1--128:??",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3637441",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Apr 10 08:42:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3637441",
abstract = "Medical multi-modal retrieval aims to provide doctors
with similar medical images from different modalities,
which can greatly promote the efficiency and accuracy
of clinical diagnosis. However, most existing medical
retrieval methods hardly support the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "128",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{You:2024:MOT,
author = "Sisi You and Hantao Yao and Bing-Kun Bao and
Changsheng Xu",
title = "Multi-object Tracking with Spatial-Temporal Tracklet
Association",
journal = j-TOMM,
volume = "20",
number = "5",
pages = "129:1--129:??",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3635155",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Apr 10 08:42:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3635155",
abstract = "Recently, the tracking-by-detection methods have
achieved excellent performance in Multi-Object Tracking
(MOT), which focuses on obtaining a robust feature for
each object and generating tracklets based on feature
similarity. However, they are confronted \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "129",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Bingol:2024:QEW,
author = "G{\"u}lnaziye Bing{\"o}l and Simone Porcu and
Alessandro Floris and Luigi Atzori",
title = "{QoE} Estimation of {WebRTC}-based Audio-visual
Conversations from Facial and Speech Features",
journal = j-TOMM,
volume = "20",
number = "5",
pages = "130:1--130:??",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3638251",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Apr 10 08:42:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3638251",
abstract = "The utilization of user's facial- and speech-related
features for the estimation of the Quality of
Experience (QoE) of multimedia services is still
underinvestigated despite its potential. Currently,
only the use of either facial or speech features
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "130",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Qiu:2024:LOP,
author = "Heqian Qiu and Hongliang Li and Qingbo Wu and Hengcan
Shi and Lanxiao Wang and Fanman Meng and Linfeng Xu",
title = "Learning Offset Probability Distribution for Accurate
Object Detection",
journal = j-TOMM,
volume = "20",
number = "5",
pages = "131:1--131:??",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3637214",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Apr 10 08:42:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3637214",
abstract = "Object detection combines object classification and
object localization problems. Current object detection
methods heavily depend on regression networks to locate
objects, which are optimized with various regression
loss functions to predict offsets \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "131",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Floris:2024:CMP,
author = "Alessandro Floris and Simone Porcu and Luigi Atzori",
title = "Controlling Media Player with Hands: a Transformer
Approach and a Quality of Experience Assessment",
journal = j-TOMM,
volume = "20",
number = "5",
pages = "132:1--132:??",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3638560",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Apr 10 08:42:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3638560",
abstract = "In this article, we propose a Hand Gesture Recognition
(HGR) system based on a novel deep transformer (DT)
neural network for media player control. The extracted
hand skeleton features are processed by separate
transformers for each finger in isolation to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "132",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2024:EVR,
author = "Jingyu Li and Zhendong Mao and Hao Li and Weidong Chen
and Yongdong Zhang",
title = "Exploring Visual Relationships via Transformer-based
Graphs for Enhanced Image Captioning",
journal = j-TOMM,
volume = "20",
number = "5",
pages = "133:1--133:??",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3638558",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Apr 10 08:42:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3638558",
abstract = "Image captioning (IC), bringing vision to language,
has drawn extensive attention. A crucial aspect of IC
is the accurate depiction of visual relations among
image objects. Visual relations encompass two primary
facets: content relations and structural \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "133",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Ma:2024:HLD,
author = "Zeyu Ma and Siwei Wang and Xiao Luo and Zhonghui Gu
and Chong Chen and Jinxing Li and Xian-Sheng Hua and
Guangming Lu",
title = "{HARR}: Learning Discriminative and High-Quality Hash
Codes for Image Retrieval",
journal = j-TOMM,
volume = "20",
number = "5",
pages = "134:1--134:??",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3627162",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Apr 10 08:42:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3627162",
abstract = "This article studies deep unsupervised hashing, which
has attracted increasing attention in large-scale image
retrieval. The majority of recent approaches usually
reconstruct semantic similarity information, which then
guides the hash code learning. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "134",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2024:CWS,
author = "Chengyang Zhang and Yong Zhang and Bo Li and Xinglin
Piao and Baocai Yin",
title = "{CrowdGraph}: Weakly supervised Crowd Counting via
Pure Graph Neural Network",
journal = j-TOMM,
volume = "20",
number = "5",
pages = "135:1--135:??",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3638774",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Apr 10 08:42:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3638774",
abstract = "Most existing weakly supervised crowd counting methods
utilize Convolutional Neural Networks (CNN) or
Transformer to estimate the total number of individuals
in an image. However, both CNN-based (grid-to-count
paradigm) and Transformer-based (sequence-to-.
\ldots{})",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "135",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2024:WGO,
author = "Jie Wang and Guoqiang Li and Jie Shi and Jinwen Xi",
title = "Weighted Guided Optional Fusion Network for {RGB-T}
Salient Object Detection",
journal = j-TOMM,
volume = "20",
number = "5",
pages = "136:1--136:??",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3624984",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Apr 10 08:42:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3624984",
abstract = "There is no doubt that the rational and effective use
of visible and thermal infrared image data information
to achieve cross-modal complementary fusion is the key
to improving the performance of RGB-T salient object
detection (SOD). A meticulous analysis \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "136",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2024:JAV,
author = "Yibo Zhang and Weiguo Lin and Junfeng Xu",
title = "Joint Audio-Visual Attention with Contrastive Learning
for More General Deepfake Detection",
journal = j-TOMM,
volume = "20",
number = "5",
pages = "137:1--137:??",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3625100",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Apr 10 08:42:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3625100",
abstract = "With the continuous advancement of deepfake
technology, there has been a surge in the creation of
realistic fake videos. Unfortunately, the malicious
utilization of deepfake poses a significant threat to
societal morality and political security. Therefore,.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "137",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2024:KIM,
author = "Depei Wang and Ruifeng Xu and Lianglun Cheng and
Zhuowei Wang",
title = "Knowledge-integrated Multi-modal Movie Turning Point
Identification",
journal = j-TOMM,
volume = "20",
number = "5",
pages = "138:1--138:??",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3638557",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Apr 10 08:42:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3638557",
abstract = "The rapid development of artificial intelligence
provides rich technologies and tools for the automated
understanding of literary works. As a comprehensive
carrier of storylines, movies are natural multimodal
data sources that provide sufficient data \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "138",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2024:DCF,
author = "Chunpu Liu and Guanglei Yang and Wangmeng Zuo and
Tianyi Zang",
title = "{DPDFormer}: a Coarse-to-Fine Model for Monocular
Depth Estimation",
journal = j-TOMM,
volume = "20",
number = "5",
pages = "139:1--139:??",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3638559",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Apr 10 08:42:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3638559",
abstract = "Monocular depth estimation attracts great attention
from computer vision researchers for its convenience in
acquiring environment depth information. Recently
classification-based MDE methods show its promising
performance and begin to act as an essential \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "139",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yan:2024:TSP,
author = "Yunyao Yan and Guoqing Xiang and Huizhu Jia and Jie
Chen and Xiaofeng Huang and Xiaodong Xie",
title = "Two-Stage Perceptual Quality Oriented Rate Control
Algorithm for {HEVC}",
journal = j-TOMM,
volume = "20",
number = "5",
pages = "140:1--140:??",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3636510",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Apr 10 08:42:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3636510",
abstract = "As a practical technique in mainstream video coding
applications, rate control dominates important to
ensure compression quality with limited bitrates
constraints. However, most rate control methods mainly
focus on objective quality while ignoring the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "140",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2024:VDG,
author = "Zongyi Li and Yuxuan Shi and Hefei Ling and Jiazhong
Chen and Boyuan Liu and Runsheng Wang and Chengxin
Zhao",
title = "Viewpoint Disentangling and Generation for
Unsupervised Object {Re-ID}",
journal = j-TOMM,
volume = "20",
number = "5",
pages = "141:1--141:??",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3632959",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Apr 10 08:42:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3632959",
abstract = "Unsupervised object Re-ID aims to learn discriminative
identity features from a fully unlabeled dataset to
solve the open-class re-identification problem.
Satisfying results have been achieved in existing
unsupervised Re-ID methods, primarily trained with
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "141",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Dai:2024:TLF,
author = "Kuai Dai and Xutao Li and Huiwei Lin and Yin Jiang and
Xunlai Chen and Yunming Ye and Di Xian",
title = "{TinyPredNet}: a Lightweight Framework for Satellite
Image Sequence Prediction",
journal = j-TOMM,
volume = "20",
number = "5",
pages = "142:1--142:??",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3638773",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Apr 10 08:42:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3638773",
abstract = "Satellite image sequence prediction aims to precisely
infer future satellite image frames with historical
observations, which is a significant and challenging
dense prediction task. Though existing deep learning
models deliver promising performance for \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "142",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Ma:2024:RRA,
author = "Yingnan Ma and Chenqiu Zhao and Bingran Huang and
Xudong Li and Anup Basu",
title = "{RAST}: Restorable Arbitrary Style Transfer",
journal = j-TOMM,
volume = "20",
number = "5",
pages = "143:1--143:??",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3638770",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Apr 10 08:42:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3638770",
abstract = "The objective of arbitrary style transfer is to apply
a given artistic or photo-realistic style to a target
image. Although current methods have shown some success
in transferring style, arbitrary style transfer still
has several issues, including content \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "143",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Hsu:2024:CDA,
author = "Wei-Yen Hsu and Hsien-Wen Lin",
title = "Context-detail-aware United Network for Single Image
Deraining",
journal = j-TOMM,
volume = "20",
number = "5",
pages = "144:1--144:??",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3639407",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Apr 10 08:42:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3639407",
abstract = "Images captured outdoors are often affected by rainy
days, resulting in a severe deterioration in the visual
quality of the captured images and a decrease in the
performance of related applications. Therefore, single
image deraining has attracted \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "144",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2024:TSM,
author = "Yao Liu and Gangfeng Cui and Jiahui Luo and Xiaojun
Chang and Lina Yao",
title = "Two-stream Multi-level Dynamic Point Transformer for
Two-person Interaction Recognition",
journal = j-TOMM,
volume = "20",
number = "5",
pages = "145:1--145:??",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3639470",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Apr 10 08:42:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3639470",
abstract = "As a fundamental aspect of human life, two-person
interactions contain meaningful information about
people's activities, relationships, and social
settings. Human action recognition serves as the
foundation for many smart applications, with a strong
focus \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "145",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Chen:2024:MCT,
author = "Chengxin Chen and Pengyuan Zhang",
title = "Modality-collaborative Transformer with Hybrid Feature
Reconstruction for Robust Emotion Recognition",
journal = j-TOMM,
volume = "20",
number = "5",
pages = "146:1--146:??",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3640343",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Apr 10 08:42:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3640343",
abstract = "As a vital aspect of affective computing, Multimodal
Emotion Recognition has been an active research area in
the multimedia community. Despite recent progress, this
field still confronts two major challenges in
real-world applications: (1) improving the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "146",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Huang:2024:UOI,
author = "Jiafeng Huang and Tianjun Zhang and Shengjie Zhao and
Lin Zhang and Yicong Zhou",
title = "An Underwater Organism Image Dataset and a Lightweight
Module Designed for Object Detection Networks",
journal = j-TOMM,
volume = "20",
number = "5",
pages = "147:1--147:??",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3640465",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Apr 10 08:42:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3640465",
abstract = "Long-term monitoring and recognition of underwater
organism objects are of great significance in marine
ecology, fisheries science and many other disciplines.
Traditional techniques in this field, including manual
fishing-based ones and sonar-based ones, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "147",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2024:PPM,
author = "Jing Liu and Litao Shang and Yuting Su and Weizhi Nie
and Xin Wen and Anan Liu",
title = "Privacy-preserving Multi-source Cross-domain
Recommendation Based on Knowledge Graph",
journal = j-TOMM,
volume = "20",
number = "5",
pages = "148:1--148:??",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3639706",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Apr 10 08:42:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3639706",
abstract = "The cross-domain recommender systems aim to alleviate
the data sparsity problem in the target domain by
transferring knowledge from the auxiliary domain.
However, existing works ignore the fact that the data
sparsity problem may also exist in the single
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "148",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2024:BDB,
author = "Xingyu Liu and Zhongyun Hua and Shuang Yi and Yushu
Zhang and Yicong Zhou",
title = "Bi-directional Block Encoding for Reversible Data
Hiding over Encrypted Images",
journal = j-TOMM,
volume = "20",
number = "5",
pages = "149:1--149:??",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3638771",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Apr 10 08:42:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3638771",
abstract = "Reversible data hiding over encrypted images (RDH-EI)
technology is a viable solution for privacy-preserving
cloud storage, as it enables the reversible embedding
of additional data into images while maintaining image
confidentiality. Since the data \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "149",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yi:2024:OVS,
author = "Peng Yi and Zhongyuan Wang and Laigan Luo and Kui
Jiang and Zheng He and Junjun Jiang and Tao Lu and
Jiayi Ma",
title = "Omniscient Video Super-Resolution with
Explicit-Implicit Alignment",
journal = j-TOMM,
volume = "20",
number = "5",
pages = "150:1--150:??",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3640346",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Apr 10 08:42:41 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3640346",
abstract = "When considering the temporal relationships, most
previous video super-resolution (VSR) methods follow
the iterative or recurrent framework. The iterative
framework adopts neighboring low-resolution (LR) frames
from a sliding window, while the recurrent \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "150",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Singh:2024:ISI,
author = "Amit Kumar Singh and Deepa Kundur and Mauro Conti",
title = "Introduction to the Special Issue on Integrity of
Multimedia and Multimodal Data in {Internet of
Things}",
journal = j-TOMM,
volume = "20",
number = "6",
pages = "151:1--151:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3643040",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat May 18 06:38:44 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3643040",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "151",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yang:2024:BBS,
author = "Wenyuan Yang and Shaocong Wu and Jianwei Fei and
Xianwang Zeng and Yuemin Ding and Zhihua Xia",
title = "A Bitcoin-based Secure Outsourcing Scheme for
Optimization Problem in Multimedia {Internet of
Things}",
journal = j-TOMM,
volume = "20",
number = "6",
pages = "152:1--152:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3637489",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat May 18 06:38:44 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3637489",
abstract = "With the development of the Internet of Things (IoT)
and cloud computing, various multimedia data such as
audio, video, and images have experienced explosive
growth, ushering in the era of big data. Large-scale
computing tasks in the Multimedia Internet \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "152",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2024:PIP,
author = "Qingzhi Liu and Yuchen Huang and Chenglu Jin and
Xiaohan Zhou and Ying Mao and Cagatay Catal and Long
Cheng",
title = "Privacy and Integrity Protection for {IoT} Multimodal
Data Using Machine Learning and Blockchain",
journal = j-TOMM,
volume = "20",
number = "6",
pages = "153:1--153:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3638769",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat May 18 06:38:44 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3638769",
abstract = "With the wide application of Internet of Things (IoT)
technology, large volumes of multimodal data are
collected and analyzed for various diagnoses, analyses,
and predictions to help in decision-making and
management. However, the research on protecting
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "153",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Jonker:2024:DPE,
author = "Simon Jonker and Malthe Jelstrup and Weizhi Meng and
Brooke Lampe",
title = "Detecting Post Editing of Multimedia Images using
Transfer Learning and Fine Tuning",
journal = j-TOMM,
volume = "20",
number = "6",
pages = "154:1--154:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3633284",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat May 18 06:38:44 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3633284",
abstract = "In the domain of general image forgery detection, a
myriad of different classification solutions have been
developed to distinguish a ``tampered'' image from a
``pristine'' image. In this work, we aim to develop a
new method to tackle the problem of binary \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "154",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Bisogni:2024:IEB,
author = "Carmen Bisogni and Lucia Cascone and Michele Nappi and
Chiara Pero",
title = "{IoT}-enabled Biometric Security: Enhancing Smart Car
Safety with Depth-based Head Pose Estimation",
journal = j-TOMM,
volume = "20",
number = "6",
pages = "155:1--155:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3639367",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat May 18 06:38:44 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3639367",
abstract = "Advanced Driver Assistance Systems (ADAS) are
experiencing higher levels of automation, facilitated
by the synergy among various sensors integrated within
vehicles, thereby forming an Internet of Things (IoT)
framework. Among these sensors, cameras have \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "155",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Nouma:2024:TED,
author = "Saif E. Nouma and Attila A. Yavuz",
title = "Trustworthy and Efficient Digital Twins in
Post-Quantum Era with Hybrid Hardware-Assisted
Signatures",
journal = j-TOMM,
volume = "20",
number = "6",
pages = "156:1--156:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3638250",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat May 18 06:38:44 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3638250",
abstract = "Digital Twins (DT) virtually model cyber-physical
objects via sensory inputs by simulating or monitoring
their behavior. Therefore, DTs usually harbor vast
quantities of Internet of Things (IoT) components
(e.g., sensors) that gather, process, and offload
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "156",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2024:VDS,
author = "Fan Li and Yanxiang Chen and Haiyang Liu and Zuxing
Zhao and Yuanzhi Yao and Xin Liao",
title = "Vocoder Detection of Spoofing Speech Based on {GAN}
Fingerprints and Domain Generalization",
journal = j-TOMM,
volume = "20",
number = "6",
pages = "157:1--157:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3630751",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat May 18 06:38:44 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3630751",
abstract = "As an important part of the text-to-speech (TTS)
system, vocoders convert acoustic features into speech
waveforms. The difference in vocoders is key to
producing different types of forged speech in the TTS
system. With the rapid development of general
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "157",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Gao:2024:IMC,
author = "Jing Gao and Peng Li and Asif Ali Laghari and Gautam
Srivastava and Thippa Reddy Gadekallu and Sidra Abbas
and Jianing Zhang",
title = "Incomplete Multiview Clustering via Semidiscrete
Optimal Transport for Multimedia Data Mining in {IoT}",
journal = j-TOMM,
volume = "20",
number = "6",
pages = "158:1--158:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3625548",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat May 18 06:38:44 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3625548",
abstract = "With the wide deployment of the Internet of Things
(IoT), large volumes of incomplete multiview data that
violates data integrity is generated by various
applications, which inevitably produces negative
impacts on the quality of service of IoT systems.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "158",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2024:PAR,
author = "Zhenyu Liu and Da Li and Xinyu Zhang and Zhang Zhang
and Peng Zhang and Caifeng Shan and Jungong Han",
title = "Pedestrian Attribute Recognition via Spatio-temporal
Relationship Learning for Visual Surveillance",
journal = j-TOMM,
volume = "20",
number = "6",
pages = "159:1--159:??",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3632624",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat May 18 06:38:44 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3632624",
abstract = "Pedestrian attribute recognition (PAR) aims at
predicting the visual attributes of a pedestrian image.
PAR has been used as soft biometrics for visual
surveillance and IoT security. Most of the current PAR
methods are developed based on discrete images.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "159",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Garcia:2024:SNF,
author = "Roberto Garc{\'\i}a and Ana Cediel and Merc{\`e}
Teixid{\'o} and Rosa Gil",
title = "Semantics and Non-fungible Tokens for Copyright
Management on the Metaverse and Beyond",
journal = j-TOMM,
volume = "20",
number = "7",
pages = "186:1--186:??",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3585387",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat May 18 06:38:45 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3585387",
abstract = "Recent initiatives related to the Metaverse focus on
better visualization, like augmented or virtual
reality, but also persistent digital objects. To
guarantee real ownership of these digital objects, open
systems based on public blockchains and Non-.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "186",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xie:2024:RCA,
author = "Tianxiu Xie and Keke Gai and Liehuang Zhu and Shuo
Wang and Zijian Zhang",
title = "{RAC-Chain}: an Asynchronous Consensus-based
Cross-chain Approach to Scalable Blockchain for
Metaverse",
journal = j-TOMM,
volume = "20",
number = "7",
pages = "187:1--187:??",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3586011",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat May 18 06:38:45 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3586011",
abstract = "The metaverse, as an emerging technical term,
conceptually aims to construct a virtual digital space
that runs parallel to the physical world. Due to human
behaviors and interactions being represented in the
virtual world, security in the metaverse is a
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "187",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Ren:2024:HCC,
author = "Yongjun Ren and Zhiying Lv and Neal N. Xiong and Jin
Wang",
title = "{HCNCT}: a Cross-chain Interaction Scheme for the
Blockchain-based Metaverse",
journal = j-TOMM,
volume = "20",
number = "7",
pages = "188:1--188:??",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3594542",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat May 18 06:38:45 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3594542",
abstract = "As a new type of digital living space that blends
virtual and reality, Metaverse combines many emerging
technologies. It provides an immersive experience based
on VR technology and stores and protects users' digital
content and digital assets through \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "188",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Chen:2024:QQC,
author = "Shuangmin Chen and Rui Xu and Jian Xu and Shiqing Xin
and Changhe Tu and Chenglei Yang and Lin Lu",
title = "{QuickCSGModeling}: Quick {CSG} Operations Based on
Fusing Signed Distance Fields for {VR} Modeling",
journal = j-TOMM,
volume = "20",
number = "7",
pages = "189:1--189:??",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3599729",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat May 18 06:38:45 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3599729",
abstract = "The latest advancements in Virtual Reality (VR) enable
the creation of 3D models within a holographic
immersive simulation environment. In this article, we
create QuickCSGModeling, a user-friendly mid-air
interactive modeling system. We first prepare a
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "189",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2024:PPA,
author = "Qinnan Zhang and Zehui Xiong and Jianming Zhu and
Sheng Gao and Wanting Yang",
title = "A Privacy-preserving Auction Mechanism for Learning
Model as an {NFT} in Blockchain-driven Metaverse",
journal = j-TOMM,
volume = "20",
number = "7",
pages = "190:1--190:??",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3599971",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat May 18 06:38:45 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3599971",
abstract = "The Metaverse, envisioned as the next-generation
Internet, will be constructed via twining a practical
world in a virtual form, wherein Meterverse service
providers (MSPs) are required to collect massive data
from Meterverse users (MUs). In this regard, a
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "190",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2024:MMI,
author = "Han Wang and Hui Li and Abla Smahi and Feng Zhao and
Yao Yao and Ching Chuen Chan and Shiyu Wang and Wenyuan
Yang and Shuo-Yen Robert Li",
title = "{MIS}: a Multi-Identifier Management and Resolution
System in the Metaverse",
journal = j-TOMM,
volume = "20",
number = "7",
pages = "191:1--191:??",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3597641",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Sat May 18 06:38:45 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3597641",
abstract = "The metaverse gradually evolves into a virtual world
containing a series of interconnected sub-metaverses.
Diverse digital resources, including identities,
contents, services, and supporting data, are key
components of the sub-metaverse. Therefore, a
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "191",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2024:HFM,
author = "Jinliang Liu and Zhedong Zheng and Zongxin Yang and Yi
Yang",
title = "High Fidelity Makeup via {$2$D} and {$3$D} Identity
Preservation Net",
journal = j-TOMM,
volume = "20",
number = "8",
pages = "230:1--230:??",
month = aug,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3656475",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Aug 28 06:37:02 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3656475",
abstract = "In this article, we address the challenging makeup
transfer task, aiming to transfer makeup from a
reference image to a source image while preserving
facial geometry and background consistency. Existing
deep neural network-based methods have shown \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "230",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Huang:2024:RTA,
author = "Junjian Huang and Hao Ren and Shulin Liu and Yong Liu
and Chuanlu Lv and Jiawen Lu and Changyong Xie and Hong
Lu",
title = "Real-Time Attentive Dilated {$U$}-Net for Extremely
Dark Image Enhancement",
journal = j-TOMM,
volume = "20",
number = "8",
pages = "231:1--231:??",
month = aug,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3654668",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Aug 28 06:37:02 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3654668",
abstract = "Images taken under low-light conditions suffer from
poor visibility, color distortion, and graininess, all
of which degrade the image quality and hamper the
performance of downstream vision tasks, such as object
detection and instance segmentation in the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "231",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xiong:2024:ICI,
author = "Mingfu Xiong and Kaikang Hu and Zhihan Lyu and Fei
Fang and Zhongyuan Wang and Ruimin Hu and Khan
Muhammad",
title = "Inter-camera Identity Discrimination for Unsupervised
Person Re-identification",
journal = j-TOMM,
volume = "20",
number = "8",
pages = "232:1--232:??",
month = aug,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3652858",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Aug 28 06:37:02 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3652858",
abstract = "Unsupervised person re-identification (Re-ID) has
garnered significant attention because of its
data-friendly nature, as it does not require labeled
data. Existing approaches primarily address this
challenge by employing feature-clustering techniques to
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "232",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yu:2024:PGE,
author = "Jiaqi Yu and Jinhai Yang and Hua Yang and Renjie Pan
and Pingrui Lai and Guangtao Zhai",
title = "Psychology-Guided Environment Aware Network for
Discovering Social Interaction Groups from Videos",
journal = j-TOMM,
volume = "20",
number = "8",
pages = "233:1--233:??",
month = aug,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3657295",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Aug 28 06:37:02 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3657295",
abstract = "Social interaction is a common phenomenon in human
societies. Different from discovering groups based on
the similarity of individuals' actions, social
interaction focuses more on the mutual influence
between people. Although people can easily judge
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "233",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2024:SSS,
author = "Qi Liu and Xinchen Liu and Kun Liu and Xiaoyan Gu and
Wu Liu",
title = "{SigFormer}: Sparse Signal-guided Transformer for
Multi-modal Action Segmentation",
journal = j-TOMM,
volume = "20",
number = "8",
pages = "234:1--234:??",
month = aug,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3657296",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Aug 28 06:37:02 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3657296",
abstract = "Multi-modal human action segmentation is a critical
and challenging task with a wide range of applications.
Nowadays, the majority of approaches concentrate on the
fusion of dense signals (i.e., RGB, optical flow, and
depth maps). However, the potential \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "234",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Lyu:2024:DDB,
author = "Jun Lyu and Shouang Yan and M. Shamim Hossain",
title = "{DBGAN}: Dual Branch Generative Adversarial Network
for Multi-Modal {MRI} Translation",
journal = j-TOMM,
volume = "20",
number = "8",
pages = "235:1--235:??",
month = aug,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3657298",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Aug 28 06:37:02 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3657298",
abstract = "Existing magnetic resonance imaging translation models
rely on generative adversarial networks, primarily
employing simple convolutional neural networks.
Unfortunately, these networks struggle to capture
global representations and contextual relationships
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "235",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2024:BDG,
author = "Dejun Zhang and Mian Zhang and Xuefeng Tan and Jun
Liu",
title = "Bridging the Domain Gap in Scene Flow Estimation via
Hierarchical Smoothness Refinement",
journal = j-TOMM,
volume = "20",
number = "8",
pages = "236:1--236:??",
month = aug,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3661823",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Aug 28 06:37:02 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3661823",
abstract = "This article introduces SmoothFlowNet3D, an innovative
encoder-decoder architecture specifically designed for
bridging the domain gap in scene flow estimation. To
achieve this goal, SmoothFlowNet3D divides the scene
flow estimation task into two stages: \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "236",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Chen:2024:ISC,
author = "Ning Chen and Zhipeng Cheng and Xuwei Fan and Zhang
Liu and Bangzhen Huang and Yifeng Zhao and Lianfen
Huang and Xiaojiang Du and Mohsen Guizani",
title = "Integrated Sensing, Communication, and Computing for
Cost-effective Multimodal Federated Perception",
journal = j-TOMM,
volume = "20",
number = "8",
pages = "237:1--237:??",
month = aug,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3661313",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Aug 28 06:37:02 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3661313",
abstract = "Federated learning (FL) is a prominent paradigm of 6G
edge intelligence (EI), which mitigates privacy
breaches and high communication pressure caused by
conventional centralized model training in the
artificial intelligence of things (AIoT). The execution
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "237",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yang:2024:LVC,
author = "Jiayu Yang and Chunhui Yang and Fei Xiong and Yongqi
Zhai and Ronggang Wang",
title = "Learned Video Compression with Adaptive Temporal Prior
and Decoded Motion-aided Quality Enhancement",
journal = j-TOMM,
volume = "20",
number = "8",
pages = "238:1--238:??",
month = aug,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3661824",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Aug 28 06:37:02 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3661824",
abstract = "Learned video compression has drawn great attention
and shown promising compression performance recently.
In this article, we focus on the two components in the
learned video compression framework, the conditional
entropy model and quality enhancement \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "238",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Gu:2024:RAF,
author = "Xiaoling Gu and Junkai Zhu and Yongkang Wong and
Zizhao Wu and Jun Yu and Jianping Fan and Mohan
Kankanhalli",
title = "Recurrent Appearance Flow for Occlusion-Free Virtual
Try-On",
journal = j-TOMM,
volume = "20",
number = "8",
pages = "239:1--239:??",
month = aug,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3659581",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Aug 28 06:37:02 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3659581",
abstract = "Image-based virtual try-on aims at transferring a
target in-shop garment onto a reference person, and has
garnered significant attention from the research
communities recently. However, previous methods have
faced severe challenges in handling occlusion
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "239",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Lyu:2024:ISI,
author = "Yuanjie Lyu and Penggang Qin and Tong Xu and Chen Zhu
and Enhong Chen",
title = "{InteractNet}: Social Interaction Recognition for
Semantic-rich Videos",
journal = j-TOMM,
volume = "20",
number = "8",
pages = "240:1--240:??",
month = aug,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3663668",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Aug 28 06:37:02 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3663668",
abstract = "The overwhelming surge of online video platforms has
raised an urgent need for social interaction
recognition techniques. Compared with simple short-term
actions, long-term social interactions in semantic-rich
videos could reflect more complicated \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "240",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Bhattacharjee:2024:ESM,
author = "Mrinmoy Bhattacharjee and Prasanna Mahadeva S. R. and
Prithwijit Guha",
title = "Exploration of Speech and Music Information for Movie
Genre Classification",
journal = j-TOMM,
volume = "20",
number = "8",
pages = "241:1--241:??",
month = aug,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3664197",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Aug 28 06:37:02 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3664197",
abstract = "Movie genre prediction from trailers is mostly
attempted in a multi-modal manner. However, the
characteristics of movie trailer audio indicate that
this modality alone might be highly effective in genre
prediction. Movie trailer audio predominantly
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "241",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Sarto:2024:TRA,
author = "Sara Sarto and Marcella Cornia and Lorenzo Baraldi and
Alessandro Nicolosi and Rita Cucchiara",
title = "Towards Retrieval-Augmented Architectures for Image
Captioning",
journal = j-TOMM,
volume = "20",
number = "8",
pages = "242:1--242:??",
month = aug,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3663667",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Aug 28 06:37:02 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3663667",
abstract = "The objective of image captioning models is to bridge
the gap between the visual and linguistic modalities by
generating natural language descriptions that
accurately reflect the content of input images. In
recent years, researchers have leveraged deep
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "242",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yang:2024:PAP,
author = "Kaihui Yang and Junwei Han and Guangyu Guo and Chaowei
Fang and Yingzi Fan and Lechao Cheng and Dingwen
Zhang",
title = "Progressive Adapting and Pruning: Domain-Incremental
Learning for Saliency Prediction",
journal = j-TOMM,
volume = "20",
number = "8",
pages = "243:1--243:??",
month = aug,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3661312",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Aug 28 06:37:02 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3661312",
abstract = "Saliency prediction (SAP) plays a crucial role in
simulating the visual perception function of human
beings. In practical situations, humans can quickly
grasp saliency extraction in new image domains.
However, current SAP methods mainly concentrate on
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "243",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Tang:2024:HED,
author = "Lv Tang and Xinfeng Zhang",
title = "High Efficiency Deep-learning Based Video
Compression",
journal = j-TOMM,
volume = "20",
number = "8",
pages = "244:1--244:??",
month = aug,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3661311",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Aug 28 06:37:02 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3661311",
abstract = "Although deep learning technique has achieved
significant improvement on image compression, but its
advantages are not fully explored in video compression,
which leads to the performance of deep-learning-based
video compression (DLVC) is obviously \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "244",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Gomes:2024:AAG,
author = "Pedro de Medeiros Gomes and Silvia Rossi and Laura
Toni",
title = "{AGAR} --- Attention Graph-{RNN} for Adaptative Motion
Prediction of Point Clouds of Deformable Objects",
journal = j-TOMM,
volume = "20",
number = "8",
pages = "245:1--245:??",
month = aug,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3662183",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Aug 28 06:37:02 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3662183",
abstract = "This article focuses on motion prediction for point
cloud sequences in the challenging case of deformable
3D objects, such as human body motion. First, we
investigate the challenges caused by deformable shapes
and complex motions present in this type of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "245",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Ye:2024:UUR,
author = "Jiabo Ye and Junfeng Tian and Ming Yan and Haiyang Xu
and Qinghao Ye and Yaya Shi and Xiaoshan Yang and Xuwu
Wang and Ji Zhang and Liang He and Xin Lin",
title = "{UniQRNet}: Unifying Referring Expression Grounding
and Segmentation with {QRNet}",
journal = j-TOMM,
volume = "20",
number = "8",
pages = "246:1--246:??",
month = aug,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3660638",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Aug 28 06:37:02 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3660638",
abstract = "Referring expression comprehension aims to align
natural language queries with visual scenes, which
requires establishing fine-grained correspondence
between vision and language. This has important
applications in multi-modal reasoning systems. Existing
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "246",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhou:2024:BQA,
author = "Wei Zhou and Qi Yang and Wu Chen and Qiuping Jiang and
Guangtao Zhai and Weisi Lin",
title = "Blind Quality Assessment of Dense {$3$D} Point Clouds
with Structure Guided Resampling",
journal = j-TOMM,
volume = "20",
number = "8",
pages = "247:1--247:??",
month = aug,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3664199",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Aug 28 06:37:02 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3664199",
abstract = "Objective quality assessment of three-dimensional (3D)
point clouds is essential for the development of
immersive multimedia systems in real-world
applications. Despite the success of perceptual quality
evaluation for 2D images and videos, blind/no-.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "247",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhao:2024:EWZ,
author = "Yuli Zhao and Yin Zhang and Francis C. M. Lau and Hai
Yu and Zhiliang Zhu and Bin Zhang",
title = "Expanding-Window Zigzag Decodable Fountain Codes for
Scalable Multimedia Transmission",
journal = j-TOMM,
volume = "20",
number = "8",
pages = "248:1--248:??",
month = aug,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3664610",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Aug 28 06:37:02 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3664610",
abstract = "In this article, we present a coding method called
expanding-window zigzag decodable fountain code with
unequal error protection property (EWF-ZD UEP code) to
achieve scalable multimedia transmission. The key idea
of the EWF-ZD UEP code is to utilize bit-. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "248",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Jin:2024:USR,
author = "Xuanyu Jin and Ni Li and Wanzeng Kong and Jiajia Tang
and Bing Yang",
title = "Unbiased Semantic Representation Learning Based on
Causal Disentanglement for Domain Generalization",
journal = j-TOMM,
volume = "20",
number = "8",
pages = "249:1--249:??",
month = aug,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3659953",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Aug 28 06:37:02 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3659953",
abstract = "Domain generalization primarily mitigates domain shift
among multiple source domains, generalizing the trained
model to an unseen target domain. However, the spurious
correlation usually caused by context prior (e.g.,
background) makes it challenging to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "249",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Peng:2024:SSM,
author = "Bo Peng and Lin Sun and Jianjun Lei and Bingzheng Liu
and Haifeng Shen and Wanqing Li and Qingming Huang",
title = "Self-Supervised Monocular Depth Estimation via
Binocular Geometric Correlation Learning",
journal = j-TOMM,
volume = "20",
number = "8",
pages = "250:1--250:??",
month = aug,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3663570",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Aug 28 06:37:02 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3663570",
abstract = "Monocular depth estimation aims to infer a depth map
from a single image. Although supervised learning-based
methods have achieved remarkable performance, they
generally rely on a large amount of labor-intensively
annotated data. Self-supervised methods, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "250",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yang:2024:DPJ,
author = "Yang Yang and Shuailong Qiu and Lanling Zeng and
Zhigeng Pan",
title = "Detail-preserving Joint Image Upsampling",
journal = j-TOMM,
volume = "20",
number = "8",
pages = "251:1--251:??",
month = aug,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3665246",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Aug 28 06:37:02 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3665246",
abstract = "Image operators can be instrumental to computational
imaging and photography. However, many of them are
computationally intensive. In this article, we propose
an effective yet efficient joint upsampling method to
accelerate various image operators. We \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "251",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Kang:2024:OCM,
author = "Xiao Kang and Xingbo Liu and Wen Xue and Xiushan Nie
and Yilong Yin",
title = "Online Cross-modal Hashing With Dynamic Prototype",
journal = j-TOMM,
volume = "20",
number = "8",
pages = "252:1--252:??",
month = aug,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3665249",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Aug 28 06:37:02 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3665249",
abstract = "Online cross-modal hashing has received increasing
attention due to its efficiency and effectiveness in
handling cross-modal streaming data retrieval. Despite
the promising performance, these methods mainly focus
on the supervised learning paradigm, demanding
expensive and laborious work to obtain clean annotated
data. Existing unsupervised online hashing methods
mostly struggle to construct instructive semantic
correlations among data chunks, resulting in the
forgetting of accumulated data distribution. To this
end, we propose a Dynamic Prototype-based Online
Cross-modal Hashing method, called DPOCH. Based on the
pre-learned reliable common representations, DPOCH
generates prototypes incrementally as sketches of
accumulated data and updates them dynamically for
adapting streaming data. Thereafter, the
prototype-based semantic embedding and similarity
graphs are designed to promote stability and
generalization of the hashing process, thereby
obtaining globally adaptive hash codes and hash
functions. Experimental results on benchmarked datasets
demonstrate that the proposed DPOCH outperforms
state-of-the-art unsupervised online cross-modal
hashing methods.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "252",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yang:2024:SFS,
author = "Yuqing Yang and Boris Joukovsky and Jos{\'e} Oramas
Mogrovejo and Tinne Tuytelaars and Nikos Deligiannis",
title = "{SNIPPET}: a Framework for Subjective Evaluation of
Visual Explanations Applied to {DeepFake} Detection",
journal = j-TOMM,
volume = "20",
number = "8",
pages = "253:1--253:??",
month = aug,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3665248",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Aug 28 06:37:02 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3665248",
abstract = "Explainable Artificial Intelligence (XAI) attempts to
help humans understand machine learning decisions
better and has been identified as a critical component
toward increasing the trustworthiness of complex
black-box systems, such as deep neural \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "253",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Pan:2024:IAL,
author = "Jinwang Pan and Xianming Liu and Yuanchao Bai and
Deming Zhai and Junjun Jiang and Debin Zhao",
title = "Illumination-Aware Low-Light Image Enhancement with
Transformer and Auto-Knee Curve",
journal = j-TOMM,
volume = "20",
number = "8",
pages = "254:1--254:??",
month = aug,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3664653",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Aug 28 06:37:02 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3664653",
abstract = "Images captured under low-light conditions suffer from
several combined degradation factors, including low
brightness, low contrast, noise, and color bias. Many
learning-based techniques attempt to learn the
low-to-clear mapping between low-light and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "254",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Tiotsop:2024:MID,
author = "Lohic Fotio Tiotsop and Antonio Servetti and Peter
Pocta and Glenn {Van Wallendael} and Marcus Barkowsky
and Enrico Masala",
title = "Multiple Image Distortion {DNN} Modeling Individual
Subject Quality Assessment",
journal = j-TOMM,
volume = "20",
number = "8",
pages = "255:1--255:??",
month = aug,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3664198",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Aug 28 06:37:02 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3664198",
abstract = "A recent research direction is focused on training
Deep Neural Networks (DNNs) to replicate individual
subject assessments of media quality. These DNNs are
referred to as Artificial Intelligence-based Observers
(AIOs). An AIO is designed to simulate, in \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "255",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xu:2024:HHK,
author = "Yunhui Xu and Youru Li and Muhao Xu and Zhenfeng Zhu
and Yao Zhao",
title = "{HKA}: a Hierarchical Knowledge Alignment Framework
for Multimodal Knowledge Graph Completion",
journal = j-TOMM,
volume = "20",
number = "8",
pages = "256:1--256:??",
month = aug,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3664288",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Aug 28 06:37:02 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3664288",
abstract = "Recent years have witnessed the successful application
of knowledge graph techniques in structured data
processing, while how to incorporate knowledge from
visual and textual modalities into knowledge graphs has
been given less attention. To better \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "256",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhou:2024:MFG,
author = "Li Zhou and Zhenyu Liu and Yutong Li and Yuchi Duan
and Huimin Yu and Bin Hu",
title = "Multi Fine-Grained Fusion Network for Depression
Detection",
journal = j-TOMM,
volume = "20",
number = "8",
pages = "257:1--257:??",
month = aug,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3665247",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Aug 28 06:37:02 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3665247",
abstract = "Depression is an illness that involves emotional and
mental health. Currently, depression detection through
interviews is the most popular way. With the
advancement of natural language processing and
sentiment analysis, automated interview-based
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "257",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Lv:2024:CTI,
author = "Chenlei Lv and Dan Zhang and Shengling Geng and
Zhongke Wu and Hui Huang",
title = "Color Transfer for Images: a Survey",
journal = j-TOMM,
volume = "20",
number = "8",
pages = "258:1--258:??",
month = aug,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3635152",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Aug 28 06:37:02 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3635152",
abstract = "High-quality image generation is an important topic in
digital visualization. As a sub-topic of the research,
color transfer is to produce a high-quality image with
ideal color scheme learned from the reference one. In
this article, we investigate the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "258",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2024:RAR,
author = "Zhihao Zhang and Jun Wang and Shengjie Li and Lei Jin
and Hao Wu and Jian Zhao and Bo Zhang",
title = "Review and Analysis of {RGBT} Single Object Tracking
Methods: a Fusion Perspective",
journal = j-TOMM,
volume = "20",
number = "8",
pages = "259:1--259:??",
month = aug,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3651308",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Wed Aug 28 06:37:02 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3651308",
abstract = "Visual tracking is a fundamental task in computer
vision with significant practical applications in
various domains, including surveillance, security,
robotics, and human-computer interaction. However, it
may face limitations in visible light data, such
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "259",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2024:CST,
author = "Yuantong Zhang and Daiqin Yang and Zhenzhong Chen and
Wenpeng Ding",
title = "Continuous Space-Time Video Super-Resolution with
Multi-Stage Motion Information Reorganization",
journal = j-TOMM,
volume = "20",
number = "9",
pages = "273:1--273:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3665646",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Sep 24 06:42:16 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3665646",
abstract = "Space-time video super-resolution (ST-VSR) aims to
simultaneously expand a given source video to a higher
frame rate and resolution. However, most existing
schemes either consider fixed intermediate time and
scale or fail to exploit long-range temporal \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "273",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Shi:2024:DAT,
author = "Caijuan Shi and Yuanfan Zheng and Zhen Chen",
title = "Domain Adaptive Thermal Object Detection with Unbiased
Granularity Alignment",
journal = j-TOMM,
volume = "20",
number = "9",
pages = "274:1--274:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3665892",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Sep 24 06:42:16 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3665892",
abstract = "Domain Adaptive Object Detection (DAOD) alleviates the
challenge of labor-intensive annotations by
transferring semantic information from a labeled source
domain to an unlabeled target domain. However, the DAOD
suffers from biased discrimination and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "274",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2024:AAS,
author = "Ziyi Liu and You Yang and Kejun Wu and Qiong Liu and
Xinghua Xu and Xiaoxuan Ma and Jiang Tang",
title = "{ASIFusion}: an Adaptive Saliency Injection-Based
Infrared and Visible Image Fusion Network",
journal = j-TOMM,
volume = "20",
number = "9",
pages = "275:1--275:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3665893",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Sep 24 06:42:16 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3665893",
abstract = "The purpose of infrared and visible image fusion
(IVIF) is to acquire a more informative fused image by
leveraging complementary information, facilitating
human perception and machine vision. Among the existing
fusion methods, the saliency-based methods \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "275",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wu:2024:LAC,
author = "Xu Wu and Zhihui Lai and Jie Zhou and Xianxu Hou and
Witold Pedrycz and Linlin Shen",
title = "Light-Aware Contrastive Learning for Low-Light Image
Enhancement",
journal = j-TOMM,
volume = "20",
number = "9",
pages = "276:1--276:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3665498",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Sep 24 06:42:16 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3665498",
abstract = "Low-Light Image Enhancement (LLIE) presents challenges
due to texture information loss and uneven
illumination, which can distort feature distribution
and reduce the quality of the enhanced images. However,
current deep learning methods for LLIE only use
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "276",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Bandung:2024:IVD,
author = "Yoanes Bandung and Mokhamad Arfan Wicaksono and Sean
Pribadi and Armein Z. R. Langi and Dion Tanjung",
title = "{IoT} Video Delivery Optimization through Machine
Learning-Based Frame Resolution Adjustment",
journal = j-TOMM,
volume = "20",
number = "9",
pages = "277:1--277:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3665929",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Sep 24 06:42:16 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3665929",
abstract = "Providing acceptable video quality in the Internet of
Things (IoT) implementation poses a significant
challenge, mainly when the application is performed on
low-cost and low-power devices. This research focuses
on developing a frame resolution adjustment \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "277",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Ma:2024:ANR,
author = "Jingwei Ma and Kangkang Bian and Yang Xu and Lei Zhu",
title = "{ANAGL}: a Noise-Resistant and Anti-Sparse Graph
Learning for Micro-Video Recommendation",
journal = j-TOMM,
volume = "20",
number = "9",
pages = "278:1--278:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3670407",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Sep 24 06:42:16 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3670407",
abstract = "In recent years, graph convolutional networks (GCNs)
have seen widespread utilization within micro-video
recommendation systems, facilitating the understanding
of user preferences through interactions with
micro-videos. Despite the commendable performance
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "278",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Chen:2024:VVB,
author = "Wuyang Chen and Boqing Zhu and Kele Xu and Yong Dou
and Dawei Feng",
title = "{VoiceStyle}: Voice-Based Face Generation via
Cross-Modal Prototype Contrastive Learning",
journal = j-TOMM,
volume = "20",
number = "9",
pages = "279:1--279:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3671002",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Sep 24 06:42:16 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3671002",
abstract = "Can we predict a person's appearance solely based on
their voice? This article explores this question by
focusing on generating a face from an unheard voice
segment. Our proposed method, VoiceStyle, combines
cross-modal representation learning with \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "279",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Cai:2024:TAC,
author = "Chen Cai and Kim-Hui Yap and Suchen Wang",
title = "Toward Attribute-Controlled Fashion Image Captioning",
journal = j-TOMM,
volume = "20",
number = "9",
pages = "280:1--280:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3671000",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Sep 24 06:42:16 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3671000",
abstract = "Fashion image captioning is a critical task in the
fashion industry that aims to automatically generate
product descriptions for fashion items. However,
existing fashion image captioning models predict a
fixed caption for a particular fashion item once
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "280",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Lv:2024:SVI,
author = "Kai Lv and Haobo Chen and Chuyang Zhao and Kai Tu and
Junru Chen and Yadong Li and Boxun Li and Youfang Lin",
title = "Style Variable and Irrelevant Learning for
Generalizable Person Re-identification",
journal = j-TOMM,
volume = "20",
number = "9",
pages = "281:1--281:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3671003",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Sep 24 06:42:16 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3671003",
abstract = "Domain generalization person re-identification
(DG-ReID) has gained much attention recently due to the
poor performance of supervised re-identification on
unseen domains. The goal of domain generalization is to
develop a model that is insensitive to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "281",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2024:SSC,
author = "Mengran Li and Ronghui Zhang and Yong Zhang and
Xinglin Piao and Shiyu Zhao and Baocai Yin",
title = "{SCAE}: Structural Contrastive Auto-Encoder for
Incomplete Multi-View Representation Learning",
journal = j-TOMM,
volume = "20",
number = "9",
pages = "282:1--282:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3672078",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Sep 24 06:42:16 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3672078",
abstract = "Describing an object from multiple perspectives often
leads to incomplete data representation. Consequently,
learning consistent representations for missing data
from multiple views has emerged as a key focus in the
realm of Incomplete Multi-view \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "282",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2024:MDE,
author = "Hanzhang Wang and Deming Zhai and Xiong Zhou and
Junjun Jiang and Xianming Liu",
title = "{Mix-DDPM}: Enhancing Diffusion Models through Fitting
Mixture Noise with Global Stochastic Offset",
journal = j-TOMM,
volume = "20",
number = "9",
pages = "283:1--283:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3672080",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Sep 24 06:42:16 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3672080",
abstract = "Denoising diffusion probabilistic models (DDPM) have
shown impressive performance in various domains as a
class of deep generative models. In this article, we
introduce the mixture noise-based DDPM (Mix-DDPM),
which considers the Markov diffusion \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "283",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Hou:2024:TLF,
author = "Wenxuan Hou and Guangyao Li and Yapeng Tian and Di
Hu",
title = "Toward Long Form Audio-Visual Video Understanding",
journal = j-TOMM,
volume = "20",
number = "9",
pages = "284:1--284:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3672079",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Sep 24 06:42:16 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3672079",
abstract = "We live in a world filled with never-ending streams of
multimodal information. As a more natural recording of
the real scenario, long form audio-visual videos
(LFAVs) are expected as an important bridge for better
exploring and understanding the world. In \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "284",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yu:2024:MIQ,
author = "Encheng Yu and Jianer Zhou and Zhenyu Li and Gareth
Tyson and Weichao Li and Xinyi Zhang and Zhiwei Xu and
Gaogang Xie",
title = "Mustang: Improving {QoE} for Real-Time Video in
Cellular Networks by Masking Jitter",
journal = j-TOMM,
volume = "20",
number = "9",
pages = "285:1--285:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3672399",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Sep 24 06:42:16 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3672399",
abstract = "The advent of 5G and interactive live broadcasting has
led to a growing trend of people preferring real-time
interactive video services on mobile devices,
particularly mobile phones. In this work, we measure
the performance of Google congestion control,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "285",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2024:MPC,
author = "Yan Li and Xiangyuan Lan and Haifeng Chen and Ke Lu
and Dongmei Jiang",
title = "Multimodal {PEAR} Chain-of-Thought Reasoning for
Multimodal Sentiment Analysis",
journal = j-TOMM,
volume = "20",
number = "9",
pages = "286:1--286:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3672398",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Sep 24 06:42:16 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3672398",
abstract = "Multimodal sentiment analysis aims to predict
sentiments from multimodal signals such as audio,
video, and text. Existing methods often rely on
Pre-trained Language Models (PLMs) to extract semantic
information from textual data, lacking an in-depth
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "286",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liang:2024:BSS,
author = "Zechen Liang and Yuan-Gen Wang and Wei Lu and Xiaochun
Cao",
title = "Boosting Semi-Supervised Learning with Dual-Threshold
Screening and Similarity Learning",
journal = j-TOMM,
volume = "20",
number = "9",
pages = "287:1--287:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3672563",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Sep 24 06:42:16 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3672563",
abstract = "How to effectively utilize unlabeled data for training
is a key problem in Semi-Supervised Learning (SSL).
Existing SSL methods often consider the unlabeled data
whose predictions are beyond a fixed threshold (e.g.,
0.95) and discard those less than 0.95. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "287",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Chen:2024:SSE,
author = "Chen Chen and Lingfeng Qu and Hadi Amirpour and
Xingjun Wang and Christian Timmerer and Zhihong Tian",
title = "On the Security of Selectively Encrypted {HEVC} Video
Bitstreams",
journal = j-TOMM,
volume = "20",
number = "9",
pages = "288:1--288:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3672568",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Sep 24 06:42:16 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3672568",
abstract = "With the growing applications of video, ensuring its
security has become of utmost importance. Selective
encryption (SE) has gained significant attention in the
field of video content protection due to its
compatibility with video codecs, favorable visual
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "288",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Qin:2024:MGP,
author = "Tai Qin and Ge Li and Wei Gao and Shan Liu",
title = "Multi-Grained Point Cloud Geometry Compression via
Dual-Model Prediction with Extended Octree",
journal = j-TOMM,
volume = "20",
number = "9",
pages = "289:1--289:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3671001",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Sep 24 06:42:16 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3671001",
abstract = "The state-of-the-art geometry-based point cloud
compression (G-PCC) (Octree) is the fine-grained
approach, which uses the octree to partition point
clouds into voxels and predicts them based on neighbor
occupancy in narrower spaces. However, G-PCC (Octree).
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "289",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2024:LDI,
author = "Jiehua Zhang and Liang Li and Chenggang Yan and Zhan
Wang and Changliang Xu and Jiyong Zhang and Chuqiao
Chen",
title = "Learning Domain Invariant Features for Unsupervised
Indoor Depth Estimation Adaptation",
journal = j-TOMM,
volume = "20",
number = "9",
pages = "290:1--290:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3672397",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Sep 24 06:42:16 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3672397",
abstract = "Predicting depth maps from monocular images has made
an impressive performance in the past years. However,
most depth estimation methods are trained with paired
image-depth map data or multi-view images (e.g., stereo
pair and monocular sequence), which \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "290",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xu:2024:CPC,
author = "Yiling Xu and Yujie Zhang and Qi Yang and Xiaozhong Xu
and Shan Liu",
title = "Compressed Point Cloud Quality Index by Combining
Global Appearance and Local Details",
journal = j-TOMM,
volume = "20",
number = "9",
pages = "291:1--291:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3672567",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Sep 24 06:42:16 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3672567",
abstract = "In recent years, many standardized algorithms for
point cloud compression (PCC) has been developed and
achieved remarkable compression ratios. To provide
guidance for rate-distortion optimization and codec
evaluation, point cloud quality assessment (PCQA)
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "291",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2024:MFT,
author = "Zhilei Liu and Xiaoxing Liu and Sen Chen and Jiaxing
Liu and Longbiao Wang and Chongke Bi",
title = "Multimodal Fusion for Talking Face Generation
Utilizing Speech-Related Facial Action Units",
journal = j-TOMM,
volume = "20",
number = "9",
pages = "292:1--292:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3672565",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Sep 24 06:42:16 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3672565",
abstract = "Talking face generation is to synthesize a
lip-synchronized talking face video by inputting an
arbitrary face image and corresponding audio clips. The
current talking face model can be divided into four
parts: visual feature extraction, audio feature
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "292",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wu:2024:KVK,
author = "Zizhao Wu and Siyu Liu and Peioyan Lu and Ping Yang
and Yongkang Wong and Xiaoling Gu and Mohan S.
Kankanhalli",
title = "{KF-VTON}: Keypoints-Driven Flow Based Virtual Try-On
Network",
journal = j-TOMM,
volume = "20",
number = "9",
pages = "293:1--293:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3673903",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Sep 24 06:42:16 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3673903",
abstract = "Image-based virtual try-on aims to fit a target
garment to a reference person. Most existing methods
are limited to solving the Garment-To-Person (G2P)
try-on task that transfers a garment from a clean
product image to the reference person and do not
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "293",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhuo:2024:UVE,
author = "Linhai Zhuo and Yuqian Fu and Jingjing Chen and Yixin
Cao and Yu-Gang Jiang",
title = "Unified View Empirical Study for Large Pretrained
Model on Cross-Domain Few-Shot Learning",
journal = j-TOMM,
volume = "20",
number = "9",
pages = "294:1--294:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3673231",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Sep 24 06:42:16 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3673231",
abstract = "The challenge of cross-domain few-shot learning
(CD-FSL) stems from the substantial distribution
disparities between target and source domain images,
necessitating a model with robust generalization
capabilities. In this work, we posit that large-scale
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "294",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zuo:2024:PEP,
author = "Ruifan Zuo and Chaoqun Zheng and Fengling Li and Lei
Zhu and Zheng Zhang",
title = "Privacy-Enhanced Prototype-Based Federated Cross-Modal
Hashing for Cross-Modal Retrieval",
journal = j-TOMM,
volume = "20",
number = "9",
pages = "295:1--295:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3674507",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Sep 24 06:42:16 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3674507",
abstract = "Cross-modal hashing is widely used for efficient
similarity searches, improving data processing
efficiency, and reducing storage costs. Existing
cross-modal hashing methods primarily focus on
centralized training scenarios, where fixed-scale and
fixed-. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "295",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Song:2024:TDV,
author = "Xue Song and Jingjing Chen and Bin Zhu and Yu-Gang
Jiang",
title = "Text-Driven Video Prediction",
journal = j-TOMM,
volume = "20",
number = "9",
pages = "296:1--296:??",
month = sep,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3675171",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Tue Sep 24 06:42:16 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3675171",
abstract = "Current video generation models usually convert
signals indicating appearance and motion received from
inputs (e.g., image and text) or latent spaces (e.g.,
noise vectors) into consecutive frames, fulfilling a
stochastic generation process for the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "296",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Hussain:2024:SYA,
author = "Walayat Hussain and Honghao Gao and Rafiul Karim and
Abdulmotaleb El Saddik",
title = "Seventeen Years of the {{\booktitle{ACM Transactions
on Multimedia Computing, Communications and
Applications}}}: a Bibliometric Overview",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "297:1--297:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3660347",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:43:12 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3660347",
abstract = "ACM Transactions on Multimedia Computing,
Communications, and Applications has been dedicated to
advancing multimedia research, fostering discoveries,
innovations, and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "297",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yuan:2024:UFL,
author = "Bowen Yuan and Jiahao Lu and Sisi You and Bing-Kun
Bao",
title = "Unbiased Feature Learning with Causal Intervention for
Visible-Infrared Person Re-Identification",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "298:1--298:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3674737",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3674737",
abstract = "Visible-infrared person re-identification (VI-ReID)
aims to match individuals across different modalities.
Existing methods can learn class-separable features but
still struggle with modality gaps within class due to
the modality-specific information, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "298",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Chan:2024:AFF,
author = "Sixian Chan and Xianpeng Zeng and Xinhua Wang and Jie
Hu and Cong Bai",
title = "Auxiliary Feature Fusion and Noise Suppression for
{HOI} Detection",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "299:1--299:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3674980",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3674980",
abstract = "In recent years, one-stage HOI (Human-Object
Interaction) detection methods tend to divide the
original task into multiple sub-tasks by using a
multi-branch network structure. However, there is no
sufficient attention to information communication
between \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "299",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2024:GMM,
author = "Yefan Li and Fuqing Duan and Ke Lu",
title = "Gated Multi-Modal Edge Refinement Network for Light
Field Salient Object Detection",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "300:1--300:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3674836",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3674836",
abstract = "Light field can be decoded into multiple
representations and provides valuable focus and depth
information. This breakthrough overcomes the
limitations of traditional 2D and 3D saliency detection
methods, opening up new possibilities for more accurate
and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "300",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Hao:2024:HHC,
author = "Dongze Hao and Qunbo Wang and Xinxin Zhu and Jing
Liu",
title = "{HCCL}: Hierarchical Counterfactual Contrastive
Learning for Robust Visual Question Answering",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "301:1--301:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3673902",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3673902",
abstract = "Despite most state-of-the-art models having achieved
amazing performance in Visual Question Answering (VQA),
they usually utilize biases to answer the question.
Recently, some studies synthesize counterfactual
training samples to help the model to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "301",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Jia:2024:HBS,
author = "Jun Jia and Zhongpai Gao and Yiwei Yang and Wei Sun
and Dandan Zhu and Xiaohong Liu and Xiongkuo Min and
Guangtao Zhai",
title = "Hidden Barcode in Sub-Images with Invisible Locating
Marker",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "302:1--302:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3674976",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3674976",
abstract = "The prevalence of the Internet of Things has led to
the widespread adoption of 2D barcodes as a means of
offline-to-online communication. Whereas, 2D barcodes
are not ideal for publicity materials due to their
space-consuming nature. Recent works have \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "302",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Lu:2024:MDE,
author = "Junxin Lu and Yongbin Gao and Jieyu Chen and Jeng-Neng
Hwang and Hamido Fujita and Zhijun Fang",
title = "Monocular Depth and Ego-motion Estimation with Scale
Based on Superpixel and Normal Constraints",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "303:1--303:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3674977",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3674977",
abstract = "Three-dimensional perception in intelligent virtual
and augmented reality (VR/AR) and autonomous vehicles
(AV) applications is critical and attracting
significant attention. The self-supervised monocular
depth and ego-motion estimation serves as a more
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "303",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Guo:2024:DYC,
author = "Zhenjiang Guo and Xiaohai He and Yu Yang and Linbo
Qing and Honggang Chen",
title = "{DAG-YOLO}: a Context-Feature Adaptive fusion Rotating
Detection Network in Remote Sensing Images",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "304:1--304:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3674978",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3674978",
abstract = "Object detection in remote sensing image (RSI)
research has seen significant advancements,
particularly with the advent of deep learning. However,
challenges such as orientation, scale, aspect ratio
variations, dense object distribution, and category
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "304",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhou:2024:MML,
author = "Yong Zhou and Zeming Xie and Jiaqi Zhao and Wenliang
Du and Rui Yao and Abdulmotaleb {El Saddik}",
title = "Multi-Modal {LiDAR} Point Cloud Semantic Segmentation
with Salience Refinement and Boundary Perception",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "305:1--305:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3674979",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3674979",
abstract = "Point cloud segmentation is essential for scene
understanding, which provides advanced information for
many applications, such as autonomous driving, robots,
and virtual reality. To improve the accuracy and
robustness of point cloud segmentation, many \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "305",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wang:2024:HRS,
author = "Yuanyuan Wang and Meng Liu and Xuemeng Song and
Liqiang Nie",
title = "Harnessing Representative Spatial-Temporal Information
for Video Question Answering",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "306:1--306:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3675399",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3675399",
abstract = "Video question answering, aiming to answer a natural
language question related to the given video, has
become prevalent in the past few years. Although
remarkable improvements have been obtained, it is still
exposed to the challenge of insufficient \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "306",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liao:2024:RFM,
author = "Guibiao Liao and Wei Gao",
title = "Rethinking Feature Mining for Light Field Salient
Object Detection",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "307:1--307:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3676967",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3676967",
abstract = "Light field salient object detection (LF SOD) has
recently received increasing attention. However, most
current works typically rely on an individual focal
stack backbone for feature extraction. This manner
ignores the characteristic of blurred saliency-.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "307",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liang:2024:NTH,
author = "Chao Liang and Linchao Zhu and Zongxin Yang and Wei
Chen and Yi Yang",
title = "Noise-Tolerant Hybrid Prototypical Learning with Noisy
{Web} Data",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "308:1--308:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3672396",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3672396",
abstract = "We focus on the challenging problem of learning an
unbiased classifier from a large number of potentially
relevant but noisily labeled web images given only a
few clean labeled images. This problem is particularly
practical because it reduces the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "308",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Peng:2024:DDL,
author = "Yitao Peng and Lianghua He and Die Hu and Yihang Liu
and Longzhen Yang and Shaohua Shang",
title = "Decoupling Deep Learning for Enhanced Image
Recognition Interpretability",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "309:1--309:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3674837",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3674837",
abstract = "The quest for enhancing the interpretability of neural
networks has become a prominent focus in recent
research endeavors. Prototype-based neural networks
have emerged as a promising avenue for imbuing models
with interpretability by gauging the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "309",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Sun:2024:DDC,
author = "Baoli Sun and Yanjun Guo and Tiantian Yan and Xinchen
Ye and Zhihui Wang and Haojie Li and Zhiyong Wang",
title = "Digging into Depth and Color Spaces: a Mapping
Constraint Network for Depth Super-Resolution",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "310:1--310:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3677123",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3677123",
abstract = "Scene depth super-resolution (DSR) poses an inherently
ill-posed problem due to the extremely large space of
one-to-many mapping functions from a given
low-resolution (LR) depth map, which possesses limited
depth information, to multiple plausible high-.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "310",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Seufert:2024:COC,
author = "Michael Seufert and Marius Spangenberger and Fabian
Poign{\'e}e and Florian Wamser and Werner Robitza and
Christian Timmerer and Tobias Ho{\ss}feld",
title = "{COBIRAS}: Offering a Continuous Bit Rate Slide to
Maximize {DASH} Streaming Bandwidth Utilization",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "311:1--311:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3677379",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3677379",
abstract = "Reaching close-to-optimal bandwidth utilization in
dynamic adaptive streaming over HTTP (DASH) systems
can, in theory, be achieved with a small discrete set
of bit rate representations. This includes typical bit
rate ladders used in state-of-the-art DASH \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "311",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Tang:2024:MLF,
author = "Zhangyong Tang and Tianyang Xu and Xiao-Jun Wu and
Josef Kittler",
title = "Multi-Level Fusion for Robust {RGBT} Tracking via
Enhanced Thermal Representation",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "312:1--312:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3678176",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3678176",
abstract = "Due to the limitations of visible (RGB) sensors in
challenging scenarios, such as nighttime and foggy
environments, the thermal infrared (TIR) modality draws
increasing attention as an auxiliary source for robust
tracking systems. Currently, the existing \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "312",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Tu:2024:RFI,
author = "Hanyue Tu and Li Li and Wengang Zhou and Houqiang Li",
title = "Reconstruction-Free Image Compression for Machine
Vision via Knowledge Transfer",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "313:1--313:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3678471",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3678471",
abstract = "Reconstruction-free image compression for machine
vision aims to perform machine vision tasks directly on
compressed-domain representations instead of
reconstructed images. Existing reports have validated
the feasibility of compressed-domain machine \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "313",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Zhang:2024:USD,
author = "Gai Zhang and Xinfeng Zhang and Lv Tang",
title = "Unified and Scalable Deep Image Compression Framework
for Human and Machine",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "314:1--314:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3678472",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3678472",
abstract = "Image compression aims to minimize the amount of data
in image representation while maintaining a certain
visual quality for humans, which is an essential
technique for storage and transmission. Recently, along
with the development of computer vision, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "314",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2024:LCA,
author = "Fengyong Li and Huajun Zhai and Teng Liu and Xinpeng
Zhang and Chuan Qin",
title = "Learning Compressed Artifact for {JPEG} Manipulation
Localization Using Wide-Receptive-Field Network",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "315:1--315:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3678883",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3678883",
abstract = "JPEG image manipulation localization aims to
accurately classify and locate tampered regions in JPEG
images. Existing image manipulation localization
schemes usually consider diverse data streams of
spatial domain, e.g. noise inconsistency and local
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "315",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yin:2024:EIL,
author = "Shukang Yin and Sirui Zhao and Hao Wang and Tong Xu
and Enhong Chen",
title = "Exploiting Instance-level Relationships in Weakly
Supervised Text-to-Video Retrieval",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "316:1--316:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3663571",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3663571",
abstract = "Text-to-Video Retrieval is a typical cross-modal
retrieval task that has been studied extensively under
a conventional supervised setting. Recently, some works
have sought to extend the problem to a weakly
supervised formulation, which can be more \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "316",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Latifzadeh:2024:EDA,
author = "Kayhan Latifzadeh and Nima Gozalpour and V. Javier
Traver and Tuukka Ruotsalo and Aleksandra
Kawala-Sterniuk and Luis A Leiva",
title = "Efficient Decoding of Affective States from
Video-elicited {EEG} Signals: an Empirical
Investigation",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "317:1--317:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3663669",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3663669",
abstract = "Affect decoding through brain-computer interfacing
(BCI) holds great potential to capture users' feelings
and emotional responses via non-invasive
electroencephalogram (EEG) sensing. Yet, little
research has been conducted to understand efficient
decoding \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "317",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wu:2024:LCA,
author = "Ziyue Wu and Junyu Gao and Shucheng Huang and
Changsheng Xu",
title = "Learning Commonsense-aware Moment-Text Alignment for
Fast Video Temporal Grounding",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "318:1--318:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3663368",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3663368",
abstract = "Grounding temporal video segments described in natural
language queries effectively and efficiently is a
crucial capability needed in vision-and-language
fields. In this article, we deal with the fast video
temporal grounding (FVTG) task, aiming at \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "318",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Lorenzi:2024:MDC,
author = "Daniele Lorenzi and Farzad Tashtarian and Hermann
Hellwagner and Christian Timmerer",
title = "{MEDUSA}: a Dynamic Codec Switching Approach in {HTTP}
Adaptive Streaming",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "319:1--319:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3656175",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3656175",
abstract = "HTTP Adaptive Streaming (HAS) solutions utilize
various Adaptive BitRate (ABR) algorithms to
dynamically select appropriate video representations,
aiming at adapting to fluctuations in network
bandwidth. However, current ABR implementations have a
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "319",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Pi:2024:EVA,
author = "Ruoyan Pi and Peng Wu and Xiangteng He and Yuxin
Peng",
title = "{EOGT}: Video Anomaly Detection with Enhanced Object
Information and Global Temporal Dependency",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "320:1--320:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3662185",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3662185",
abstract = "Video anomaly detection (VAD) aims to identify events
or scenes in videos that deviate from typical patterns.
Existing approaches primarily focus on reconstructing
or predicting frames to detect anomalies and have shown
improved performance in recent \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "320",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yue:2024:MGR,
author = "Shengbin Yue and Yunbin Tu and Liang Li and Shengxiang
Gao and Zhengtao Yu",
title = "Multi-Grained Representation Aggregating Transformer
with Gating Cycle for Change Captioning",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "321:1--321:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3660346",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3660346",
abstract = "Change captioning aims to describe the difference
within an image pair in natural language, which
combines visual comprehension and language generation.
Although significant progress has been achieved, it
remains a key challenge of perceiving the object
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "321",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Wu:2024:ADS,
author = "Jingjing Wu and Xi Zhou and Xiaohong Li and Hao Liu
and Meibin Qi and Richang Hong",
title = "Asymmetric Deformable Spatio-temporal Framework for
Infrared Object Tracking",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "322:1--322:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3678882",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3678882",
abstract = "The Infrared Object Tracking (IOT) task aims to locate
objects in infrared sequences. Since color and texture
information is unavailable in infrared modality, most
existing infrared trackers merely rely on capturing
spatial contexts from the image to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "322",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2024:DPP,
author = "Zhenyu Li and Shanshan Gao and Deqian Mao and Shouwen
Song and Lei Li and Yuanfeng Zhou",
title = "Deep Plug-and-Play Non-Iterative Cluster for {$3$D}
Global Feature Extraction",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "323:1--323:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3679204",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3679204",
abstract = "Efficient and accurate point cloud feature extraction
is crucial for critical tasks such as 3D recognition
and semantic segmentation. However, existing global
feature extraction methods for 3D data often require
designing different models for different \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "323",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Xue:2024:SAA,
author = "Mingfu Xue and Yinghao Wu and Leo Yu Zhang and Dujuan
Gu and Yushu Zhang and Weiqiang Liu",
title = "{SSAT}: Active Authorization Control and {User}'s
Fingerprint Tracking Framework for {DNN IP}
Protection",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "324:1--324:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3679202",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3679202",
abstract = "As training a high-performance deep neural network
(DNN) model requires a large amount of data, powerful
computing resources and expert knowledge, protecting
well-trained DNN models from intellectual property (IP)
infringement has raised serious concerns \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "324",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Li:2024:FSF,
author = "Yongkang Li and Qifan Liang and Zhen Han and Wenjun
Mai and Zhongyuan Wang",
title = "Few-Shot Face Sketch-to-Photo Synthesis via
Global-Local Asymmetric Image-to-Image Translation",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "325:1--325:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3672400",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3672400",
abstract = "Face sketch-to-photo synthesis is widely used in law
enforcement and digital entertainment, which can be
achieved by Image-to-Image (I2I) translation.
Traditional I2I translation algorithms usually regard
the bidirectional translation of two image domains
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "325",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Chen:2024:AAL,
author = "Shuqin Chen and Xian Zhong and Yi Zhang and Lei Zhu
and Ping Li and Xiaokang Yang and Bin Sheng",
title = "Action-aware Linguistic Skeleton Optimization Network
for Non-autoregressive Video Captioning",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "326:1--326:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3679203",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3679203",
abstract = "Non-autoregressive video captioning methods generate
visual words in parallel but often overlook semantic
correlations among them, especially regarding verbs,
leading to lower caption quality. To address this, we
integrate action information of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "326",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Yang:2024:LFR,
author = "Yancun Yang and Weiqing Min and Jingru Song and Guorui
Sheng and Lili Wang and Shuqiang Jiang",
title = "Lightweight Food Recognition via Aggregation Block and
Feature Encoding",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "327:1--327:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3680285",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3680285",
abstract = "Food image recognition has recently been given
considerable attention in the multimedia field in light
of its possible implications on health. The
characteristics of the dispersed distribution of
ingredients in food images put forward higher
requirements \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "327",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Liu:2024:MMR,
author = "Huaijin Liu and Jixiang Du and Yong Zhang and Hongbo
Zhang and Jiandian Zeng",
title = "{MSSA}: Multi-Representation Semantics-Augmented Set
Abstraction for {$3$D} Object Detection",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "328:1--328:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3686157",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3686157",
abstract = "Accurate recognition and localization of 3D objects is
a fundamental research problem in 3D computer vision.
Benefiting from transformation-free point cloud
processing and flexible receptive fields, point-based
methods have become accurate in 3D point \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "328",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}
@Article{Kawai:2024:RBH,
author = "Vinicius Sato Kawai and Lucas Pascotti Valem and
Alexandro Baldassin and Edson Borin and Daniel Carlos
Guimar{\~a}es Pedronette and Longin Jan Latecki",
title = "Rank-based Hashing for Effective and Efficient Nearest
Neighbor Search for Image Retrieval",
journal = j-TOMM,
volume = "20",
number = "10",
pages = "329:1--329:??",
month = oct,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.1145/3659580",
ISSN = "1551-6857 (print), 1551-6865 (electronic)",
ISSN-L = "1551-6857",
bibdate = "Thu Oct 31 10:45:31 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
URL = "https://dl.acm.org/doi/10.1145/3659580",
abstract = "The large and growing amount of digital data creates a
pressing need for approaches capable of indexing and
retrieving multimedia content. A traditional and
fundamental challenge consists of effectively and
efficiently performing nearest-neighbor searches. After
decades of research, several different methods are
available, including trees, hashing, and graph-based
approaches. Most of the current methods exploit
learning to hash approaches based on deep learning. In
spite of effective results and compact codes obtained,
such methods often require a significant amount of
labeled data for training. Unsupervised approaches also
rely on expensive training procedures usually based on
a huge amount of data. In this work, we propose an
unsupervised data-independent approach for nearest
neighbor searches, which can be used with different
features, including deep features trained by transfer
learning. The method uses a rank-based formulation and
exploits a hashing approach for efficient ranked list
computation at query time. A comprehensive experimental
evaluation was conducted on seven public datasets,
considering deep features based on CNNs and
Transformers. Both effectiveness and efficiency aspects
were evaluated. The proposed approach achieves
remarkable results in comparison to traditional and
state-of-the-art methods. Hence, it is an attractive
and innovative solution, especially when costly
training procedures need to be avoided.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Multimed Comput. Commun. Appl.",
articleno = "329",
fjournal = "ACM Transactions on Multimedia Computing,
Communications, and Applications",
journal-URL = "https://dl.acm.org/loi/tomm",
}