@Preamble{
"\ifx \undefined \circled \def \circled #1{(#1)}\fi" #
"\ifx \undefined \k \let \k = \c \fi" #
"\ifx \undefined \ocirc \def \ocirc #1{{\accent'27#1}}\fi" #
"\ifx \undefined \pkg \def \pkg #1{{{\tt #1}}} \fi" #
"\ifx \undefined \reg \def \reg {\circled{R}}\fi"
}
@String{ack-nhfb = "Nelson H. F. Beebe,
University of Utah,
Department of Mathematics, 110 LCB,
155 S 1400 E RM 233,
Salt Lake City, UT 84112-0090, USA,
Tel: +1 801 581 5254,
FAX: +1 801 581 4148,
e-mail: \path|beebe@math.utah.edu|,
\path|beebe@acm.org|,
\path|beebe@computer.org| (Internet),
URL: \path|https://www.math.utah.edu/~beebe/|"}
@String{j-PROC-VLDB-ENDOWMENT = "Proceedings of the VLDB Endowment"}
@Article{Hill:2008:TMO,
author = "Mark D. Hill",
title = "Is transactional memory an oxymoron?",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "1--1",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453858",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zobel:2008:DSH,
author = "Justin Zobel",
title = "Databases and the silification of health",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "2--2",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453859",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Blott:2008:WWH,
author = "Stephen Blott and Roger Weber",
title = "What's wrong with high-dimensional similarity
search?",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "3--3",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453861",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bruno:2008:CPD,
author = "Nicolas Bruno and Surajit Chaudhuri",
title = "Constrained physical design tuning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "4--15",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453863",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kementsietsidis:2008:SMQ,
author = "Anastasios Kementsietsidis and Frank Neven and Dieter
Van de Craen and Stijn Vansummeren",
title = "Scalable multi-query optimization for exploratory
queries over federated scientific databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "16--27",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453864",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{DeWitt:2008:CIC,
author = "David J. DeWitt and Erik Paulson and Eric Robinson and
Jeffrey Naughton and Joshua Royalty and Srinath Shankar
and Andrew Krioukov",
title = "{Clustera}: an integrated computation and data
management system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "28--41",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453865",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cheung:2008:PPE,
author = "Alvin Cheung and Samuel Madden",
title = "Performance profiling with {EndoScope}, an
acquisitional software monitoring framework",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "42--53",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453866",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bar-Yossef:2008:MSE,
author = "Ziv Bar-Yossef and Maxim Gurevich",
title = "Mining search engine query logs via suggestion
sampling",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "54--65",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453868",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Akdere:2008:PBC,
author = "Mert Akdere and U{\u{g}}ur {\c{C}}etintemel and Nesime
Tatbul",
title = "Plan-based complex event detection across distributed
sources",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "66--77",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453869",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lachmann:2008:FRP,
author = "Alexander Lachmann and Mirek Riedewald",
title = "Finding relevant patterns in bursty sequences",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "78--89",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453870",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cheng:2008:CLW,
author = "Hao Cheng and Kien A. Hua and Khanh Vu",
title = "Constrained locally weighted clustering",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "90--101",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453871",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Hay:2008:RSR,
author = "Michael Hay and Gerome Miklau and David Jensen and Don
Towsley and Philipp Weis",
title = "Resisting structural re-identification in anonymized
social networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "102--114",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453873",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Terrovitis:2008:PPA,
author = "Manolis Terrovitis and Nikos Mamoulis and Panos
Kalnis",
title = "Privacy-preserving anonymization of set-valued data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "115--125",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453874",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Pang:2008:AQR,
author = "HweeHwa Pang and Kyriakos Mouratidis",
title = "Authenticating the query results of text search
engines",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "126--137",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453875",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kundu:2008:SST,
author = "Ashish Kundu and Elisa Bertino",
title = "Structural signatures for tree data structures",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "138--150",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453876",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Roitman:2008:MDC,
author = "Haggai Roitman and David Carmel and Elad Yom-Tov",
title = "Maintaining dynamic channel profiles on the {Web}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "151--162",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453878",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yang:2008:WDD,
author = "Fan Yang and Nitin Gupta and Chavdar Botev and
Elizabeth F. Churchill and George Levchenko and Jayavel
Shanmugasundaram",
title = "{WYSIWYG} development of data driven {Web}
applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "163--175",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453879",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Baykan:2008:WPL,
author = "Eda Baykan and Monika Henzinger and Ingmar Weber",
title = "{Web} page language identification based on {URLs}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "176--187",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453880",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Han:2008:PQO,
author = "Wook-Shin Han and Wooseong Kwak and Jinsoo Lee and Guy
M. Lohman and Volker Markl",
title = "Parallelizing query optimization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "188--200",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453882",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Hadjieleftheriou:2008:HSS,
author = "Marios Hadjieleftheriou and Xiaohui Yu and Nick Koudas
and Divesh Srivastava",
title = "Hashed samples: selectivity estimators for set
similarity selection queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "201--212",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453883",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cohen:2008:TEU,
author = "Edith Cohen and Haim Kaplan",
title = "Tighter estimation using bottom $k$ sketches",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "213--229",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453884",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Alexe:2008:STB,
author = "Bogdan Alexe and Wang-Chiew Tan and Yannis
Velegrakis",
title = "{STBenchmark}: towards a benchmark for mapping
systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "230--244",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453886",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Katsis:2008:ISR,
author = "Yannis Katsis and Alin Deutsch and Yannis
Papakonstantinou",
title = "Interactive source registration in community-oriented
information integration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "245--259",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453887",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Hernandez:2008:DED,
author = "Mauricio A. Hern{\'a}ndez and Paolo Papotti and
Wang-Chiew Tan",
title = "Data exchange with data-metadata translations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "260--273",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453888",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2008:OPN,
author = "Jin Li and Kristin Tufte and Vladislav Shkapenyuk and
Vassilis Papadimos and Theodore Johnson and David
Maier",
title = "Out-of-order processing: a new architecture for
high-performance stream systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "274--288",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453890",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Han:2008:SET,
author = "Wook-Shin Han and Haifeng Jiang and Howard Ho and
Quanzhong Li",
title = "{StreamTX}: extracting tuples from streaming {XML}
data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "289--300",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453891",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jin:2008:SWT,
author = "Cheqing Jin and Ke Yi and Lei Chen and Jeffrey Xu Yu
and Xuemin Lin",
title = "Sliding-window top-$k$ queries on uncertain streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "301--312",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453892",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Koch:2008:CPD,
author = "Christoph Koch and Dan Olteanu",
title = "Conditioning probabilistic databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "313--325",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453894",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Beskales:2008:EST,
author = "George Beskales and Mohamed A. Soliman and Ihab F.
Ilyas",
title = "Efficient search for the top-$k$ probable nearest
neighbors in uncertain databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "326--339",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453895",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2008:BML,
author = "Daisy Zhe Wang and Eirinaios Michelakis and Minos
Garofalakis and Joseph M. Hellerstein",
title = "{BayesStore}: managing large, uncertain data
repositories with probabilistic graphical models",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "340--351",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453896",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Deutch:2008:TIT,
author = "Daniel Deutch and Tova Milo",
title = "Type inference and type checking for queries on
execution traces",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "352--363",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453898",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shang:2008:TVH,
author = "Haichuan Shang and Ying Zhang and Xuemin Lin and
Jeffrey Xu Yu",
title = "Taming verification hardness: an efficient algorithm
for testing subgraph isomorphism",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "364--375",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453899",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Golab:2008:GNO,
author = "Lukasz Golab and Howard Karloff and Flip Korn and
Divesh Srivastava and Bei Yu",
title = "On generating near-optimal tableaux for conditional
functional dependencies",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "376--390",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453900",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fan:2008:PFD,
author = "Wenfei Fan and Shuai Ma and Yanli Hu and Jie Liu and
Yinghui Wu",
title = "Propagating functional dependencies with conditions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "391--407",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453901",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Antonellis:2008:SQR,
author = "Ioannis Antonellis and Hector Garcia Molina and Chi
Chao Chang",
title = "{Simrank++}: query rewriting through link analysis of
the click graph",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "408--421",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453903",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lizorkin:2008:AEO,
author = "Dmitry Lizorkin and Pavel Velikhov and Maxim Grinev
and Denis Turdakov",
title = "Accuracy estimate and optimization techniques for
{SimRank} computation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "422--433",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453904",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chandramouli:2008:EES,
author = "Badrish Chandramouli and Jun Yang",
title = "End-to-end support for joins in large-scale
publish\slash subscribe systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "434--450",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453905",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Machanavajjhala:2008:SRP,
author = "Ashwin Machanavajjhala and Erik Vee and Minos
Garofalakis and Jayavel Shanmugasundaram",
title = "Scalable ranked publish\slash subscribe",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "451--462",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453906",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Teubner:2008:DCF,
author = "Jens Teubner and Torsten Grust and Sebastian Maneth
and Sherif Sakr",
title = "Dependable cardinality forecasts for {XQuery}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "463--477",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453908",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2008:HBS,
author = "Hongzhi Wang and Jianzhong Li and Jizhou Luo and Hong
Gao",
title = "Hash-base subgraph query processing method for
graph-structured {XML} documents",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "478--489",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453909",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cohen:2008:GXS,
author = "Sara Cohen",
title = "Generating {XML} structure using examples and
constraints",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "490--501",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453910",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Holloway:2008:ROD,
author = "Allison L. Holloway and David J. DeWitt",
title = "Read-optimized databases, in depth",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "502--513",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453912",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Koltsidas:2008:FSL,
author = "Ioannis Koltsidas and Stratis D. Viglas",
title = "Flashing up the storage layer",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "514--525",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453913",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sears:2008:RCL,
author = "Russell Sears and Mark Callaghan and Eric Brewer",
title = "{Rose}: compressed, log-structured replication",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "526--537",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453914",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cafarella:2008:WEP,
author = "Michael J. Cafarella and Alon Halevy and Daisy Zhe
Wang and Eugene Wu and Yang Zhang",
title = "{WebTables}: exploring the power of tables on the
{Web}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "538--549",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453916",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Garrod:2008:SQR,
author = "Charles Garrod and Amit Manjhi and Anastasia Ailamaki
and Bruce Maggs and Todd Mowry and Christopher Olston
and Anthony Tomasic",
title = "Scalable query result caching for {Web} applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "550--561",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453917",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Braga:2008:OMD,
author = "Daniele Braga and Stefano Ceri and Florian Daniel and
Davide Martinenghi",
title = "Optimization of multi-domain queries on the {Web}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "562--573",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453918",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kwon:2008:FTS,
author = "YongChul Kwon and Magdalena Balazinska and Albert
Greenberg",
title = "Fault-tolerant stream processing using a distributed,
replicated file system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "574--585",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453920",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yeh:2008:LLW,
author = "Mi-Yen Yeh and Kun-Lung Wu and Philip S. Yu and
Ming-Syan Chen",
title = "{LeeWave}: level-wise distribution of wavelet
coefficients for processing $k$ {NN} queries over
distributed streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "586--597",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453921",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Aguilera:2008:PSD,
author = "Marcos K. Aguilera and Wojciech Golab and Mehul A.
Shah",
title = "A practical scalable distributed {B-tree}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "598--609",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453922",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Qiao:2008:MMS,
author = "Lin Qiao and Vijayshankar Raman and Frederick Reiss
and Peter J. Haas and Guy M. Lohman",
title = "Main-memory scan sharing for multi-core {CPUs}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "610--621",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453924",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Johnson:2008:RWP,
author = "Ryan Johnson and Vijayshankar Raman and Richard Sidle
and Garret Swart",
title = "Row-wise parallel predicate evaluation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "622--634",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453925",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Soundararajan:2008:DPC,
author = "Gokul Soundararajan and Jin Chen and Mohamed A. Sharaf
and Cristiana Amza",
title = "Dynamic partitioning of the cache hierarchy in shared
data centers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "635--646",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453926",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Neumann:2008:RRS,
author = "Thomas Neumann and Gerhard Weikum",
title = "{RDF-3X}: a {RISC}-style engine for {RDF}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "647--659",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453927",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Simitsis:2008:MCE,
author = "Alkis Simitsis and Akanksha Baid and Yannis Sismanis
and Berthold Reinwald",
title = "Multidimensional content {eXploration}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "660--671",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453929",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fontoura:2008:RTS,
author = "Marcus Fontoura and Vanja Josifovski and Ravi Kumar
and Christopher Olston and Andrew Tomkins and Sergei
Vassilvitskii",
title = "Relaxation in text search using taxonomies",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "672--683",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453930",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Nguyen:2008:LEF,
author = "Hoa Nguyen and Thanh Nguyen and Juliana Freire",
title = "Learning to extract form labels",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "684--694",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453931",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jayapandian:2008:ACF,
author = "Magesh Jayapandian and H. V. Jagadish",
title = "Automated creation of a forms-based database query
interface",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "695--709",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453932",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yahia:2008:ENA,
author = "Sihem Amer Yahia and Michael Benedikt and Laks V. S.
Lakshmanan and Julia Stoyanovich",
title = "Efficient network aware search in collaborative
tagging sites",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "710--721",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453934",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cheng:2008:CUD,
author = "Reynold Cheng and Jinchuan Chen and Xike Xie",
title = "Cleaning uncertain data with quality guarantees",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "722--735",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453935",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Huang:2008:PNA,
author = "Jiansheng Huang and Ting Chen and AnHai Doan and
Jeffrey F. Naughton",
title = "On the provenance of non-answers to queries over
extracted data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "736--747",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453936",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhu:2008:DAP,
author = "Shenghuo Zhu and Tao Li and Zhiyuan Chen and Dingding
Wang and Yihong Gong",
title = "Dynamic active probing of helpdesk databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "748--760",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453937",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Curino:2008:GDS,
author = "Carlo A. Curino and Hyun J. Moon and Carlo Zaniolo",
title = "Graceful database schema evolution: the {PRISM}
workbench",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "761--772",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453939",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chai:2008:ARD,
author = "Xiaoyong Chai and Mayssam Sayyadian and AnHai Doan and
Arnon Rosenthal and Len Seligman",
title = "Analyzing and revising data integration schemas to
improve their matchability",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "773--784",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453940",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Talukdar:2008:LCD,
author = "Partha Pratim Talukdar and Marie Jacob and Muhammad
Salman Mehmood and Koby Crammer and Zachary G. Ives and
Fernando Pereira and Sudipto Guha",
title = "Learning to create data-integrating queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "785--796",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453941",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Re:2008:ALP,
author = "Christopher R{\'e} and Dan Suciu",
title = "Approximate lineage for probabilistic databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "797--808",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453943",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sen:2008:ESC,
author = "Prithviraj Sen and Amol Deshpande and Lise Getoor",
title = "Exploiting shared correlations in probabilistic
databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "809--820",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453944",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Rastogi:2008:ACU,
author = "Vibhor Rastogi and Dan Suciu and Evan Welbourne",
title = "Access control over uncertain data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "821--832",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453945",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cormode:2008:ABG,
author = "Graham Cormode and Divesh Srivastava and Ting Yu and
Qing Zhang",
title = "Anonymizing bipartite graph data using safe
groupings",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "833--844",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453947",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bu:2008:PPS,
author = "Yingyi Bu and Ada Wai Chee Fu and Raymond Chi Wing
Wong and Lei Chen and Jiuyong Li",
title = "Privacy preserving serial data publishing by role
composition",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "845--856",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453948",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Xiao:2008:OPQ,
author = "Xiaokui Xiao and Yufei Tao",
title = "Output perturbation with query relaxation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "857--869",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453949",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lomet:2008:TTI,
author = "David Lomet and Mingsheng Hong and Rimma Nehme and Rui
Zhang",
title = "Transaction time indexing with version compression",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "870--881",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453951",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Moon:2008:MQT,
author = "Hyun J. Moon and Carlo A. Curino and Alin Deutsch and
Chien-Yi Hou and Carlo Zaniolo",
title = "Managing and querying transaction-time databases under
schema evolution",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "882--895",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453952",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sherkat:2008:EST,
author = "Reza Sherkat and Davood Rafiei",
title = "On efficiently searching trajectories and archival
data for historical similarities",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "896--908",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453953",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Pu:2008:KQC,
author = "Ken Q. Pu and Xiaohui Yu",
title = "Keyword query cleaning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "909--920",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453955",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2008:RIR,
author = "Ziyang Liu and Yi Cher",
title = "Reasoning and identifying relevant matches for {XML}
keyword search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "921--932",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453956",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Xiao:2008:EJE,
author = "Chuan Xiao and Wei Wang and Xuemin Lin",
title = "{Ed-Join}: an efficient algorithm for similarity joins
with edit distance constraints",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "933--944",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453957",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Agrawal:2008:SAH,
author = "Sanjay Agrawal and Kaushik Chakrabarti and Surajit
Chaudhuri and Venkatesh Ganti",
title = "Scalable ad-hoc entity extraction from text
collections",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "945--957",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453958",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Agrawal:2008:SSS,
author = "Parag Agrawal and Daniel Kifer and Christopher
Olston",
title = "Scheduling shared scans of large data files",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "958--969",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453960",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Nath:2008:OMV,
author = "Suman Nath and Phillip B. Gibbons",
title = "Online maintenance of very large random samples on
flash storage",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "970--983",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453961",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ge:2008:SLA,
author = "Tingjian Ge and Stan Zdonik",
title = "A skip-list approach for efficiently processing
forecasting queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "984--995",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453962",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Phan:2008:RRF,
author = "Thomas Phan and Wen-Syan Li",
title = "A request-routing framework for {SOA}-based enterprise
computing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "996--1007",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453963",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Weiss:2008:HSI,
author = "Cathrin Weiss and Panagiotis Karras and Abraham
Bernstein",
title = "{Hexastore}: sextuple indexing for {Semantic Web} data
management",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "1008--1019",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453965",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shahabi:2008:ILS,
author = "Cyrus Shahabi and Lu-An Tang and Songhua Xing",
title = "Indexing land surface for efficient {kNN} query",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "1020--1031",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453966",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wong:2008:ESQ,
author = "Raymond Chi-Wing Wong and Ada Wai-Chee Fu and Jian Pei
and Yip Sing Ho and Tai Wong and Yubao Liu",
title = "Efficient skyline querying with variable user
preferences on nominal attributes",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "1032--1043",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453967",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Guo:2008:ETP,
author = "Lin Guo and Sihem Amer Yahia and Raghu Ramakrishnan
and Jayavel Shanmugasundaram and Utkarsh Srivastava and
Erik Vee",
title = "Efficient top-$k$ processing over query-dependent
functions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "1044--1055",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453968",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wu:2008:FER,
author = "Wei Wu and Fei Yang and Chee-Yong Chan and Kian-Lee
Tan",
title = "{FINCH}: evaluating reverse $k$-Nearest-Neighbor
queries on location data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "1056--1067",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453970",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jeung:2008:DCT,
author = "Hoyoung Jeung and Man Lung Yiu and Xiaofang Zhou and
Christian S. Jensen and Heng Tao Shen",
title = "Discovery of convoys in trajectory databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "1068--1080",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453971",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lee:2008:TTC,
author = "Jae-Gil Lee and Jiawei Han and Xiaolei Li and Hector
Gonzalez",
title = "{TraClass}: trajectory classification using
hierarchical region-based and trajectory-based
clustering",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "1081--1094",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453972",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Nutanong:2008:VDQ,
author = "Sarana Nutanong and Rui Zhang and Egemen Tanin and
Lars Kulik",
title = "The {V*-Diagram}: a query-dependent approach to moving
{KNN} queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "1095--1106",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453973",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Guravannavar:2008:RPB,
author = "Ravindra Guravannavar and S. Sudarshan",
title = "Rewriting procedures for batched bindings",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "1107--1123",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453975",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{D:2008:IRP,
author = "Harish D. and Pooja N. Darera and Jayant R. Haritsa",
title = "Identifying robust plans through plan diagram
reduction",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "1124--1140",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453976",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chaudhuri:2008:PYG,
author = "Surajit Chaudhuri and Vivek Narasayya and Ravi
Ramamurthy",
title = "A pay-as-you-go framework for query execution
feedback",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "1141--1152",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453977",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Condie:2008:ERM,
author = "Tyson Condie and David Chu and Joseph M. Hellerstein
and Petros Maniatis",
title = "Evita raced: metacompilation for declarative
networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "1153--1165",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453978",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chiang:2008:DDQ,
author = "Fei Chiang and Ren{\'e}e J. Miller",
title = "Discovering data quality rules",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "1166--1177",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453980",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2008:MNR,
author = "Xiang Zhang and Feng Pan and Wei Wang and Andrew
Nobel",
title = "Mining non-redundant high order correlations in binary
data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "1178--1188",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453981",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dalvi:2008:KSE,
author = "Bhavana Bharat Dalvi and Meghana Kshirsagar and S.
Sudarshan",
title = "Keyword search on external memory data graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "1189--1204",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453982",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Koltsidas:2008:SHD,
author = "Ioannis Koltsidas and Heiko M{\"u}ller and Stratis D.
Viglas",
title = "Sorting hierarchical data in external memory for
archiving",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "1",
pages = "1205--1216",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1453856.1453983",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:36 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Metwally:2008:SSP,
author = "Ahmed Metwally and Fatih Emek{\c{c}}i and Divyakant
Agrawal and Amr {El Abbadi}",
title = "{SLEUTH}: {Single-pubLisher attack dEtection Using
correlaTion Hunting}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1217--1228",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454161",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Poess:2008:ECK,
author = "Meikel Poess and Raghunath Othayoth Nambiar",
title = "Energy cost, the key challenge of today's data
centers: a power consumption analysis of {TPC}-{C}
results",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1229--1240",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454162",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Madhavan:2008:GDW,
author = "Jayant Madhavan and David Ko and Lucja Kot and Vignesh
Ganapathy and Alex Rasmussen and Alon Halevy",
title = "{Google}'s {Deep Web} crawl",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1241--1252",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454163",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Weis:2008:ISD,
author = "Melanie Weis and Felix Naumann and Ulrich Jehle and
Jens Lufter and Holger Schuster",
title = "Industry-scale duplicate detection",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1253--1264",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454165",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chaiken:2008:SEE,
author = "Ronnie Chaiken and Bob Jenkins and Per-{\AA}ke Larson
and Bill Ramsey and Darren Shakib and Simon Weaver and
Jingren Zhou",
title = "{SCOPE}: easy and efficient parallel processing of
massive data sets",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1265--1276",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454166",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cooper:2008:PYH,
author = "Brian F. Cooper and Raghu Ramakrishnan and Utkarsh
Srivastava and Adam Silberstein and Philip Bohannon and
Hans-Arno Jacobsen and Nick Puz and Daniel Weaver and
Ramana Yerneni",
title = "{PNUTS}: {Yahoo!}'s hosted data serving platform",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1277--1288",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454167",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Acharya:2008:RSF,
author = "Srini Acharya and Peter Carlin and Cesar
Galindo-Legaria and Krzysztof Kozielczyk and Pawel
Terlecki and Peter Zabback",
title = "Relational support for flexible schema scenarios",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1289--1300",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454169",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mukherjee:2008:OSS,
author = "Niloy Mukherjee and Bharath Aleti and Amit Ganesh and
Krishna Kunchithapadam and Scott Lynn and Sujatha
Muthulingam and Kam Shergill and Shaoyu Wang and Wei
Zhang",
title = "{Oracle SecureFiles System}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1301--1312",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454170",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chhugani:2008:EIS,
author = "Jatin Chhugani and Anthony D. Nguyen and Victor W. Lee
and William Macy and Mostafa Hagog and Yen-Kuang Chen
and Akram Baransi and Sanjeev Kumar and Pradeep Dubey",
title = "Efficient implementation of sorting on multi-core
{SIMD CPU} architecture",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1313--1324",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454171",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dey:2008:EAQ,
author = "Atreyee Dey and Sourjya Bhaumik and Harish D. and
Jayant R. Haritsa",
title = "Efficiently approximating query optimizer plan
diagrams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1325--1336",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454173",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Slezak:2008:BAD,
author = "Dominik {\'S}l{\k{e}}zak and Jakub Wr{\'o}blewski and
Victoria Eastwood and Piotr Synak",
title = "{Brighthouse}: an analytic data warehouse for ad-hoc
queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1337--1345",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454174",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ziauddin:2008:OPC,
author = "Mohamed Ziauddin and Dinesh Das and Hong Su and Yali
Zhu and Khaled Yagoub",
title = "Optimizer plan change management: improved stability
and performance in {Oracle} 11g",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1346--1355",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454175",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2008:TPX,
author = "Zhen Hua Liu and Sivasankaran Chandrasekar and Thomas
Baby and Hui J. Chang",
title = "Towards a physical {XML} independent {XQuery\slash
SQL\slash XML} engine",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1356--1367",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454177",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lee:2008:CQP,
author = "Allison W. Lee and Mohamed Zait",
title = "Closing the query processing loop in {Oracle 11g}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1368--1378",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454178",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jain:2008:TSS,
author = "Namit Jain and Shailendra Mishra and Anand Srinivasan
and Johannes Gehrke and Jennifer Widom and Hari
Balakrishnan and U{\u{g}}ur {\c{C}}etintemel and Mitch
Cherniack and Richard Tibbetts and Stan Zdonik",
title = "Towards a streaming {SQL} standard",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1379--1390",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454179",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Huang:2008:ESG,
author = "Yu Huang and Ziyang Liu and Yi Chen",
title = "{eXtract}: a snippet generation system for {XML}
search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1392--1395",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454181",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Terwilliger:2008:LIQ,
author = "James F. Terwilliger and Sergey Melnik and Philip A.
Bernstein",
title = "Language-integrated querying of {XML} data in {SQL}
server",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1396--1399",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454182",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mathis:2008:XXC,
author = "Christian Mathis and Andreas M. Weiner and Theo
H{\"a}rder and Caesar Ralf Franz Hoppen",
title = "{XTCcmp}: {XQuery} compilation on {XTC}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1400--1403",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454183",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tian:2008:PGG,
author = "Yuanyuan Tian and Jignesh M. Patel and Viji Nair and
Sebastian Martini and Matthias Kretzler",
title = "{Periscope\slash GQ}: a graph querying toolkit",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1404--1407",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454184",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Balmin:2008:SSS,
author = "Andrey Balmin and Latha Colby and Emiran Curtmola and
Quanzhong Li and Fatma {\"O}zcan and Sharath Srinivas
and Zografoula Vagena",
title = "{SEDA}: a system for search, exploration, discovery,
and analysis of {XML Data}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1408--1411",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454185",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Motahari:2008:PSD,
author = "Hamid Motahari and Boualem Benatallah and Regis
Saint-Paul and Fabio Casati and Periklis Andritsos",
title = "Process spaceship: discovering and exploring process
views from event logs in data spaces",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1412--1415",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454186",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lupu:2008:PPP,
author = "Mihai Lupu and Y. C. Tay",
title = "{P} 3 {N}: profiling the potential of a peer-based
data management system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1416--1419",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454188",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tlili:2008:PLT,
author = "Mounir Tlili and W. Kokou Dedzoe and Esther Pacitti
and Patrick Valduriez and Reza Akbarinia and Pascal
Molli and G{\'e}r{\^o}me Canals and St{\'e}phane
Lauri{\`e}re",
title = "{P2P} logging and timestamping for reconciliation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1420--1423",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454189",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Luu:2008:ASP,
author = "Toan Luu and Gleb Skobeltsyn and Fabius Klemm and
Maroje Puh and Ivana Podnar Zarko and Martin Rajman and
Karl Aberer",
title = "{AlvisP2P}: scalable peer-to-peer text retrieval in a
structured {P2P} network",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1424--1427",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454190",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Abiteboul:2008:WEP,
author = "S. Abiteboul and T. Allard and P. Chatalic and G.
Gardarin and A. Ghitescu and F. Goasdou{\'e} and I.
Manolescu and B. Nguyen and M. Ouazara and A. Somani
and N. Travers and G. Vasile and S. Zoupanos",
title = "{WebContent}: efficient {P2P Warehousing} of {Web}
data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1428--1431",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454191",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jurczyk:2008:DED,
author = "Pawel Jurczyk and Li Xiong",
title = "{DObjects}: enabling distributed data services for
metacomputing platforms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1432--1435",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454192",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shao:2008:ETR,
author = "Qihong Shao and Yi Chen and Shu Tao and Xifeng Yan and
Nikos Anerousis",
title = "{EasyTicket}: a ticket routing recommendation engine
for enterprise problem resolution",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1436--1439",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454193",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Duda:2008:ACI,
author = "Cristian Duda and Gianni Frey and Donald Kossmann and
Chong Zhou",
title = "{AJAXSearch}: crawling, indexing and searching {Web
2.0} applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1440--1443",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454195",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2008:MSH,
author = "Kun Liu and Evimaria Terzi and Tyrone Grandison",
title = "{ManyAspects}: a system for highlighting diverse
concepts in documents",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1444--1447",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454196",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Curtmola:2008:XDC,
author = "Emiran Curtmola and Alin Deutsch and Dionysios
Logothetis and K. K. Ramakrishnan and Divesh Srivastava
and Kenneth Yocum",
title = "{XTreeNet}: democratic community search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1448--1451",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454197",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2008:EVK,
author = "Guoliang Li and Jianhua Feng and Jianyong Wang and
Lizhu Zhou",
title = "An effective and versatile keyword search engine on
heterogeneous data sources",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1452--1455",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454198",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Baid:2008:DME,
author = "Akanksha Baid and Andrey Balmin and Heasoo Hwang and
Erik Nijkamp and Jun Rao and Berthold Reinwald and
Alkis Simitsis and Yannis Sismanis and Frank van Ham",
title = "{DBPubs}: multidimensional exploration of database
publications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1456--1459",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454199",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fan:2008:SDQ,
author = "Wenfei Fan and Floris Geerts and Xibei Jia",
title = "{Semandaq}: a data quality system based on conditional
functional dependencies",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1460--1463",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454200",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Katsis:2008:RTI,
author = "Yannis Katsis and Alin Deutsch and Yannis
Papakonstantinou and Keliang Zhao",
title = "{RIDE}: a tool for interactive source registration in
community-oriented information integration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1464--1467",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454202",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Alexe:2008:CEM,
author = "Bogdan Alexe and Wang-Chiew Tan and Yannis
Velegrakis",
title = "Comparing and evaluating mapping systems with
{STBenchmark}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1468--1471",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454203",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Logothetis:2008:AHD,
author = "Dionysios Logothetis and Kenneth Yocum",
title = "Ad-hoc data processing in the cloud",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1472--1475",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454204",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Weigel:2008:LSC,
author = "Felix Weigel and Biswanath Panda and Mirek Riedewald
and Johannes Gehrke and Manuel Calimlim",
title = "Large-scale collaborative analysis and extraction of
{Web} data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1476--1479",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454205",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Crecelius:2008:MSS,
author = "Tom Crecelius and Mouna Kacimi and Sebastian Michel
and Thomas Neumann and Josiane Xavier Parreira and Ralf
Schenkel and Gerhard Weikum",
title = "Making {SENSE}: socially enhanced search and
exploration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1480--1483",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454206",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lu:2008:ASD,
author = "Wentian Lu and Gerome Miklau",
title = "{AuditGuard}: a system for database auditing under
retention restrictions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1484--1487",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454207",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Hu:2008:QVQ,
author = "Ling Hu and Kenneth A. Ross and Yuan-Chi Chang and
Christian A. Lang and Donghui Zhang",
title = "{QueryScope}: visualizing queries for repeatable
database tuning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1488--1491",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454209",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Hose:2008:WIT,
author = "Katja Hose and Daniel Klan and Matthias Marx and
Kai-Uwe Sattler",
title = "When is it time to rethink the aggregate configuration
of your {OLAP} server?",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1492--1495",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454210",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kallman:2008:HSH,
author = "Robert Kallman and Hideaki Kimura and Jonathan Natkins
and Andrew Pavlo and Alexander Rasin and Stanley Zdonik
and Evan P. C. Jones and Samuel Madden and Michael
Stonebraker and Yang Zhang and John Hugg and Daniel J.
Abadi",
title = "{H-store}: a high-performance, distributed main memory
transaction processing system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1496--1499",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454211",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Perlman:2008:OIN,
author = "Eric Perlman and Randal Burns and Michael Kazhdan",
title = "Organizing and indexing non-convex regions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1500--1503",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454212",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Paquet:2008:CME,
author = "Eric Paquet and Herna L. Viktor",
title = "{Capri\slash MR}: exploring protein databases from a
structural and physicochemical point of view",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1504--1507",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454213",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Guo:2008:CMM,
author = "Fan Guo and Lei Li and Christos Faloutsos and Eric P.
Xing",
title = "{C-DEM}: a multi-modal query system for {Drosophila
Embryo} databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1508--1511",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454214",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Milo:2008:QMD,
author = "Tova Milo and Daniel Deutch",
title = "Querying and monitoring distributed business
processes",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1512--1515",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454216",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Franklin:2008:FTD,
author = "Michael Franklin and Alon Halevy and David Maier",
title = "A first tutorial on dataspaces",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1516--1517",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454217",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Franconi:2008:ODM,
author = "Enrico Franconi",
title = "Ontologies and databases: myths and challenges",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1518--1519",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454218",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Balazinska:2008:SAP,
author = "Magdalena Balazinska and Christopher R{\'e} and Dan
Suciu",
title = "Systems aspects of probabilistic data management",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1520--1521",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454219",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fan:2008:RIC,
author = "Wenfei Fan and Floris Geerts and Xibei Jia",
title = "A revival of integrity constraints for data cleaning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1522--1523",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454220",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Moro:2008:XSS,
author = "Mirella M. Moro and Zografoula Vagena and Vassilis J.
Tsotras",
title = "{XML Structural Summaries}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1524--1525",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454221",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sharaf:2008:SCQ,
author = "Mohamed A. Sharaf and Alexandros Labrinidis and Panos
K. Chrysanthis",
title = "Scheduling continuous queries in data stream
management systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1526--1527",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454222",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kriegel:2008:DCM,
author = "Hans-Peter Kriegel and Peer Kr{\"o}ger and Arthur
Zimek",
title = "Detecting clusters in moderate-to-high dimensional
data: subspace clustering, pattern-based clustering,
and correlation clustering",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1528--1529",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454223",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cormode:2008:FFI,
author = "Graham Cormode and Marios Hadjieleftheriou",
title = "Finding frequent items in data streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1530--1541",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454225",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ding:2008:QMT,
author = "Hui Ding and Goce Trajcevski and Peter Scheuermann and
Xiaoyue Wang and Eamonn Keogh",
title = "Querying and mining of time series data: experimental
comparison of representations and distance measures",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1542--1552",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454226",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sidirourgos:2008:CSS,
author = "Lefteris Sidirourgos and Romulo Goncalves and Martin
Kersten and Niels Nes and Stefan Manegold",
title = "Column-store support for {RDF} data management: not
all swans are white",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1553--1563",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454227",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sans:2008:PBN,
author = "Virginie Sans and Dominique Laurent",
title = "Prefix based numbering schemes for {XML}: techniques,
applications and performances",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1564--1573",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454228",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chen:2008:BEM,
author = "Su Chen and Christian S. Jensen and Dan Lin",
title = "A benchmark for evaluating moving object indexes",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1574--1585",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454229",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dittrich:2008:DRM,
author = "Jens Dittrich and Lukas Blunschi and Marcos Antonio
Vaz Salles",
title = "Dwarfs in the rearview mirror: how big are they
really?",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1586--1597",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454230",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shao:2008:CTE,
author = "Jie Shao and Heng Tao Shen and Xiaofang Zhou",
title = "Challenges and techniques for effective and efficient
similarity search in large video databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1598--1603",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454232",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Hopfgartner:2008:SIM,
author = "Frank Hopfgartner",
title = "Studying interaction methodologies in video
retrieval",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1604--1608",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454233",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lo:2008:MPR,
author = "David Lo and Siau-Cheng Khoo",
title = "Mining patterns and rules for software specification
discovery",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1609--1616",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454234",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Biveinis:2008:TEM,
author = "Laurynas Biveinis and Simonas Saltenis",
title = "Towards efficient main-memory use for optimum tree
index update",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1617--1622",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454236",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Holupirek:2008:IFT,
author = "Alexander Holupirek and Marc H. Scholl",
title = "Implementing filesystems by tree-aware {DBMSs}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1623--1630",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454237",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Avanes:2008:AWS,
author = "Artin Avanes and Johann-Christoph Freytag",
title = "Adaptive workflow scheduling under resource allocation
constraints and network dynamics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1631--1637",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454238",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zerr:2008:PPD,
author = "Sergej Zerr and Wolfgang Nejdl",
title = "Privacy preserving document indexing infrastructure
for a distributed environment",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1638--1643",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454240",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Miao:2008:GTG,
author = "Jiajia Miao",
title = "{GS-TMS}: a global stream-based threat monitor
system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1644--1651",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454241",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kharlamov:2008:III,
author = "Evgeny Kharlamov and Werner Nutt",
title = "Incompleteness in information integration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1652--1658",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454242",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Deutch:2008:QWB,
author = "Daniel Deutch and Tova Milo",
title = "Querying {Web}-based applications under models of
uncertainty",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1659--1665",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454244",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Silvasti:2008:XDF,
author = "Panu Silvasti and Seppo Sippu and Eljas
Soisalon-Soininen",
title = "{XML}-document-filtering automaton",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1666--1671",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454245",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Scholl:2008:CDD,
author = "Tobias Scholl and Alfons Kemper",
title = "Community-driven data grids",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "1",
number = "2",
pages = "1672--1677",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1454159.1454246",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:44 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gatterbauer:2009:BIA,
author = "Wolfgang Gatterbauer and Magdalena Balazinska and
Nodira Khoussainova and Dan Suciu",
title = "Believe it or not: adding belief annotations to
databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "1--12",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2009:SSB,
author = "Zhenjie Zhang and Beng Chin Ooi and Srinivasan
Parthasarathy and Anthony K. H. Tung",
title = "Similarity search on {Bregman} divergence: towards
non-metric indexing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "13--24",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zeng:2009:CSA,
author = "Zhiping Zeng and Anthony K. H. Tung and Jianyong Wang
and Jianhua Feng and Lizhu Zhou",
title = "Comparing stars: on approximating graph edit
distance",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "25--36",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Whang:2009:IBE,
author = "Steven Euijong Whang and Hector Garcia-Molina and Chad
Brower and Jayavel Shanmugasundaram and Sergei
Vassilvitskii and Erik Vee and Ramana Yerneni",
title = "Indexing {Boolean} expressions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "37--48",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhou:2009:SDS,
author = "Yongluan Zhou and Ali Salehi and Karl Aberer",
title = "Scalable delivery of stream query result",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "49--60",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Benedikt:2009:SBI,
author = "Michael Benedikt and James Cheney",
title = "Schema-based independence analysis for {XML} updates",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "61--72",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Nehme:2009:TSD,
author = "Rimma V. Nehme and Elke A. Rundensteiner and Elisa
Bertino",
title = "Tagging stream data for rich real-time services",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "73--84",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sarma:2009:RMP,
author = "Atish Das Sarma and Ashwin Lall and Danupon Nanongkai
and Jun Xu",
title = "Randomized multi-pass streaming skyline algorithms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "85--96",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Reeves:2009:MMT,
author = "Galen Reeves and Jie Liu and Suman Nath and Feng
Zhao",
title = "Managing massive time series streams with multi-scale
compressed trickles",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "97--108",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wu:2009:PAM,
author = "Tianyi Wu and Dong Xin and Qiaozhu Mei and Jiawei
Han",
title = "Promotion analysis in multi-dimensional space",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "109--120",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sarkas:2009:MDK,
author = "Nikos Sarkas and Nilesh Bansal and Gautam Das and Nick
Koudas",
title = "Measure-driven keyword-query expansion",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "121--132",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2009:UTD,
author = "Bin Liu and H. V. Jagadish",
title = "Using trees to depict a forest",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "133--144",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Elmeleegy:2009:OPW,
author = "Hazem Elmeleegy and Ahmed K. Elmagarmid and Emmanuel
Cecchet and Walid G. Aref and Willy Zwaenepoel",
title = "Online piece-wise linear approximation of numerical
streams with precision guarantees",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "145--156",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Stern:2009:WTE,
author = "Mirco Stern and Erik Buchmann and Klemens B{\"o}hm",
title = "A wavelet transform for efficient consolidation of
sensor relations with quality guarantees",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "157--168",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yu:2009:EAQ,
author = "Liu Yu and Jianzhong Li and Hong Gao and Xiaolin
Fang",
title = "Enabling $ \epsilon $-approximate querying in sensor
networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "169--180",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Nandi:2009:HUS,
author = "Arnab Nandi and Philip A. Bernstein",
title = "{HAMSTER}: using search clicklogs for schema and
taxonomy matching",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "181--192",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kot:2009:CUE,
author = "Lucja Kot and Christoph Koch",
title = "Cooperative update exchange in the {Youtopia} system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "193--204",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Papapetrou:2009:RBA,
author = "Panagiotis Papapetrou and Vassilis Athitsos and George
Kollios and Dimitrios Gunopulos",
title = "Reference-based alignment in large sequence
databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "205--216",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Das:2009:TCM,
author = "Sudipto Das and Shyam Antony and Divyakant Agrawal and
Amr {El Abbadi}",
title = "Thread cooperation in multicore architectures for
frequency counting over multiple data streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "217--228",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mueller:2009:SWQ,
author = "Rene Mueller and Jens Teubner and Gustavo Alonso",
title = "Streams on wires: a query compiler for {FPGAs}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "229--240",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chandramouli:2009:FPD,
author = "Badrish Chandramouli and Jonathan Goldstein and David
Maier",
title = "On-the-fly progress detection in iterative stream
queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "241--252",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kraska:2009:CRC,
author = "Tim Kraska and Martin Hentschel and Gustavo Alonso and
Donald Kossmann",
title = "Consistency rationing in the cloud: pay only when it
matters",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "253--264",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lomet:2009:LKR,
author = "David Lomet and Mohamed F. Mokbel",
title = "Locking key ranges with unbundled transaction
services",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "265--276",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Candea:2009:SPJ,
author = "George Candea and Neoklis Polyzotis and Radek
Vingralek",
title = "A scalable, predictable join operator for highly
concurrent data warehouses",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "277--288",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gupta:2009:ATA,
author = "Rahul Gupta and Sunita Sarawagi",
title = "Answering table augmentation queries from unstructured
lists on the {Web}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "289--300",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cautis:2009:ERX,
author = "Bogdan Cautis and Alin Deutsch and Nicola Onose and
Vasilis Vassalos",
title = "Efficient rewriting of {XPath} queries using {Query
Set Specifications}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "301--312",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2009:SSR,
author = "Ziyang Liu and Peng Sun and Yi Chen",
title = "Structured search result differentiation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "313--324",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dragut:2009:HAM,
author = "Eduard C. Dragut and Thomas Kabisch and Clement Yu and
Ulf Leser",
title = "A hierarchical approach to model {Web} query
interfaces for {Web} source integration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "325--336",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cong:2009:ERT,
author = "Gao Cong and Christian S. Jensen and Dingming Wu",
title = "Efficient retrieval of the top-$k$ most relevant
spatial {Web} objects",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "337--348",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dragut:2009:SWR,
author = "Eduard Dragut and Fang Fang and Prasad Sistla and
Clement Yu and Weiyi Meng",
title = "Stop word and related problems in {Web} interface
integration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "349--360",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Agrawal:2009:LAT,
author = "Devesh Agrawal and Deepak Ganesan and Ramesh Sitaraman
and Yanlei Diao and Shashi Singh",
title = "Lazy-Adaptive {Tree}: an optimized index structure for
flash devices",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "361--372",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lee:2009:MDM,
author = "Rubao Lee and Xiaoning Ding and Feng Chen and Qingda
Lu and Xiaodong Zhang",
title = "{MCC-DB}: minimizing cache conflicts in multi-core
processors for databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "373--384",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Willhalm:2009:SSU,
author = "Thomas Willhalm and Nicolae Popovici and Yazan Boshmaf
and Hasso Plattner and Alexander Zeier and Jan
Schaffner",
title = "{SIMD-scan}: ultra fast in-memory table scan using
on-chip vector processing units",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "385--394",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chaudhuri:2009:MDC,
author = "Surajit Chaudhuri and Venkatesh Ganti and Dong Xin",
title = "Mining document collections to facilitate accurate
approximate entity matching",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "395--406",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fan:2009:RAR,
author = "Wenfei Fan and Xibei Jia and Jianzhong Li and Shuai
Ma",
title = "Reasoning about record matching rules",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "407--418",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dobra:2009:TCE,
author = "Alin Dobra and Chris Jermaine and Florin Rusu and Fei
Xu",
title = "Turbo-charging estimate convergence in {DBO}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "419--430",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cohen:2009:CSA,
author = "Edith Cohen and Nick Duffield and Haim Kaplan and
Carsten Lund and Mikkel Thorup",
title = "Composable, scalable, and accurate weight
summarization of unaggregated data sets",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "431--442",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wu:2009:DOA,
author = "Sai Wu and Shouxu Jiang and Beng Chin Ooi and Kian-Lee
Tan",
title = "Distributed online aggregations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "443--454",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Koloniari:2009:RBC,
author = "Georgia Koloniari and Evaggelia Pitoura",
title = "A recall-based cluster formation game in peer-to-peer
systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "455--466",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fekete:2009:QIA,
author = "Alan Fekete and Shirley N. Goldrei and Jorge P{\'e}rez
Asenjo",
title = "Quantifying isolation anomalies",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "467--478",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Johnson:2009:IOS,
author = "Ryan Johnson and Ippokratis Pandis and Anastasia
Ailamaki",
title = "Improving {OLTP} scalability using speculative lock
inheritance",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "479--489",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sears:2009:SBR,
author = "Russell Sears and Eric Brewer",
title = "Segment-based recovery: write-ahead logging
revisited",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "490--501",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2009:UAR,
author = "Jian Li and Barna Saha and Amol Deshpande",
title = "A unified approach to ranking in probabilistic
databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "502--513",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Arasu:2009:LST,
author = "Arvind Arasu and Surajit Chaudhuri and Raghav
Kaushik",
title = "Learning string transformations from examples",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "514--525",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cormode:2009:PHP,
author = "Graham Cormode and Antonios Deligiannakis and Minos
Garofalakis and Andrew McGregor",
title = "Probabilistic histograms for probabilistic data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "526--537",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Greenshpan:2009:AM,
author = "Ohad Greenshpan and Tova Milo and Neoklis Polyzotis",
title = "Autocompletion for mashups",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "538--549",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dong:2009:ICD,
author = "Xin Luna Dong and Laure Berti-Equille and Divesh
Srivastava",
title = "Integrating conflicting data: the role of source
dependence",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "550--561",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dong:2009:TDC,
author = "Xin Luna Dong and Laure Berti-Equille and Divesh
Srivastava",
title = "Truth discovery and copying detection in a dynamic
world",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "562--573",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Golab:2009:SD,
author = "Lukasz Golab and Howard Karloff and Flip Korn and
Avishek Saha and Divesh Srivastava",
title = "Sequential dependencies",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "574--585",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Denev:2009:SFQ,
author = "Dimitar Denev and Arturas Mazeika and Marc Spaniol and
Gerhard Weikum",
title = "{SHARC}: framework for quality-conscious {Web}
archiving",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "586--597",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Beskales:2009:MQP,
author = "George Beskales and Mohamed A. Soliman and Ihab F.
Ilyas and Shai Ben-David",
title = "Modeling and querying possible repairs in duplicate
detection",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "598--609",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mindolin:2009:DRI,
author = "Denis Mindolin and Jan Chomicki",
title = "Discovering relative importance of skyline
attributes",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "610--621",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kim:2009:PDB,
author = "Min-Soo Kim and Jiawei Han",
title = "A particle-and-density based evolutionary clustering
method for dynamic networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "622--633",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yang:2009:SRD,
author = "Xiaoyan Yang and Cecilia M. Procopiuc and Divesh
Srivastava",
title = "Summarizing relational databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "634--645",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cohen:2009:CWS,
author = "Edith Cohen and Haim Kaplan and Subhabrata Sen",
title = "Coordinated weighted sampling for estimating
aggregates over multiple weight assignments",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "646--657",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lee:2009:PLB,
author = "Hongrae Lee and Raymond T. Ng and Kyuseok Shim",
title = "Power-law based estimation of set similarity join
size",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "658--669",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Karras:2009:OSL,
author = "Panagiotis Karras",
title = "Optimality and scalability in lattice histogram
construction",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "670--681",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Vigfusson:2009:APD,
author = "Ymir Vigfusson and Adam Silberstein and Brian F.
Cooper and Rodrigo Fonseca",
title = "Adaptively parallelizing distributed range queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "682--693",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tatikonda:2009:MTS,
author = "Shirish Tatikonda and Srinivasan Parthasarathy",
title = "Mining tree-structured data on multicore systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "694--705",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Unterbrunner:2009:PPU,
author = "P. Unterbrunner and G. Giannikis and G. Alonso and D.
Fauser and D. Kossmann",
title = "Predictable performance for unpredictable workloads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "706--717",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhou:2009:GCB,
author = "Yang Zhou and Hong Cheng and Jeffrey Xu Yu",
title = "Graph clustering based on structural\slash attribute
similarities",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "718--729",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{AlHasan:2009:OSS,
author = "Mohammad {Al Hasan} and Mohammed J. Zaki",
title = "Output space sampling for graph patterns",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "730--741",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chen:2009:MGP,
author = "Chen Chen and Cindy X. Lin and Matt Fredrikson and
Mihai Christodorescu and Xifeng Yan and Jiawei Han",
title = "Mining graph patterns efficiently via randomized
summaries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "742--753",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Amer-Yahia:2009:GRS,
author = "Sihem Amer-Yahia and Senjuti Basu Roy and Ashish
Chawlat and Gautam Das and Cong Yu",
title = "Group recommendation: semantics and efficiency",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "754--765",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bhagat:2009:CBG,
author = "Smriti Bhagat and Graham Cormode and Balachander
Krishnamurthy and Divesh Srivastava",
title = "Class-based graph anonymization for social network
data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "766--777",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sarkas:2009:ISS,
author = "Nikos Sarkas and Gautam Das and Nick Koudas",
title = "Improved search for socially annotated data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "778--789",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Machanavajjhala:2009:DPA,
author = "Ashwin Machanavajjhala and Johannes Gehrke and
Michaela G{\"o}tz",
title = "Data publishing against realistic adversaries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "790--801",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Pang:2009:SVO,
author = "HweeHwa Pang and Jilian Zhang and Kyriakos
Mouratidis",
title = "Scalable verification for outsourced dynamic
databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "802--813",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Xiao:2009:ORP,
author = "Xiaokui Xiao and Yufei Tao and Minghua Chen",
title = "Optimal random perturbation at multiple privacy
levels",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "814--825",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Assent:2009:ADE,
author = "Ira Assent and Marc Wichterich and Ralph Krieger and
Hardy Kremer and Thomas Seidl",
title = "Anticipatory {DTW} for efficient similarity search in
time series databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "826--837",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tsirogiannis:2009:IPL,
author = "Dimitris Tsirogiannis and Sudipto Guha and Nick
Koudas",
title = "Improving the performance of list intersection",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "838--849",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kaushik:2009:CHP,
author = "Raghav Kaushik and Dan Suciu",
title = "Consistent histograms in the presence of distinct
value counts",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "850--861",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Aggarwal:2009:GCI,
author = "Charu Aggarwal and Yan Xie and Philip S. Yu",
title = "{GConnect}: a connectivity index for massive
disk-resident graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "862--873",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yang:2009:SES,
author = "Di Yang and Elke A. Rundensteiner and Matthew O.
Ward",
title = "A shared execution strategy for multiple pattern
mining requests over streaming data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "874--885",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zou:2009:DJP,
author = "Lei Zou and Lei Chen and M. Tamer {\"O}zsu",
title = "Distance-join: pattern match query in a large graph
database",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "886--897",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wan:2009:CCP,
author = "Qian Wan and Raymond Chi-Wing Wong and Ihab F. Ilyas
and M. Tamer {\"O}zsu and Yu Peng",
title = "Creating competitive products",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "898--909",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mueller:2009:DPF,
author = "Rene Mueller and Jens Teubner and Gustavo Alonso",
title = "Data processing on {FPGAs}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "910--921",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Abouzeid:2009:HAH,
author = "Azza Abouzeid and Kamil Bajda-Pawlikowski and Daniel
Abadi and Avi Silberschatz and Alexander Rasin",
title = "{HadoopDB}: an architectural hybrid of {MapReduce} and
{DBMS} technologies for analytical workloads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "922--933",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{He:2009:ASV,
author = "Yeye He and Jeffrey F. Naughton",
title = "Anonymization of set-valued data via top-down, local
generalization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "934--945",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zou:2009:AGF,
author = "Lei Zou and Lei Chen and M. Tamer {\"O}zsu",
title = "$k$-automorphism: a general framework for privacy
preserving network publication",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "946--957",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Koudas:2009:DBM,
author = "Nick Koudas and Divesh Srivastava and Ting Yu and Qing
Zhang",
title = "Distribution based microdata anonymization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "958--969",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Meier:2009:CTB,
author = "Michael Meier and Michael Schmidt and Georg Lausen",
title = "On chase termination beyond stratification",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "970--981",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Moerkotte:2009:PBP,
author = "Guido Moerkotte and Thomas Neumann and Gabriele
Steidl",
title = "Preventing bad plans by bounding the impact of
cardinality estimation errors",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "982--993",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chaudhuri:2009:ECQ,
author = "Surajit Chaudhuri and Vivek Narasayya and Ravi
Ramamurthy",
title = "Exact cardinality query optimization for optimizer
testing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "994--1005",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{tenCate:2009:LSM,
author = "Balder ten Cate and Laura Chiticariu and Phokion
Kolaitis and Wang-Chiew Tan",
title = "Laconic schema mappings: computing the core with {SQL}
queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "1006--1017",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Arenas:2009:ISM,
author = "Marcelo Arenas and Jorge P{\'e}rez and Juan Reutter
and Cristian Riveros",
title = "Inverting schema mappings: bridging the gap between
theory and practice",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "1018--1029",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Terwilliger:2009:FFF,
author = "James F. Terwilliger and Philip A. Bernstein and
Sergey Melnik",
title = "Full-fidelity flexible object-oriented {XML} access",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "1030--1041",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2009:PAM,
author = "Ting Wang and Ling Liu",
title = "Privacy-aware mobile services over road networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "1042--1053",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{U:2009:FAA,
author = "Leong Hou U. and Nikos Mamoulis and Kyriakos
Mouratidis",
title = "A fair assignment algorithm for multiple preference
queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "1054--1065",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mishima:2009:PED,
author = "Takeshi Mishima and Hiroshi Nakamura",
title = "Pangea: an eager database replication middleware
guaranteeing snapshot isolation without modification of
database servers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "1066--1077",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Elmeleegy:2009:HRT,
author = "Hazem Elmeleegy and Jayant Madhavan and Alon Halevy",
title = "Harvesting relational tables from lists on the web",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "1078--1089",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cafarella:2009:DIR,
author = "Michael J. Cafarella and Alon Halevy and Nodira
Khoussainova",
title = "Data integration for the relational web",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "1090--1101",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gottlob:2009:NOS,
author = "Georg Gottlob and Reinhard Pichler and Vadim
Savenkov",
title = "Normalization and optimization of schema mappings",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "1102--1113",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Xing:2009:CMN,
author = "Songhua Xing and Cyrus Shahabi and Bei Pan",
title = "Continuous monitoring of nearest neighbors on land
surface",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "1114--1125",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wong:2009:EMM,
author = "Raymond Chi-Wing Wong and M. Tamer {\"O}zsu and Philip
S. Yu and Ada Wai-Chee Fu and Lian Liu",
title = "Efficient method for maximizing bichromatic reverse
nearest neighbor",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "1126--1137",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cheema:2009:LUE,
author = "Muhammad Aamir Cheema and Xuemin Lin and Ying Zhang
and Wei Wang and Wenjie Zhang",
title = "Lazy updates: an efficient technique to continuously
monitoring reverse {kNN}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "1138--1149",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chen:2009:NMM,
author = "Ling Chen and Sourav S. Bhowmick and Wolfgang Nejdl",
title = "{NEAR-Miner}: mining evolution associations of {Web}
site directories for efficient maintenance of {Web}
archives",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "1150--1161",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wong:2009:AEO,
author = "W. K. Wong and David W. Cheung and Edward Hung and Ben
Kao and Nikos Mamoulis",
title = "An audit environment for outsourcing of frequent
itemset mining",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "1162--1173",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mozafari:2009:PNB,
author = "Barzan Mozafari and Carlo Zaniolo",
title = "Publishing naive {Bayesian} classifiers: privacy
without accuracy loss",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "1174--1185",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tzoumas:2009:WAI,
author = "Kostas Tzoumas and Man Lung Yiu and Christian S.
Jensen",
title = "Workload-aware indexing of continuously moving
objects",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "1186--1197",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2009:EIU,
author = "Meihui Zhang and Su Chen and Christian S. Jensen and
Beng Chin Ooi and Zhenjie Zhang",
title = "Effectively indexing uncertain moving objects for
predictive queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "1198--1209",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sankaranarayanan:2009:POS,
author = "Jagan Sankaranarayanan and Hanan Samet and Houman
Alborzi",
title = "Path oracles for spatial networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "1210--1221",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kimura:2009:CMC,
author = "Hideaki Kimura and George Huo and Alexander Rasin and
Samuel Madden and Stanley B. Zdonik",
title = "Correlation maps: a compressed access method for
exploiting soft functional dependencies",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "1222--1233",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Schnaitter:2009:IIP,
author = "Karl Schnaitter and Neoklis Polyzotis and Lise
Getoor",
title = "Index interactions in physical design tuning:
modeling, analysis, and applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "1234--1245",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Duan:2009:TDC,
author = "Songyun Duan and Vamsidhar Thummala and Shivnath
Babu",
title = "Tuning database configuration parameters with
{iTuned}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "1246--1257",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Salles:2009:ECR,
author = "Marcos Vaz Salles and Tuan Cao and Benjamin Sowell and
Alan Demers and Johannes Gehrke and Christoph Koch and
Walker White",
title = "An evaluation of checkpoint recovery for massively
multiplayer online games",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "1258--1269",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Muller:2009:ECS,
author = "Emmanuel M{\"u}ller and Stephan G{\"u}nnemann and Ira
Assent and Thomas Seidl",
title = "Evaluating clustering in subspace projections of high
dimensional data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "1270--1281",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Hassanzadeh:2009:FEC,
author = "Oktie Hassanzadeh and Fei Chiang and Hyun Chul Lee and
Ren{\'e}e J. Miller",
title = "Framework for evaluating clustering algorithms in
duplicate detection",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "1",
pages = "1282--1293",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:50 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Guo:2009:DMM,
author = "Hongfei Guo and Dan Jones and Jennifer Beckmann and
Praveen Seshadri",
title = "Declarative management in {Microsoft SQL} server",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1294--1305",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{El-Helw:2009:SRS,
author = "Amr El-Helw and Ihab F. Ilyas and Calisto Zuzarte",
title = "{StatAdvisor}: recommending statistical views",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1306--1317",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Canim:2009:OPA,
author = "Mustafa Canim and George A. Mihaila and Bishwaranjan
Bhattacharjee and Kenneth A. Ross and Christian A.
Lang",
title = "An object placement advisor for {DB2} using solid
state storage",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1318--1329",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bhide:2009:XXP,
author = "Manish Bhide and Manoj K. Agarwal and Amir Bar-Or and
Sriram Padmanabhan and Srinivas K. Mittapalli and
Girish Venkatachaliah",
title = "{XPEDIA}: {XML} processing for data integration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1330--1341",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bamford:2009:XR,
author = "Roger Bamford and Vinayak Borkar and Matthias Brantner
and Peter M. Fischer and Daniela Florescu and David
Graf and Donald Kossmann and Tim Kraska and Dan Muresan
and Sorin Nasoi and Markos Zacharioudakis",
title = "{XQuery} reloaded",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1342--1353",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2009:BXS,
author = "Ning Zhang and Nipun Agarwal and Sivasankaran
Chandrasekar and Sam Idicula and Vijay Medi and Sabina
Petride and Balasubramanyam Sthanikam",
title = "Binary {XML} storage and query processing in {Oracle
11g}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1354--1365",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bellamkonda:2009:ESO,
author = "Srikanth Bellamkonda and Rafi Ahmed and Andrew
Witkowski and Angela Amor and Mohamed Zait and
Chun-Chieh Lin",
title = "Enhanced subquery optimizations in {Oracle}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1366--1377",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kim:2009:SVH,
author = "Changkyu Kim and Tim Kaldewey and Victor W. Lee and
Eric Sedlar and Anthony D. Nguyen and Nadathur Satish
and Jatin Chhugani and Andrea {Di Blas} and Pradeep
Dubey",
title = "Sort vs. {Hash} revisited: fast join implementation on
modern multi-core {CPUs}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1378--1389",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Xu:2009:EOJ,
author = "Yu Xu and Pekka Kostamaa",
title = "Efficient outer join data skew handling in parallel
{DBMS}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1390--1396",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Friedman:2009:SMP,
author = "Eric Friedman and Peter Pawlowski and John
Cieslewicz",
title = "{SQL\slash MapReduce}: a practical approach to
self-describing, polymorphic, and parallelizable
user-defined functions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1402--1413",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gates:2009:BHL,
author = "Alan F. Gates and Olga Natkovich and Shubham Chopra
and Pradeep Kamath and Shravan M. Narayanamurthy and
Christopher Olston and Benjamin Reed and Santhosh
Srinivasan and Utkarsh Srivastava",
title = "Building a high-level dataflow system on top of
{Map-Reduce}: the {Pig} experience",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1414--1425",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Panda:2009:PMP,
author = "Biswanath Panda and Joshua S. Herbach and Sugato Basu
and Roberto J. Bayardo",
title = "{PLANET}: massively parallel learning of tree
ensembles with {MapReduce}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1426--1437",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Legler:2009:RDT,
author = "Thomas Legler and Wolfgang Lehner and Jan Schaffner
and Jens Kr{\"u}ger",
title = "Robust and distributed top-n frequent-pattern mining
with {SAP BW} accelerator",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1438--1449",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dieu:2009:TUF,
author = "Nicolas Dieu and Adrian Dragusanu and Fran{\c{c}}oise
Fabret and Fran{\c{c}}ois Llirbat and Eric Simon",
title = "1,000 tables under the form",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1450--1461",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bhattacharjee:2009:EIC,
author = "Bishwaranjan Bhattacharjee and Lipyeow Lim and Timothy
Malkemus and George Mihaila and Kenneth Ross and
Sherman Lau and Cathy McArthur and Zoltan Toth and Reza
Sherkat",
title = "Efficient index compression in {DB2 LUW}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1462--1473",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lacroix:2009:SSW,
author = "Zo{\'e} Lacroix and Christophe Legendre and Spyro
Mousses",
title = "Storing scientific workflows in a database",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1474--1480",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cohen:2009:MSN,
author = "Jeffrey Cohen and Brian Dolan and Mark Dunlap and
Joseph M. Hellerstein and Caleb Welton",
title = "{MAD} skills: new analysis practices for big data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1481--1492",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ley:2009:DSL,
author = "Michael Ley",
title = "{DBLP}: some lessons learned",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1493--1500",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mukherjee:2009:OSP,
author = "Niloy Mukherjee and Amit Ganesh and Vinayagam
Djegaradjane and Sujatha Muthulingam and Wei Zhang and
Krishna Kunchithapadam and Scott Lynn and Bharath Aleti
and Kam Shergill and Shaoyu Wang",
title = "{Oracle SecureFiles}: prepared for the digital
deluge",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1501--1511",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Baumgartner:2009:SWD,
author = "Robert Baumgartner and Georg Gottlob and Marcus
Herzog",
title = "Scalable {Web} data extraction for online market
intelligence",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1512--1523",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Rajaraman:2009:KHP,
author = "Anand Rajaraman",
title = "{Kosmix}: high-performance topic exploration using the
deep {Web}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1524--1529",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Nehme:2009:QMM,
author = "Rimma V. Nehme and Karen E. Works and Elke A.
Rundensteiner and Elisa Bertino",
title = "Query mesh: multi-route query processing technology",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1530--1533",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cudre-Mauroux:2009:DSS,
author = "P. Cudre-Mauroux and H. Kimura and K.-T. Lim and J.
Rogers and R. Simakov and E. Soroush and P. Velikhov
and D. L. Wang and M. Balazinska and J. Becla and D.
DeWitt and B. Heath and D. Maier and S. Madden and J.
Patel and M. Stonebraker and S. Zdonik",
title = "A demonstration of {SciDB}: a science-oriented
{DBMS}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1534--1537",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2009:MMM,
author = "Kuien Liu and Ke Deng and Zhiming Ding and Mingshu Li
and Xiaofang Zhou",
title = "{MOIR\slash MT}: monitoring large-scale road network
traffic in real-time",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1538--1541",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Colle:2009:ODR,
author = "Romain Colle and Leonidas Galanis and Supiti
Buranawatanachoke and Stratos Papadomanolakis and Yujun
Wang",
title = "{Oracle Database Replay}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1542--1545",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Borisov:2009:DPD,
author = "Nedyalko Borisov and Shivnath Babu and Sandeep
Uttamchandani and Ramani Routray and Aameek Singh",
title = "{DIADS}: a problem diagnosis tool for databases and
storage area networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1546--1549",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Herschel:2009:ASA,
author = "Melanie Herschel and Mauricio A. Hern{\'a}ndez and
Wang-Chiew Tan",
title = "{Artemis}: a system for analyzing missing answers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1550--1553",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wu:2009:DTS,
author = "Eugene Wu and Philippe Cudre-Mauroux and Samuel
Madden",
title = "Demonstration of the {TrajStore} system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1554--1557",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ali:2009:MCS,
author = "M. H. Ali and C. Gerea and B. S. Raman and B. Sezgin
and T. Tarnavski and T. Verona and P. Wang and P.
Zabback and A. Ananthanarayan and A. Kirilov and M. Lu
and A. Raizman and R. Krishnan and R. Schindlauer and
T. Grabs and S. Bjeletich and B. Chandramouli and J.
Goldstein and S. Bhat and Ying Li and V. {Di Nicola}
and X. Wang and David Maier and S. Grell and O. Nano
and I. Santos",
title = "{Microsoft CEP Server} and online behavioral
targeting",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1558--1561",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Krompass:2009:TMD,
author = "Stefan Krompass and Harumi Kuno and Janet L. Wiener
and Kevin Wilkinson and Umeshwar Dayal and Alfons
Kemper",
title = "A testbed for managing dynamic mixed workloads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1562--1565",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ahmad:2009:DSC,
author = "Yanif Ahmad and Christoph Koch",
title = "{DBToaster}: a {SQL} compiler for high-performance
delta processing in main-memory databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1566--1569",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Preda:2009:AAK,
author = "Nicoleta Preda and Fabian M. Suchanek and Gjergji
Kasneci and Thomas Neumann and Maya Ramanath and
Gerhard Weikum",
title = "{ANGIE}: active knowledge for interactive
exploration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1570--1573",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kopcke:2009:CEE,
author = "Hanna K{\"o}pcke and Andreas Thor and Erhard Rahm",
title = "Comparative evaluation of entity resolution approaches
with {FEVER}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1574--1577",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Brauer:2009:RDR,
author = "Falk Brauer and Wojciech Barczynski and Gregor
Hackenbroich and Marcus Schramm and Adrian Mocan and
Felix F{\"o}rster",
title = "{RankIE}: document retrieval on ranked entity graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1578--1581",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mecca:2009:CEM,
author = "Giansalvatore Mecca and Paolo Papotti and Salvatore
Raunich and Marcello Buoncristiano",
title = "Concise and expressive mappings with +Spicy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1582--1585",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cruz:2009:AEM,
author = "Isabel F. Cruz and Flavio Palandri Antonelli and
Cosmin Stroe",
title = "{AgreementMaker}: efficient matching for large
real-world schemas and ontologies",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1586--1589",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Hassanzadeh:2009:LQW,
author = "Oktie Hassanzadeh and Reynold Xin and Ren{\'e}e J.
Miller and Anastasios Kementsietsidis and Lipyeow Lim
and Min Wang",
title = "{Linkage Query Writer}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1590--1593",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2009:SEE,
author = "Xiaoyuan Wang and Xingzhi Sun and Feng Cao and Li Ma
and Nick Kanellos and Kang Zhang and Yue Pan and Yong
Yu",
title = "{SMDM}: enhancing enterprise-wide master data
management using semantic {Web} technologies",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1594--1597",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gubanov:2009:IUR,
author = "Michael N. Gubanov and Lucian Popa and Howard Ho and
Hamid Pirahesh and Jeng-Yih Chang and Shr-Chang Chen",
title = "{IBM UFO} repository: object-oriented data
integration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1598--1601",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chen:2009:MSW,
author = "Huajun Chen and Bin Lu and Yuan Ni and Guotong Xie and
Chunying Zhou and Jinhua Mi and Zhaohui Wu",
title = "Mashup by surfing a {Web} of data {APIs}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1602--1605",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Pichler:2009:DDE,
author = "Reinhard Pichler and Vadim Savenkov",
title = "{DEMo}: data exchange modeling tool",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1606--1609",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Letchner:2009:LDW,
author = "Julie Letchner and Christopher R{\'e} and Magdalena
Balazinska and Matthai Philipose",
title = "Lahar demonstration: warehousing {Markovian} streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1610--1613",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sun:2009:WAC,
author = "Peng Sun and Ziyang Liu and Sivaramakrishnan Natarajan
and Susan B. Davidson and Yi Chen",
title = "{WOLVES}: achieving correct provenance analysis by
detecting and resolving unsound workflow views",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1614--1617",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dai:2009:TTI,
author = "Chenyun Dai and Gabriel Ghinita and Elisa Bertino and
Ji-Won Byun and Ninghui Li",
title = "{TIAMAT}: a tool for interactive analysis of microdata
anonymization techniques",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1618--1621",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yu:2009:IIN,
author = "Yintao Yu and Cindy X. Lin and Yizhou Sun and Chen
Chen and Jiawei Han and Binbin Liao and Tianyi Wu and
ChengXiang Zhai and Duo Zhang and Bo Zhao",
title = "{iNextCube}: information network-enhanced text cube",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1622--1625",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Thusoo:2009:HWS,
author = "Ashish Thusoo and Joydeep Sen Sarma and Namit Jain and
Zheng Shao and Prasad Chakka and Suresh Anthony and Hao
Liu and Pete Wyckoff and Raghotham Murthy",
title = "{Hive}: a warehousing solution over a map-reduce
framework",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1626--1629",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Satish:2009:TEB,
author = "Arjun Satish and Ramesh Jain and Amarnath Gupta",
title = "{Tolkien}: an event based storytelling system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1630--1633",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sarigol:2009:ESN,
author = "Emre Sarig{\"o}l and Oriana Riva and Patrick Stuedi
and Gustavo Alonso",
title = "Enabling social networking in ad hoc networks of
mobile phones",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1634--1637",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bao:2009:PVD,
author = "Zhuowei Bao and Sarah Cohen-Boulakia and Susan B.
Davidson and Pierrick Girard",
title = "{PDiffView}: viewing the difference in provenance of
workflow results",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1638--1641",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Deutch:2009:GOW,
author = "Daniel Deutch and Tova Milo and Tom Yam",
title = "Goal-oriented {Web}-site navigation for on-line
shoppers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1642--1645",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Pereira:2009:AWQ,
author = "Fernando Pereira and Anand Rajaraman and Sunita
Sarawagi and William Tunstall-Pedoe and Gerhard Weikum
and Alon Halevy",
title = "Answering {Web} questions using structured data: dream
or reality?",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1646--1646",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bernstein:2009:HBB,
author = "Philip A. Bernstein and Daniel J. Abadi and Michael J.
Cafarella and Joseph M. Hellerstein and Donald Kossmann
and Samuel Madden",
title = "How best to build {Web}-scale data managers?",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1647--1647",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Manegold:2009:DAE,
author = "Stefan Manegold and Martin L. Kersten and Peter
Boncz",
title = "Database architecture evolution: mammals flourished
long before dinosaurs became extinct",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1648--1653",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dong:2009:DFR,
author = "Xin Luna Dong and Felix Naumann",
title = "Data fusion: resolving data conflicts for
integration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1654--1655",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Heer:2009:DVS,
author = "Jeffrey Heer and Joseph M. Hellerstein",
title = "Data visualization and social data analysis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1656--1657",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chaudhuri:2009:KQR,
author = "Surajit Chaudhuri and Gautam Das",
title = "Keyword querying and ranking in databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1658--1659",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Hadjieleftheriou:2009:EAS,
author = "Marios Hadjieleftheriou and Chen Li",
title = "Efficient approximate search on string collections",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1660--1661",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Srivastava:2009:ITD,
author = "Divesh Srivastava and Suresh Venkatasubramanian",
title = "Information theory for data management",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1662--1663",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Abadi:2009:COD,
author = "Daniel J. Abadi and Peter A. Boncz and Stavros
Harizopoulos",
title = "Column-oriented database systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "2",
number = "2",
pages = "1664--1665",
month = aug,
year = "2009",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:54:57 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Srivastava:2010:ERT,
author = "Divesh Srivastava and Lukasz Golab and Rick Greer and
Theodore Johnson and Joseph Seidel and Vladislav
Shkapenyuk and Oliver Spatscheck and Jennifer Yates",
title = "Enabling real time data analysis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1--2",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Matsudaira:2010:HEB,
author = "Paul Matsudaira",
title = "High-end biological imaging generates very large
{$3$D+} and dynamic datasets",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "3--3",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cho:2010:DWD,
author = "Junghoo Cho and Hector Garcia-Molina",
title = "Dealing with {Web} data: history and look ahead",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "4--4",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
remark = "10-year best paper award",
}
@Article{Kemme:2010:DRT,
author = "Bettina Kemme and Gustavo Alonso",
title = "Database replication: a tale of research across
communities",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "5--12",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
remark = "10-year best paper award",
}
@Article{Canim:2010:BDR,
author = "Mustafa Canim and Murat Kantarcio{\u{g}}lu and Bijit
Hore and Sharad Mehrotra",
title = "Building disclosure risk aware query optimizers for
relational databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "13--24",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Allard:2010:SPD,
author = "Tristan Allard and Nicolas Anciaux and Luc Bouganim
and Yanli Guo and Lionel Le Folgoc and Benjamin Nguyen
and Philippe Pucheral and Indrajit Ray and Indrakshi
Ray and Shaoyi Yin",
title = "Secure personal data servers: a vision paper",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "25--35",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fabbri:2010:PMR,
author = "Daniel Fabbri and Kristen LeFevre and Qiang Zhu",
title = "{PolicyReplay}: misconfiguration-response queries for
data breach reporting",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "36--47",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Curino:2010:SWD,
author = "Carlo Curino and Evan Jones and Yang Zhang and Sam
Madden",
title = "{Schism}: a workload-driven approach to database
replication and partitioning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "48--57",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Qin:2010:TTS,
author = "Lu Qin and Jeffrey Xu Yu and Lijun Chang",
title = "Ten thousand {SQLs}: parallel keyword queries
computing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "58--69",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Thomson:2010:CDD,
author = "Alexander Thomson and Daniel J. Abadi",
title = "The case for determinism in database systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "70--80",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Alexe:2010:MCI,
author = "Bogdan Alexe and Mauricio Hern{\'a}ndez and Lucian
Popa and Wang-Chiew Tan",
title = "{MapMerge}: correlating independent schema mappings",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "81--92",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Greco:2010:CTC,
author = "Sergio Greco and Francesca Spezzano",
title = "Chase termination: a constraints rewriting approach",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "93--104",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Marnette:2010:SDE,
author = "Bruno Marnette and Giansalvatore Mecca and Paolo
Papotti",
title = "Scalable data exchange with functional dependencies",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "105--116",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kanza:2010:IRS,
author = "Yaron Kanza and Roy Levin and Eliyahu Safra and
Yehoshua Sagiv",
title = "Interactive route search in the presence of order
constraints",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "117--128",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lang:2010:EMM,
author = "Willis Lang and Jignesh M. Patel",
title = "Energy management for {MapReduce} clusters",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "129--139",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Baid:2010:TSK,
author = "Akanksha Baid and Ian Rae and Jiexing Li and AnHai
Doan and Jeffrey Naughton",
title = "Toward scalable keyword search over relational data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "140--149",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mozafari:2010:REN,
author = "Barzan Mozafari and Kai Zeng and Carlo Zaniolo",
title = "From regular expressions to nested words: unifying
languages and query execution for relational and {XML}
sequences",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "150--161",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Grust:2010:ASL,
author = "Torsten Grust and Jan Rittinger and Tom Schreiber",
title = "Avalanche-safe {LINQ} compilation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "162--172",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fan:2010:TCF,
author = "Wenfei Fan and Jianzhong Li and Shuai Ma and Nan Tang
and Wenyuan Yu",
title = "Towards certain fixes with editing rules and master
data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "173--184",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Herschel:2010:EMA,
author = "Melanie Herschel and Mauricio A. Hern{\'a}ndez",
title = "Explaining missing answers to {SPJUA} queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "185--196",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Beskales:2010:SRF,
author = "George Beskales and Ihab F. Ilyas and Lukasz Golab",
title = "Sampling the repairs of functional dependency
violations under hard constraints",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "197--207",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Menestrina:2010:EER,
author = "David Menestrina and Steven Euijong Whang and Hector
Garcia-Molina",
title = "Evaluating entity resolution results",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "208--219",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chandramouli:2010:HPD,
author = "Badrish Chandramouli and Jonathan Goldstein and David
Maier",
title = "High-performance dynamic pattern matching over
disordered streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "220--231",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Botan:2010:SMA,
author = "Irina Botan and Roozbeh Derakhshan and Nihal Dindar
and Laura Haas and Ren{\'e}e J. Miller and Nesime
Tatbul",
title = "{SECRET}: a model for analysis of the execution
semantics of stream processing systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "232--243",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2010:RPS,
author = "Haopeng Zhang and Yanlei Diao and Neil Immerman",
title = "Recognizing patterns in streams with imprecise
timestamps",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "244--255",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Neumann:2010:XRF,
author = "Thomas Neumann and Gerhard Weikum",
title = "{x-RDF-3X}: fast querying, high update rates, and
consistency for {RDF} databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "256--263",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fan:2010:GPM,
author = "Wenfei Fan and Jianzhong Li and Shuai Ma and Nan Tang
and Yinghui Wu and Yunpeng Wu",
title = "Graph pattern matching: from intractable to polynomial
time",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "264--275",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yildirim:2010:GSR,
author = "Hilmi Yildirim and Vineet Chaoji and Mohammed J.
Zaki",
title = "{GRAIL}: scalable reachability index for large
graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "276--284",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bu:2010:HEI,
author = "Yingyi Bu and Bill Howe and Magdalena Balazinska and
Michael D. Ernst",
title = "{HaLoop}: efficient iterative data processing on large
clusters",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "285--296",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Benedikt:2010:IVV,
author = "Michael Benedikt and Georg Gottlob",
title = "The impact of virtual views on containment",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "297--308",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Terwilliger:2010:UET,
author = "James F. Terwilliger and Lois M. L. Delcambre and
David Maier and Jeremy Steinhauer and Scott Britell",
title = "Updatable and evolvable transforms for virtual
databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "309--319",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Deutch:2010:NCM,
author = "Daniel Deutch and Ohad Greenshpan and Tova Milo",
title = "Navigating in complex mashed-up applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "320--329",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Melnik:2010:DIA,
author = "Sergey Melnik and Andrey Gubarev and Jing Jing Long
and Geoffrey Romer and Shiva Shivakumar and Matt Tolton
and Theo Vassilakis",
title = "{Dremel}: interactive analysis of {Web}-scale
datasets",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "330--339",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhao:2010:GQO,
author = "Peixiang Zhao and Jiawei Han",
title = "On graph query optimization in large networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "340--351",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Martinenghi:2010:PRJ,
author = "Davide Martinenghi and Marco Tagliasacchi",
title = "Proximity rank join",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "352--363",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Vlachou:2010:IMI,
author = "Akrivi Vlachou and Christos Doulkeridis and Kjetil
N{\o}rv{\aa}g and Yannis Kotidis",
title = "Identifying the most influential data objects with
reverse top-$k$ queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "364--372",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cao:2010:RTP,
author = "Xin Cao and Gao Cong and Christian S. Jensen",
title = "Retrieving top-$k$ prestige-based relevant spatial
{Web} objects",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "373--384",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2010:PLF,
author = "Lei Li and B. Aditya Prakash and Christos Faloutsos",
title = "Parsimonious linear fingerprinting for time series",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "385--396",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2010:HTM,
author = "Rui Zhang and Martin Stradling",
title = "The {HV-tree}: a memory hierarchy aware version
index",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "397--408",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Pramanik:2010:TRQ,
author = "Sakti Pramanik and Alok Watve and Chad R. Meiners and
Alex Liu",
title = "Transforming range queries to equivalent box queries
to optimize page access",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "409--416",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Guo:2010:RLU,
author = "Songtao Guo and Xin Luna Dong and Divesh Srivastava
and Remi Zajac",
title = "Record linkage with uniqueness constraints and
erroneous values",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "417--428",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ioannou:2010:FEA,
author = "Ekaterini Ioannou and Wolfgang Nejdl and Claudia
Nieder{\'e}e and Yannis Velegrakis",
title = "On-the-fly entity-aware query processing in the
presence of linkage",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "429--438",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yakout:2010:BBR,
author = "Mohamed Yakout and Ahmed K. Elmagarmid and Hazem
Elmeleegy and Mourad Ouzzani and Alan Qi",
title = "Behavior based record linkage",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "439--448",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Han:2010:IFC,
author = "Wook-Shin Han and Jinsoo Lee and Minh-Duc Pham and
Jeffrey Xu Yu",
title = "{iGraph}: a framework for comparisons of disk-based
graph indexing techniques",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "449--459",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Schad:2010:RMC,
author = "J{\"o}rg Schad and Jens Dittrich and Jorge-Arnulfo
Quian{\'e}-Ruiz",
title = "Runtime measurements in the cloud: observing,
analyzing, and reducing variance",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "460--471",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jiang:2010:PMD,
author = "Dawei Jiang and Beng Chin Ooi and Lei Shi and Sai Wu",
title = "The performance of {MapReduce}: an in-depth study",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "472--483",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kopcke:2010:EER,
author = "Hanna K{\"o}pcke and Andreas Thor and Erhard Rahm",
title = "Evaluation of entity resolution approaches on
real-world match problems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "484--493",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Nykiel:2010:MSA,
author = "Tomasz Nykiel and Michalis Potamias and Chaitanya
Mishra and George Kollios and Nick Koudas",
title = "{MRShare}: sharing across multiple queries in
{MapReduce}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "494--505",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Vo:2010:TET,
author = "Hoang Tam Vo and Chun Chen and Beng Chin Ooi",
title = "Towards elastic transactional cloud storage with range
query support",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "506--514",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dittrich:2010:HMY,
author = "Jens Dittrich and Jorge-Arnulfo Quian{\'e}-Ruiz and
Alekh Jindal and Yagiz Kargin and Vinay Setty and
J{\"o}rg Schad",
title = "{Hadoop++}: making a yellow elephant run like a
cheetah (without it even noticing)",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "515--529",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bruno:2010:SLR,
author = "Nicolas Bruno and Vivek Narasayya and Ravi
Ramamurthy",
title = "Slicing long-running queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "530--541",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tzoumas:2010:SAH,
author = "Kostas Tzoumas and Amol Deshpande and Christian S.
Jensen",
title = "Sharing-aware horizontal partitioning for exploiting
correlations during query processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "542--553",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cali:2010:APO,
author = "Andrea Cal{\`\i} and Georg Gottlob and Andreas
Pieris",
title = "Advanced processing for ontological queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "554--565",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Parameswaran:2010:TWC,
author = "Aditya Parameswaran and Hector Garcia-Molina and Anand
Rajaraman",
title = "Towards the {Web} of concepts: extracting concepts
from large datasets",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "566--577",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gulhane:2010:ECR,
author = "Pankaj Gulhane and Rajeev Rastogi and Srinivasan H.
Sengamedu and Ashwin Tengli",
title = "Exploiting content redundancy for {Web} information
extraction",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "578--587",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2010:ARR,
author = "Bin Liu and Laura Chiticariu and Vivian Chu and H. V.
Jagadish and Frederick R. Reiss",
title = "Automatic rule refinement for information extraction",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "588--597",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Pang:2010:ETS,
author = "HweeHwa Pang and Xuhua Ding and Xiaokui Xiao",
title = "Embellishing text search queries to protect user
privacy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "598--607",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chaytor:2010:SDR,
author = "Rhonda Chaytor and Ke Wang",
title = "Small domain randomization: same privacy, more
utility",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "608--618",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Papadopoulos:2010:NNS,
author = "Stavros Papadopoulos and Spiridon Bakiras and Dimitris
Papadias",
title = "Nearest neighbor search with strong location privacy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "619--629",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kimura:2010:UPI,
author = "Hideaki Kimura and Samuel Madden and Stanley B.
Zdonik",
title = "{UPI}: a primary index for uncertain databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "630--637",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2010:RCP,
author = "Jian Li and Amol Deshpande",
title = "Ranking continuous probabilistic datasets",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "638--649",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lian:2010:SSJ,
author = "Xiang Lian and Lei Chen",
title = "Set similarity join on probabilistic data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "650--659",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Woods:2010:CED,
author = "Louis Woods and Jens Teubner and Gustavo Alonso",
title = "Complex event detection at wire speed with {FPGAs}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "660--669",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fang:2010:DCG,
author = "Wenbin Fang and Bingsheng He and Qiong Luo",
title = "Database compression on graphics processors",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "670--680",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Johnson:2010:ASA,
author = "Ryan Johnson and Ippokratis Pandis and Radu Stoica and
Manos Athanassoulis and Anastasia Ailamaki",
title = "{Aether}: a scalable approach to logging",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "681--692",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Macropol:2010:SDB,
author = "Kathy Macropol and Ambuj Singh",
title = "Scalable discovery of best clusters on large graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "693--702",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Smola:2010:APT,
author = "Alexander Smola and Shravan Narayanamurthy",
title = "An architecture for parallel topic models",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "703--710",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ganti:2010:KFI,
author = "Venkatesh Ganti and Yeye He and Dong Xin",
title = "{Keyword++}: a framework to improve keyword search
over entity databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "711--722",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2010:SMR,
author = "Zhenhui Li and Bolin Ding and Jiawei Han and Roland
Kays",
title = "{Swarm}: mining relaxed temporal moving object
clusters",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "723--734",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chen:2010:AUP,
author = "Su Chen and Beng Chin Ooi and Zhenjie Zhang",
title = "An adaptive updating protocol for reducing moving
object database workload",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "735--746",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kellaris:2010:SPC,
author = "Georgios Kellaris and Kyriakos Mouratidis",
title = "Shortest path computation on air indexes",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "747--757",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Xu:2010:EES,
author = "Jia Xu and Zhenjie Zhang and Anthony K. H. Tung and Ge
Yu",
title = "Efficient and effective similarity search over
probabilistic data based on {Earth Mover's Distance}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "758--769",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Benedikt:2010:PXM,
author = "Michael Benedikt and Evgeny Kharlamov and Dan Olteanu
and Pierre Senellart",
title = "Probabilistic {XML} via {Markov Chains}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "770--781",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Arumugam:2010:MRR,
author = "Subi Arumugam and Fei Xu and Ravi Jampani and
Christopher Jermaine and Luis L. Perez and Peter J.
Haas",
title = "{MCDB-R}: risk analysis in the database",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "782--793",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wick:2010:SPD,
author = "Michael Wick and Andrew McCallum and Gerome Miklau",
title = "Scalable probabilistic databases with factor graphs
and {MCMC}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "794--804",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2010:MCF,
author = "Meihui Zhang and Marios Hadjieleftheriou and Beng Chin
Ooi and Cecilia M. Procopiuc and Divesh Srivastava",
title = "On multi-column foreign key discovery",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "805--814",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cheng:2010:EEE,
author = "Reynold Cheng and Eric Lo and Xuan S. Yang and
Ming-Hay Luk and Xiang Li and Xike Xie",
title = "Explore or exploit?: effective strategies for
disambiguating large databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "815--825",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Soliman:2010:BRM,
author = "Mohamed A. Soliman and Ihab F. Ilyas and Mina Saleeb",
title = "Building ranked mashups of unstructured sources with
uncertain information",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "826--837",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Raissi:2010:CCS,
author = "Chedy Ra{\"\i}ssi and Jian Pei and Thomas Kister",
title = "Computing closed skycubes",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "838--847",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lo:2010:GDQ,
author = "Eric Lo and Nick Cheng and Wing-Kai Hon",
title = "Generating databases for query workloads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "848--859",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wu:2010:PTJ,
author = "Minji Wu and Laure Berti-{\'E}quille and Am{\'e}lie
Marian and Cecilia M. Procopiuc and Divesh Srivastava",
title = "Processing top-$k$ join queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "860--870",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Martinez-Palau:2010:TWR,
author = "Xavier Martinez-Palau and David Dominguez-Sal and
Josep Lluis Larriba-Pey",
title = "Two-way replacement selection",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "871--881",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Maneth:2010:XWQ,
author = "Sebastian Maneth and Kim Nguyen",
title = "{XPath} whole query optimization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "882--893",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Grimsmo:2010:FOT,
author = "Nils Grimsmo and Truls A. Bj{\o}rklund and Magnus Lie
Hetland",
title = "Fast optimal twig joins",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "894--905",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Benedikt:2010:DIX,
author = "Michael Benedikt and James Cheney",
title = "Destabilizers and independence of {XML} updates",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "906--917",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2010:SWH,
author = "Ziyang Liu and Qihong Shao and Yi Chen",
title = "Searching workflows with hierarchical views",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "918--927",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Pandis:2010:DOT,
author = "Ippokratis Pandis and Ryan Johnson and Nikos
Hardavellas and Anastasia Ailamaki",
title = "Data-oriented transaction execution",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "928--939",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Deutch:2010:OTQ,
author = "Daniel Deutch and Tova Milo and Neoklis Polyzotis and
Tom Yam",
title = "Optimal top-$k$ query evaluation for weighted business
processes",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "940--951",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2010:BSM,
author = "Guozhang Wang and Marcos Vaz Salles and Benjamin
Sowell and Xun Wang and Tuan Cao and Alan Demers and
Johannes Gehrke and Walker White",
title = "Behavioral simulations in {MapReduce}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "952--963",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ge:2010:TSS,
author = "Tingjian Ge and Stan Zdonik",
title = "{A*-tree}: a structure for storage and modeling of
uncertain multidimensional arrays",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "964--974",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Aggarwal:2010:DPM,
author = "Charu C. Aggarwal and Yao Li and Philip S. Yu and
Ruoming Jin",
title = "On dense pattern mining in graph streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "975--984",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yiu:2010:EPD,
author = "Man Lung Yiu and Leong Hou U. and Simonas Saltenis and
Kostas Tzoumas",
title = "Efficient proximity detection among mobile users via
self-tuning policies",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "985--996",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Potamias:2010:KNN,
author = "Michalis Potamias and Francesco Bonchi and Aristides
Gionis and George Kollios",
title = "k-nearest neighbors in uncertain graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "997--1008",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cao:2010:MSS,
author = "Xin Cao and Gao Cong and Christian S. Jensen",
title = "Mining significant semantic locations from {GPS}
data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1009--1020",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Hay:2010:BAD,
author = "Michael Hay and Vibhor Rastogi and Gerome Miklau and
Dan Suciu",
title = "Boosting the accuracy of differentially private
histograms through consistency",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1021--1032",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cao:2010:UIP,
author = "Jianneng Cao and Panagiotis Karras and Chedy
Ra{\"\i}ssi and Kian-Lee Tan",
title = "$ \rho $-uncertainty: inference-proof transaction
anonymization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1033--1044",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cormode:2010:MMM,
author = "Graham Cormode and Divesh Srivastava and Ninghui Li
and Tiancheng Li",
title = "Minimizing minimality and maximizing utility:
analyzing method-based attacks on anonymized data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1045--1056",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2010:QPI,
author = "Daisy Zhe Wang and Michael J. Franklin and Minos
Garofalakis and Joseph M. Hellerstein",
title = "Querying probabilistic information extraction",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1057--1067",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sen:2010:ROF,
author = "Prithviraj Sen and Amol Deshpande and Lise Getoor",
title = "Read-once functions and query evaluation in
probabilistic databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1068--1079",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Agrawal:2010:FUD,
author = "Parag Agrawal and Anish Das Sarma and Jeffrey Ullman
and Jennifer Widom",
title = "Foundations of uncertain-data integration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1080--1090",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mathioudakis:2010:IAD,
author = "Michael Mathioudakis and Nilesh Bansal and Nick
Koudas",
title = "Identifying, attributing and describing spatial
bursts",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1091--1102",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kimura:2010:CCA,
author = "Hideaki Kimura and George Huo and Alexander Rasin and
Samuel Madden and Stanley B. Zdonik",
title = "{CORADD}: correlation aware database designer for
materialized views and indexes",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1103--1113",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Nanongkai:2010:RMR,
author = "Danupon Nanongkai and Atish Das Sarma and Ashwin Lall
and Richard J. Lipton and Jun Xu",
title = "Regret-minimizing representative databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1114--1124",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Arai:2010:ACA,
author = "Benjamin Arai and Gautam Das and Dimitrios Gunopulos
and Vagelis Hristidis and Nick Koudas",
title = "An access cost-aware approach for object retrieval
over multiple sources",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1125--1136",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Abhirama:2010:SPC,
author = "M. Abhirama and Sourjya Bhaumik and Atreyee Dey and
Harsh Shrimal and Jayant R. Haritsa",
title = "On the stability of plan costs and the costs of plan
stability",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1137--1148",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Herodotou:2010:XST,
author = "Herodotos Herodotou and Shivnath Babu",
title = "{Xplus}: a {SQL}-tuning-aware query optimizer",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1149--1160",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fan:2010:GHR,
author = "Wenfei Fan and Jianzhong Li and Shuai Ma and Hongzhi
Wang and Yinghui Wu",
title = "Graph homomorphism revisited for graph matching",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1161--1172",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kandhan:2010:SFS,
author = "Ramakrishnan Kandhan and Nikhil Teletia and Jignesh M.
Patel",
title = "{SigMatch}: fast and scalable multi-pattern matching",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1173--1184",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2010:SSI,
author = "Shijie Zhang and Jiong Yang and Wei Jin",
title = "{SAPPER}: subgraph indexing and approximate matching
in large graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1185--1194",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2010:TIS,
author = "Yinan Li and Bingsheng He and Robin Jun Yang and Qiong
Luo and Ke Yi",
title = "Tree indexing on solid state drives",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1195--1206",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wu:2010:EBT,
author = "Sai Wu and Dawei Jiang and Beng Chin Ooi and Kun-Lung
Wu",
title = "Efficient {B-tree} based indexing for cloud data
processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1207--1218",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2010:TJE,
author = "Jiannan Wang and Jianhua Feng and Guoliang Li",
title = "{Trie-join}: efficient trie-based string similarity
joins with edit-distance constraints",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1219--1230",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sharifzadeh:2010:VTR,
author = "Mehdi Sharifzadeh and Cyrus Shahabi",
title = "{VoR-tree}: {R-trees} with {Voronoi} diagrams for
efficient processing of spatial nearest neighbor
queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1231--1242",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Deepak:2010:ERR,
author = "P. Deepak and Prasad M. Deshpande",
title = "Efficient {RkNN} retrieval with arbitrary non-metric
similarity measures",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1243--1254",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2010:ESE,
author = "Shiming Zhang and Nikos Mamoulis and David W. Cheung
and Ben Kao",
title = "Efficient skyline evaluation over partially ordered
domains",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1255--1266",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wei:2010:AHO,
author = "Mingzhu Wei and Elke A. Rundensteiner and Murali
Mani",
title = "Achieving high output quality under limited resources
through structure-based spilling in {XML} streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1267--1278",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mihaylov:2010:DJO,
author = "Svilen R. Mihaylov and Marie Jacob and Zachary G. Ives
and Sudipto Guha",
title = "Dynamic join optimization in multi-hop wireless sensor
networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1279--1290",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Akdere:2010:DSC,
author = "Mert Akdere and U{\u{g}}ur {\c{C}}etintemel and Eli
Upfal",
title = "Database-support for continuous prediction queries
over streaming data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1291--1301",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tran:2010:CAU,
author = "Thanh T. L. Tran and Andrew McGregor and Yanlei Diao
and Liping Peng and Anna Liu",
title = "Conditioning and aggregating uncertain data streams:
going beyond expectations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1302--1313",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Glavic:2010:TUB,
author = "Boris Glavic and Gustavo Alonso and Ren{\'e}e J.
Miller and Laura M. Haas",
title = "{TRAMP}: understanding the behavior of schema mappings
through provenance",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1314--1325",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Whang:2010:ERE,
author = "Steven Euijong Whang and Hector Garcia-Molina",
title = "Entity resolution with evolving rules",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1326--1337",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Limaye:2010:ASW,
author = "Girija Limaye and Sunita Sarawagi and Soumen
Chakrabarti",
title = "Annotating and searching {Web} tables using entities,
types and relationships",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1338--1347",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bedathur:2010:IPM,
author = "Srikanta Bedathur and Klaus Berberich and Jens
Dittrich and Nikos Mamoulis and Gerhard Weikum",
title = "Interesting-phrase mining for ad-hoc text analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1348--1357",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dong:2010:GDC,
author = "Xin Luna Dong and Laure Berti-Equille and Yifan Hu and
Divesh Srivastava",
title = "Global detection of complex copying relationships
between sources",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1358--1369",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{DeCapitanidiVimercati:2010:FLA,
author = "Sabrina {De Capitani di Vimercati} and Sara Foresti
and Sushil Jajodia and Stefano Paraboschi and
Pierangela Samarati",
title = "Fragments and loose associations: respecting privacy
in data publishing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1370--1381",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fusco:2010:NFF,
author = "Francesco Fusco and Marc Ph. Stoecklin and Michail
Vlachos",
title = "{NET-FLi}: on-the-fly compression, archiving and
indexing of streaming network traffic",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1382--1393",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zou:2010:SRQ,
author = "Qiong Zou and Huayong Wang and Robert Soul{\'e} and
Martin Hirzel and Henrique Andrade and Bu{\u{g}}ra
Gedik and Kun-Lung Wu",
title = "From a stream of relational queries to distributed
stream processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1394--1405",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mah:2010:UUA,
author = "James T. L. Mah and Danny C. C. Poo and Shaojiang
Cai",
title = "{UASMAs} (universal automated {SNP} mapping
algorithms): a set of algorithms to instantaneously map
{SNPs} in real time to aid functional {SNP} discovery",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1406--1413",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Debnath:2010:FHT,
author = "Biplob Debnath and Sudipta Sengupta and Jin Li",
title = "{FlashStore}: high throughput persistent key--value
store",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1414--1425",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Xin:2010:MDA,
author = "Reynold S. Xin and William McLaren and Patrick
Dantressangle and Steve Schormann and Sam Lightstone
and Maria Schwenger",
title = "{MEET DB2}: automated database migration evaluation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1426--1434",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Canim:2010:SBE,
author = "Mustafa Canim and George A. Mihaila and Bishwaranjan
Bhattacharjee and Kenneth A. Ross and Christian A.
Lang",
title = "{SSD} bufferpool extensions for database systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1435--1446",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Loboz:2010:DWM,
author = "Charles Loboz and Slawek Smyl and Suman Nath",
title = "{DataGarage}: warehousing massive performance data on
commodity servers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1447--1458",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chen:2010:CHP,
author = "Songting Chen",
title = "{Cheetah}: a high performance, custom data warehouse
on top of {MapReduce}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1459--1468",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Orair:2010:DBO,
author = "Gustavo H. Orair and Carlos H. C. Teixeira Wagner
{Meira, Jr.} and Ye Wang and Srinivasan Parthasarathy",
title = "Distance-based outlier detection: consolidation and
renewed bearing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1469--1480",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kim:2010:ALM,
author = "Young-Seok Kim and Heegyu Jin and Kyoung-Gu Woo",
title = "Adaptive logging for mobile device",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1481--1492",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Pesti:2010:RSL,
author = "Peter Pesti and Ling Liu and Bhuvan Bamba and Arun
Iyengar and Matt Weber",
title = "{RoadTrack}: scaling location updates for mobile
clients on road networks with query awareness",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1493--1504",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Si:2010:CID,
author = "Xiance Si and Edward Y. Chang and Zolt{\'a}n
Gy{\"o}ngyi and Maosong Sun",
title = "{Confucius} and its intelligent disciples: integrating
social with search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1505--1516",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Haritsa:2010:PDQ,
author = "Jayant R. Haritsa",
title = "The {Picasso} database query optimizer visualizer",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1517--1520",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2010:CED,
author = "Ziyang Liu and Sivaramakrishnan Natarajan and Bin He
and Hui-I Hsiao and Yi Chen",
title = "{CODS}: evolving data efficiently and scalably in
column oriented databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1521--1524",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sadoghi:2010:EEP,
author = "Mohammad Sadoghi and Martin Labrecque and Harsh Singh
and Warren Shum and Hans-Arno Jacobsen",
title = "Efficient event processing through reconfigurable
hardware for algorithmic trading",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1525--1528",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Levandoski:2010:CCP,
author = "Justin J. Levandoski and Mohamed F. Mokbel and Mohamed
E. Khalefa",
title = "{CareDB}: a context and preference-aware
location-based database system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1529--1532",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kossmann:2010:CMC,
author = "Donald Kossmann and Tim Kraska and Simon Loesing and
Stephan Merkli and Raman Mittal and Flavio
Pfaffhauser",
title = "{Cloudy}: a modular cloud storage system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1533--1536",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kazemitabar:2010:GSQ,
author = "Seyed Jalal Kazemitabar and Ugur Demiryurek and
Mohamed Ali and Afsin Akdogan and Cyrus Shahabi",
title = "Geospatial stream query processing using {Microsoft
SQL Server StreamInsight}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1537--1540",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dyreson:2010:UXT,
author = "Curtis E. Dyreson and Sourav S. Bhowmick and
Kirankanth Mallampalli",
title = "Using {XMorph} to transform {XML} data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1541--1544",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2010:ACE,
author = "Di Wang and Elke A. Rundensteiner and Han Wang and
Richard T. {Ellison III}",
title = "Active complex event processing: applications in
real-time health care",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1545--1548",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Schreiber:2010:TNP,
author = "Tom Schreiber and Simone Bonetti and Torsten Grust and
Manuel Mayr and Jan Rittinger",
title = "Thirteen new players in the team: a {FERRY}-based
{LINQ} to {SQL} provider",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1549--1552",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Abiteboul:2010:AEC,
author = "Serge Abiteboul and Pierre Bourhis and Bogdan Marinoiu
and Alban Galland",
title = "{AXART}: enabling collaborative work with {AXML}
artifacts",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1553--1556",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{McConnell:2010:IAF,
author = "Christopher McConnell and Fan Ping and Jeong-Hyon
Hwang",
title = "{iFlow}: an approach for fast and reliable
{Internet-scale} stream processing utilizing detouring
and replication",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1557--1560",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kantere:2010:PCT,
author = "Verena Kantere and Maher Manoubi and Iluju Kiringa and
Timos Sellis and John Mylopoulos",
title = "Peer coordination through distributed triggers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1561--1564",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wu:2010:SSY,
author = "Hao Wu and Guoliang Li and Chen Li and Lizhu Zhou",
title = "{Seaform}: search-as-you-type in forms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1565--1568",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Strotgen:2010:TSE,
author = "Jannik Str{\"o}tgen and Michael Gertz",
title = "{TimeTrails}: a system for exploring spatio-temporal
information in documents",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1569--1572",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Pound:2010:QEF,
author = "Jeffrey Pound and Ihab F. Ilyas and Grant Weddell",
title = "{QUICK}: expressive and flexible search over knowledge
bases and text collections",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1573--1576",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kwietniewski:2010:TXD,
author = "Marcin Kwietniewski and Jarek Gryz and Stephanie
Hazlewood and Paul {Van Run}",
title = "Transforming {XML} documents as schemas evolve",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1577--1580",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2010:XCT,
author = "Ziyang Liu and Sivaramakrishnan Natarajan and Peng Sun
and Stephen Booher and Tim Meehan and Robert Winkler
and Yi Chen",
title = "{XSACT}: a comparison tool for structured search
results",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1581--1584",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Abdessalem:2010:OLT,
author = "Talel Abdessalem and Bogdan Cautis and Nora
Derouiche",
title = "{ObjectRunner}: lightweight, targeted extraction and
querying of structured {Web} data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1585--1588",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Elbassuoni:2010:RRW,
author = "Shady Elbassuoni and Katja Hose and Steffen Metzger
and Ralf Schenkel",
title = "{ROXXI}: {Reviving} witness {dOcuments} to {eXplore
eXtracted Information}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1589--1592",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Termehchy:2010:EUD,
author = "Arash Termehchy and Marianne Winslett",
title = "{EXTRUCT}: using deep structural information in {XML}
keyword search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1593--1596",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Akbarnejad:2010:SQR,
author = "Javad Akbarnejad and Gloria Chatzopoulou and Magdalini
Eirinaki and Suju Koshy and Sarika Mittal and Duc On
and Neoklis Polyzotis and Jothi S. Vindhiya Varman",
title = "{SQL QueRIE} recommendations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1597--1600",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ang:2010:PCM,
author = "Hock Hee Ang and Vivekanand Gopalkrishnan and Wee
Keong Ng and Steven C. H. Hoi",
title = "{P2PDocTagger}: content management through automated
{P2P} collaborative tagging",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1601--1604",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Setty:2010:IEI,
author = "Vinay Setty and Srikanta Bedathur and Klaus Berberich
and Gerhard Weikum",
title = "{InZeit}: efficiently identifying insightful time
points",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1605--1608",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sun:2010:IIT,
author = "Aixin Sun and Sourav S. Bhowmick and Yao Liu",
title = "{iAVATAR}: an interactive tool for finding and
visualizing visual-representative tags in image
search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1609--1612",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kabisch:2010:DWI,
author = "Thomas Kabisch and Eduard C. Dragut and Clement Yu and
Ulf Leser",
title = "Deep {Web} integration with {VisQI}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1613--1616",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dong:2010:SST,
author = "Xin Luna Dong and Laure Berti-Equille and Yifan Hu and
Divesh Srivastava",
title = "{SOLOMON}: seeking the truth via copying detection",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1617--1620",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Hentschel:2010:JTD,
author = "Martin Hentschel and Laura Haas and Ren{\'e}e J.
Miller",
title = "Just-in-time data integration in action",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1621--1624",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Alexandrov:2010:MPD,
author = "Alexander Alexandrov and Max Heimel and Volker Markl
and Dominic Battr{\'e} and Fabian Hueske and Erik
Nijkamp and Stephan Ewen and Odej Kao and Daniel
Warneke",
title = "Massively parallel data analysis with {PACTs} on
{Nephele}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1625--1628",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Middelfart:2010:UST,
author = "Morten Middelfart and Torben Bach Pedersen",
title = "Using sentinel technology in the {TARGIT BI} suite",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1629--1632",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gunnemann:2010:CIC,
author = "Stephan G{\"u}nnemann and Ines F{\"a}rber and Hardy
Kremer and Thomas Seidl",
title = "{CoDA}: interactive cluster based concept discovery",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1633--1636",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bergamaschi:2010:KSK,
author = "Sonia Bergamaschi and Elton Domnori and Francesco
Guerra and Mirko Orsini and Raquel Trillo Lado and
Yannis Velegrakis",
title = "{Keymantic}: semantic keyword-based searching in data
integration systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1637--1640",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Golab:2010:DAE,
author = "Lukasz Golab and Howard Karloff and Flip Korn and
Divesh Srivastava",
title = "Data {Auditor}: exploring data quality and semantics
using pattern tableaux",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1641--1644",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Nori:2010:DCP,
author = "Anil K. Nori",
title = "Distributed caching platforms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1645--1646",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Agrawal:2010:BDC,
author = "Divyakant Agrawal and Sudipto Das and Amr {El
Abbadi}",
title = "Big data and cloud computing: new wine or just new
bottles?",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1647--1648",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Samet:2010:TSS,
author = "Hanan Samet",
title = "Techniques for similarity searching in multimedia
databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1649--1650",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Etzion:2010:EPP,
author = "Opher Etzion",
title = "Event processing: past, present and future",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1651--1652",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Renz:2010:SSM,
author = "Matthias Renz and Reynold Cheng and Hans-Peter
Kriegel",
title = "Similarity search and mining in uncertain databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1653--1654",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Muthukrishnan:2010:DMM,
author = "S. Muthukrishnan",
title = "Data management and mining in {Internet AD} systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "3",
number = "1--2",
pages = "1655--1656",
month = sep,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:02 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kling:2010:GEE,
author = "Patrick Kling and M. Tamer {\"O}zsu and Khuzaima
Daudjee",
title = "Generating efficient execution plans for vertically
partitioned {XML} databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "1",
pages = "1--11",
month = oct,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:15 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lian:2010:GFH,
author = "Xiang Lian and Lei Chen",
title = "A generic framework for handling uncertain data with
local correlations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "1",
pages = "12--21",
month = oct,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:15 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Khoussainova:2010:SCA,
author = "Nodira Khoussainova and YongChul Kwon and Magdalena
Balazinska and Dan Suciu",
title = "{SnipSuggest}: context-aware autocompletion for
{SQL}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "1",
pages = "22--33",
month = oct,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:15 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Meliou:2010:CCR,
author = "Alexandra Meliou and Wolfgang Gatterbauer and
Katherine F. Moore and Dan Suciu",
title = "The complexity of causality and responsibility for
query answers and non-answers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "1",
pages = "34--45",
month = oct,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:15 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sagy:2010:DTQ,
author = "Guy Sagy and Daniel Keren and Izchak Sharfman and
Assaf Schuster",
title = "Distributed threshold querying of general functions by
a difference of monotonic representation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "2",
pages = "46--57",
month = nov,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:15 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2010:TBD,
author = "Nan Wang and Jingbo Zhang and Kian-Lee Tan and Anthony
K. H. Tung",
title = "On triangulation-based dense neighborhood graph
discovery",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "2",
pages = "58--68",
month = nov,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:15 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Rice:2010:GIR,
author = "Michael Rice and Vassilis J. Tsotras",
title = "Graph indexing of road networks for shortest path
queries with label restrictions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "2",
pages = "69--80",
month = nov,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:15 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Qian:2010:CUF,
author = "Li Qian and Kristen LeFevre and H. V. Jagadish",
title = "{CRIUS}: user-friendly database design",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "2",
pages = "81--92",
month = nov,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:15 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Rocha-Junior:2010:EPT,
author = "Jo{\~a}o B. Rocha-Junior and Akrivi Vlachou and
Christos Doulkeridis and Kjetil N{\o}rv{\aa}g",
title = "Efficient processing of top-$k$ spatial preference
queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "2",
pages = "93--104",
month = nov,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:15 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Grund:2010:HMM,
author = "Martin Grund and Jens Kr{\"u}ger and Hasso Plattner
and Alexander Zeier and Philippe Cudre-Mauroux and
Samuel Madden",
title = "{HYRISE}: a main memory hybrid storage engine",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "2",
pages = "105--116",
month = nov,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:15 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Curino:2010:URI,
author = "Carlo A. Curino and Hyun Jin Moon and Alin Deutsch and
Carlo Zaniolo",
title = "Update rewriting and integrity constraint maintenance
in a schema evolution support system: {PRISM++}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "2",
pages = "117--128",
month = nov,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:15 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Oro:2010:SEX,
author = "Ermelinda Oro and Massimo Ruffolo and Steffen Staab",
title = "{SXPath}: extending {XPath} towards spatial querying
on {Web} documents",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "2",
pages = "129--140",
month = nov,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:15 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yuan:2010:PPP,
author = "Mingxuan Yuan and Lei Chen and Philip S. Yu",
title = "Personalized privacy protection in social networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "2",
pages = "141--150",
month = nov,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:15 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Toda:2010:PAA,
author = "Guilherme A. Toda and Eli Cortez and Altigran S. da
Silva and Edleno de Moura",
title = "A probabilistic approach for automatically filling
form-based {Web} interfaces",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "3",
pages = "151--160",
month = dec,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:16 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Papadimitriou:2010:OUB,
author = "Panagiotis Papadimitriou and Hector Garcia-Molina and
Ali Dasdan and Santanu Kolay",
title = "Output {URL} bidding",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "3",
pages = "161--172",
month = dec,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:16 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bahmani:2010:FIP,
author = "Bahman Bahmani and Abdur Chowdhury and Ashish Goel",
title = "Fast incremental and personalized {PageRank}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "3",
pages = "173--184",
month = dec,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:16 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we analyze the efficiency of Monte
Carlo methods for incremental computation of PageRank,
personalized PageRank, and similar random walk based
methods (with focus on SALSA), on large-scale
dynamically evolving social networks. We assume that
the graph of friendships is stored in distributed
shared memory, as is the case for large social networks
such as Twitter.\par
For global PageRank, we assume that the social network
has $n$ nodes, and $m$ adversarially chosen edges
arrive in a random order. We show that with a reset
probability of $ \epsilon $, the expected total work
needed to maintain an accurate estimate (using the
Monte Carlo method) of the PageRank of every node at
all times is $ O(n \ln m / \epsilon^2)$. This is
significantly better than all known bounds for
incremental PageRank. For instance, if we naively
recompute the PageRanks as each edge arrives, the
simple power iteration method needs $ \Omega (m^2 / \ln
(1 / (1 - \epsilon)))$ total time and the Monte Carlo
method needs $ O(m n / \epsilon)$ total time; both are
prohibitively expensive. We also show that we can
handle deletions equally efficiently.\par
We then study the computation of the top $k$
personalized PageRanks starting from a seed node,
assuming that personalized PageRanks follow a power-law
with exponent $ < 1$. We show that if we store $ R > q
\ln n$ random walks starting from every node for large
enough constant $q$ (using the approach outlined for
global PageRank), then the expected number of calls
made to the distributed social network database is $
O(k / (R^{(1 - \alpha) / \alpha }))$. We also present
experimental results from the social networking site,
Twitter, verifying our assumptions and analyses. The
overall result is that this algorithm is fast enough
for real-time queries over a dynamic social network.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lee:2010:QES,
author = "Jongwuk Lee and Seung-won Hwang",
title = "{QSkycube}: efficient skycube computation using
point-based space partitioning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "3",
pages = "185--196",
month = dec,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:16 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2010:ZEI,
author = "Bin Liu and Chee-Yong Chan",
title = "{ZINC}: efficient indexing for skyline computation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "3",
pages = "197--207",
month = dec,
year = "2010",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:16 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Rastogi:2011:LSC,
author = "Vibhor Rastogi and Nilesh Dalvi and Minos
Garofalakis",
title = "Large-scale collective entity matching",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "4",
pages = "208--218",
month = jan,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:17 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dalvi:2011:AWL,
author = "Nilesh Dalvi and Ravi Kumar and Mohamed Soliman",
title = "Automatic wrappers for large scale {Web} extraction",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "4",
pages = "219--230",
month = jan,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:17 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yang:2011:FSM,
author = "Xintian Yang and Srinivasan Parthasarathy and P.
Sadayappan",
title = "Fast sparse matrix-vector multiplication on {GPUs}:
implications for graph mining",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "4",
pages = "231--242",
month = jan,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:17 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Rao:2011:UPB,
author = "Jun Rao and Eugene J. Shekita and Sandeep Tata",
title = "Using {Paxos} to build a scalable, consistent, and
highly available datastore",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "4",
pages = "243--254",
month = jan,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:17 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ding:2011:FSI,
author = "Bolin Ding and Arnd Christian K{\"o}nig",
title = "Fast set intersection in memory",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "4",
pages = "255--266",
month = jan,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:17 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Parameswaran:2011:HAG,
author = "Aditya Parameswaran and Anish Das Sarma and Hector
Garcia-Molina and Neoklis Polyzotis and Jennifer
Widom",
title = "Human-assisted graph search: it's okay to ask
questions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "5",
pages = "267--278",
month = feb,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:18 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yakout:2011:GDR,
author = "Mohamed Yakout and Ahmed K. Elmagarmid and Jennifer
Neville and Mourad Ouzzani and Ihab F. Ilyas",
title = "Guided data repair",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "5",
pages = "279--289",
month = feb,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:18 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Venetis:2011:HLD,
author = "Petros Venetis and Hector Gonzalez and Christian S.
Jensen and Alon Halevy",
title = "Hyper-local, directions-based ranking of places",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "5",
pages = "290--301",
month = feb,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:18 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Koc:2011:IMC,
author = "M. Levent Koc and Christopher R{\'e}",
title = "Incrementally maintaining classification using an
{RDBMS}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "5",
pages = "302--313",
month = feb,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:18 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{He:2011:HTT,
author = "Bingsheng He and Jeffrey Xu Yu",
title = "High-throughput transaction executions on graphics
processors",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "5",
pages = "314--325",
month = feb,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:18 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cao:2011:DIQ,
author = "Zhao Cao and Charles Sutton and Yanlei Diao and
Prashant Shenoy",
title = "Distributed inference and query processing for {RFID}
tracking and monitoring",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "5",
pages = "326--337",
month = feb,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:55:18 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lee:2011:SJS,
author = "Hongrae Lee and Raymond T. Ng and Kyuseok Shim",
title = "Similarity join size estimation using locality
sensitive hashing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "6",
pages = "338--349",
month = mar,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:45:07 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2011:QEB,
author = "Ziyang Liu and Sivaramakrishnan Natarajan and Yi
Chen",
title = "Query expansion based on clustered results",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "6",
pages = "350--361",
month = mar,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:45:07 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dash:2011:CSP,
author = "Debabrata Dash and Neoklis Polyzotis and Anastasia
Ailamaki",
title = "{CoPhy}: a scalable, portable, and interactive index
advisor for large workloads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "6",
pages = "362--372",
month = mar,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:45:07 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Niu:2011:TSS,
author = "Feng Niu and Christopher R{\'e} and AnHai Doan and
Jude Shavlik",
title = "{Tuffy}: scaling up statistical inference in {Markov}
logic networks using an {RDBMS}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "6",
pages = "373--384",
month = mar,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:45:07 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jahani:2011:AOM,
author = "Eaman Jahani and Michael J. Cafarella and Christopher
R{\'e}",
title = "Automatic optimization for {MapReduce} programs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "6",
pages = "385--396",
month = mar,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:45:07 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yang:2011:STG,
author = "De-Nian Yang and Yi-Ling Chen and Wang-Chien Lee and
Ming-Syan Chen",
title = "On social-temporal group query with acquaintance
constraint",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "6",
pages = "397--408",
month = mar,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 13 14:45:07 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Nguyen:2011:SPO,
author = "Hoa Nguyen and Ariel Fuxman and Stelios Paparizos and
Juliana Freire and Rakesh Agrawal",
title = "Synthesizing products for online catalogs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "7",
pages = "409--418",
month = apr,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Jun 7 19:31:12 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Floratou:2011:COS,
author = "Avrilia Floratou and Jignesh M. Patel and Eugene J.
Shekita and Sandeep Tata",
title = "Column-oriented storage techniques for {MapReduce}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "7",
pages = "419--429",
month = apr,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Jun 7 19:31:12 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lomet:2011:IPC,
author = "David Lomet and Kostas Tzoumas and Michael Zwilling",
title = "Implementing performance competitive logical
recovery",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "7",
pages = "430--439",
month = apr,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Jun 7 19:31:12 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Machanavajjhala:2011:PSR,
author = "Ashwin Machanavajjhala and Aleksandra Korolova and
Atish Das Sarma",
title = "Personalized social recommendations: accurate or
private",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "7",
pages = "440--450",
month = apr,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Jun 7 19:31:12 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Capannini:2011:EDW,
author = "Gabriele Capannini and Franco Maria Nardini and
Raffaele Perego and Fabrizio Silvestri",
title = "Efficient diversification of {Web} search results",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "7",
pages = "451--459",
month = apr,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Jun 7 19:31:12 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{DeFrancisciMorales:2011:SCM,
author = "Gianmarco {De Francisci Morales} and Aristides Gionis
and Mauro Sozio",
title = "Social content matching in {MapReduce}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "7",
pages = "460--469",
month = apr,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Jun 7 19:31:12 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ao:2011:EPL,
author = "Naiyong Ao and Fan Zhang and Di Wu and Douglas S.
Stones and Gang Wang and Xiaoguang Liu and Jing Liu and
Sheng Lin",
title = "Efficient parallel lists intersection and index
compression algorithms using graphics processing
units",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "8",
pages = "470--481",
month = may,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 5 17:23:33 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zou:2011:GAS,
author = "Lei Zou and Jinghui Mo and Lei Chen and M. Tamer
{\"O}zsu and Dongyan Zhao",
title = "{gStore}: answering {SPARQL} queries via subgraph
matching",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "8",
pages = "482--493",
month = may,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 5 17:23:33 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Das:2011:ALE,
author = "Sudipto Das and Shoji Nishimura and Divyakant Agrawal
and Amr {El Abbadi}",
title = "{Albatross}: lightweight elasticity in shared storage
databases for the cloud using live data migration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "8",
pages = "494--505",
month = may,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 5 17:23:33 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Nutanong:2011:IHD,
author = "Sarana Nutanong and Edwin H. Jacox and Hanan Samet",
title = "An incremental {Hausdorff} distance calculation
algorithm",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "8",
pages = "506--517",
month = may,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 5 17:23:33 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Blaustein:2011:SPP,
author = "Barbara Blaustein and Adriane Chapman and Len Seligman
and M. David Allen and Arnon Rosenthal",
title = "Surrogate parenthood: protected and informative
graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "8",
pages = "518--525",
month = may,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 5 17:23:33 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Venetis:2011:RST,
author = "Petros Venetis and Alon Halevy and Jayant Madhavan and
Marius Pasca and Warren Shen and Fei Wu and Gengxin
Miao and Chung Wu",
title = "Recovering semantics of tables on the web",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "9",
pages = "528--538",
month = jun,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 5 17:23:34 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Neumann:2011:ECE,
author = "Thomas Neumann",
title = "Efficiently compiling efficient query plans for modern
hardware",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "9",
pages = "539--550",
month = jun,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 5 17:23:34 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jin:2011:DCR,
author = "Ruoming Jin and Lin Liu and Bolin Ding and Haixun
Wang",
title = "Distance-constraint reachability computation in
uncertain graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "9",
pages = "551--562",
month = jun,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 5 17:23:34 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chi:2011:IIC,
author = "Yun Chi and Hyun Jin Moon and Hakan
Hacig{\"u}m{\"u}s",
title = "{iCBS}: incremental cost-based scheduling under
piecewise linear {SLAs}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "9",
pages = "563--574",
month = jun,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 5 17:23:34 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Eltabakh:2011:CFD,
author = "Mohamed Y. Eltabakh and Yuanyuan Tian and Fatma
{\"O}zcan and Rainer Gemulla and Aljoscha Krettek and
John McPherson",
title = "{CoHadoop}: flexible data placement and its
exploitation in {Hadoop}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "9",
pages = "575--585",
month = jun,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 5 17:23:34 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Idreos:2011:MWC,
author = "Stratos Idreos and Stefan Manegold and Harumi Kuno and
Goetz Graefe",
title = "Merging what's cracked, cracking what's merged:
adaptive indexing in main-memory column-stores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "9",
pages = "586--597",
month = jun,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 5 17:23:34 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2011:PTR,
author = "Chonghai Wang and Li Yan Yuan and Jia-Huai You and
Osmar R. Zaiane and Jian Pei",
title = "On pruning for top-$k$ ranking in uncertain
databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "10",
pages = "598--609",
month = jul,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 5 17:23:34 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Pandis:2011:PPL,
author = "Ippokratis Pandis and Pinar T{\"o}z{\"u}n and Ryan
Johnson and Anastasia Ailamaki",
title = "{PLP}: page latch-free shared-everything {OLTP}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "10",
pages = "610--621",
month = jul,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 5 17:23:34 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2011:EMH,
author = "Jiannan Wang and Guoliang Li and Jeffrey Xu Yu and
Jianhua Feng",
title = "Entity matching: how similar is similar",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "10",
pages = "622--633",
month = jul,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 5 17:23:34 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2011:ACE,
author = "Di Wang and Elke A. Rundensteiner and Richard T.
{Ellison III}",
title = "Active complex event processing over event streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "10",
pages = "634--645",
month = jul,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 5 17:23:34 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Budak:2011:STA,
author = "Ceren Budak and Divyakant Agrawal and Amr {El
Abbadi}",
title = "Structural trend analysis for online social networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "10",
pages = "646--656",
month = jul,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 5 17:23:34 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kimura:2011:CAP,
author = "Hideaki Kimura and Vivek Narasayya and Manoj Syamala",
title = "Compression aware physical database design",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "10",
pages = "657--668",
month = jul,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 5 17:23:34 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bernecker:2011:EPR,
author = "Thomas Bernecker and Tobias Emrich and Hans-Peter
Kriegel and Matthias Renz and Stefan Zankl and Andreas
Z{\"u}fle",
title = "Efficient probabilistic reverse nearest neighbor query
processing on uncertain data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "10",
pages = "669--680",
month = jul,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 5 17:23:34 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kargar:2011:KSG,
author = "Mehdi Kargar and Aijun An",
title = "Keyword search in graphs: finding $r$-cliques",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "4",
number = "10",
pages = "681--692",
month = jul,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 5 17:23:34 MDT 2011",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fabbri:2011:EBA,
author = "Daniel Fabbri and Kristen LeFevre",
title = "Explanation-based auditing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "1",
pages = "1--12",
month = sep,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:06 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "To comply with emerging privacy laws and regulations,
it has become common for applications like electronic
health records systems (EHRs) to collect access logs,
which record each time a user (e.g., a hospital
employee) accesses a piece of sensitive data (e.g., a
patient record). Using the access log, it is easy to
answer simple queries (e.g., Who accessed Alice's
medical record?), but this often does not provide
enough information. In addition to learning who
accessed their medical records, patients will likely
want to understand why each access occurred. In this
paper, we introduce the problem of generating
explanations for individual records in an access log.
The problem is motivated by user-centric auditing
applications, and it also provides a novel approach to
misuse detection. We develop a framework for modeling
explanations which is based on a fundamental
observation: For certain classes of databases,
including EHRs, the reason for most data accesses can
be inferred from data stored elsewhere in the database.
For example, if Alice has an appointment with Dr. Dave,
this information is stored in the database, and it
explains why Dr. Dave looked at Alice's record. Large
numbers of data accesses can be explained using general
forms called explanation templates. Rather than
requiring an administrator to manually specify
explanation templates, we propose a set of algorithms
for automatically discovering frequent templates from
the database (i.e., those that explain a large number
of accesses). We also propose techniques for inferring
collaborative user groups, which can be used to enhance
the quality of the discovered explanations. Finally, we
have evaluated our proposed techniques using an access
log and data from the University of Michigan Health
System. Our results demonstrate that in practice we can
provide explanations for over 94\% of data accesses in
the log.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Marcus:2011:HPS,
author = "Adam Marcus and Eugene Wu and David Karger and Samuel
Madden and Robert Miller",
title = "Human-powered sorts and joins",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "1",
pages = "13--24",
month = sep,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:06 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Crowdsourcing markets like Amazon's Mechanical Turk
(MTurk) make it possible to task people with small
jobs, such as labeling images or looking up phone
numbers, via a programmatic interface. MTurk tasks for
processing datasets with humans are currently designed
with significant reimplementation of common workflows
and ad-hoc selection of parameters such as price to pay
per task. We describe how we have integrated crowds
into a declarative workflow engine called Qurk to
reduce the burden on workflow designers. In this paper,
we focus on how to use humans to compare items for
sorting and joining data, two of the most common
operations in DBMSs. We describe our basic query
interface and the user interface of the tasks we post
to MTurk. We also propose a number of optimizations,
including task batching, replacing pairwise comparisons
with numerical ratings, and pre-filtering tables before
joining them, which dramatically reduce the overall
cost of running sorts and joins on the crowd. In an
experiment joining two sets of images, we reduce the
overall cost from \$67 in a naive implementation to
about \$3, without substantially affecting accuracy or
latency. In an end-to-end experiment, we reduced cost
by a factor of 14.5.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cormode:2011:VCS,
author = "Graham Cormode and Justin Thaler and Ke Yi",
title = "Verifying computations with streaming interactive
proofs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "1",
pages = "25--36",
month = sep,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:06 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "When computation is outsourced, the data owner would
like to be assured that the desired computation has
been performed correctly by the service provider. In
theory, proof systems can give the necessary assurance,
but prior work is not sufficiently scalable or
practical. In this paper, we develop new proof
protocols for verifying computations which are
streaming in nature: the verifier (data owner) needs
only logarithmic space and a single pass over the
input, and after observing the input follows a simple
protocol with a prover (service provider) that takes
logarithmic communication spread over a logarithmic
number of rounds. These ensure that the computation is
performed correctly: that the service provider has not
made any errors or missed out some data. The guarantee
is very strong: even if the service provider
deliberately tries to cheat, there is only vanishingly
small probability of doing so undetected, while a
correct computation is always accepted. We first
observe that some theoretical results can be modified
to work with streaming verifiers, showing that there
are efficient protocols for problems in the complexity
classes NP and NC. Our main results then seek to bridge
the gap between theory and practice by developing
usable protocols for a variety of problems of central
importance in streaming and database processing. All
these problems require linear space in the traditional
streaming model, and therefore our protocols
demonstrate that adding a prover can exponentially
reduce the effort needed by the verifier. Our
experimental results show that our protocols are
practical and scalable.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lin:2011:MOI,
author = "Dan Lin and Christian S. Jensen and Rui Zhang and Lu
Xiao and Jiaheng Lu",
title = "A moving-object index for efficient query processing
with peer-wise location privacy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "1",
pages = "37--48",
month = sep,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:06 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the growing use of location-based services,
location privacy attracts increasing attention from
users, industry, and the research community. While
considerable effort has been devoted to inventing
techniques that prevent service providers from knowing
a user's exact location, relatively little attention
has been paid to enabling so-called peer-wise
privacy---the protection of a user's location from
unauthorized peer users. This paper identifies an
important efficiency problem in existing peer-privacy
approaches that simply apply a filtering step to
identify users that are located in a query range, but
that do not want to disclose their location to the
querying peer. To solve this problem, we propose a
novel, privacy-policy enabled index called the PEB-tree
that seamlessly integrates location proximity and
policy compatibility. We propose efficient algorithms
that use the PEB-tree for processing privacy-aware
range and $k$ NN queries. Extensive experiments suggest
that the PEB-tree enables efficient query processing.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mansour:2011:EES,
author = "Essam Mansour and Amin Allam and Spiros Skiadopoulos
and Panos Kalnis",
title = "{ERA}: efficient serial and parallel suffix tree
construction for very long strings",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "1",
pages = "49--60",
month = sep,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:06 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The suffix tree is a data structure for indexing
strings. It is used in a variety of applications such
as bioinformatics, time series analysis, clustering,
text editing and data compression. However, when the
string and the resulting suffix tree are too large to
fit into the main memory, most existing construction
algorithms become very inefficient. This paper presents
a disk-based suffix tree construction method, called
Elastic Range (ERa), which works efficiently with very
long strings that are much larger than the available
memory. ERa partitions the tree construction process
horizontally and vertically and minimizes I/Os by
dynamically adjusting the horizontal partitions
independently for each vertical partition, based on the
evolving shape of the tree and the available memory.
Where appropriate, ERa also groups vertical partitions
together to amortize the I/O cost. We developed a
serial version; a parallel version for shared-memory
and shared-disk multi-core systems; and a parallel
version for shared-nothing architectures. ERa indexes
the entire human genome in 19 minutes on an ordinary
desktop computer. For comparison, the fastest existing
method needs 15 minutes using 1024 CPUs on an IBM
BlueGene supercomputer.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Krueger:2011:FUR,
author = "Jens Krueger and Changkyu Kim and Martin Grund and
Nadathur Satish and David Schwalb and Jatin Chhugani
and Hasso Plattner and Pradeep Dubey and Alexander
Zeier",
title = "Fast updates on read-optimized databases using
multi-core {CPUs}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "1",
pages = "61--72",
month = sep,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:06 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Read-optimized columnar databases use differential
updates to handle writes by maintaining a separate
write-optimized delta partition which is periodically
merged with the read-optimized and compressed main
partition. This merge process introduces significant
overheads and unacceptable downtimes in update
intensive systems, aspiring to combine transactional
and analytical workloads into one system. In the first
part of the paper, we report data analyses of 12 SAP
Business Suite customer systems. In the second half, we
present an optimized merge process reducing the merge
overhead of current systems by a factor of 30. Our
linear-time merge algorithm exploits the underlying
high compute and bandwidth resources of modern
multi-core CPUs with architecture-aware optimizations
and efficient parallelization. This enables compressed
in-memory column stores to handle the transactional
update rate required by enterprise applications, while
keeping properties of read-optimized databases for
analytic-style queries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Goyal:2011:DBA,
author = "Amit Goyal and Francesco Bonchi and Laks V. S.
Lakshmanan",
title = "A data-based approach to social influence
maximization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "1",
pages = "73--84",
month = sep,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:06 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Influence maximization is the problem of finding a set
of users in a social network, such that by targeting
this set, one maximizes the expected spread of
influence in the network. Most of the literature on
this topic has focused exclusively on the social graph,
overlooking historical data, i.e., traces of past
action propagations. In this paper, we study influence
maximization from a novel data-based perspective. In
particular, we introduce a new model, which we call
credit distribution, that directly leverages available
propagation traces to learn how influence flows in the
network and uses this to estimate expected influence
spread. Our approach also learns the different levels
of influence-ability of users, and it is time-aware in
the sense that it takes the temporal nature of
influence into account. We show that influence
maximization under the credit distribution model is NP
-hard and that the function that defines expected
spread under our model is submodular. Based on these,
we develop an approximation algorithm for solving the
influence maximization problem that at once enjoys high
accuracy compared to the standard approach, while being
several orders of magnitude faster and more scalable.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Pavlo:2011:PMO,
author = "Andrew Pavlo and Evan P. C. Jones and Stanley Zdonik",
title = "On predictive modeling for optimizing transaction
execution in parallel {OLTP} systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "2",
pages = "85--96",
month = oct,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:08 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A new emerging class of parallel database management
systems (DBMS) is designed to take advantage of the
partitionable workloads of on-line transaction
processing (OLTP) applications [23, 20]. Transactions
in these systems are optimized to execute to completion
on a single node in a shared-nothing cluster without
needing to coordinate with other nodes or use expensive
concurrency control measures [18]. But some OLTP
applications cannot be partitioned such that all of
their transactions execute within a single-partition in
this manner. These distributed transactions access data
not stored within their local partitions and
subsequently require more heavy-weight concurrency
control protocols. Further difficulties arise when the
transaction's execution properties, such as the number
of partitions it may need to access or whether it will
abort, are not known beforehand. The DBMS could
mitigate these performance issues if it is provided
with additional information about transactions. Thus,
in this paper we present a Markov model-based approach
for automatically selecting which optimizations a DBMS
could use, namely (1) more efficient concurrency
control schemes, (2) intelligent scheduling, (3)
reduced undo logging, and (4) speculative execution. To
evaluate our techniques, we implemented our models and
integrated them into a parallel, main-memory OLTP DBMS
to show that we can improve the performance of
applications with diverse workloads.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Goasdoue:2011:VSS,
author = "Fran{\c{c}}ois Goasdou{\'e} and Konstantinos Karanasos
and Julien Leblay and Ioana Manolescu",
title = "View selection in {Semantic Web} databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "2",
pages = "97--108",
month = oct,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:08 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We consider the setting of a Semantic Web database,
containing both explicit data encoded in RDF triples,
and implicit data, implied by the RDF semantics. Based
on a query workload, we address the problem of
selecting a set of views to be materialized in the
database, minimizing a combination of query processing,
view storage, and view maintenance costs. Starting from
an existing relational view selection method, we devise
new algorithms for recommending view sets, and show
that they scale significantly beyond the existing
relational ones when adapted to the RDF context. To
account for implicit triples in query answers, we
propose a novel RDF query reformulation algorithm and
an innovative way of incorporating it into view
selection in order to avoid a combinatorial explosion
in the complexity of the selection process. The
interest of our techniques is demonstrated through a
set of experiments.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jestes:2011:BWH,
author = "Jeffrey Jestes and Ke Yi and Feifei Li",
title = "Building wavelet histograms on large data in
{MapReduce}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "2",
pages = "109--120",
month = oct,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:08 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "MapReduce is becoming the de facto framework for
storing and processing massive data, due to its
excellent scalability, reliability, and elasticity. In
many MapReduce applications, obtaining a compact
accurate summary of data is essential. Among various
data summarization tools, histograms have proven to be
particularly important and useful for summarizing data,
and the wavelet histogram is one of the most widely
used histograms. In this paper, we investigate the
problem of building wavelet histograms efficiently on
large datasets in MapReduce. We measure the efficiency
of the algorithms by both end-to-end running time and
communication cost. We demonstrate straightforward
adaptations of existing exact and approximate methods
for building wavelet histograms to MapReduce clusters
are highly inefficient. To that end, we design new
algorithms for computing exact and approximate wavelet
histograms and discuss their implementation in
MapReduce. We illustrate our techniques in Hadoop, and
compare to baseline solutions with extensive
experiments performed in a heterogeneous Hadoop cluster
of 16 nodes, using large real and synthetic datasets,
up to hundreds of gigabytes. The results suggest
significant (often orders of magnitude) performance
improvement achieved by our new algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yang:2011:SMD,
author = "Di Yang and Elke A. Rundensteiner and Matthew O.
Ward",
title = "Summarization and matching of density-based clusters
in streaming environments",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "2",
pages = "121--132",
month = oct,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:08 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Density-based cluster mining is known to serve a broad
range of applications ranging from stock trade analysis
to moving object monitoring. Although methods for
efficient extraction of density-based clusters have
been studied in the literature, the problem of
summarizing and matching of such clusters with
arbitrary shapes and complex cluster structures remains
unsolved. Therefore, the goal of our work is to extend
the state-of-art of density-based cluster mining in
streams from cluster extraction only to now also
support analysis and management of the extracted
clusters. Our work solves three major technical
challenges. First, we propose a novel multi-resolution
cluster summarization method, called Skeletal Grid
Summarization (SGS), which captures the key features of
density-based clusters, covering both their external
shape and internal cluster structures. Second, in order
to summarize the extracted clusters in real-time, we
present an integrated computation strategy C-SGS, which
piggybacks the generation of cluster summarizations
within the online clustering process. Lastly, we design
a mechanism to efficiently execute cluster matching
queries, which identify similar clusters for given
cluster of analyst's interest from clusters extracted
earlier in the stream history. Our experimental study
using real streaming data shows the clear superiority
of our proposed methods in both efficiency and
effectiveness for cluster summarization and cluster
matching queries to other potential alternatives.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Nguyen:2011:MSM,
author = "Thanh Nguyen and Viviane Moreira and Huong Nguyen and
Hoa Nguyen and Juliana Freire",
title = "Multilingual schema matching for {Wikipedia}
infoboxes",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "2",
pages = "133--144",
month = oct,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:08 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Recent research has taken advantage of Wikipedia's
multi-lingualism as a resource for cross-language
information retrieval and machine translation, as well
as proposed techniques for enriching its cross-language
structure. The availability of documents in multiple
languages also opens up new opportunities for querying
structured Wikipedia content, and in particular, to
enable answers that straddle different languages. As a
step towards supporting such queries, in this paper, we
propose a method for identifying mappings between
attributes from infoboxes that come from pages in
different languages. Our approach finds mappings in a
completely automated fashion. Because it does not
require training data, it is scalable: not only can it
be used to find mappings between many language pairs,
but it is also effective for languages that are
under-represented and lack sufficient training samples.
Another important benefit of our approach is that it
does not depend on syntactic similarity between
attribute names, and thus, it can be applied to
language pairs that have distinct morphologies. We have
performed an extensive experimental evaluation using a
corpus consisting of pages in Portuguese, Vietnamese,
and English. The results show that not only does our
approach obtain high precision and recall, but it also
outperforms state-of-the-art techniques. We also
present a case study which demonstrates that the
multilingual mappings we derive lead to substantial
improvements in answer quality and coverage for
structured queries over Wikipedia content.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2011:CFP,
author = "Guimei Liu and Haojun Zhang and Limsoon Wong",
title = "Controlling false positives in association rule
mining",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "2",
pages = "145--156",
month = oct,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:08 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Association rule mining is an important problem in the
data mining area. It enumerates and tests a large
number of rules on a dataset and outputs rules that
satisfy user-specified constraints. Due to the large
number of rules being tested, rules that do not
represent real systematic effect in the data can
satisfy the given constraints purely by random chance.
Hence association rule mining often suffers from a high
risk of false positive errors. There is a lack of
comprehensive study on controlling false positives in
association rule mining. In this paper, we adopt three
multiple testing correction approaches---the direct
adjustment approach, the permutation-based approach and
the holdout approach---to control false positives in
association rule mining, and conduct extensive
experiments to study their performance. Our results
show that (1) Numerous spurious rules are generated if
no correction is made. (2) The three approaches can
control false positives effectively. Among the three
approaches, the permutation-based approach has the
highest power of detecting real association rules, but
it is very computationally expensive. We employ several
techniques to reduce its cost effectively.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Suchanek:2011:PPA,
author = "Fabian M. Suchanek and Serge Abiteboul and Pierre
Senellart",
title = "{PARIS}: probabilistic alignment of relations,
instances, and schema",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "3",
pages = "157--168",
month = nov,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:09 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "One of the main challenges that the Semantic Web faces
is the integration of a growing number of independently
designed ontologies. In this work, we present paris, an
approach for the automatic alignment of ontologies.
paris aligns not only instances, but also relations and
classes. Alignments at the instance level
cross-fertilize with alignments at the schema level.
Thereby, our system provides a truly holistic solution
to the problem of ontology alignment. The heart of the
approach is probabilistic, i.e., we measure degrees of
matchings based on probability estimates. This allows
paris to run without any parameter tuning. We
demonstrate the efficiency of the algorithm and its
precision through extensive experiments. In particular,
we obtain a precision of around 90\% in experiments
with some of the world's largest ontologies.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ranu:2011:ATQ,
author = "Sayan Ranu and Ambuj K. Singh",
title = "Answering top-$k$ queries over a mixture of attractive
and repulsive dimensions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "3",
pages = "169--180",
month = nov,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:09 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we formulate a top-$k$ query that
compares objects in a database to a user-provided query
object on a novel scoring function. The proposed
scoring function combines the idea of attractive and
repulsive dimensions into a general framework to
overcome the weakness of traditional distance or
similarity measures. We study the properties of the
proposed class of scoring functions and develop
efficient and scalable index structures that index the
isolines of the function. We demonstrate various
scenarios where the query finds application. Empirical
evaluation demonstrates a performance gain of one to
two orders of magnitude on querying time over existing
state-of-the-art top-$k$ techniques. Further, a
qualitative analysis is performed on a real dataset to
highlight the potential of the proposed query in
discovering hidden data characteristics.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Armbrust:2011:PST,
author = "Michael Armbrust and Kristal Curtis and Tim Kraska and
Armando Fox and Michael J. Franklin and David A.
Patterson",
title = "{PIQL}: success-tolerant query processing in the
cloud",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "3",
pages = "181--192",
month = nov,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:09 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Newly-released web applications often succumb to a
``Success Disaster,'' where overloaded database
machines and resulting high response times destroy a
previously good user experience. Unfortunately, the
data independence provided by a traditional relational
database system, while useful for agile development,
only exacerbates the problem by hiding potentially
expensive queries under simple declarative expressions.
As a result, developers of these applications are
increasingly abandoning relational databases in favor
of imperative code written against distributed
key/value stores, losing the many benefits of data
independence in the process. Instead, we propose PIQL,
a declarative language that also provides scale
independence by calculating an upper bound on the
number of key/value store operations that will be
performed for any query. Coupled with a service level
objective (SLO) compliance prediction model and PIQL's
scalable database architecture, these bounds make it
easy for developers to write success-tolerant
applications that support an arbitrarily large number
of users while still providing acceptable performance.
In this paper, we present the PIQL query processing
system and evaluate its scale independence on hundreds
of machines using two benchmarks, TPC-W and SCADr.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhao:2011:GQE,
author = "Peixiang Zhao and Charu C. Aggarwal and Min Wang",
title = "{gSketch}: on query estimation in graph streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "3",
pages = "193--204",
month = nov,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:09 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Many dynamic applications are built upon large network
infrastructures, such as social networks, communication
networks, biological networks and the Web. Such
applications create data that can be naturally modeled
as graph streams, in which edges of the underlying
graph are received and updated sequentially in a form
of a stream. It is often necessary and important to
summarize the behavior of graph streams in order to
enable effective query processing. However, the sheer
size and dynamic nature of graph streams present an
enormous challenge to existing graph management
techniques. In this paper, we propose a new graph
sketch method, gSketch, which combines well studied
synopses for traditional data streams with a sketch
partitioning technique, to estimate and optimize the
responses to basic queries on graph streams. We
consider two different scenarios for query estimation:
(1) A graph stream sample is available; (2) Both a
graph stream sample and a query workload sample are
available. Algorithms for different scenarios are
designed respectively by partitioning a global sketch
to a group of localized sketches in order to optimize
the query estimation accuracy. We perform extensive
experimental studies on both real and synthetic data
sets and demonstrate the power and robustness of
gSketch in comparison with the state-of-the-art global
sketch method.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ruttenberg:2011:IEM,
author = "Brian E. Ruttenberg and Ambuj K. Singh",
title = "Indexing the earth mover's distance using normal
distributions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "3",
pages = "205--216",
month = nov,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:09 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Querying uncertain data sets (represented as
probability distributions) presents many challenges due
to the large amount of data involved and the
difficulties comparing uncertainty between
distributions. The Earth Mover's Distance (EMD) has
increasingly been employed to compare uncertain data
due to its ability to effectively capture the
differences between two distributions. Computing the
EMD entails finding a solution to the transportation
problem, which is computationally intensive. In this
paper, we propose a new lower bound to the EMD and an
index structure to significantly improve the
performance of EMD based K-- nearest neighbor (K--NN)
queries on uncertain databases. We propose a new lower
bound to the EMD that approximates the EMD on a
projection vector. Each distribution is projected onto
a vector and approximated by a normal distribution, as
well as an accompanying error term. We then represent
each normal as a point in a Hough transformed space. We
then use the concept of stochastic dominance to
implement an efficient index structure in the
transformed space. We show that our method
significantly decreases K--NN query time on uncertain
databases. The index structure also scales well with
database cardinality. It is well suited for
heterogeneous data sets, helping to keep EMD based
queries tractable as uncertain data sets become larger
and more complex.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Qumsiyeh:2011:GER,
author = "Rani Qumsiyeh and Maria S. Pera and Yiu-Kai Ng",
title = "Generating exact- and ranked partially-matched answers
to questions in advertisements",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "3",
pages = "217--228",
month = nov,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:09 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Taking advantage of the Web, many advertisements (ads
for short) websites, which aspire to increase client's
transactions and thus profits, offer searching tools
which allow users to (i) post keyword queries to
capture their information needs or (ii) invoke
form-based interfaces to create queries by selecting
search options, such as a price range, filled-in
entries, check boxes, or drop-down menus. These search
mechanisms, however, are inadequate, since they cannot
be used to specify a natural-language query with rich
syntactic and semantic content, which can only be
handled by a question answering (QA) system.
Furthermore, existing ads websites are incapable of
evaluating arbitrary Boolean queries or retrieving
partially-matched answers that might be of interest to
the user whenever a user's search yields only a few or
no results at all. In solving these problems, we
present a QA system for ads, called CQAds, which (i)
allows users to post a natural-language question Q for
retrieving relevant ads, if they exist, (ii) identifies
ads as answers that partially-match the requested
information expressed in Q, if insufficient or no
answers to Q can be retrieved, which are ordered using
a similarity-ranking approach, and (iii) analyzes
incomplete or ambiguous questions to perform the ``best
guess'' in retrieving answers that ``best match'' the
selection criteria specified in Q. CQAds is also
equipped with a Boolean model to evaluate Boolean
operators that are either explicitly or implicitly
specified in Q, i.e., with or without Boolean operators
specified by the users, respectively. CQAds is easy to
use, scalable to all ads domains, and more powerful
than search tools provided by existing ads websites,
since its query-processing strategy retrieves relevant
ads of higher quality and quantity. We have verified
the accuracy of CQAds in retrieving ads on eight ads
domains and compared its ranking strategy with other
well-known ranking approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fakas:2011:SOS,
author = "Georgios J. Fakas and Zhi Cai and Nikos Mamoulis",
title = "Size-$l$ object summaries for relational keyword
search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "3",
pages = "229--240",
month = nov,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:09 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A previously proposed keyword search paradigm
produces, as a query result, a ranked list of Object
Summaries (OSs). An OS is a tree structure of related
tuples that summarizes all data held in a relational
database about a particular Data Subject (DS). However,
some of these OSs are very large in size and therefore
unfriendly to users that initially prefer synoptic
information before proceeding to more comprehensive
information about a particular DS. In this paper, we
investigate the effective and efficient retrieval of
concise and informative OSs. We argue that a good size-
l OS should be a stand-alone and meaningful synopsis of
the most important information about the particular DS.
More precisely, we define a size- l OS as a partial OS
composed of l important tuples. We propose three
algorithms for the efficient generation of size- l OSs
(in addition to the optimal approach which requires
exponential time). Experimental evaluation on DBLP and
TPC-H databases verifies the effectiveness and
efficiency of our approach.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fang:2011:RER,
author = "Lujun Fang and Anish Das Sarma and Cong Yu and Philip
Bohannon",
title = "{REX}: explaining relationships between entity pairs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "3",
pages = "241--252",
month = nov,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:09 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Knowledge bases of entities and relations (either
constructed manually or automatically) are behind many
real world search engines, including those at Yahoo!,
Microsoft, and Google. Those knowledge bases can be
viewed as graphs with nodes representing entities and
edges representing (primary) relationships, and various
studies have been conducted on how to leverage them to
answer entity seeking queries. Meanwhile, in a
complementary direction, analyses over the query logs
have enabled researchers to identify entity pairs that
are statistically correlated. Such entity relationships
are then presented to search users through the
``related searches'' feature in modern search engines.
However, entity relationships thus discovered can often
be ``puzzling'' to the users because why the entities
are connected is often indescribable. In this paper, we
propose a novel problem called entity relationship
explanation, which seeks to explain why a pair of
entities are connected, and solve this challenging
problem by integrating the above two complementary
approaches, i.e., we leverage the knowledge base to
``explain'' the connections discovered between entity
pairs. More specifically, we present REX, a system that
takes a pair of entities in a given knowledge base as
input and efficiently identifies a ranked list of
relationship explanations. We formally define
relationship explanations and analyze their desirable
properties. Furthermore, we design and implement
algorithms to efficiently enumerate and rank all
relationship explanations based on multiple measures of
``interestingness.'' We perform extensive experiments
over real web-scale data gathered from DBpedia and a
commercial search engine, demonstrating the efficiency
and scalability of REX. We also perform user studies to
corroborate the effectiveness of explanations generated
by REX.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2011:PJP,
author = "Guoliang Li and Dong Deng and Jiannan Wang and Jianhua
Feng",
title = "Pass-join: a partition-based method for similarity
joins",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "3",
pages = "253--264",
month = nov,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:09 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "As an essential operation in data cleaning, the
similarity join has attracted considerable attention
from the database community. In this paper, we study
string similarity joins with edit-distance constraints,
which find similar string pairs from two large sets of
strings whose edit distance is within a given
threshold. Existing algorithms are efficient either for
short strings or for long strings, and there is no
algorithm that can efficiently and adaptively support
both short strings and long strings. To address this
problem, we propose a partition-based method called
Pass-Join. Pass-Join partitions a string into a set of
segments and creates inverted indices for the segments.
Then for each string, Pass-Join selects some of its
substrings and uses the selected substrings to find
candidate pairs using the inverted indices. We devise
efficient techniques to select the substrings and prove
that our method can minimize the number of selected
substrings. We develop novel pruning techniques to
efficiently verify the candidate pairs. Experimental
results show that our algorithms are efficient for both
short strings and long strings, and outperform
state-of-the-art methods on real datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Hoobin:2011:RLZ,
author = "Christopher Hoobin and Simon J. Puglisi and Justin
Zobel",
title = "Relative {Lempel--Ziv} factorization for efficient
storage and retrieval of {Web} collections",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "3",
pages = "265--273",
month = nov,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:09 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Compression techniques that support fast random access
are a core component of any information system. Current
state-of-the-art methods group documents into
fixed-sized blocks and compress each block with a
general-purpose adaptive algorithm such as gzip. Random
access to a specific document then requires
decompression of a block. The choice of block size is
critical: it trades between compression effectiveness
and document retrieval times. In this paper we present
a scalable compression method for large document
collections that allows fast random access. We build a
representative sample of the collection and use it as a
dictionary in a LZ77-like encoding of the rest of the
collection, relative to the dictionary. We demonstrate
on large collections, that using a dictionary as small
as 0.1\% of the collection size, our algorithm is
dramatically faster than previous methods, and in
general gives much better compression.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2011:TCE,
author = "Ning Zhang and Junichi Tatemura and Jignesh M. Patel
and Hakan Hacig{\"u}m{\"u}s",
title = "Towards cost-effective storage provisioning for
{DBMSs}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "4",
pages = "274--285",
month = dec,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:11 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data center operators face a bewildering set of
choices when considering how to provision resources on
machines with complex I/O subsystems. Modern I/O
subsystems often have a rich mix of fast, high
performing, but expensive SSDs sitting alongside with
cheaper but relatively slower (for random accesses)
traditional hard disk drives. The data center operators
need to determine how to provision the I/O resources
for specific workloads so as to abide by existing
Service Level Agreements (SLAs), while minimizing the
total operating cost (TOC) of running the workload,
where the TOC includes the amortized hardware costs and
the run time energy costs. The focus of this paper is
on introducing this new problem of TOC-based storage
allocation, cast in a framework that is compatible with
traditional DBMS query optimization and query
processing architecture. We also present a
heuristic-based solution to this problem, called DOT.
We have implemented DOT in PostgreSQL, and experiments
using TPC-H and TPC-C demonstrate significant TOC
reduction by DOT in various settings.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Roh:2011:BTI,
author = "Hongchan Roh and Sanghyun Park and Sungho Kim and
Mincheol Shin and Sang-Won Lee",
title = "{B+}-tree index optimization by exploiting internal
parallelism of flash-based solid state drives",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "4",
pages = "286--297",
month = dec,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:11 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Previous research addressed the potential problems of
the hard-disk oriented design of DBMSs of flashSSDs. In
this paper, we focus on exploiting potential benefits
of flashSSDs. First, we examine the internal
parallelism issues of flashSSDs by conducting
benchmarks to various flashSSDs. Then, we suggest
algorithm-design principles in order to best benefit
from the internal parallelism. We present a new I/O
request concept, called psync I/O that can exploit the
internal parallelism of flashSSDs in a single process.
Based on these ideas, we introduce B+-tree optimization
methods in order to utilize internal parallelism. By
integrating the results of these methods, we present a
B+-tree variant, PIO B-tree. We confirmed that each
optimization method substantially enhances the index
performance. Consequently, PIO B-tree enhanced
B+-tree's insert performance by a factor of up to 16.3,
while improving point-search performance by a factor of
1.2. The range search of PIO B-tree was up to 5 times
faster than that of the B+-tree. Moreover, PIO B-tree
outperformed other flash-aware indexes in various
synthetic workloads. We also confirmed that PIO B-tree
outperforms B+-tree in index traces collected inside
the PostgreSQL DBMS with TPC-C benchmark.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Larson:2011:HPC,
author = "Per-{\AA}ke Larson and Spyros Blanas and Cristian
Diaconu and Craig Freedman and Jignesh M. Patel and
Mike Zwilling",
title = "High-performance concurrency control mechanisms for
main-memory databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "4",
pages = "298--309",
month = dec,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:11 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A database system optimized for in-memory storage can
support much higher transaction rates than current
systems. However, standard concurrency control methods
used today do not scale to the high transaction rates
achievable by such systems. In this paper we introduce
two efficient concurrency control methods specifically
designed for main-memory databases. Both use
multiversioning to isolate read-only transactions from
updates but differ in how atomicity is ensured: one is
optimistic and one is pessimistic. To avoid expensive
context switching, transactions never block during
normal processing but they may have to wait before
commit to ensure correct serialization ordering. We
also implemented a main-memory optimized version of
single-version locking. Experimental results show that
while single-version locking works well when
transactions are short and contention is low
performance degrades under more demanding conditions.
The multiversion schemes have higher overhead but are
much less sensitive to hotspots and the presence of
long-running transactions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ma:2011:CTG,
author = "Shuai Ma and Yang Cao and Wenfei Fan and Jinpeng Huai
and Tianyu Wo",
title = "Capturing topology in graph pattern matching",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "4",
pages = "310--321",
month = dec,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:11 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Graph pattern matching is often defined in terms of
subgraph isomorphism, an np-complete problem. To lower
its complexity, various extensions of graph simulation
have been considered instead. These extensions allow
pattern matching to be conducted in cubic-time.
However, they fall short of capturing the topology of
data graphs, i.e., graphs may have a structure
drastically different from pattern graphs they match,
and the matches found are often too large to understand
and analyze. To rectify these problems, this paper
proposes a notion of strong simulation, a revision of
graph simulation, for graph pattern matching. (1) We
identify a set of criteria for preserving the topology
of graphs matched. We show that strong simulation
preserves the topology of data graphs and finds a
bounded number of matches. (2) We show that strong
simulation retains the same complexity as earlier
extensions of simulation, by providing a cubic-time
algorithm for computing strong simulation. (3) We
present the locality property of strong simulation,
which allows us to effectively conduct pattern matching
on distributed graphs. (4) We experimentally verify the
effectiveness and efficiency of these algorithms, using
real-life data and synthetic data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kumar:2011:PMO,
author = "Arun Kumar and Christopher R{\'e}",
title = "Probabilistic management of {OCR} data using an
{RDBMS}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "4",
pages = "322--333",
month = dec,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:11 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The digitization of scanned forms and documents is
changing the data sources that enterprises manage. To
integrate these new data sources with enterprise data,
the current state-of-the-art approach is to convert the
images to ASCII text using optical character
recognition (OCR) software and then to store the
resulting ASCII text in a relational database. The OCR
problem is challenging, and so the output of OCR often
contains errors. In turn, queries on the output of OCR
may fail to retrieve relevant answers. State-of-the-art
OCR programs, e.g., the OCR powering Google Books, use
a probabilistic model that captures many alternatives
during the OCR process. Only when the results of OCR
are stored in the database, do these approaches discard
the uncertainty. In this work, we propose to retain the
probabilistic models produced by OCR process in a
relational database management system. A key technical
challenge is that the probabilistic data produced by
OCR software is very large (a single book blows up to
2GB from 400kB as ASCII). As a result, a baseline
solution that integrates these models with an RDBMS is
over 1000x slower versus standard text processing for
single table select-project queries. However, many
applications may have quality-performance needs that
are in between these two extremes of ASCII and the
complete model output by the OCR software. Thus, we
propose a novel approximation scheme called Staccato
that allows a user to trade recall for query
performance. Additionally, we provide a formal analysis
of our scheme's properties, and describe how we
integrate our scheme with standard-RDBMS text
indexing.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Pawlik:2011:RRA,
author = "Mateusz Pawlik and Nikolaus Augsten",
title = "{RTED}: a robust algorithm for the tree edit
distance",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "4",
pages = "334--345",
month = dec,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:11 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We consider the classical tree edit distance between
ordered labeled trees, which is defined as the
minimum-cost sequence of node edit operations that
transform one tree into another. The state-of-the-art
solutions for the tree edit distance are not
satisfactory. The main competitors in the field either
have optimal worst-case complexity, but the worst case
happens frequently, or they are very efficient for some
tree shapes, but degenerate for others. This leads to
unpredictable and often infeasible runtimes. There is
no obvious way to choose between the algorithms. In
this paper we present RTED, a robust tree edit distance
algorithm. The asymptotic complexity of RTED is smaller
or equal to the complexity of the best competitors for
any input instance, i.e., RTED is both efficient and
worst-case optimal. We introduce the class of LRH
(Left-Right-Heavy) algorithms, which includes RTED and
the fastest tree edit distance algorithms presented in
literature. We prove that RTED outperforms all
previously proposed LRH algorithms in terms of runtime
complexity. In our experiments on synthetic and real
world data we empirically evaluate our solution and
compare it to the state-of-the-art.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Amsterdamer:2011:PLP,
author = "Yael Amsterdamer and Susan B. Davidson and Daniel
Deutch and Tova Milo and Julia Stoyanovich and Val
Tannen",
title = "Putting lipstick on pig: enabling database-style
workflow provenance",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "4",
pages = "346--357",
month = dec,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:11 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Workflow provenance typically assumes that each module
is a ``black-box'', so that each output depends on all
inputs (coarse-grained dependencies). Furthermore, it
does not model the internal state of a module, which
can change between repeated executions. In practice,
however, an output may depend on only a small subset of
the inputs (fine-grained dependencies) as well as on
the internal state of the module. We present a novel
provenance framework that marries database-style and
workflow-style provenance, by using Pig Latin to expose
the functionality of modules, thus capturing internal
state and fine-grained dependencies. A critical
ingredient in our solution is the use of a novel form
of provenance graph that models module invocations and
yields a compact representation of fine-grained
workflow provenance. It also enables a number of novel
graph transformation operations, allowing to choose the
desired level of granularity in provenance querying
(ZoomIn and ZoomOut), and supporting ``what-if''
workflow analytic queries. We implemented our approach
in the Lipstick system and developed a benchmark in
support of a systematic performance evaluation. Our
results demonstrate the feasibility of tracking and
querying fine-grained workflow provenance.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gao:2011:RAS,
author = "Jun Gao and Ruoming Jin and Jiashuai Zhou and Jeffrey
Xu Yu and Xiao Jiang and Tengjiao Wang",
title = "Relational approach for shortest path discovery over
large graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "4",
pages = "358--369",
month = dec,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:11 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the rapid growth of large graphs, we cannot
assume that graphs can still be fully loaded into
memory, thus the disk-based graph operation is
inevitable. In this paper, we take the shortest path
discovery as an example to investigate the technique
issues when leveraging existing infrastructure of
relational database (RDB) in the graph data management.
Based on the observation that a variety of graph search
queries can be implemented by iterative operations
including selecting frontier nodes from visited nodes,
making expansion from the selected frontier nodes, and
merging the expanded nodes into the visited ones, we
introduce a relational FEM framework with three
corresponding operators to implement graph search tasks
in the RDB context. We show new features such as window
function and merge statement introduced by recent SQL
standards can not only simplify the expression but also
improve the performance of the FEM framework. In
addition, we propose two optimization strategies
specific to shortest path discovery inside the FEM
framework. First, we take a bi-directional set
Dijkstra's algorithm in the path finding. The
bi-directional strategy can reduce the search space,
and set Dijkstra's algorithm finds the shortest path in
a set-at-a-time fashion. Second, we introduce an index
named SegTable to preserve the local shortest segments,
and exploit SegTable to further improve the
performance. The final extensive experimental results
illustrate our relational approach with the
optimization strategies achieves high scalability and
performance.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Barsky:2011:MFC,
author = "Marina Barsky and Sangkyum Kim and Tim Weninger and
Jiawei Han",
title = "Mining flipping correlations from large datasets with
taxonomies",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "4",
pages = "370--381",
month = dec,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:11 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper we introduce a new type of pattern --- a
flipping correlation pattern. The flipping patterns are
obtained from contrasting the correlations between
items at different levels of abstraction. They
represent surprising correlations, both positive and
negative, which are specific for a given abstraction
level, and which ``flip'' from positive to negative and
vice versa when items are generalized to a higher level
of abstraction. We design an efficient algorithm for
finding flipping correlations, the Flipper algorithm,
which outperforms na{\"\i}ve pattern mining methods by
several orders of magnitude. We apply Flipper to
real-life datasets and show that the discovered
patterns are non-redundant, surprising and actionable.
Flipper finds strong contrasting correlations in
itemsets with low-to-medium support, while existing
techniques cannot handle the pattern discovery in this
frequency range.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Konig:2011:SAT,
author = "Arnd Christian K{\"o}nig and Bolin Ding and Surajit
Chaudhuri and Vivek Narasayya",
title = "A statistical approach towards robust progress
estimation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "4",
pages = "382--393",
month = dec,
year = "2011",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:11 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The need for accurate SQL progress estimation in the
context of decision support administration has led to a
number of techniques proposed for this task.
Unfortunately, no single one of these progress
estimators behaves robustly across the variety of SQL
queries encountered in practice, meaning that each
technique performs poorly for a significant fraction of
queries. This paper proposes a novel estimator
selection framework that uses a statistical model to
characterize the sets of conditions under which certain
estimators outperform others, leading to a significant
increase in estimation robustness. The generality of
this framework also enables us to add a number of novel
``special purpose'' estimators which increase accuracy
further. Most importantly, the resulting model
generalizes well to queries very different from the
ones used to train it. We validate our findings using a
large number of industrial real-life and benchmark
workloads.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sun:2012:RSA,
author = "Yizhou Sun and Charu C. Aggarwal and Jiawei Han",
title = "Relation strength-aware clustering of heterogeneous
information networks with incomplete attributes",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "5",
pages = "394--405",
month = jan,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:13 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the rapid development of online social media,
online shopping sites and cyber-physical systems,
heterogeneous information networks have become
increasingly popular and content-rich over time. In
many cases, such networks contain multiple types of
objects and links, as well as different kinds of
attributes. The clustering of these objects can provide
useful insights in many applications. However, the
clustering of such networks can be challenging since
(a) the attribute values of objects are often
incomplete, which implies that an object may carry only
partial attributes or even no attributes to correctly
label itself; and (b) the links of different types may
carry different kinds of semantic meanings, and it is a
difficult task to determine the nature of their
relative importance in helping the clustering for a
given purpose. In this paper, we address these
challenges by proposing a model-based clustering
algorithm. We design a probabilistic model which
clusters the objects of different types into a common
hidden space, by using a user-specified set of
attributes, as well as the links from different
relations. The strengths of different types of links
are automatically learned, and are determined by the
given purpose of clustering. An iterative algorithm is
designed for solving the clustering problem, in which
the strengths of different types of links and the
quality of clustering results mutually enhance each
other. Our experimental results on real and synthetic
data sets demonstrate the effectiveness and efficiency
of the algorithm.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wu:2012:SPD,
author = "Lingkun Wu and Xiaokui Xiao and Dingxiong Deng and Gao
Cong and Andy Diwen Zhu and Shuigeng Zhou",
title = "Shortest path and distance queries on road networks:
an experimental evaluation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "5",
pages = "406--417",
month = jan,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:13 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Computing the shortest path between two given
locations in a road network is an important problem
that finds applications in various map services and
commercial navigation products. The state-of-the-art
solutions for the problem can be divided into two
categories: spatial-coherence-based methods and
vertex-importance-based approaches. The two categories
of techniques, however, have not been compared
systematically under the same experimental framework,
as they were developed from two independent lines of
research that do not refer to each other. This renders
it difficult for a practitioner to decide which
technique should be adopted for a specific application.
Furthermore, the experimental evaluation of the
existing techniques, as presented in previous work,
falls short in several aspects. Some methods were
tested only on small road networks with up to one
hundred thousand vertices; some approaches were
evaluated using distance queries (instead of shortest
path queries), namely, queries that ask only for the
length of the shortest path; a state-of-the-art
technique was examined based on a faulty implementation
that led to incorrect query results. To address the
above issues, this paper presents a comprehensive
comparison of the most advanced spatial-coherence-based
and vertex-importance-based approaches. Using a variety
of real road networks with up to twenty million
vertices, we evaluated each technique in terms of its
preprocessing time, space consumption, and query
efficiency (for both shortest path and distance
queries). Our experimental results reveal the
characteristics of different techniques, based on which
we provide guidelines on selecting appropriate methods
for various scenarios.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Erdos:2012:FPP,
author = "D{\'o}ra Erd{\H{o}}s and Vatche Ishakian and Andrei
Lapets and Evimaria Terzi and Azer Bestavros",
title = "The filter-placement problem and its application to
minimizing information multiplicity",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "5",
pages = "418--429",
month = jan,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:13 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In many information networks, data items --- such as
updates in social networks, news flowing through
interconnected RSS feeds and blogs, measurements in
sensor networks, route updates in ad-hoc networks ---
propagate in an uncoordinated manner: nodes often relay
information they receive to neighbors, independent of
whether or not these neighbors received the same
information from other sources. This uncoordinated data
dissemination may result in significant, yet
unnecessary communication and processing overheads,
ultimately reducing the utility of information
networks. To alleviate the negative impacts of this
information multiplicity phenomenon, we propose that a
subset of nodes (selected at key positions in the
network) carry out additional information filtering
functionality. Thus, nodes are responsible for the
removal (or significant reduction) of the redundant
data items relayed through them. We refer to such nodes
as filters. We formally define the Filter Placement
problem as a combinatorial optimization problem, and
study its computational complexity for different types
of graphs. We also present polynomial-time
approximation algorithms and scalable heuristics for
the problem. Our experimental results, which we
obtained through extensive simulations on synthetic and
real-world information flow networks, suggest that in
many settings a relatively small number of filters are
fairly effective in removing a large fraction of
redundant information.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Satuluri:2012:BLS,
author = "Venu Satuluri and Srinivasan Parthasarathy",
title = "{Bayesian} locality sensitive hashing for fast
similarity search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "5",
pages = "430--441",
month = jan,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:13 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Given a collection of objects and an associated
similarity measure, the all-pairs similarity search
problem asks us to find all pairs of objects with
similarity greater than a certain user-specified
threshold. Locality-sensitive hashing (LSH) based
methods have become a very popular approach for this
problem. However, most such methods only use LSH for
the first phase of similarity search --- i.e. efficient
indexing for candidate generation. In this paper, we
present BayesLSH, a principled Bayesian algorithm for
the subsequent phase of similarity search ---
performing candidate pruning and similarity estimation
using LSH. A simpler variant, BayesLSH-Lite, which
calculates similarities exactly, is also presented. Our
algorithms are able to quickly prune away a large
majority of the false positive candidate pairs, leading
to significant speedups over baseline approaches. For
BayesLSH, we also provide probabilistic guarantees on
the quality of the output, both in terms of accuracy
and recall. Finally, the quality of BayesLSH's output
can be easily tuned and does not require any manual
setting of the number of hashes to use for similarity
estimation, unlike standard approaches. For two
state-of-the-art candidate generation algorithms,
AllPairs and LSH, BayesLSH enables significant
speedups, typically in the range 2x-20x for a wide
variety of datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fujiwara:2012:FET,
author = "Yasuhiro Fujiwara and Makoto Nakatsuji and Makoto
Onizuka and Masaru Kitsuregawa",
title = "Fast and exact top-$k$ search for random walk with
restart",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "5",
pages = "442--453",
month = jan,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:13 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Graphs are fundamental data structures and have been
employed for centuries to model real-world systems and
phenomena. Random walk with restart (RWR) provides a
good proximity score between two nodes in a graph, and
it has been successfully used in many applications such
as automatic image captioning, recommender systems, and
link prediction. The goal of this work is to find nodes
that have top-$k$ highest proximities for a given node.
Previous approaches to this problem find nodes
efficiently at the expense of exactness. The main
motivation of this paper is to answer, in the
affirmative, the question, 'Is it possible to improve
the search time without sacrificing the exactness?'.
Our solution, K-dash, is based on two ideas: (1) It
computes the proximity of a selected node efficiently
by sparse matrices, and (2) It skips unnecessary
proximity computations when searching for the top-$k$
nodes. Theoretical analyses show that K-dash guarantees
result exactness. We perform comprehensive experiments
to verify the efficiency of K-dash. The results show
that K-dash can find top-$k$ nodes significantly faster
than the previous approaches while it guarantees
exactness.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bahmani:2012:DSS,
author = "Bahman Bahmani and Ravi Kumar and Sergei
Vassilvitskii",
title = "Densest subgraph in streaming and {MapReduce}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "5",
pages = "454--465",
month = jan,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:13 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The problem of finding locally dense components of a
graph is an important primitive in data analysis, with
wide-ranging applications from community mining to spam
detection and the discovery of biological network
modules. In this paper we present new algorithms for
finding the densest subgraph in the streaming model.
For any $ \epsilon > 0 $, our algorithms make $
O(\log_{1 + \epsilon } n) $ passes over the input and
find a subgraph whose density is guaranteed to be
within a factor $ 2 (1 + \epsilon) $ of the optimum.
Our algorithms are also easily parallelizable and we
illustrate this by realizing them in the MapReduce
model. In addition we perform extensive experimental
evaluation on massive real-world graphs showing the
performance and scalability of our algorithms in
practice.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Silva:2012:MAS,
author = "Arlei Silva and Wagner {Meira, Jr.} and Mohammed J.
Zaki",
title = "Mining attribute-structure correlated patterns in
large attributed graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "5",
pages = "466--477",
month = jan,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:13 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this work, we study the correlation between
attribute sets and the occurrence of dense subgraphs in
large attributed graphs, a task we call structural
correlation pattern mining. A structural correlation
pattern is a dense subgraph induced by a particular
attribute set. Existing methods are not able to extract
relevant knowledge regarding how vertex attributes
interact with dense subgraphs. Structural correlation
pattern mining combines aspects of frequent itemset and
quasi-clique mining problems. We propose statistical
significance measures that compare the structural
correlation of attribute sets against their expected
values using null models. Moreover, we evaluate the
interestingness of structural correlation patterns in
terms of size and density. An efficient algorithm that
combines search and pruning strategies in the
identification of the most relevant structural
correlation patterns is presented. We apply our method
for the analysis of three real-world attributed graphs:
a collaboration, a music, and a citation network,
verifying that it provides valuable knowledge in a
feasible time.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Schnaitter:2012:SAI,
author = "Karl Schnaitter and Neoklis Polyzotis",
title = "Semi-automatic index tuning: keeping {DBAs} in the
loop",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "5",
pages = "478--489",
month = jan,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:13 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "To obtain a high level of system performance, a
database administrator (DBA) must choose a set of
indices that is appropriate for the workload. The
system can aid in this challenging task by providing
recommendations for the index configuration. We propose
a new index recommendation technique, termed
semi-automatic tuning, that keeps the DBA ``in the
loop'' by generating recommendations that use feedback
about the DBA's preferences. The technique also works
online, which avoids the limitations of commercial
tools that require the workload to be known in advance.
The foundation of our approach is the Work Function
Algorithm, which can solve a wide variety of online
optimization problems with strong competitive
guarantees. We present an experimental analysis that
validates the benefits of semi-automatic tuning in a
wide variety of conditions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fink:2012:APD,
author = "Robert Fink and Larisa Han and Dan Olteanu",
title = "Aggregation in probabilistic databases via knowledge
compilation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "5",
pages = "490--501",
month = jan,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:13 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper presents a query evaluation technique for
positive relational algebra queries with aggregates on
a representation system for probabilistic data based on
the algebraic structures of semiring and semimodule.
The core of our evaluation technique is a procedure
that compiles semimodule and semiring expressions into
so-called decomposition trees, for which the
computation of the probability distribution can be done
in time linear in the product of the sizes of the
probability distributions represented by its nodes. We
give syntactic characterisations of tractable queries
with aggregates by exploiting the connection between
query tractability and polynomial-time decomposition
trees. A prototype of the technique is incorporated in
the probabilistic database engine SPROUT. We report on
performance experiments with custom datasets and TPC-H
data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Halim:2012:SDC,
author = "Felix Halim and Stratos Idreos and Panagiotis Karras
and Roland H. C. Yap",
title = "Stochastic database cracking: towards robust adaptive
indexing in main-memory column-stores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "6",
pages = "502--513",
month = feb,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:15 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Modern business applications and scientific databases
call for inherently dynamic data storage environments.
Such environments are characterized by two challenging
features: (a) they have little idle system time to
devote on physical design; and (b) there is little, if
any, a priori workload knowledge, while the query and
data workload keeps changing dynamically. In such
environments, traditional approaches to index building
and maintenance cannot apply. Database cracking has
been proposed as a solution that allows on-the-fly
physical data reorganization, as a collateral effect of
query processing. Cracking aims to continuously and
automatically adapt indexes to the workload at hand,
without human intervention. Indexes are built
incrementally, adaptively, and on demand. Nevertheless,
as we show, existing adaptive indexing methods fail to
deliver workload-robustness; they perform much better
with random workloads than with others. This frailty
derives from the inelasticity with which these
approaches interpret each query as a hint on how data
should be stored. Current cracking schemes blindly
reorganize the data within each query's range, even if
that results into successive expensive operations with
minimal indexing benefit. In this paper, we introduce
stochastic cracking, a significantly more resilient
approach to adaptive indexing. Stochastic cracking also
uses each query as a hint on how to reorganize data,
but not blindly so; it gains resilience and avoids
performance bottlenecks by deliberately applying
certain arbitrary choices in its decision-making.
Thereby, we bring adaptive indexing forward to a mature
formulation that confers the workload-robustness
previous approaches lacked. Our extensive experimental
study verifies that stochastic cracking maintains the
desired properties of original database cracking while
at the same time it performs well with diverse
realistic workloads.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2012:AMA,
author = "Chao Li and Gerome Miklau",
title = "An adaptive mechanism for accurate query answering
under differential privacy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "6",
pages = "514--525",
month = feb,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:15 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We propose a novel mechanism for answering sets of
counting queries under differential privacy. Given a
workload of counting queries, the mechanism
automatically selects a different set of ``strategy''
queries to answer privately, using those answers to
derive answers to the workload. The main algorithm
proposed in this paper approximates the optimal
strategy for any workload of linear counting queries.
With no cost to the privacy guarantee, the mechanism
improves significantly on prior approaches and achieves
near-optimal error for many workloads, when applied
under $ (\epsilon, \delta)$-differential privacy. The
result is an adaptive mechanism which can help users
achieve good utility without requiring that they reason
carefully about the best formulation of their task.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Giannikis:2012:SKO,
author = "Georgios Giannikis and Gustavo Alonso and Donald
Kossmann",
title = "{SharedDB}: killing one thousand queries with one
stone",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "6",
pages = "526--537",
month = feb,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:15 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Traditional database systems are built around the
query-at-a-time model. This approach tries to optimize
performance in a best-effort way. Unfortunately, best
effort is not good enough for many modern applications.
These applications require response time guarantees in
high load situations. This paper describes the design
of a new database architecture that is based on
batching queries and shared computation across possibly
hundreds of concurrent queries and updates. Performance
experiments with the TPC-W benchmark show that the
performance of our implementation, SharedDB, is indeed
robust across a wide range of dynamic workloads.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Selke:2012:PBC,
author = "Joachim Selke and Christoph Lofi and Wolf-Tilo Balke",
title = "Pushing the boundaries of crowd-enabled databases with
query-driven schema expansion",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "6",
pages = "538--549",
month = feb,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:15 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "By incorporating human workers into the query
execution process crowd-enabled databases facilitate
intelligent, social capabilities like completing
missing data at query time or performing cognitive
operators. But despite all their flexibility,
crowd-enabled databases still maintain rigid schemas.
In this paper, we extend crowd-enabled databases by
flexible query-driven schema expansion, allowing the
addition of new attributes to the database at query
time. However, the number of crowd-sourced mini-tasks
to fill in missing values may often be prohibitively
large and the resulting data quality is doubtful.
Instead of simple crowd-sourcing to obtain all values
individually, we leverage the usergenerated data found
in the Social Web: By exploiting user ratings we build
perceptual spaces, i.e., highly-compressed
representations of opinions, impressions, and
perceptions of large numbers of users. Using few
training samples obtained by expert crowd sourcing, we
then can extract all missing data automatically from
the perceptual space with high quality and at low
costs. Extensive experiments show that our approach can
boost both performance and quality of crowd-enabled
databases, while also providing the flexibility to
expand schemas in a query-driven fashion.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhao:2012:BAD,
author = "Bo Zhao and Benjamin I. P. Rubinstein and Jim Gemmell
and Jiawei Han",
title = "A {Bayesian} approach to discovering truth from
conflicting sources for data integration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "6",
pages = "550--561",
month = feb,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:15 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In practical data integration systems, it is common
for the data sources being integrated to provide
conflicting information about the same entity.
Consequently, a major challenge for data integration is
to derive the most complete and accurate integrated
records from diverse and sometimes conflicting sources.
We term this challenge the truth finding problem. We
observe that some sources are generally more reliable
than others, and therefore a good model of source
quality is the key to solving the truth finding
problem. In this work, we propose a probabilistic
graphical model that can automatically infer true
records and source quality without any supervision. In
contrast to previous methods, our principled approach
leverages a generative process of two types of errors
(false positive and false negative) by modeling two
different aspects of source quality. In so doing, ours
is also the first approach designed to merge
multi-valued attribute types. Our method is scalable,
due to an efficient sampling-based inference algorithm
that needs very few iterations in practice and enjoys
linear time complexity, with an even faster incremental
variant. Experiments on two real world datasets show
that our new method outperforms existing
state-of-the-art approaches to the truth finding
problem.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Upadhyaya:2012:HPS,
author = "Prasang Upadhyaya and Magdalena Balazinska and Dan
Suciu",
title = "How to price shared optimizations in the cloud",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "6",
pages = "562--573",
month = feb,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:15 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data-management-as-a-service systems are increasingly
being used in collaborative settings, where multiple
users access common datasets. Cloud providers have the
choice to implement various optimizations, such as
indexing or materialized views, to accelerate queries
over these datasets. Each optimization carries a cost
and may benefit multiple users. This creates a major
challenge: how to select which optimizations to perform
and how to share their cost among users. The problem is
especially challenging when users are selfish and will
only report their true values for different
optimizations if doing so maximizes their utility. In
this paper, we present a new approach for selecting and
pricing shared optimizations by using Mechanism Design.
We first show how to apply the Shapley Value Mechanism
to the simple case of selecting and pricing additive
optimizations, assuming an offline game where all users
access the service for the same time-period. Second, we
extend the approach to online scenarios where users
come and go. Finally, we consider the case of
substitutive optimizations. We show analytically that
our mechanisms induce truthfulness and recover the
optimization costs. We also show experimentally that
our mechanisms yield higher utility than the
state-of-the-art approach based on regret
accumulation.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Angel:2012:DSM,
author = "Albert Angel and Nikos Sarkas and Nick Koudas and
Divesh Srivastava",
title = "Dense subgraph maintenance under streaming edge weight
updates for real-time story identification",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "6",
pages = "574--585",
month = feb,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:15 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Recent years have witnessed an unprecedented
proliferation of social media. People around the globe
author, every day, millions of blog posts, micro-blog
posts, social network status updates, etc. This rich
stream of information can be used to identify, on an
ongoing basis, emerging stories, and events that
capture popular attention. Stories can be identified
via groups of tightly-coupled real-world entities,
namely the people, locations, products, etc., that are
involved in the story. The sheer scale, and rapid
evolution of the data involved necessitate highly
efficient techniques for identifying important stories
at every point of time. The main challenge in real-time
story identification is the maintenance of dense
subgraphs (corresponding to groups of tightly-coupled
entities) under streaming edge weight updates
(resulting from a stream of user-generated content).
This is the first work to study the efficient
maintenance of dense subgraphs under such streaming
edge weight updates. For a wide range of definitions of
density, we derive theoretical results regarding the
magnitude of change that a single edge weight update
can cause. Based on these, we propose a novel
algorithm, DynDens, which outperforms adaptations of
existing techniques to this setting, and yields
meaningful results. Our approach is validated by a
thorough experimental evaluation on large-scale real
and synthetic datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Elghandour:2012:RRR,
author = "Iman Elghandour and Ashraf Aboulnaga",
title = "{ReStore}: reusing results of {MapReduce} jobs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "6",
pages = "586--597",
month = feb,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 24 07:52:15 MDT 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Analyzing large scale data has emerged as an important
activity for many organizations in the past few years.
This large scale data analysis is facilitated by the
MapReduce programming and execution model and its
implementations, most notably Hadoop. Users of
MapReduce often have analysis tasks that are too
complex to express as individual MapReduce jobs.
Instead, they use high-level query languages such as
Pig, Hive, or Jaql to express their complex tasks. The
compilers of these languages translate queries into
workflows of MapReduce jobs. Each job in these
workflows reads its input from the distributed file
system used by the MapReduce system and produces output
that is stored in this distributed file system and read
as input by the next job in the workflow. The current
practice is to delete these intermediate results from
the distributed file system at the end of executing the
workflow. One way to improve the performance of
workflows of MapReduce jobs is to keep these
intermediate results and reuse them for future
workflows submitted to the system. In this paper, we
present ReStore, a system that manages the storage and
reuse of such intermediate results. ReStore can reuse
the output of whole MapReduce jobs that are part of a
workflow, and it can also create additional reuse
opportunities by materializing and storing the output
of query execution operators that are executed within a
MapReduce job. We have implemented ReStore as an
extension to the Pig dataflow system on top of Hadoop,
and we experimentally demonstrate significant speedups
on queries from the PigMix benchmark.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Khoussainova:2012:PDM,
author = "Nodira Khoussainova and Magdalena Balazinska and Dan
Suciu",
title = "{PerfXplain}: debugging {MapReduce} job performance",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "7",
pages = "598--609",
month = mar,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:09 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "While users today have access to many tools that
assist in performing large scale data analysis tasks,
understanding the performance characteristics of their
parallel computations, such as MapReduce jobs, remains
difficult. We present PerfXplain, a system that enables
users to ask questions about the relative performances
(i.e., runtimes) of pairs of MapReduce jobs. PerfXplain
provides a new query language for articulating
performance queries and an algorithm for generating
explanations from a log of past MapReduce job
executions. We formally define the notion of an
explanation together with three metrics, relevance,
precision, and generality, that measure explanation
quality. We present the explanation-generation
algorithm based on techniques related to decision-tree
building. We evaluate the approach on a log of past
executions on Amazon EC2, and show that our approach
can generate quality explanations, outperforming two
na{\"\i}ve explanation-generation methods.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gullo:2012:UCB,
author = "Francesco Gullo and Andrea Tagarelli",
title = "Uncertain centroid based partitional clustering of
uncertain data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "7",
pages = "610--621",
month = mar,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:09 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Clustering uncertain data has emerged as a challenging
task in uncertain data management and mining. Thanks to
a computational complexity advantage over other
clustering paradigms, partitional clustering has been
particularly studied and a number of algorithms have
been developed. While existing proposals differ mainly
in the notions of cluster centroid and clustering
objective function, little attention has been given to
an analysis of their characteristics and limits. In
this work, we theoretically investigate major existing
methods of partitional clustering, and alternatively
propose a well-founded approach to clustering uncertain
data based on a novel notion of cluster centroid. A
cluster centroid is seen as an uncertain object defined
in terms of a random variable whose realizations are
derived based on all deterministic representations of
the objects to be clustered. As demonstrated
theoretically and experimentally, this allows for
better representing a cluster of uncertain objects,
thus supporting a consistently improved clustering
performance while maintaining comparable efficiency
with existing partitional clustering algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bahmani:2012:SM,
author = "Bahman Bahmani and Benjamin Moseley and Andrea Vattani
and Ravi Kumar and Sergei Vassilvitskii",
title = "Scalable $k$-means$ + + $",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "7",
pages = "622--633",
month = mar,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:09 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Over half a century old and showing no signs of aging,
$k$-means remains one of the most popular data
processing algorithms. As is well-known, a proper
initialization of $k$-means is crucial for obtaining a
good final solution. The recently proposed $k$-means++
initialization algorithm achieves this, obtaining an
initial set of centers that is provably close to the
optimum solution. A major downside of the $k$-means++
is its inherent sequential nature, which limits its
applicability to massive data: one must make $k$ passes
over the data to find a good initial set of centers. In
this work we show how to drastically reduce the number
of passes needed to obtain, in parallel, a good
initialization. This is unlike prevailing efforts on
parallelizing $k$-means that have mostly focused on the
post-initialization phases of $k$-means. We prove that
our proposed initialization algorithm $k$-means||
obtains a nearly optimal solution after a logarithmic
number of passes, and then show that in practice a
constant number of passes suffices. Experimental
evaluation on real-world large-scale data demonstrates
that $k$-means|| outperforms $k$-means++ in both
sequential and parallel settings.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Benedikt:2012:QSA,
author = "Michael Benedikt and Pierre Bourhis and Clemens Ley",
title = "Querying schemas with access restrictions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "7",
pages = "634--645",
month = mar,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:09 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We study verification of systems whose transitions
consist of accesses to a Web-based data-source. An
access is a lookup on a relation within a relational
database, fixing values for a set of positions in the
relation. For example, a transition can represent
access to a Web form, where the user is restricted to
filling in values for a particular set of fields. We
look at verifying properties of a schema describing the
possible accesses of such a system. We present a
language where one can describe the properties of an
access path, and also specify additional restrictions
on accesses that are enforced by the schema. Our main
property language, AccLTL, is based on a first-order
extension of linear-time temporal logic, interpreting
access paths as sequences of relational structures. We
also present a lower-level automaton model, A-automata,
which AccLTL specifications can compile into. We show
that AccLTL and A-automata can express static analysis
problems related to ``querying with limited access
patterns'' that have been studied in the database
literature in the past, such as whether an access is
relevant to answering a query, and whether two queries
are equivalent in the accessible data they can return.
We prove decidability and complexity results for
several restrictions and variants of AccLTL, and
explain which properties of paths can be expressed in
each restriction.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Graefe:2012:DDR,
author = "Goetz Graefe and Harumi Kuno",
title = "Definition, detection, and recovery of single-page
failures, a fourth class of database failures",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "7",
pages = "646--655",
month = mar,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:09 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The three traditional failure classes are system,
media, and transaction failures. Sometimes, however,
modern storage exhibits failures that differ from all
of those. In order to capture and describe such cases,
single-page failures are introduced as a fourth failure
class. This class encompasses all failures to read a
data page correctly and with plausible contents despite
all correction attempts in lower system levels.
Efficient recovery seems to require a new data
structure called the page recovery index. Its
transactional maintenance can be accomplished writing
the same number of log records as today's efficient
implementations of logging and recovery. Detection and
recovery of a single-page failure can be sufficiently
fast that the affected data access is merely delayed,
without the need to abort the transaction.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Graefe:2012:CCA,
author = "Goetz Graefe and Felix Halim and Stratos Idreos and
Harumi Kuno and Stefan Manegold",
title = "Concurrency control for adaptive indexing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "7",
pages = "656--667",
month = mar,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:09 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Adaptive indexing initializes and optimizes indexes
incrementally, as a side effect of query processing.
The goal is to achieve the benefits of indexes while
hiding or minimizing the costs of index creation.
However, index-optimizing side effects seem to turn
read-only queries into update transactions that might,
for example, create lock contention. This paper studies
concurrency control in the context of adaptive
indexing. We show that the design and implementation of
adaptive indexing rigorously separates index structures
from index contents; this relaxes the constraints and
requirements during adaptive indexing compared to those
of traditional index updates. Our design adapts to the
fact that an adaptive index is refined continuously,
and exploits any concurrency opportunities in a dynamic
way. A detailed experimental analysis demonstrates that
(a) adaptive indexing maintains its adaptive properties
even when running concurrent queries, (b) adaptive
indexing can exploit the opportunity for parallelism
due to concurrent queries, (c) the number of
concurrency conflicts and any concurrency
administration overheads follow an adaptive behavior,
decreasing as the workload evolves and adapting to the
workload needs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zeng:2012:CSB,
author = "Qiang Zeng and Hai Zhuge",
title = "Comments on {``Stack-based Algorithms for Pattern
Matching on DAGs''}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "7",
pages = "668--679",
month = mar,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:09 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The paper ``Stack-based Algorithms for Pattern
Matching on DAGs'' generalizes the classical holistic
twig join algorithms and proposes PathStackD,
TwigStackD and DagStackD to respectively evaluate path,
twig and DAG pattern queries on directed acyclic
graphs. In this paper, we investigate the major results
of that paper, pointing out several discrepancies and
proposing solutions to resolving them. We show that the
original algorithms do not find particular types of
query solutions that are common in practice. We also
analyze the effect of an underlying assumption on the
correctness of the algorithms and discuss the
pre-filtering process that the original work proposes
to prune redundant nodes. Our experimental study on
both real and synthetic data substantiates our
conclusions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dalvi:2012:ASD,
author = "Nilesh Dalvi and Ashwin Machanavajjhala and Bo Pang",
title = "An analysis of structured data on the web",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "7",
pages = "680--691",
month = mar,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:09 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we analyze the nature and distribution
of structured data on the Web. Web-scale information
extraction, or the problem of creating structured
tables using extraction from the entire web, is
gathering lots of research interest. We perform a study
to understand and quantify the value of Web-scale
extraction, and how structured information is
distributed amongst top aggregator websites and tail
sites for various interesting domains. We believe this
is the first study of its kind, and gives us new
insights for information extraction over the Web.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mouratidis:2012:SPC,
author = "Kyriakos Mouratidis and Man Lung Yiu",
title = "Shortest path computation with no information
leakage",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "8",
pages = "692--703",
month = apr,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:10 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Shortest path computation is one of the most common
queries in location-based services (LBSs). Although
particularly useful, such queries raise serious privacy
concerns. Exposing to a (potentially untrusted) LBS the
client's position and her destination may reveal
personal information, such as social habits, health
condition, shopping preferences, lifestyle choices,
etc. The only existing method for privacy-preserving
shortest path computation follows the obfuscation
paradigm; it prevents the LBS from inferring the source
and destination of the query with a probability higher
than a threshold. This implies, however, that the LBS
still deduces some information (albeit not exact) about
the client's location and her destination. In this
paper we aim at strong privacy, where the adversary
learns nothing about the shortest path query. We
achieve this via established private information
retrieval techniques, which we treat as black-box
building blocks. Experiments on real, large-scale road
networks assess the practicality of our schemes.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Metwally:2012:VSJ,
author = "Ahmed Metwally and Christos Faloutsos",
title = "{V-SMART-join}: a scalable {MapReduce} framework for
all-pair similarity joins of multisets and vectors",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "8",
pages = "704--715",
month = apr,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:10 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This work proposes V-SMART-Join, a scalable
MapReduce-based framework for discovering all pairs of
similar entities. The V-SMART-Join framework is
applicable to sets, multisets, and vectors.
V-SMART-Join is motivated by the observed skew in the
underlying distributions of Internet traffic, and is a
family of 2-stage algorithms, where the first stage
computes and joins the partial results, and the second
stage computes the similarity exactly for all candidate
pairs. The V-SMART-Join algorithms are very efficient
and scalable in the number of entities, as well as
their cardinalities. They were up to 30 times faster
than the state of the art algorithm, VCL, when compared
on a real dataset of a small size. We also established
the scalability of the proposed algorithms by running
them on a dataset of a realistic size, on which VCL
never succeeded to finish. Experiments were run using
real datasets of IPs and cookies, where each IP is
represented as a multiset of cookies, and the goal is
to discover similar IPs to identify Internet proxies.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Low:2012:DGF,
author = "Yucheng Low and Danny Bickson and Joseph Gonzalez and
Carlos Guestrin and Aapo Kyrola and Joseph M.
Hellerstein",
title = "{Distributed GraphLab}: a framework for machine
learning and data mining in the cloud",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "8",
pages = "716--727",
month = apr,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:10 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "While high-level data parallel frameworks, like
MapReduce, simplify the design and implementation of
large-scale data processing systems, they do not
naturally or efficiently support many important data
mining and machine learning algorithms and can lead to
inefficient learning systems. To help fill this
critical void, we introduced the GraphLab abstraction
which naturally expresses asynchronous, dynamic,
graph-parallel computation while ensuring data
consistency and achieving a high degree of parallel
performance in the shared-memory setting. In this
paper, we extend the GraphLab framework to the
substantially more challenging distributed setting
while preserving strong data consistency guarantees. We
develop graph based extensions to pipelined locking and
data versioning to reduce network congestion and
mitigate the effect of network latency. We also
introduce fault tolerance to the GraphLab abstraction
using the classic Chandy-Lamport snapshot algorithm and
demonstrate how it can be easily implemented by
exploiting the GraphLab abstraction itself. Finally, we
evaluate our distributed implementation of the GraphLab
abstraction on a large Amazon EC2 deployment and show
1-2 orders of magnitude performance gains over
Hadoop-based implementations.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zeng:2012:ALO,
author = "Qiang Zeng and Xiaorui Jiang and Hai Zhuge",
title = "Adding logical operators to tree pattern queries on
graph-structured data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "8",
pages = "728--739",
month = apr,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:10 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "As data are increasingly modeled as graphs for
expressing complex relationships, the tree pattern
query on graph-structured data becomes an important
type of queries in real-world applications. Most
practical query languages, such as XQuery and SPARQL,
support logical expressions using logical-AND/OR/NOT
operators to define structural constraints of tree
patterns. In this paper, (1) we propose generalized
tree pattern queries (GTPQs) over graph-structured
data, which fully support propositional logic of
structural constraints. (2) We make a thorough study of
fundamental problems including satisfiability,
containment and minimization, and analyze the
computational complexity and the decision procedures of
these problems. (3) We propose a compact graph
representation of intermediate results and a pruning
approach to reduce the size of intermediate results and
the number of join operations --- two factors that
often impair the efficiency of traditional algorithms
for evaluating tree pattern queries. (4) We present an
efficient algorithm for evaluating GTPQs using 3-hop as
the underlying reachability index. (5) Experiments on
both real-life and synthetic data sets demonstrate the
effectiveness and efficiency of our algorithm, from
several times to orders of magnitude faster than
state-of-the-art algorithms in terms of evaluation
time, even for traditional tree pattern queries with
only conjunctive operations.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Singh:2012:LSS,
author = "Rishabh Singh and Sumit Gulwani",
title = "Learning semantic string transformations from
examples",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "8",
pages = "740--751",
month = apr,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:10 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We address the problem of performing semantic
transformations on strings, which may represent a
variety of data types (or their combination) such as a
column in a relational table, time, date, currency,
etc. Unlike syntactic transformations, which are based
on regular expressions and which interpret a string as
a sequence of characters, semantic transformations
additionally require exploiting the semantics of the
data type represented by the string, which may be
encoded as a database of relational tables. Manually
performing such transformations on a large collection
of strings is error prone and cumbersome, while
programmatic solutions are beyond the skill-set of
end-users. We present a programming by example
technology that allows end-users to automate such
repetitive tasks. We describe an expressive
transformation language for semantic manipulation that
combines table lookup operations and syntactic
manipulations. We then present a synthesis algorithm
that can learn all transformations in the language that
are consistent with the user-provided set of
input-output examples. We have implemented this
technology as an add-in for the Microsoft Excel
Spreadsheet system and have evaluated it successfully
over several benchmarks picked from various Excel
help-forums.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2012:CDD,
author = "Changbin Liu and Lu Ren and Boon Thau Loo and Yun Mao
and Prithwish Basu",
title = "{Cologne}: a declarative distributed constraint
optimization platform",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "8",
pages = "752--763",
month = apr,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:10 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper presents Cologne, a declarative
optimization platform that enables constraint
optimization problems (COPs) to be declaratively
specified and incrementally executed in distributed
systems. Cologne integrates a declarative networking
engine with an off-the-shelf constraint solver. We have
developed the Colog language that combines distributed
Datalog used in declarative networking with language
constructs for specifying goals and constraints used in
COPs. Cologne uses novel query processing strategies
for processing Colog programs, by combining the use of
bottom-up distributed Datalog evaluation with top-down
goal-oriented constraint solving. Using case studies
based on cloud and wireless network optimizations, we
demonstrate that Cologne (1) can flexibly support a
wide range of policy-based optimizations in distributed
systems, (2) results in orders of magnitude less code
compared to imperative implementations, and (3) is
highly efficient with low overhead and fast convergence
times.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2012:OBA,
author = "Yi Zhang and Jun Yang",
title = "Optimizing {I/O} for big array analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "8",
pages = "764--775",
month = apr,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:10 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Big array analytics is becoming indispensable in
answering important scientific and business questions.
Most analysis tasks consist of multiple steps, each
making one or multiple passes over the arrays to be
analyzed and generating intermediate results. In the
big data setting, I/O optimization is a key to
efficient analytics. In this paper, we develop a
framework and techniques for capturing a broad range of
analysis tasks expressible in nested-loop forms,
representing them in a declarative way, and optimizing
their I/O by identifying sharing opportunities.
Experiment results show that our optimizer is capable
of finding execution plans that exploit nontrivial I/O
sharing opportunities with significant savings.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bailis:2012:PBS,
author = "Peter Bailis and Shivaram Venkataraman and Michael J.
Franklin and Joseph M. Hellerstein and Ion Stoica",
title = "Probabilistically bounded staleness for practical
partial quorums",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "8",
pages = "776--787",
month = apr,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:10 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data store replication results in a fundamental
trade-off between operation latency and data
consistency. In this paper, we examine this trade-off
in the context of quorum-replicated data stores. Under
partial, or non-strict quorum replication, a data store
waits for responses from a subset of replicas before
answering a query, without guaranteeing that read and
write replica sets intersect. As deployed in practice,
these configurations provide only basic eventual
consistency guarantees, with no limit to the recency of
data returned. However, anecdotally, partial quorums
are often ``good enough'' for practitioners given their
latency benefits. In this work, we explain why partial
quorums are regularly acceptable in practice, analyzing
both the staleness of data they return and the latency
benefits they offer. We introduce Probabilistically
Bounded Staleness (PBS) consistency, which provides
expected bounds on staleness with respect to both
versions and wall clock time. We derive a closed-form
solution for versioned staleness as well as model
real-time staleness for representative Dynamo-style
systems under internet-scale production workloads.
Using PBS, we measure the latency-consistency trade-off
for partial quorum systems. We quantitatively
demonstrate how eventually consistent systems
frequently return consistent data within tens of
milliseconds while offering significant latency
benefits.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sun:2012:ESM,
author = "Zhao Sun and Hongzhi Wang and Haixun Wang and Bin Shao
and Jianzhong Li",
title = "Efficient subgraph matching on billion node graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "9",
pages = "788--799",
month = may,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:11 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The ability to handle large scale graph data is
crucial to an increasing number of applications. Much
work has been dedicated to supporting basic graph
operations such as subgraph matching, reachability,
regular expression matching, etc. In many cases, graph
indices are employed to speed up query processing.
Typically, most indices require either super-linear
indexing time or super-linear indexing space.
Unfortunately, for very large graphs, super-linear
approaches are almost always infeasible. In this paper,
we study the problem of subgraph matching on
billion-node graphs. We present a novel algorithm that
supports efficient subgraph matching for graphs
deployed on a distributed memory store. Instead of
relying on super-linear indices, we use efficient graph
exploration and massive parallel computing for query
processing. Our experimental results demonstrate the
feasibility of performing subgraph matching on
web-scale graph data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yuan:2012:ESS,
author = "Ye Yuan and Guoren Wang and Lei Chen and Haixun Wang",
title = "Efficient subgraph similarity search on large
probabilistic graph databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "9",
pages = "800--811",
month = may,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:11 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Many studies have been conducted on seeking the
efficient solution for subgraph similarity search over
certain (deterministic) graphs due to its wide
application in many fields, including bioinformatics,
social network analysis, and Resource Description
Framework (RDF) data management. All these works assume
that the underlying data are certain. However, in
reality, graphs are often noisy and uncertain due to
various factors, such as errors in data extraction,
inconsistencies in data integration, and privacy
preserving purposes. Therefore, in this paper, we study
subgraph similarity search on large probabilistic graph
databases. Different from previous works assuming that
edges in an uncertain graph are independent of each
other, we study the uncertain graphs where edges'
occurrences are correlated. We formally prove that
subgraph similarity search over probabilistic graphs is
\#P-complete, thus, we employ a filter-and-verify
framework to speed up the search. In the filtering
phase, we develop tight lower and upper bounds of
subgraph similarity probability based on a
probabilistic matrix index, PMI. PMI is composed of
discriminative subgraph features associated with tight
lower and upper bounds of subgraph isomorphism
probability. Based on PMI, we can sort out a large
number of probabilistic graphs and maximize the pruning
capability. During the verification phase, we develop
an efficient sampling algorithm to validate the
remaining candidates. The efficiency of our proposed
solutions has been verified through extensive
experiments.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2012:TDM,
author = "Jia Wang and James Cheng",
title = "Truss decomposition in massive networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "9",
pages = "812--823",
month = may,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:11 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The $k$-truss is a type of cohesive subgraphs proposed
recently for the study of networks. While the problem
of computing most cohesive subgraphs is NP-hard, there
exists a polynomial time algorithm for computing
$k$-truss. Compared with $k$-core which is also
efficient to compute, $k$-truss represents the ``core''
of a $k$-core that keeps the key information of, while
filtering out less important information from, the
$k$-core. However, existing algorithms for computing
$k$-truss are inefficient for handling today's massive
networks. We first improve the existing in-memory
algorithm for computing $k$-truss in networks of
moderate size. Then, we propose two I/O-efficient
algorithms to handle massive networks that cannot fit
in main memory. Our experiments on real datasets verify
the efficiency of our algorithms and the value of
$k$-truss.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fan:2012:SST,
author = "Ju Fan and Guoliang Li and Lizhu Zhou and Shanshan
Chen and Jun Hu",
title = "{Seal}: spatio-textual similarity search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "9",
pages = "824--835",
month = may,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:11 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Location-based services (LBS) have become more and
more ubiquitous recently. Existing methods focus on
finding relevant points-of-interest (POIs) based on
users' locations and query keywords. Nowadays, modern
LBS applications generate a new kind of spatio-textual
data, regions-of-interest (ROIs), containing
region-based spatial information and textual
description, e.g., mobile user profiles with active
regions and interest tags. To satisfy search
requirements on ROIs, we study a new research problem,
called spatio-textual similarity search: Given a set of
ROIs and a query ROI, we find the similar ROIs by
considering spatial overlap and textual similarity.
Spatio-textual similarity search has many important
applications, e.g., social marketing in location-aware
social networks. It calls for an efficient search
method to support large scales of spatio-textual data
in LBS systems. To this end, we introduce a
filter-and-verification framework to compute the
answers. In the filter step, we generate signatures for
the ROIs and the query, and utilize the signatures to
generate candidates whose signatures are similar to
that of the query. In the verification step, we verify
the candidates and identify the final answers. To
achieve high performance, we generate effective
high-quality signatures, and devise efficient filtering
algorithms as well as pruning techniques. Experimental
results on real and synthetic datasets show that our
method achieves high performance.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lappas:2012:SBT,
author = "Theodoros Lappas and Marcos R. Vieira and Dimitrios
Gunopulos and Vassilis J. Tsotras",
title = "On the spatiotemporal burstiness of terms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "9",
pages = "836--847",
month = may,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:11 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Thousands of documents are made available to the users
via the web on a daily basis. One of the most
extensively studied problems in the context of such
document streams is burst identification. Given a term
t, a burst is generally exhibited when an unusually
high frequency is observed for t. While spatial and
temporal burstiness have been studied individually in
the past, our work is the first to simultaneously track
and measure spatiotemporal term burstiness. In
addition, we use the mined burstiness information
toward an efficient document-search engine: given a
user's query of terms, our engine returns a ranked list
of documents discussing influential events with a
strong spatiotemporal impact. We demonstrate the
efficiency of our methods with an extensive
experimental evaluation on real and synthetic
datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shirani-Mehr:2012:ERQ,
author = "Houtan Shirani-Mehr and Farnoush Banaei-Kashani and
Cyrus Shahabi",
title = "Efficient reachability query evaluation in large
spatiotemporal contact datasets",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "9",
pages = "848--859",
month = may,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:11 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the advent of reliable positioning technologies
and prevalence of location-based services, it is now
feasible to accurately study the propagation of items
such as infectious viruses, sensitive information
pieces, and malwares through a population of moving
objects, e.g., individuals, mobile devices, and
vehicles. In such application scenarios, an item passes
between two objects when the objects are sufficiently
close (i.e., when they are, so-called, in contact), and
hence once an item is initiated, it can penetrate the
object population through the evolving network of
contacts among objects, termed contact network. In this
paper, for the first time we define and study
reachability queries in large (i.e., disk-resident)
contact datasets which record the movement of a
(potentially large) set of objects moving in a spatial
environment over an extended time period. A
reachability query verifies whether two objects are
``reachable'' through the evolving contact network
represented by such contact datasets. We propose two
contact-dataset indexes that enable efficient
evaluation of such queries despite the potentially
humongous size of the contact datasets. With the first
index, termed ReachGrid, at the query time only a small
necessary portion of the contact network which is
required for reachability evaluation is constructed and
traversed. With the second approach, termed ReachGraph,
we precompute reachability at different scales and
leverage these precalculations at the query time for
efficient query processing. We optimize the placement
of both indexes on disk to enable efficient index
traversal during query processing. We study the pros
and cons of our proposed approaches by performing
extensive experiments with both real and synthetic
data. Based on our experimental results, our proposed
approaches outperform existing reachability query
processing techniques in contact networks by 76\% on
average.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Nguyen:2012:BMO,
author = "Thi Nguyen and Zhen He and Rui Zhang and Phillip
Ward",
title = "Boosting moving object indexing through velocity
partitioning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "9",
pages = "860--871",
month = may,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:11 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "There have been intense research interests in moving
object indexing in the past decade. However, existing
work did not exploit the important property of skewed
velocity distributions. In many real world scenarios,
objects travel predominantly along only a few
directions. Examples include vehicles on road networks,
flights, people walking on the streets, etc. The search
space for a query is heavily dependent on the velocity
distribution of the objects grouped in the nodes of an
index tree. Motivated by this observation, we propose
the velocity partitioning (VP) technique, which
exploits the skew in velocity distribution to speed up
query processing using moving object indexes. The VP
technique first identifies the ``dominant velocity axes
(DVAs)'' using a combination of principal components
analysis (PCA) and $k$-means clustering. Then, a moving
object index (e.g., a TPR-tree) is created based on
each DVA, using the DVA as an axis of the underlying
coordinate system. An object is maintained in the index
whose DVA is closest to the object's current moving
direction. Thus, all the objects in an index are moving
in a near 1-dimensional space instead of a
2-dimensional space. As a result, the expansion of the
search space with time is greatly reduced, from a
quadratic function of the maximum speed (of the objects
in the search range) to a near linear function of the
maximum speed. The VP technique can be applied to a
wide range of moving object index structures. We have
implemented the VP technique on two representative
ones, the TPR*-tree and the B$^x$-tree. Extensive
experiments validate that the VP technique consistently
improves the performance of those index structures.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bidoit-Tollu:2012:TBD,
author = "Nicole Bidoit-Tollu and Dario Colazzo and Federico
Ulliana",
title = "Type-based detection of {XML} query-update
independence",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "9",
pages = "872--883",
month = may,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:11 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper presents a novel static analysis technique
to detect XML query-update independence, in the
presence of a schema. Rather than types, our system
infers chains of types. Each chain represents a path
that can be traversed on a valid document during
query/update evaluation. The resulting independence
analysis is precise, although it raises a challenging
issue: recursive schemas may lead to inference of
infinitely many chains. A sound and complete
approximation technique ensuring a finite analysis in
any case is presented, together with an efficient
implementation performing the chain-based analysis in
polynomial space and time.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sowell:2012:MSD,
author = "Benjamin Sowell and Wojciech Golab and Mehul A. Shah",
title = "{Minuet}: a scalable distributed multiversion
{B}-tree",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "9",
pages = "884--895",
month = may,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:11 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data management systems have traditionally been
designed to support either long-running analytics
queries or short-lived transactions, but an increasing
number of applications need both. For example, online
games, socio-mobile apps, and e-commerce sites need to
not only maintain operational state, but also analyze
that data quickly to make predictions and
recommendations that improve user experience. In this
paper, we present Minuet, a distributed, main-memory
B-tree that supports both transactions and
copy-on-write snapshots for in-situ analytics. Minuet
uses main-memory storage to enable low-latency
transactional operations as well as analytics queries
without compromising transaction performance. In
addition to supporting read-only analytics queries on
snapshots, Minuet supports writable clones, so that
users can create branching versions of the data. This
feature can be quite useful, e.g. to support complex
``what-if'' analysis or to facilitate wide-area
replication. Our experiments show that Minuet
outperforms a commercial main-memory database in many
ways. It scales to hundreds of cores and TBs of memory,
and can process hundreds of thousands of B-tree
operations per second while executing long-running
scans.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yin:2012:CLT,
author = "Hongzhi Yin and Bin Cui and Jing Li and Junjie Yao and
Chen Chen",
title = "Challenging the long tail recommendation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "9",
pages = "896--907",
month = may,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:11 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The success of ``infinite-inventory'' retailers such
as Amazon.com and Netflix has been largely attributed
to a ``long tail'' phenomenon. Although the majority of
their inventory is not in high demand, these niche
products, unavailable at limited-inventory competitors,
generate a significant fraction of total revenue in
aggregate. In addition, tail product availability can
boost head sales by offering consumers the convenience
of ``one-stop shopping'' for both their mainstream and
niche tastes. However, most of existing recommender
systems, especially collaborative filter based methods,
can not recommend tail products due to the data
sparsity issue. It has been widely acknowledged that to
recommend popular products is easier yet more trivial
while to recommend long tail products adds more novelty
yet it is also a more challenging task. In this paper,
we propose a novel suite of graph-based algorithms for
the long tail recommendation. We first represent
user-item information with undirected edge-weighted
graph and investigate the theoretical foundation of
applying Hitting Time algorithm for long tail item
recommendation. To improve recommendation diversity and
accuracy, we extend Hitting Time and propose efficient
Absorbing Time algorithm to help users find their
favorite long tail items. Finally, we refine the
Absorbing Time algorithm and propose two entropy-biased
Absorbing Cost algorithms to distinguish the variation
on different user-item rating pairs, which further
enhances the effectiveness of long tail recommendation.
Empirical experiments on two real life datasets show
that our proposed algorithms are effective to recommend
long tail items and outperform state-of-the-art
recommendation techniques.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Pimplikar:2012:ATQ,
author = "Rakesh Pimplikar and Sunita Sarawagi",
title = "Answering table queries on the {Web} using column
keywords",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "10",
pages = "908--919",
month = jun,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:13 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present the design of a structured search engine
which returns a multi-column table in response to a
query consisting of keywords describing each of its
columns. We answer such queries by exploiting the
millions of tables on the Web because these are much
richer sources of structured knowledge than free-format
text. However, a corpus of tables harvested from
arbitrary HTML web pages presents huge challenges of
diversity and redundancy not seen in centrally edited
knowledge bases. We concentrate on one concrete task in
this paper. Given a set of Web tables T$_1$,\ldots{},
T$_n$, and a query Q with q sets of keywords
Q$_1$,\ldots{}, Q$_q$, decide for each T$_i$ if it is
relevant to Q and if so, identify the mapping between
the columns of T$_i$ and query columns. We represent
this task as a graphical model that jointly maps all
tables by incorporating diverse sources of clues
spanning matches in different parts of the table,
corpus-wide co-occurrence statistics, and content
overlap across table columns. We define a novel query
segmentation model for matching keywords to table
columns, and a robust mechanism of exploiting content
overlap across table columns. We design efficient
inference algorithms based on bipartite matching and
constrained graph cuts to solve the joint labeling
task. Experiments on a workload of 59 queries over a 25
million web table corpus shows significant boost in
accuracy over baseline IR methods.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Goodrich:2012:EVW,
author = "Michael T. Goodrich and Charalampos Papamanthou and
Duy Nguyen and Roberto Tamassia and Cristina Videira
Lopes and Olga Ohrimenko and Nikos Triandopoulos",
title = "Efficient verification of web-content searching
through authenticated web crawlers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "10",
pages = "920--931",
month = jun,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:13 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We consider the problem of verifying the correctness
and completeness of the result of a keyword search. We
introduce the concept of an authenticated web crawler
and present its design and prototype implementation. An
authenticated web crawler is a trusted program that
computes a specially-crafted signature over the web
contents it visits. This signature enables (i) the
verification of common Internet queries on web pages,
such as conjunctive keyword searches---this guarantees
that the output of a conjunctive keyword search is
correct and complete; (ii) the verification of the
content returned by such Internet queries---this
guarantees that web data is authentic and has not been
maliciously altered since the computation of the
signature by the crawler. In our solution, the search
engine returns a cryptographic proof of the query
result. Both the proof size and the verification time
are proportional only to the sizes of the query
description and the query result, but do not depend on
the number or sizes of the web pages over which the
search is performed. As we experimentally demonstrate,
the prototype implementation of our system provides a
low communication overhead between the search engine
and the user, and fast verification of the returned
results by the user.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Blunschi:2012:SGS,
author = "Lukas Blunschi and Claudio Jossen and Donald Kossmann
and Magdalini Mori and Kurt Stockinger",
title = "{SODA}: generating {SQL} for business users",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "10",
pages = "932--943",
month = jun,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:13 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The purpose of data warehouses is to enable business
analysts to make better decisions. Over the years the
technology has matured and data warehouses have become
extremely successful. As a consequence, more and more
data has been added to the data warehouses and their
schemas have become increasingly complex. These systems
still work great in order to generate pre-canned
reports. However, with their current complexity, they
tend to be a poor match for non tech-savvy business
analysts who need answers to ad-hoc queries that were
not anticipated. This paper describes the design,
implementation, and experience of the SODA system
(Search over DAta Warehouse). SODA bridges the gap
between the business needs of analysts and the
technical complexity of current data warehouses. SODA
enables a Google-like search experience for data
warehouses by taking keyword queries of business users
and automatically generating executable SQL. The key
idea is to use a graph pattern matching algorithm that
uses the metadata model of the data warehouse. Our
results with real data from a global player in the
financial services industry show that SODA produces
queries with high precision and recall, and makes it
much easier for business users to interactively explore
highly-complex data warehouses.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Terrovitis:2012:PPD,
author = "Manolis Terrovitis and Nikos Mamoulis and John
Liagouris and Spiros Skiadopoulos",
title = "Privacy preservation by disassociation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "10",
pages = "944--955",
month = jun,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:13 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this work, we focus on protection against identity
disclosure in the publication of sparse
multidimensional data. Existing multidimensional
anonymization techniques (a) protect the privacy of
users either by altering the set of quasi-identifiers
of the original data (e.g., by generalization or
suppression) or by adding noise (e.g., using
differential privacy) and/or (b) assume a clear
distinction between sensitive and non-sensitive
information and sever the possible linkage. In many
real world applications the above techniques are not
applicable. For instance, consider web search query
logs. Suppressing or generalizing anonymization methods
would remove the most valuable information in the
dataset: the original query terms. Additionally, web
search query logs contain millions of query terms which
cannot be categorized as sensitive or non-sensitive
since a term may be sensitive for a user and
non-sensitive for another. Motivated by this
observation, we propose an anonymization technique
termed disassociation that preserves the original terms
but hides the fact that two or more different terms
appear in the same record. We protect the users'
privacy by disassociating record terms that participate
in identifying combinations. This way the adversary
cannot associate with high probability a record with a
rare combination of terms. To the best of our
knowledge, our proposal is the first to employ such a
technique to provide protection against identity
disclosure. We propose an anonymization algorithm based
on our approach and evaluate its performance on real
and synthetic datasets, comparing it against other
state-of-the-art methods based on generalization and
differential privacy.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kanagal:2012:SRS,
author = "Bhargav Kanagal and Amr Ahmed and Sandeep Pandey and
Vanja Josifovski and Jeff Yuan and Lluis Garcia-Pueyo",
title = "Supercharging recommender systems using taxonomies for
learning user purchase behavior",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "10",
pages = "956--967",
month = jun,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:13 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Recommender systems based on latent factor models have
been effectively used for understanding user interests
and predicting future actions. Such models work by
projecting the users and items into a smaller
dimensional space, thereby clustering similar users and
items together and subsequently compute similarity
between unknown user-item pairs. When user-item
interactions are sparse (sparsity problem) or when new
items continuously appear (cold start problem), these
models perform poorly. In this paper, we exploit the
combination of taxonomies and latent factor models to
mitigate these issues and improve recommendation
accuracy. We observe that taxonomies provide structure
similar to that of a latent factor model: namely, it
imposes human-labeled categories (clusters) over items.
This leads to our proposed taxonomy-aware latent factor
model (TF) which combines taxonomies and latent factors
using additive models. We develop efficient algorithms
to train the TF models, which scales to large number of
users/items and develop scalable
inference/recommendation algorithms by exploiting the
structure of the taxonomy. In addition, we extend the
TF model to account for the temporal dynamics of user
interests using high-order Markov chains. To deal with
large-scale data, we develop a parallel multi-core
implementation of our TF model. We empirically evaluate
the TF model for the task of predicting user purchases
using a real-world shopping dataset spanning more than
a million users and products. Our experiments
demonstrate the benefits of using our TF models over
existing approaches, in terms of both prediction
accuracy and running time.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ahmad:2012:DHO,
author = "Yanif Ahmad and Oliver Kennedy and Christoph Koch and
Milos Nikolic",
title = "{DBToaster}: higher-order delta processing for
dynamic, frequently fresh views",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "10",
pages = "968--979",
month = jun,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:13 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Applications ranging from algorithmic trading to
scientific data analysis require realtime analytics
based on views over databases that change at very high
rates. Such views have to be kept fresh at low
maintenance cost and latencies. At the same time, these
views have to support classical SQL, rather than window
semantics, to enable applications that combine current
with aged or historical data. In this paper, we present
viewlet transforms, a recursive finite differencing
technique applied to queries. The viewlet transform
materializes a query and a set of its higher-order
deltas as views. These views support each other's
incremental maintenance, leading to a reduced overall
view maintenance cost. The viewlet transform of a query
admits efficient evaluation, the elimination of certain
expensive query operations, and aggressive
parallelization. We develop viewlet transforms into a
workable query execution technique, present a heuristic
and cost-based optimization framework, and report on
experiments with a prototype dynamic data management
system that combines viewlet transforms with an
optimizing compilation technique. The system supports
tens of thousands of complete view refreshes a second
for a wide range of queries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Agarwal:2012:RTD,
author = "Manoj K. Agarwal and Krithi Ramamritham and Manish
Bhide",
title = "Real time discovery of dense clusters in highly
dynamic graphs: identifying real world events in highly
dynamic environments",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "10",
pages = "980--991",
month = jun,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:13 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Due to their real time nature, microblog streams are a
rich source of dynamic information, for example, about
emerging events. Existing techniques for discovering
such events from a microblog stream in real time (such
as Twitter trending topics), have several lacunae when
used for discovering emerging events; extant graph
based event detection techniques are not practical in
microblog settings due to their complexity; and
conventional techniques, which have been developed for
blogs, web-pages, etc., involving the use of keyword
search, are only useful for finding information about
known events. Hence, in this paper, we present
techniques to discover events that are unraveling in
microblog message streams in real time so that such
events can be reported as soon as they occur. We model
the problem as discovering dense clusters in highly
dynamic graphs. Despite many recent advances in graph
analysis, ours is the first technique to identify dense
clusters in massive and highly dynamic graphs in real
time. Given the characteristics of microblog streams,
in order to find clusters without missing any events,
we propose and exploit a novel graph property which we
call short-cycle property. Our algorithms find these
clusters efficiently in spite of rapid changes to the
microblog streams. Further we present a novel ranking
function to identify the important events. Besides
proving the correctness of our algorithms we show their
practical utility by evaluating them using real world
microblog data. These demonstrate our technique's
ability to discover, with high precision and recall,
emerging events in high intensity data streams in real
time. Many recent web applications create data which
can be represented as massive dynamic graphs. Our
technique can be easily extended to discover, in real
time, interesting patterns in such graphs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Papapetrou:2012:SBQ,
author = "Odysseas Papapetrou and Minos Garofalakis and Antonios
Deligiannakis",
title = "Sketch-based querying of distributed sliding-window
data streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "10",
pages = "992--1003",
month = jun,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:13 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "While traditional data-management systems focus on
evaluating single, ad-hoc queries over static data sets
in a centralized setting, several emerging applications
require (possibly, continuous) answers to queries on
dynamic data that is widely distributed and constantly
updated. Furthermore, such query answers often need to
discount data that is ``stale'', and operate solely on
a sliding window of recent data arrivals (e.g., data
updates occurring over the last 24 hours). Such
distributed data streaming applications mandate novel
algorithmic solutions that are both time- and
space-efficient (to manage high-speed data streams),
and also communication-efficient (to deal with physical
data distribution). In this paper, we consider the
problem of complex query answering over distributed,
high-dimensional data streams in the sliding-window
model. We introduce a novel sketching technique (termed
ECM-sketch) that allows effective summarization of
streaming data over both time-based and count-based
sliding windows with probabilistic accuracy guarantees.
Our sketch structure enables point as well as
inner-product queries, and can be employed to address a
broad range of problems, such as maintaining frequency
statistics, finding heavy hitters, and computing
quantiles in the sliding-window model. Focusing on
distributed environments, we demonstrate how
ECM-sketches of individual, local streams can be
composed to generate a (low-error) ECM-sketch summary
of the order-preserving aggregation of all streams;
furthermore, we show how ECM-sketches can be exploited
for continuous monitoring of sliding-window queries
over distributed streams. Our extensive experimental
study with two real-life data sets validates our
theoretical claims and verifies the effectiveness of
our techniques. To the best of our knowledge, ours is
the first work to address efficient, guaranteed-error
complex query answering over distributed data streams
in the sliding-window model.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Vo:2012:LSL,
author = "Hoang Tam Vo and Sheng Wang and Divyakant Agrawal and
Gang Chen and Beng Chin Ooi",
title = "{LogBase}: a scalable log-structured database system
in the cloud",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "10",
pages = "1004--1015",
month = jun,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:13 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Numerous applications such as financial transactions
(e.g., stock trading) are write-heavy in nature. The
shift from reads to writes in web applications has also
been accelerating in recent years. Write-ahead-logging
is a common approach for providing recovery capability
while improving performance in most storage systems.
However, the separation of log and application data
incurs write overheads observed in write-heavy
environments and hence adversely affects the write
throughput and recovery time in the system. In this
paper, we introduce LogBase --- a scalable
log-structured database system that adopts log-only
storage for removing the write bottleneck and
supporting fast system recovery. It is designed to be
dynamically deployed on commodity clusters to take
advantage of elastic scaling property of cloud
environments. LogBase provides in-memory multiversion
indexes for supporting efficient access to data
maintained in the log. LogBase also supports
transactions that bundle read and write operations
spanning across multiple records. We implemented the
proposed system and compared it with HBase and a
disk-based log-structured record-oriented system
modeled after RAMCloud. The experimental results show
that LogBase is able to provide sustained write
throughput, efficient data access out of the cache, and
effective system recovery.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lu:2012:EPN,
author = "Wei Lu and Yanyan Shen and Su Chen and Beng Chin Ooi",
title = "Efficient processing of $k$ nearest neighbor joins
using {MapReduce}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "10",
pages = "1016--1027",
month = jun,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:13 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "k nearest neighbor join ($k$ NN join), designed to
find $k$ nearest neighbors from a dataset S for every
object in another dataset R, is a primitive operation
widely adopted by many data mining applications. As a
combination of the $k$ nearest neighbor query and the
join operation, $k$ NN join is an expensive operation.
Given the increasing volume of data, it is difficult to
perform a $k$ NN join on a centralized machine
efficiently. In this paper, we investigate how to
perform $k$ NN join using MapReduce which is a
well-accepted framework for data-intensive applications
over clusters of computers. In brief, the mappers
cluster objects into groups; the reducers perform the
$k$ NN join on each group of objects separately. We
design an effective mapping mechanism that exploits
pruning rules for distance filtering, and hence reduces
both the shuffling and computational costs. To reduce
the shuffling cost, we propose two approximate
algorithms to minimize the number of replicas.
Extensive experiments on our in-house cluster
demonstrate that our proposed methods are efficient,
robust and scalable.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Laptev:2012:EAR,
author = "Nikolay Laptev and Kai Zeng and Carlo Zaniolo",
title = "Early accurate results for advanced analytics on
{MapReduce}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "10",
pages = "1028--1039",
month = jun,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:13 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Approximate results based on samples often provide the
only way in which advanced analytical applications on
very massive data sets can satisfy their time and
resource constraints. Unfortunately, methods and tools
for the computation of accurate early results are
currently not supported in MapReduce-oriented systems
although these are intended for 'big data'. Therefore,
we proposed and implemented a non-parametric extension
of Hadoop which allows the incremental computation of
early results for arbitrary work-flows, along with
reliable on-line estimates of the degree of accuracy
achieved so far in the computation. These estimates are
based on a technique called bootstrapping that has been
widely employed in statistics and can be applied to
arbitrary functions and data distributions. In this
paper, we describe our Early Accurate Result Library
(EARL) for Hadoop that was designed to minimize the
changes required to the MapReduce framework. Various
tests of EARL of Hadoop are presented to characterize
the frequent situations where EARL can provide major
speed-ups over the current version of Hadoop.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2012:CCD,
author = "Xuan Liu and Meiyu Lu and Beng Chin Ooi and Yanyan
Shen and Sai Wu and Meihui Zhang",
title = "{CDAS}: a crowdsourcing data analytics system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "10",
pages = "1040--1051",
month = jun,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:13 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Some complex problems, such as image tagging and
natural language processing, are very challenging for
computers, where even state-of-the-art technology is
yet able to provide satisfactory accuracy. Therefore,
rather than relying solely on developing new and better
algorithms to handle such tasks, we look to the
crowdsourcing solution --- employing human
participation --- to make good the shortfall in current
technology. Crowdsourcing is a good supplement to many
computer tasks. A complex job may be divided into
computer-oriented tasks and human-oriented tasks, which
are then assigned to machines and humans respectively.
To leverage the power of crowdsourcing, we design and
implement a Crowdsourcing Data Analytics System, CDAS.
CDAS is a framework designed to support the deployment
of various crowdsourcing applications. The core part of
CDAS is a quality-sensitive answering model, which
guides the crowdsourcing engine to process and monitor
the human tasks. In this paper, we introduce the
principles of our quality-sensitive model. To satisfy
user required accuracy, the model guides the
crowdsourcing query engine for the design and
processing of the corresponding crowdsourcing jobs. It
provides an estimated accuracy for each generated
result based on the human workers' historical
performances. When verifying the quality of the result,
the model employs an online strategy to reduce waiting
time. To show the effectiveness of the model, we
implement and deploy two analytics jobs on CDAS, a
twitter sentiment analytics job and an image tagging
job. We use real Twitter and Flickr data as our queries
respectively. We compare our approaches with
state-of-the-art classification and image annotation
techniques. The results show that the human-assisted
methods can indeed achieve a much higher accuracy. By
embedding the quality-sensitive model into
crowdsourcing query engine, we effectively reduce the
processing cost while maintaining the required query
answer quality.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sachan:2012:MSS,
author = "Mayank Sachan and Arnab Bhattacharya",
title = "Mining statistically significant substrings using the
chi-square statistic",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "10",
pages = "1052--1063",
month = jun,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:13 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The problem of identification of statistically
significant patterns in a sequence of data has been
applied to many domains such as intrusion detection
systems, financial models, web-click records, automated
monitoring systems, computational biology, cryptology,
and text analysis. An observed pattern of events is
deemed to be statistically significant if it is
unlikely to have occurred due to randomness or chance
alone. We use the chi-square statistic as a
quantitative measure of statistical significance. Given
a string of characters generated from a memoryless
Bernoulli model, the problem is to identify the
substring for which the empirical distribution of
single letters deviates the most from the distribution
expected from the generative Bernoulli model. This
deviation is captured using the chi-square measure. The
most significant substring (MSS) of a string is thus
defined as the substring having the highest chi-square
value. Till date, to the best of our knowledge, there
does not exist any algorithm to find the MSS in better
than $ O(n^2) $ time, where $n$ denotes the length of
the string. In this paper, we propose an algorithm to
find the most significant substring, whose running time
is $ O(n^{3 / 2})$ with high probability. We also study
some variants of this problem such as finding the
top-$t$ set, finding all substrings having chi-square
greater than a fixed threshold and finding the MSS
among substrings greater than a given length. We
experimentally demonstrate the asymptotic behavior of
the MSS on varying the string size and alphabet size.
We also describe some applications of our algorithm on
cryptology and real world data from finance and sports.
Finally, we compare our technique with the existing
heuristics for finding the MSS.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Albutiu:2012:MPS,
author = "Martina-Cezara Albutiu and Alfons Kemper and Thomas
Neumann",
title = "Massively parallel sort-merge joins in main memory
multi-core database systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "10",
pages = "1064--1075",
month = jun,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:13 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Two emerging hardware trends will dominate the
database system technology in the near future:
increasing main memory capacities of several TB per
server and massively parallel multi-core processing.
Many algorithmic and control techniques in current
database technology were devised for disk-based systems
where I/O dominated the performance. In this work we
take a new look at the well-known sort-merge join
which, so far, has not been in the focus of research in
scalable massively parallel multi-core data processing
as it was deemed inferior to hash joins. We devise a
suite of new massively parallel sort-merge (MPSM) join
algorithms that are based on partial partition-based
sorting. Contrary to classical sort-merge joins, our
MPSM algorithms do not rely on a hard to parallelize
final merge step to create one complete sort order.
Rather they work on the independently created runs in
parallel. This way our MPSM algorithms are NUMA-affine
as all the sorting is carried out on local memory
partitions. An extensive experimental evaluation on a
modern 32-core machine with one TB of main memory
proves the competitive performance of MPSM on large
main memory databases with billions of objects. It
scales (almost) linearly in the number of employed
cores and clearly outperforms competing hash join
proposals --- in particular it outperforms the
``cutting-edge'' Vectorwise parallel query engine by a
factor of four.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Luo:2012:HDH,
author = "Tian Luo and Rubao Lee and Michael Mesnier and Feng
Chen and Xiaodong Zhang",
title = "{hStorage-DB}: heterogeneity-aware data management to
exploit the full capability of hybrid storage systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "10",
pages = "1076--1087",
month = jun,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:13 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "As storage systems become increasingly heterogeneous
and complex, it adds burdens on DBAs, causing
suboptimal performance even after a lot of human
efforts have been made. In addition, existing
monitoring-based storage management by access pattern
detections has difficulties to handle workloads that
are highly dynamic and concurrent. To achieve high
performance by best utilizing heterogeneous storage
devices, we have designed and implemented a
heterogeneity-aware software framework for DBMS storage
management called hStorage-DB, where semantic
information that is critical for storage I/O is
identified and passed to the storage manager. According
to the collected semantic information, requests are
classified into different types. Each type is assigned
a proper QoS policy supported by the underlying storage
system, so that every request will be served with a
suitable storage device. With hStorage-DB, we can well
utilize semantic information that cannot be detected
through data access monitoring but is particularly
important for a hybrid storage system. To show the
effectiveness of hStorage-DB, we have implemented a
system prototype that consists of an I/O request
classification enabled DBMS, and a hybrid storage
system that is organized into a two-level caching
hierarchy. Our performance evaluation shows that
hStorage-DB can automatically make proper decisions for
data allocation in different storage devices and make
substantial performance improvements in a
cost-efficient way.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Choi:2012:SAM,
author = "Dong-Wan Choi and Chin-Wan Chung and Yufei Tao",
title = "A scalable algorithm for maximizing range sum in
spatial databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1088--1099",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper investigates the MaxRS problem in spatial
databases. Given a set O of weighted points and a
rectangular region r of a given size, the goal of the
MaxRS problem is to find a location of r such that the
sum of the weights of all the points covered by r is
maximized. This problem is useful in many
location-based applications such as finding the best
place for a new franchise store with a limited delivery
range and finding the most attractive place for a
tourist with a limited reachable range. However, the
problem has been studied mainly in theory,
particularly, in computational geometry. The existing
algorithms from the computational geometry community
are in-memory algorithms which do not guarantee the
scalability. In this paper, we propose a scalable
external-memory algorithm (ExactMaxRS) for the MaxRS
problem, which is optimal in terms of the I/O
complexity. Furthermore, we propose an approximation
algorithm (ApproxMaxCRS) for the MaxCRS problem that is
a circle version of the MaxRS problem. We prove the
correctness and optimality of the ExactMaxRS algorithm
along with the approximation bound of the ApproxMaxCRS
algorithm. From extensive experimental results, we show
that the ExactMaxRS algorithm is two orders of
magnitude faster than methods adapted from existing
algorithms, and the approximation bound in practice is
much better than the theoretical bound of the
ApproxMaxCRS algorithm.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Aly:2012:SQT,
author = "Ahmed M. Aly and Walid G. Aref and Mourad Ouzzani",
title = "Spatial queries with two {kNN} predicates",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1100--1111",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The widespread use of location-aware devices has led
to countless location-based services in which a user
query can be arbitrarily complex, i.e., one that embeds
multiple spatial selection and join predicates. Amongst
these predicates, the $k$-Nearest-Neighbor ($k$ NN)
predicate stands as one of the most important and
widely used predicates. Unlike related research, this
paper goes beyond the optimization of queries with
single $k$ NN predicates, and shows how queries with
two $k$ NN predicates can be optimized. In particular,
the paper addresses the optimization of queries with:
(i) two $k$ NN-select predicates, (ii) two $k$ NN-join
predicates, and (iii) one $k$ NN-join predicate and one
$k$ NN-select predicate. For each type of queries,
conceptually correct query evaluation plans (QEPs) and
new algorithms that optimize the query execution time
are presented. Experimental results demonstrate that
the proposed algorithms outperform the conceptually
correct QEPs by orders of magnitude.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sheng:2012:OAC,
author = "Cheng Sheng and Nan Zhang and Yufei Tao and Xin Jin",
title = "Optimal algorithms for crawling a hidden database in
the web",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1112--1123",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A hidden database refers to a dataset that an
organization makes accessible on the web by allowing
users to issue queries through a search interface. In
other words, data acquisition from such a source is not
by following static hyper-links. Instead, data are
obtained by querying the interface, and reading the
result page dynamically generated. This, with other
facts such as the interface may answer a query only
partially, has prevented hidden databases from being
crawled effectively by existing search engines. This
paper remedies the problem by giving algorithms to
extract all the tuples from a hidden database. Our
algorithms are provably efficient, namely, they
accomplish the task by performing only a small number
of queries, even in the worst case. We also establish
theoretical results indicating that these algorithms
are asymptotically optimal --- i.e., it is impossible
to improve their efficiency by more than a constant
factor. The derivation of our upper and lower bound
results reveals significant insight into the
characteristics of the underlying problem. Extensive
experiments confirm the proposed techniques work very
well on all the real datasets examined.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Qin:2012:DTR,
author = "Lu Qin and Jeffrey Xu Yu and Lijun Chang",
title = "Diversifying top-$k$ results",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1124--1135",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Top-$k$ query processing finds a list of $k$ results
that have largest scores w.r.t the user given query,
with the assumption that all the $k$ results are
independent to each other. In practice, some of the
top-$k$ results returned can be very similar to each
other. As a result some of the top-$k$ results returned
are redundant. In the literature, diversified top-$k$
search has been studied to return $k$ results that take
both score and diversity into consideration. Most
existing solutions on diversified top-$k$ search assume
that scores of all the search results are given, and
some works solve the diversity problem on a specific
problem and can hardly be extended to general cases. In
this paper, we study the diversified top-$k$ search
problem. We define a general diversified top-$k$ search
problem that only considers the similarity of the
search results themselves. We propose a framework, such
that most existing solutions for top-$k$ query
processing can be extended easily to handle diversified
top-$k$ search, by simply applying three new functions,
a sufficient stop condition sufficient(), a necessary
stop condition necessary(), and an algorithm for
diversified top-$k$ search on the current set of
generated results, div-search-current(). We propose
three new algorithms, namely, div-astar, div-dp, and
div-cut to solve the div-search-current() problem.
div-astar is an A* based algorithm, div-dp is an
algorithm that decomposes the results into components
which are searched using div-astar independently and
combined using dynamic programming. div-cut further
decomposes the current set of generated results using
cut points and combines the results using sophisticated
operations. We conducted extensive performance studies
using two real datasets, enwiki and reuters. Our
div-cut algorithm finds the optimal solution for
diversified top-$k$ search problem in seconds even for
$k$ as large as 2, 000.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cao:2012:KAO,
author = "Xin Cao and Lisi Chen and Gao Cong and Xiaokui Xiao",
title = "Keyword-aware optimal route search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1136--1147",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Identifying a preferable route is an important problem
that finds applications in map services. When a user
plans a trip within a city, the user may want to find
``a most popular route such that it passes by shopping
mall, restaurant, and pub, and the travel time to and
from his hotel is within 4 hours.'' However, none of
the algorithms in the existing work on route planning
can be used to answer such queries. Motivated by this,
we define the problem of keyword-aware optimal route
query, denoted by KOR, which is to find an optimal
route such that it covers a set of user-specified
keywords, a specified budget constraint is satisfied,
and an objective score of the route is optimal. The
problem of answering KOR queries is NP-hard. We devise
an approximation algorithm OSScaling with provable
approximation bounds. Based on this algorithm, another
more efficient approximation algorithm BucketBound is
proposed. We also design a greedy approximation
algorithm. Results of empirical studies show that all
the proposed algorithms are capable of answering KOR
queries efficiently, while the BucketBound and Greedy
algorithms run faster. The empirical studies also offer
insight into the accuracy of the proposed algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cautis:2012:AQU,
author = "Bogdan Cautis and Evgeny Kharlamov",
title = "Answering queries using views over probabilistic
{XML}: complexity and tractability",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1148--1159",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We study the complexity of query answering using views
in a probabilistic XML setting, identifying large
classes of XPath queries --- with child and descendant
navigation and predicates --- for which there are
efficient (PTime) algorithms. We consider this problem
under the two possible semantics for XML query results:
with persistent node identifiers and in their absence.
Accordingly, we consider rewritings that can exploit a
single view, by means of compensation, and rewritings
that can use multiple views, by means of intersection.
Since in a probabilistic setting queries return answers
with probabilities, the problem of rewriting goes
beyond the classic one of retrieving XML answers from
views. For both semantics of XML queries, we show that,
even when XML answers can be retrieved from views,
their probabilities may not be computable. For
rewritings that use only compensation, we describe a
PTime decision procedure, based on easily verifiable
criteria that distinguish between the feasible cases
--- when probabilistic XML results are computable ---
and the unfeasible ones. For rewritings that can use
multiple views, with compensation and intersection, we
identify the most permissive conditions that make
probabilistic rewriting feasible, and we describe an
algorithm that is sound in general, and becomes
complete under fairly permissive restrictions, running
in PTime modulo worst-case exponential time equivalence
tests. This is the best we can hope for since
intersection makes query equivalence intractable
already over deterministic data. Our algorithm runs in
PTime whenever deterministic rewritings can be found in
PTime.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jha:2012:PDM,
author = "Abhay Jha and Dan Suciu",
title = "Probabilistic databases with {MarkoViews}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1160--1171",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Most of the work on query evaluation in probabilistic
databases has focused on the simple tuple-independent
data model, where tuples are independent random events.
Several efficient query evaluation techniques exists in
this setting, such as safe plans, algorithms based on
OBDDs, tree-decomposition and a variety of
approximation algorithms. However, complex data
analytics tasks often require complex correlations, and
query evaluation then is significantly more expensive,
or more restrictive. In this paper, we propose MVDB as
a framework both for representing complex correlations
and for efficient query evaluation. An MVDB specifies
correlations by views, called MarkoViews, on the
probabilistic relations and declaring the weights of
the view's outputs. An MVDB is a (very large) Markov
Logic Network. We make two sets of contributions.
First, we show that query evaluation on an MVDB is
equivalent to evaluating a Union of Conjunctive
Query(UCQ) over a tuple-independent database. The
translation is exact (thus allowing the techniques
developed for tuple independent databases to be carried
over to MVDB), yet it is novel and quite non-obvious
(some resulting probabilities may be negative!). This
translation in itself though may not lead to much gain
since the translated query gets complicated as we try
to capture more correlations. Our second contribution
is to propose a new query evaluation strategy that
exploits offline compilation to speed up online query
evaluation. Here we utilize and extend our prior work
on compilation of UCQ. We validate experimentally our
techniques on a large probabilistic database with
MarkoViews inferred from the DBLP data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mamouras:2012:CSC,
author = "Konstantinos Mamouras and Sigal Oren and Lior Seeman
and Lucja Kot and Johannes Gehrke",
title = "The complexity of social coordination",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1172--1183",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Coordination is a challenging everyday task; just
think of the last time you organized a party or a
meeting involving several people. As a growing part of
our social and professional life goes online, an
opportunity for an improved coordination process
arises. Recently, Gupta et al. proposed entangled
queries as a declarative abstraction for data-driven
coordination, where the difficulty of the coordination
task is shifted from the user to the database.
Unfortunately, evaluating entangled queries is very
hard, and thus previous work considered only a
restricted class of queries that satisfy safety (the
coordination partners are fixed) and uniqueness (all
queries need to be satisfied). In this paper we
significantly extend the class of feasible entangled
queries beyond uniqueness and safety. First, we show
that we can simply drop uniqueness and still
efficiently evaluate a set of safe entangled queries.
Second, we show that as long as all users coordinate on
the same set of attributes, we can give an efficient
algorithm for coordination even if the set of queries
does not satisfy safety. In an experimental evaluation
we show that our algorithms are feasible for a wide
spectrum of coordination scenarios.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2012:EMW,
author = "Xiaofei Zhang and Lei Chen and Min Wang",
title = "Efficient multi-way theta-join processing using
{MapReduce}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1184--1195",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Multi-way Theta-join queries are powerful in
describing complex relations and therefore widely
employed in real practices. However, existing solutions
from traditional distributed and parallel databases for
multi-way Theta-join queries cannot be easily extended
to fit a shared-nothing distributed computing paradigm,
which is proven to be able to support OLAP applications
over immense data volumes. In this work, we study the
problem of efficient processing of multi-way Theta-join
queries using MapReduce from a cost-effective
perspective. Although there have been some works using
the (key, value) pair-based programming model to
support join operations, efficient processing of
multi-way Theta-join queries has never been fully
explored. The substantial challenge lies in, given a
number of processing units (that can run Map or Reduce
tasks), mapping a multi-way Theta-join query to a
number of MapReduce jobs and having them executed in a
well scheduled sequence, such that the total processing
time span is minimized. Our solution mainly includes
two parts: (1) cost metrics for both single MapReduce
job and a number of MapReduce jobs executed in a
certain order; (2) the efficient execution of a
chain-typed Theta-join with only one MapReduce job.
Comparing with the query evaluation strategy proposed
in [23] and the widely adopted Pig Latin and Hive SQL
solutions, our method achieves significant improvement
of the join processing efficiency.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lim:2012:STB,
author = "Harold Lim and Herodotos Herodotou and Shivnath Babu",
title = "{Stubby}: a transformation-based optimizer for
{MapReduce} workflows",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1196--1207",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "There is a growing trend of performing analysis on
large datasets using workflows composed of MapReduce
jobs connected through producer-consumer relationships
based on data. This trend has spurred the development
of a number of interfaces---ranging from program-based
to query-based interfaces---for generating MapReduce
workflows. Studies have shown that the gap in
performance can be quite large between optimized and
unoptimized workflows. However, automatic cost-based
optimization of MapReduce workflows remains a challenge
due to the multitude of interfaces, large size of the
execution plan space, and the frequent unavailability
of all types of information needed for optimization. We
introduce a comprehensive plan space for MapReduce
workflows generated by popular workflow generators. We
then propose Stubby, a cost-based optimizer that
searches selectively through the subspace of the full
plan space that can be enumerated correctly and costed
based on the information available in any given
setting. Stubby enumerates the plan space based on
plan-to-plan transformations and an efficient search
algorithm. Stubby is designed to be extensible to new
interfaces and new types of optimizations, which is a
desirable feature given how rapidly MapReduce systems
are evolving. Stubby's efficiency and effectiveness
have been evaluated using representative workflows from
many domains.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bao:2012:LWV,
author = "Zhuowei Bao and Susan B. Davidson and Tova Milo",
title = "Labeling workflow views with fine-grained
dependencies",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1208--1219",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper considers the problem of efficiently
answering reachability queries over views of provenance
graphs, derived from executions of workflows that may
include recursion. Such views include composite modules
and model fine-grained dependencies between module
inputs and outputs. A novel view-adaptive dynamic
labeling scheme is developed for efficient query
evaluation, in which view specifications are labeled
statically (i.e. as they are created) and data items
are labeled dynamically as they are produced during a
workflow execution. Although the combination of
fine-grained dependencies and recursive workflows
entail, in general, long (linear-size) data labels, we
show that for a large natural class of workflows and
views, labels are compact (logarithmic-size) and
reachability queries can be evaluated in constant time.
Experimental results demonstrate the benefit of this
approach over the state-of-the-art technique when
applied for labeling multiple views.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Szlichta:2012:FOD,
author = "Jaros{\l}aw Szlichta and Parke Godfrey and Jarek
Gryz",
title = "Fundamentals of order dependencies",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1220--1231",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Dependencies have played a significant role in
database design for many years. They have also been
shown to be useful in query optimization. In this
paper, we discuss dependencies between
lexicographically ordered sets of tuples. We introduce
formally the concept of order dependency and present a
set of axioms (inference rules) for them. We show how
query rewrites based on these axioms can be used for
query optimization. We present several interesting
theorems that can be derived using the inference rules.
We prove that functional dependencies are subsumed by
order dependencies and that our set of axioms for order
dependencies is sound and complete.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bakibayev:2012:FQE,
author = "Nurzhan Bakibayev and Dan Olteanu and Jakub
Z{\'a}vodn{\'y}",
title = "{FDB}: a query engine for factorised relational
databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1232--1243",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Factorised databases are relational databases that use
compact factorised representations at the physical
layer to reduce data redundancy and boost query
performance. This paper introduces FDB, an in-memory
query engine for select-project-join queries on
factorised databases. Key components of FDB are novel
algorithms for query optimisation and evaluation that
exploit the succinctness brought by data factorisation.
Experiments show that for data sets with many-to-many
relationships FDB can outperform relational engines by
orders of magnitude.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cao:2012:OAW,
author = "Yu Cao and Chee-Yong Chan and Jie Li and Kian-Lee
Tan",
title = "Optimization of analytic window functions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1244--1255",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Analytic functions represent the state-of-the-art way
of performing complex data analysis within a single SQL
statement. In particular, an important class of
analytic functions that has been frequently used in
commercial systems to support OLAP and decision support
applications is the class of window functions. A window
function returns for each input tuple a value derived
from applying a function over a window of neighboring
tuples. However, existing window function evaluation
approaches are based on a naive sorting scheme. In this
paper, we study the problem of optimizing the
evaluation of window functions. We propose several
efficient techniques, and identify optimization
opportunities that allow us to optimize the evaluation
of a set of window functions. We have integrated our
scheme into PostgreSQL. Our comprehensive experimental
study on the TPC-DS datasets as well as synthetic
datasets and queries demonstrate significant speedup
over existing approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Hueske:2012:OBB,
author = "Fabian Hueske and Mathias Peters and Matthias J. Sax
and Astrid Rheinl{\"a}nder and Rico Bergmann and
Aljoscha Krettek and Kostas Tzoumas",
title = "Opening the black boxes in data flow optimization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1256--1267",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Many systems for big data analytics employ a data flow
abstraction to define parallel data processing tasks.
In this setting, custom operations expressed as
user-defined functions are very common. We address the
problem of performing data flow optimization at this
level of abstraction, where the semantics of operators
are not known. Traditionally, query optimization is
applied to queries with known algebraic semantics. In
this work, we find that a handful of properties, rather
than a full algebraic specification, suffice to
establish reordering conditions for data processing
operators. We show that these properties can be
accurately estimated for black box operators by
statically analyzing the general-purpose code of their
user-defined functions. We design and implement an
optimizer for parallel data flows that does not assume
knowledge of semantics or algebraic properties of
operators. Our evaluation confirms that the optimizer
can apply common rewritings such as selection
reordering, bushy join-order enumeration, and limited
forms of aggregation push-down, hence yielding similar
rewriting power as modern relational DBMS optimizers.
Moreover, it can optimize the operator order of
nonrelational data flows, a unique feature among
today's systems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ewen:2012:SFI,
author = "Stephan Ewen and Kostas Tzoumas and Moritz Kaufmann
and Volker Markl",
title = "Spinning fast iterative data flows",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1268--1279",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Parallel dataflow systems are a central part of most
analytic pipelines for big data. The iterative nature
of many analysis and machine learning algorithms,
however, is still a challenge for current systems.
While certain types of bulk iterative algorithms are
supported by novel dataflow frameworks, these systems
cannot exploit computational dependencies present in
many algorithms, such as graph algorithms. As a result,
these algorithms are inefficiently executed and have
led to specialized systems based on other paradigms,
such as message passing or shared memory. We propose a
method to integrate incremental iterations, a form of
workset iterations, with parallel dataflows. After
showing how to integrate bulk iterations into a
dataflow system and its optimizer, we present an
extension to the programming model for incremental
iterations. The extension alleviates for the lack of
mutable state in dataflows and allows for exploiting
the sparse computational dependencies inherent in many
iterative algorithms. The evaluation of a prototypical
implementation shows that those aspects lead to up to
two orders of magnitude speedup in algorithm runtime,
when exploited. In our experiments, the improved
dataflow system is highly competitive with specialized
systems while maintaining a transparent and unified
dataflow abstraction.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mihaylov:2012:RRD,
author = "Svilen R. Mihaylov and Zachary G. Ives and Sudipto
Guha",
title = "{REX}: recursive, delta-based data-centric
computation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1280--1291",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In today's Web and social network environments, query
workloads include ad hoc and OLAP queries, as well as
iterative algorithms that analyze data relationships
(e.g., link analysis, clustering, learning). Modern
DBMSs support ad hoc and OLAP queries, but most are not
robust enough to scale to large clusters. Conversely,
``cloud'' platforms like MapReduce execute chains of
batch tasks across clusters in a fault tolerant way,
but have too much overhead to support ad hoc queries.
Moreover, both classes of platform incur significant
overhead in executing iterative data analysis
algorithms. Most such iterative algorithms repeatedly
refine portions of their answers, until some
convergence criterion is reached. However, general
cloud platforms typically must reprocess all data in
each step. DBMSs that support recursive SQL are more
efficient in that they propagate only the changes in
each step --- but they still accumulate each
iteration's state, even if it is no longer useful.
User-defined functions are also typically harder to
write for DBMSs than for cloud platforms. We seek to
unify the strengths of both styles of platforms, with a
focus on supporting iterative computations in which
changes, in the form of deltas, are propagated from
iteration to iteration, and state is efficiently
updated in an extensible way. We present a programming
model oriented around deltas, describe how we execute
and optimize such programs in our REX runtime system,
and validate that our platform also handles failures
gracefully. We experimentally validate our techniques,
and show speedups over the competing methods ranging
from 2.5 to nearly 100 times.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cheng:2012:KRW,
author = "James Cheng and Zechao Shang and Hong Cheng and Haixun
Wang and Jeffrey Xu Yu",
title = "{K}-reach: who is in your small world",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1292--1303",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We study the problem of answering $k$-hop reachability
queries in a directed graph, i.e., whether there exists
a directed path of length k, from a source query vertex
to a target query vertex in the input graph. The
problem of $k$-hop reachability is a general problem of
the classic reachability (where $ k = \infty $).
Existing indexes for processing classic reachability
queries, as well as for processing shortest path
queries, are not applicable or not efficient for
processing $k$-hop reachability queries. We propose an
index for processing $k$-hop reachability queries,
which is simple in design and efficient to construct.
Our experimental results on a wide range of real
datasets show that our index is more efficient than the
state-of-the-art indexes even for processing classic
reachability queries, for which these indexes are
primarily designed. We also show that our index is
efficient in answering $k$-hop reachability queries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fan:2012:PGD,
author = "Wenfei Fan and Xin Wang and Yinghui Wu",
title = "Performance guarantees for distributed reachability
queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1304--1316",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In the real world a graph is often fragmented and
distributed across different sites. This highlights the
need for evaluating queries on distributed graphs. This
paper proposes distributed evaluation algorithms for
three classes of queries: reachability for determining
whether one node can reach another, bounded
reachability for deciding whether there exists a path
of a bounded length between a pair of nodes, and
regular reachability for checking whether there exists
a path connecting two nodes such that the node labels
on the path form a string in a given regular
expression. We develop these algorithms based on
partial evaluation, to explore parallel computation.
When evaluating a query Q on a distributed graph G, we
show that these algorithms possess the following
performance guarantees, no matter how G is fragmented
and distributed: (1) each site is visited only once;
(2) the total network traffic is determined by the size
of Q and the fragmentation of G, independent of the
size of G; and (3) the response time is decided by the
largest fragment of G rather than the entire G. In
addition, we show that these algorithms can be readily
implemented in the MapReduce framework. Using synthetic
and real-life data, we experimentally verify that these
algorithms are scalable on large graphs, regardless of
how the graphs are distributed.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chubak:2012:EIQ,
author = "Pirooz Chubak and Davood Rafiei",
title = "Efficient indexing and querying over syntactically
annotated trees",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1316--1327",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Natural language text corpora are often available as
sets of syntactically parsed trees. A wide range of
expressive tree queries are possible over such parsed
trees that open a new avenue in searching over natural
language text. They not only allow for querying roles
and relationships within sentences, but also improve
search effectiveness compared to flat keyword queries.
One major drawback of current systems supporting
querying over parsed text is the performance of
evaluating queries over large data. In this paper we
propose a novel indexing scheme over unique subtrees as
index keys. We also propose a novel root-split coding
scheme that stores subtree structural information only
partially, thus reducing index size and improving
querying performance. Our extensive set of experiments
show that root-split coding reduces the index size of
any interval coding which stores individual node
numbers by a factor of 50\% to 80\%, depending on the
sizes of subtrees indexed. Moreover, We show that our
index using root-split coding, outperforms previous
approaches by at least an order of magnitude in terms
of the response time of queries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Barany:2012:QGN,
author = "Vince B{\'a}r{\'a}ny and Balder ten Cate and Martin
Otto",
title = "Queries with guarded negation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1328--1339",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A well-established and fundamental insight in database
theory is that negation (also known as complementation)
tends to make queries difficult to process and
difficult to reason about. Many basic problems are
decidable and admit practical algorithms in the case of
unions of conjunctive queries, but become difficult or
even undecidable when queries are allowed to contain
negation. Inspired by recent results in finite model
theory, we consider a restricted form of negation,
guarded negation. We introduce a fragment of SQL,
called GN-SQL, as well as a fragment of Datalog with
stratified negation, called GN-Datalog, that allow only
guarded negation, and we show that these query
languages are computationally well behaved, in terms of
testing query containment, query evaluation, open-world
query answering, and boundedness. GN-SQL and GN-Datalog
subsume a number of well known query languages and
constraint languages, such as unions of conjunctive
queries, monadic Datalog, and frontier-guarded tgds. In
addition, an analysis of standard benchmark workloads
shows that many uses of negation in SQL in practice are
guarded.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2012:PFI,
author = "Ninghui Li and Wahbeh Qardaji and Dong Su and Jianneng
Cao",
title = "{PrivBasis}: frequent itemset mining with differential
privacy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1340--1351",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The discovery of frequent itemsets can serve valuable
economic and research purposes. Releasing discovered
frequent itemsets, however, presents privacy
challenges. In this paper, we study the problem of how
to perform frequent itemset mining on transaction
databases while satisfying differential privacy. We
propose an approach, called PrivBasis, which leverages
a novel notion called basis sets. A $ \theta $-basis
set has the property that any itemset with frequency
higher than $ \theta $ is a subset of some basis. We
introduce algorithms for privately constructing a basis
set and then using it to find the most frequent
itemsets. Experiments show that our approach greatly
outperforms the current state of the art.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yuan:2012:LRM,
author = "Ganzhao Yuan and Zhenjie Zhang and Marianne Winslett
and Xiaokui Xiao and Yin Yang and Zhifeng Hao",
title = "Low-rank mechanism: optimizing batch queries under
differential privacy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1352--1363",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Differential privacy is a promising privacy-preserving
paradigm for statistical query processing over
sensitive data. It works by injecting random noise into
each query result, such that it is provably hard for
the adversary to infer the presence or absence of any
individual record from the published noisy results. The
main objective in differentially private query
processing is to maximize the accuracy of the query
results, while satisfying the privacy guarantees.
Previous work, notably the matrix mechanism [16], has
suggested that processing a batch of correlated queries
as a whole can potentially achieve considerable
accuracy gains, compared to answering them
individually. However, as we point out in this paper,
the matrix mechanism is mainly of theoretical interest;
in particular, several inherent problems in its design
limit its accuracy in practice, which almost never
exceeds that of na{\"\i}ve methods. In fact, we are not
aware of any existing solution that can effectively
optimize a query batch under differential privacy.
Motivated by this, we propose the Low-Rank Mechanism
(LRM), the first practical differentially private
technique for answering batch queries with high
accuracy, based on a low rank approximation of the
workload matrix. We prove that the accuracy provided by
LRM is close to the theoretical lower bound for any
mechanism to answer a batch of queries under
differential privacy. Extensive experiments using real
data demonstrate that LRM consistently outperforms
state-of-the-art query processing solutions under
differential privacy, by large margins.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2012:FMR,
author = "Jun Zhang and Zhenjie Zhang and Xiaokui Xiao and Yin
Yang and Marianne Winslett",
title = "Functional mechanism: regression analysis under
differential privacy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1364--1375",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "$ \epsilon $-differential privacy is the
state-of-the-art model for releasing sensitive
information while protecting privacy. Numerous methods
have been proposed to enforce $ \epsilon $-differential
privacy in various analytical tasks, e.g., regression
analysis. Existing solutions for regression analysis,
however, are either limited to non-standard types of
regression or unable to produce accurate regression
results. Motivated by this, we propose the Functional
Mechanism, a differentially private method designed for
a large class of optimization-based analyses. The main
idea is to enforce $ \epsilon $-differential privacy by
perturbing the objective function of the optimization
problem, rather than its results. As case studies, we
apply the functional mechanism to address two most
widely used regression models, namely, linear
regression and logistic regression. Both theoretical
analysis and thorough experimental evaluations show
that the functional mechanism is highly effective and
efficient, and it significantly outperforms existing
solutions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Boldi:2012:IUG,
author = "Paolo Boldi and Francesco Bonchi and Aristides Gionis
and Tamir Tassa",
title = "Injecting uncertainty in graphs for identity
obfuscation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1376--1387",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data collected nowadays by social-networking
applications create fascinating opportunities for
building novel services, as well as expanding our
understanding about social structures and their
dynamics. Unfortunately, publishing social-network
graphs is considered an ill-advised practice due to
privacy concerns. To alleviate this problem, several
anonymization methods have been proposed, aiming at
reducing the risk of a privacy breach on the published
data, while still allowing to analyze them and draw
relevant conclusions. In this paper we introduce a new
anonymization approach that is based on injecting
uncertainty in social graphs and publishing the
resulting uncertain graphs. While existing approaches
obfuscate graph data by adding or removing edges
entirely, we propose using a finer-grained perturbation
that adds or removes edges partially: this way we can
achieve the same desired level of obfuscation with
smaller changes in the data, thus maintaining higher
utility. Our experiments on real-world networks confirm
that at the same level of identity obfuscation our
method provides higher usefulness than existing
randomized methods that publish standard graphs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cao:2012:PMR,
author = "Jianneng Cao and Panagiotis Karras",
title = "Publishing microdata with a robust privacy guarantee",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1388--1399",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Today, the publication of microdata poses a privacy
threat. Vast research has striven to define the privacy
condition that microdata should satisfy before it is
released, and devise algorithms to anonymize the data
so as to achieve this condition. Yet, no method
proposed to date explicitly bounds the percentage of
information an adversary gains after seeing the
published data for each sensitive value therein. This
paper introduces $ \beta $-likeness, an appropriately
robust privacy model for microdata anonymization, along
with two anonymization schemes designed therefore, the
one based on generalization, and the other based on
perturbation. Our model postulates that an adversary's
confidence on the likelihood of a certain
sensitive-attribute (SA) value should not increase, in
relative difference terms, by more than a predefined
threshold. Our techniques aim to satisfy a given $
\beta $ threshold with little information loss. We
experimentally demonstrate that (i) our model provides
an effective privacy guarantee in a way that
predecessor models cannot, (ii) our generalization
scheme is more effective and efficient in its task than
methods adapting algorithms for the $k$-anonymity
model, and (iii) our perturbation method outperforms a
baseline approach. Moreover, we discuss in detail the
resistance of our model and methods to attacks proposed
in previous research.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Guan:2012:MTE,
author = "Ziyu Guan and Xifeng Yan and Lance M. Kaplan",
title = "Measuring two-event structural correlations on
graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1400--1411",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Real-life graphs usually have various kinds of events
happening on them, e.g., product purchases in online
social networks and intrusion alerts in computer
networks. The occurrences of events on the same graph
could be correlated, exhibiting either attraction or
repulsion. Such structural correlations can reveal
important relationships between different events.
Unfortunately, correlation relationships on graph
structures are not well studied and cannot be captured
by traditional measures. In this work, we design a
novel measure for assessing two-event structural
correlations on graphs. Given the occurrences of two
events, we choose uniformly a sample of ``reference
nodes'' from the vicinity of all event nodes and employ
the Kendall's $ \tau $ rank correlation measure to
compute the average concordance of event density
changes. Significance can be efficiently assessed by $
\tau $'s nice property of being asymptotically normal
under the null hypothesis. In order to compute the
measure in large scale networks, we develop a scalable
framework using different sampling strategies. The
complexity of these strategies is analyzed. Experiments
on real graph datasets with both synthetic and real
events demonstrate that the proposed framework is not
only efficacious, but also efficient and scalable.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jestes:2012:RLT,
author = "Jeffrey Jestes and Jeff M. Phillips and Feifei Li and
Mingwang Tang",
title = "Ranking large temporal data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1412--1423",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Ranking temporal data has not been studied until
recently, even though ranking is an important operator
(being promoted as a first-class citizen) in database
systems. However, only the instant top-$k$ queries on
temporal data were studied in, where objects with the
$k$ highest scores at a query time instance t are to be
retrieved. The instant top-$k$ definition clearly comes
with limitations (sensitive to outliers, difficult to
choose a meaningful query time $t$). A more flexible
and general ranking operation is to rank objects based
on the aggregation of their scores in a query interval,
which we dub the aggregate top-$k$ query on temporal
data. For example, return the top-10 weather stations
having the highest average temperature from 10/01/2010
to 10/07/2010; find the top-20 stocks having the
largest total transaction volumes from 02/05/2011 to
02/07/2011. This work presents a comprehensive study to
this problem by designing both exact and approximate
methods (with approximation quality guarantees). We
also provide theoretical analysis on the construction
cost, the index size, the update and the query costs of
each approach. Extensive experiments on large real
datasets clearly demonstrate the efficiency, the
effectiveness, and the scalability of our methods
compared to the baseline methods.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Funke:2012:CTD,
author = "Florian Funke and Alfons Kemper and Thomas Neumann",
title = "Compacting transactional data in hybrid {OLTP\&OLAP}
databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1424--1435",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Growing main memory sizes have facilitated database
management systems that keep the entire database in
main memory. The drastic performance improvements that
came along with these in-memory systems have made it
possible to reunite the two areas of online transaction
processing (OLTP) and online analytical processing
(OLAP): An emerging class of hybrid OLTP and OLAP
database systems allows to process analytical queries
directly on the transactional data. By offering
arbitrarily current snapshots of the transactional data
for OLAP, these systems enable real-time business
intelligence. Despite memory sizes of several Terabytes
in a single commodity server, RAM is still a precious
resource: Since free memory can be used for
intermediate results in query processing, the amount of
memory determines query performance to a large extent.
Consequently, we propose the compaction of
memory-resident databases. Compaction consists of two
tasks: First, separating the mutable working set from
the immutable ``frozen'' data. Second, compressing the
immutable data and optimizing it for efficient,
memory-consumption-friendly snapshotting. Our approach
reorganizes and compresses transactional data online
and yet hardly affects the mission-critical OLTP
throughput. This is achieved by unburdening the OLTP
threads from all additional processing and performing
these tasks asynchronously.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Hall:2012:PTC,
author = "Alexander Hall and Olaf Bachmann and Robert B{\"u}ssow
and Silviu Ganceanu and Marc Nunkesser",
title = "Processing a trillion cells per mouse click",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1436--1446",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Column-oriented database systems have been a real game
changer for the industry in recent years. Highly tuned
and performant systems have evolved that provide users
with the possibility of answering ad hoc queries over
large datasets in an interactive manner. In this paper
we present the column-oriented datastore developed as
one of the central components of PowerDrill. It
combines the advantages of columnar data layout with
other known techniques (such as using composite range
partitions) and extensive algorithmic engineering on
key data structures. The main goal of the latter being
to reduce the main memory footprint and to increase the
efficiency in processing typical user queries. In this
combination we achieve large speed-ups. These enable a
highly interactive Web UI where it is common that a
single mouse click leads to processing a trillion
values in the underlying dataset.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Porobic:2012:OHI,
author = "Danica Porobic and Ippokratis Pandis and Miguel Branco
and Pinar T{\"o}z{\"u}n and Anastasia Ailamaki",
title = "{OLTP} on hardware islands",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1447--1458",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Modern hardware is abundantly parallel and
increasingly heterogeneous. The numerous processing
cores have nonuniform access latencies to the main
memory and to the processor caches, which causes
variability in the communication costs. Unfortunately,
database systems mostly assume that all processing
cores are the same and that microarchitecture
differences are not significant enough to appear in
critical database execution paths. As we demonstrate in
this paper, however, hardware heterogeneity does appear
in the critical path and conventional database
architectures achieve suboptimal and even worse,
unpredictable performance. We perform a detailed
performance analysis of OLTP deployments in servers
with multiple cores per CPU (multicore) and multiple
CPUs per server (multisocket). We compare different
database deployment strategies where we vary the number
and size of independent database instances running on a
single server, from a single shared-everything instance
to fine-grained shared-nothing configurations. We
quantify the impact of non-uniform hardware on various
deployments by (a) examining how efficiently each
deployment uses the available hardware resources and
(b) measuring the impact of distributed transactions
and skewed requests on different workloads. Finally, we
argue in favor of shared-nothing deployments that are
topology- and workload-aware and take advantage of fast
on-chip communication between islands of cores on the
same socket.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Patterson:2012:SSC,
author = "Stacy Patterson and Aaron J. Elmore and Faisal Nawab
and Divyakant Agrawal and Amr {El Abbadi}",
title = "Serializability, not serial: concurrency control and
availability in multi-datacenter datastores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1459--1470",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present a framework for concurrency control and
availability in multi-datacenter datastores. While we
consider Google's Megastore as our motivating example,
we define general abstractions for key components,
making our solution extensible to any system that
satisfies the abstraction properties. We first develop
and analyze a transaction management and replication
protocol based on a straightforward implementation of
the Paxos algorithm. Our investigation reveals that
this protocol acts as a concurrency prevention
mechanism rather than a concurrency control mechanism.
We then propose an enhanced protocol called Paxos with
Combination and Promotion (Paxos-CP) that provides true
transaction concurrency while requiring the same per
instance message complexity as the basic Paxos
protocol. Finally, we compare the performance of Paxos
and Paxos-CP in a multi-datacenter experimental study,
and we demonstrate that Paxos-CP results in
significantly fewer aborted transactions than basic
Paxos.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cheung:2012:APD,
author = "Alvin Cheung and Samuel Madden and Owen Arden and
Andrew C. Myers",
title = "Automatic partitioning of database applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1471--1482",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Database-backed applications are nearly ubiquitous in
our daily lives. Applications that make many small
accesses to the database create two challenges for
developers: increased latency and wasted resources from
numerous network round trips. A well-known technique to
improve transactional database application performance
is to convert part of the application into stored
procedures that are executed on the database server.
Unfortunately, this conversion is often difficult. In
this paper we describe Pyxis, a system that takes
database-backed applications and automatically
partitions their code into two pieces, one of which is
executed on the application server and the other on the
database server. Pyxis profiles the application and
server loads, statically analyzes the code's
dependencies, and produces a partitioning that
minimizes the number of control transfers as well as
the amount of data sent during each transfer. Our
experiments using TPC-C and TPC-W show that Pyxis is
able to generate partitions with up to 3x reduction in
latency and 1.7x improvement in throughput when
compared to a traditional non-partitioned
implementation and has comparable performance to that
of a custom stored procedure implementation.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2012:CCE,
author = "Jiannan Wang and Tim Kraska and Michael J. Franklin
and Jianhua Feng",
title = "{CrowdER}: crowdsourcing entity resolution",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1483--1494",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Entity resolution is central to data integration and
data cleaning. Algorithmic approaches have been
improving in quality, but remain far from perfect.
Crowdsourcing platforms offer a more accurate but
expensive (and slow) way to bring human insight into
the process. Previous work has proposed batching
verification tasks for presentation to human workers
but even with batching, a human-only approach is
infeasible for data sets of even moderate size, due to
the large numbers of matches to be tested. Instead, we
propose a hybrid human-machine approach in which
machines are used to do an initial, coarse pass over
all the data, and people are used to verify only the
most likely matching pairs. We show that for such a
hybrid system, generating the minimum number of
verification tasks of a given size is NP-Hard, but we
develop a novel two-tiered heuristic approach for
creating batched tasks. We describe this method, and
present the results of extensive experiments on real
data sets using a popular crowdsourcing platform. The
experiments show that our hybrid approach achieves both
good efficiency and high accuracy compared to
machine-only or human-only alternatives.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cao:2012:WAJ,
author = "Caleb Chen Cao and Jieying She and Yongxin Tong and
Lei Chen",
title = "Whom to ask?: jury selection for decision making tasks
on micro-blog services",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1495--1506",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "It is universal to see people obtain knowledge on
micro-blog services by asking others decision making
questions. In this paper, we study the Jury Selection
Problem(JSP) by utilizing crowdsourcing for decision
making tasks on micro-blog services. Specifically, the
problem is to enroll a subset of crowd under a limited
budget, whose aggregated wisdom via Majority Voting
scheme has the lowest probability of drawing a wrong
answer(Jury Error Rate-JER). Due to various individual
error-rates of the crowd, the calculation of JER is
non-trivial. Firstly, we explicitly state that JER is
the probability when the number of wrong jurors is
larger than half of the size of a jury. To avoid the
exponentially increasing calculation of JER, we propose
two efficient algorithms and an effective bounding
technique. Furthermore, we study the Jury Selection
Problem on two crowdsourcing models, one is for
altruistic users(AltrM) and the other is for
incentive-requiring users(PayM) who require extra
payment when enrolled into a task. For the AltrM model,
we prove the monotonicity of JER on individual error
rate and propose an efficient exact algorithm for JSP.
For the PayM model, we prove the NP-hardness of JSP on
PayM and propose an efficient greedy-based heuristic
algorithm. Finally, we conduct a series of experiments
to investigate the traits of JSP, and validate the
efficiency and effectiveness of our proposed algorithms
on both synthetic and real micro-blog data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yang:2012:AAL,
author = "Xiaochun Yang and Honglei Liu and Bin Wang",
title = "{ALAE}: accelerating local alignment with affine gap
exactly in biosequence databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1507--1518",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We study the problem of local alignment, which is
finding pairs of similar subsequences with gaps. The
problem exists in biosequence databases. BLAST is a
typical software for finding local alignment based on
heuristic, but could miss results. Using the
Smith-Waterman algorithm, we can find all local
alignments in $ O(m n) $ time, where $m$ and $n$ are
lengths of a query and a text, respectively. A recent
exact approach BWT-SW improves the complexity of the
Smith-Waterman algorithm under constraints, but still
much slower than BLAST. This paper takes on the
challenge of designing an accurate and efficient
algorithm for evaluating local-alignment searches,
especially for long queries. In this paper, we propose
an efficient software called ALAE to speed up BWT-SW
using a compressed suffix array. ALAE utilizes a family
of filtering techniques to prune meaningless
calculations and an algorithm for reusing score
calculations. We also give a mathematical analysis and
show that the upper bound of the total number of
calculated entries using ALAE could vary from 4.50
mn$^{0.520}$ to 9.05 mn$^{0.896}$ for random DNA
sequences and vary from 8.28 mn$^{0.364}$ to 7.49
mn$^{0.723}$ for random protein sequences. We
demonstrate the significant performance improvement of
ALAE on BWT-SW using a thorough experimental study on
real biosequences. ALAE guarantees correctness and
accelerates BLAST for most of parameters.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Candan:2012:SCD,
author = "K. Sel{\c{c}}uk Candan and Rosaria Rossini and Xiaolan
Wang and Maria Luisa Sapino",
title = "{sDTW}: computing {DTW} distances using locally
relevant constraints based on salient feature
alignments",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1519--1530",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Many applications generate and consume temporal data
and retrieval of time series is a key processing step
in many application domains. Dynamic time warping (DTW)
distance between time series of size N and M is
computed relying on a dynamic programming approach
which creates and fills an N x M grid to search for an
optimal warp path. Since this can be costly, various
heuristics have been proposed to cut away the
potentially unproductive portions of the DTW grid. In
this paper, we argue that time series often carry
structural features that can be used for identifying
locally relevant constraints to eliminate redundant
work. Relying on this observation, we propose salient
feature based sDTW algorithms which first identify
robust salient features in the given time series and
then find a consistent alignment of these to establish
the boundaries for the warp path search. More
specifically, we propose alternative fixed
core\&adaptive width, adaptive core\&fixed width, and
adaptive core\&adaptive width strategies which enforce
different constraints reflecting the high level
structural characteristics of the series in the data
set. Experiment results show that the proposed sDTW
algorithms help achieve much higher accuracy in DTW
computation and time series retrieval than fixed core
\& fixed width algorithms that do not leverage local
features of the given time series.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tauheed:2012:SPL,
author = "Farhan Tauheed and Thomas Heinis and Felix
Sch{\"u}rmann and Henry Markram and Anastasia
Ailamaki",
title = "{SCOUT}: prefetching for latent structure following
queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1531--1542",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Today's scientists are quickly moving from in vitro to
in silico experimentation: they no longer analyze
natural phenomena in a petri dish, but instead they
build models and simulate them. Managing and analyzing
the massive amounts of data involved in simulations is
a major task. Yet, they lack the tools to efficiently
work with data of this size. One problem many
scientists share is the analysis of the massive spatial
models they build. For several types of analysis they
need to interactively follow the structures in the
spatial model, e.g., the arterial tree, neuron fibers,
etc., and issue range queries along the way. Each query
takes long to execute, and the total time for executing
a sequence of queries significantly delays data
analysis. Prefetching the spatial data reduces the
response time considerably, but known approaches do not
prefetch with high accuracy. We develop SCOUT, a
structure-aware method for prefetching data along
interactive spatial query sequences. SCOUT uses an
approximate graph model of the structures involved in
past queries and attempts to identify what particular
structure the user follows. Our experiments with
neuro-science data show that SCOUT prefetches with an
accuracy from 71\% to 92\%, which translates to a
speedup of 4x-15x. SCOUT also improves the prefetching
accuracy on datasets from other scientific domains,
such as medicine and biology.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2012:API,
author = "Kaibo Wang and Yin Huai and Rubao Lee and Fusheng Wang
and Xiaodong Zhang and Joel H. Saltz",
title = "Accelerating pathology image data cross-comparison on
{CPU--GPU} hybrid systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1543--1554",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "As an important application of spatial databases in
pathology imaging analysis, cross-comparing the spatial
boundaries of a huge amount of segmented micro-anatomic
objects demands extremely data- and compute-intensive
operations, requiring high throughput at an affordable
cost. However, the performance of spatial database
systems has not been satisfactory since their
implementations of spatial operations cannot fully
utilize the power of modern parallel hardware. In this
paper, we provide a customized software solution that
exploits GPUs and multi-core CPUs to accelerate spatial
cross-comparison in a cost-effective way. Our solution
consists of an efficient GPU algorithm and a pipelined
system framework with task migration support. Extensive
experiments with real-world data sets demonstrate the
effectiveness of our solution, which improves the
performance of spatial cross-comparison by over 18
times compared with a parallelized spatial database
approach.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2012:RER,
author = "Jiexing Li and Arnd Christian K{\"o}nig and Vivek
Narasayya and Surajit Chaudhuri",
title = "Robust estimation of resource consumption for {SQL}
queries using statistical techniques",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1555--1566",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The ability to estimate resource consumption of SQL
queries is crucial for a number of tasks in a database
system such as admission control, query scheduling and
costing during query optimization. Recent work has
explored the use of statistical techniques for resource
estimation in place of the manually constructed cost
models used in query optimization. Such techniques,
which require as training data examples of resource
usage in queries, offer the promise of superior
estimation accuracy since they can account for factors
such as hardware characteristics of the system or bias
in cardinality estimates. However, the proposed
approaches lack robustness in that they do not
generalize well to queries that are different from the
training examples, resulting in significant estimation
errors. Our approach aims to address this problem by
combining knowledge of database query processing with
statistical models. We model resource-usage at the
level of individual operators, with different models
and features for each operator type, and explicitly
model the asymptotic behavior of each operator. This
results in significantly better estimation accuracy and
the ability to estimate resource usage of arbitrary
plans, even when they are very different from the
training instances. We validate our approach using
various large scale real-life and benchmark workloads
on Microsoft SQL Server.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Das:2012:WTW,
author = "Mahashweta Das and Saravanan Thirumuruganathan and
Sihem Amer-Yahia and Gautam Das and Cong Yu",
title = "Who tags what?: an analysis framework",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1567--1578",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The rise of Web 2.0 is signaled by sites such as
Flickr, del.icio.us, and YouTube, and social tagging is
essential to their success. A typical tagging action
involves three components, user, item (e.g., photos in
Flickr), and tags (i.e., words or phrases). Analyzing
how tags are assigned by certain users to certain items
has important implications in helping users search for
desired information. In this paper, we explore common
analysis tasks and propose a dual mining framework for
social tagging behavior mining. This framework is
centered around two opposing measures, similarity and
diversity, being applied to one or more tagging
components, and therefore enables a wide range of
analysis scenarios such as characterizing similar users
tagging diverse items with similar tags, or diverse
users tagging similar items with diverse tags, etc. By
adopting different concrete measures for similarity and
diversity in the framework, we show that a wide range
of concrete analysis problems can be defined and they
are NP-Complete in general. We design efficient
algorithms for solving many of those problems and
demonstrate, through comprehensive experiments over
real data, that our algorithms significantly
out-perform the exact brute-force approach without
compromising analysis result quality.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhu:2012:GFE,
author = "Haohan Zhu and George Kollios and Vassilis Athitsos",
title = "A generic framework for efficient and effective
subsequence retrieval",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1579--1590",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper proposes a general framework for matching
similar subsequences in both time series and string
databases. The matching results are pairs of query
subsequences and database subsequences. The framework
finds all possible pairs of similar subsequences if the
distance measure satisfies the ``consistency''
property, which is a property introduced in this paper.
We show that most popular distance functions, such as
the Euclidean distance, DTW, ERP, the Frech{\'e}t
distance for time series, and the Hamming distance and
Levenshtein distance for strings, are all
``consistent''. We also propose a generic index
structure for metric spaces named ``reference net''.
The reference net occupies $ O(n) $ space, where $n$ is
the size of the dataset and is optimized to work well
with our framework. The experiments demonstrate the
ability of our method to improve retrieval performance
when combined with diverse distance measures. The
experiments also illustrate that the reference net
scales well in terms of space overhead and query
time.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dittrich:2012:OAE,
author = "Jens Dittrich and Jorge-Arnulfo Quian{\'e}-Ruiz and
Stefan Richter and Stefan Schuh and Alekh Jindal and
J{\"o}rg Schad",
title = "Only aggressive elephants are fast elephants",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1591--1602",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Yellow elephants are slow. A major reason is that they
consume their inputs entirely before responding to an
elephant rider's orders. Some clever riders have
trained their yellow elephants to only consume parts of
the inputs before responding. However, the teaching
time to make an elephant do that is high. So high that
the teaching lessons often do not pay off. We take a
different approach. We make elephants aggressive; only
this will make them very fast. We propose HAIL (Hadoop
Aggressive Indexing Library), an enhancement of HDFS
and Hadoop MapReduce that dramatically improves
runtimes of several classes of MapReduce jobs. HAIL
changes the upload pipeline of HDFS in order to create
different clustered indexes on each data block replica.
An interesting feature of HAIL is that we typically
create a win-win situation: we improve both data upload
to HDFS and the runtime of the actual Hadoop MapReduce
job. In terms of data upload, HAIL improves over HDFS
by up to 60\% with the default replication factor of
three. In terms of query execution, we demonstrate that
HAIL runs up to 68x faster than Hadoop. In our
experiments, we use six clusters including physical and
EC2 clusters of up to 100 nodes. A series of
scalability experiments also demonstrates the
superiority of HAIL.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2012:MLP,
author = "Rui Li and Shengjie Wang and Kevin Chen-Chuan Chang",
title = "Multiple location profiling for users and
relationships from social network and content",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1603--1614",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Users' locations are important for many applications
such as personalized search and localized content
delivery. In this paper, we study the problem of
profiling Twitter users' locations with their following
network and tweets. We propose a multiple location
profiling model (MLP), which has three key features:
(1) it formally models how likely a user follows
another user given their locations and how likely a
user tweets a venue given his location, (2) it
fundamentally captures that a user has multiple
locations and his following relationships and tweeted
venues can be related to any of his locations, and some
of them are even noisy, and (3) it novelly utilizes the
home locations of some users as partial supervision. As
a result, MLP not only discovers users' locations
accurately and completely, but also ``explains'' each
following relationship by revealing users' true
locations in the relationship. Experiments on a
large-scale data set demonstrate those advantages.
Particularly, (1) for predicting users' home locations,
MLP successfully places 62\% users and out-performs two
state-of-the-art methods by 10\% in accuracy, (2) for
discovering users' multiple locations, MLP improves the
baseline methods by 14\% in recall, and (3) for
explaining following relationships, MLP achieves 57\%
accuracy.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kang:2012:FBE,
author = "Woon-Hak Kang and Sang-Won Lee and Bongki Moon",
title = "Flash-based extended cache for higher throughput and
faster recovery",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1615--1626",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Considering the current price gap between disk and
flash memory drives, for applications dealing with
large scale data, it will be economically more sensible
to use flash memory drives to supplement disk drives
rather than to replace them. This paper presents FaCE,
which is a new low-overhead caching strategy that uses
flash memory as an extension to the DRAM buffer. FaCE
aims at improving the transaction throughput as well as
shortening the recovery time from a system failure. To
achieve the goals, we propose two novel algorithms for
flash cache management, namely, Multi-Version FIFO
replacement and Group Second Chance. One striking
result from FaCE is that using a small flash memory
drive as a caching device could deliver even higher
throughput than using a large flash memory drive to
store the entire database tables. This was possible due
to flash write optimization as well as disk access
reduction obtained by the FaCE caching methods. In
addition, FaCE takes advantage of the non-volatility of
flash memory to fully support database recovery by
extending the scope of a persistent database to include
the data pages stored in the flash cache. We have
implemented FaCE in the PostgreSQL open source database
server and demonstrated its effectiveness for TPC-C
benchmarks.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bender:2012:DTH,
author = "Michael A. Bender and Martin Farach-Colton and Rob
Johnson and Russell Kraner and Bradley C. Kuszmaul and
Dzejla Medjedovic and Pablo Montes and Pradeep Shetty
and Richard P. Spillane and Erez Zadok",
title = "Don't thrash: how to cache your hash on flash",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1627--1637",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper presents new alternatives to the well-known
Bloom filter data structure. The Bloom filter, a
compact data structure supporting set insertion and
membership queries, has found wide application in
databases, storage systems, and networks. Because the
Bloom filter performs frequent random reads and writes,
it is used almost exclusively in RAM, limiting the size
of the sets it can represent. This paper first
describes the quotient filter, which supports the basic
operations of the Bloom filter, achieving roughly
comparable performance in terms of space and time, but
with better data locality. Operations on the quotient
filter require only a small number of contiguous
accesses. The quotient filter has other advantages over
the Bloom filter: it supports deletions, it can be
dynamically resized, and two quotient filters can be
efficiently merged. The paper then gives two data
structures, the buffered quotient filter and the
cascade filter, which exploit the quotient filter
advantages and thus serve as SSD-optimized alternatives
to the Bloom filter. The cascade filter has better
asymptotic I/O performance than the buffered quotient
filter, but the buffered quotient filter outperforms
the cascade filter on small to medium data sets. Both
data structures significantly outperform
recently-proposed SSD-optimized Bloom filter variants,
such as the elevator Bloom filter, buffered Bloom
filter, and forest-structured Bloom filter. In
experiments, the cascade filter and buffered quotient
filter performed insertions 8.6--11 times faster than
the fastest Bloom filter variant and performed lookups
0.94--2.56 times faster.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Isele:2012:LEL,
author = "Robert Isele and Christian Bizer",
title = "Learning expressive linkage rules using genetic
programming",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1638--1649",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A central problem in data integration and data
cleansing is to find entities in different data sources
that describe the same real-world object. Many existing
methods for identifying such entities rely on explicit
linkage rules which specify the conditions that
entities must fulfill in order to be considered to
describe the same real-world object. In this paper, we
present the GenLink algorithm for learning expressive
linkage rules from a set of existing reference links
using genetic programming. The algorithm is capable of
generating linkage rules which select discriminative
properties for comparison, apply chains of data
transformations to normalize property values, choose
appropriate distance measures and thresholds and
combine the results of multiple comparisons using
non-linear aggregation functions. Our experiments show
that the GenLink algorithm outperforms the
state-of-the-art genetic programming approach to
learning linkage rules recently presented by Carvalho
et. al. and is capable of learning linkage rules which
achieve a similar accuracy as human written rules for
the same problem.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tong:2012:MFI,
author = "Yongxin Tong and Lei Chen and Yurong Cheng and Philip
S. Yu",
title = "Mining frequent itemsets over uncertain databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1650--1661",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In recent years, due to the wide applications of
uncertain data, mining frequent itemsets over uncertain
databases has attracted much attention. In uncertain
databases, the support of an itemset is a random
variable instead of a fixed occurrence counting of this
itemset. Thus, unlike the corresponding problem in
deterministic databases where the frequent itemset has
a unique definition, the frequent itemset under
uncertain environments has two different definitions so
far. The first definition, referred as the expected
support-based frequent itemset, employs the expectation
of the support of an itemset to measure whether this
itemset is frequent. The second definition, referred as
the probabilistic frequent itemset, uses the
probability of the support of an itemset to measure its
frequency. Thus, existing work on mining frequent
itemsets over uncertain databases is divided into two
different groups and no study is conducted to
comprehensively compare the two different definitions.
In addition, since no uniform experimental platform
exists, current solutions for the same definition even
generate inconsistent results. In this paper, we
firstly aim to clarify the relationship between the two
different definitions. Through extensive experiments,
we verify that the two definitions have a tight
connection and can be unified together when the size of
data is large enough. Secondly, we provide baseline
implementations of eight existing representative
algorithms and test their performances with uniform
measures fairly. Finally, according to the fair tests
over many different benchmark data sets, we clarify
several existing inconsistent conclusions and discuss
some new findings.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dallachiesa:2012:UTS,
author = "Michele Dallachiesa and Besmira Nushi and Katsiaryna
Mirylenka and Themis Palpanas",
title = "Uncertain time-series similarity: return to the
basics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1662--1673",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In the last years there has been a considerable
increase in the availability of continuous sensor
measurements in a wide range of application domains,
such as Location-Based Services (LBS), medical
monitoring systems, manufacturing plants and
engineering facilities to ensure efficiency, product
quality and safety, hydrologic and geologic observing
systems, pollution management, and others. Due to the
inherent imprecision of sensor observations, many
investigations have recently turned into querying,
mining and storing uncertain data. Uncertainty can also
be due to data aggregation, privacy-preserving
transforms, and error-prone mining algorithms. In this
study, we survey the techniques that have been proposed
specifically for modeling and processing uncertain time
series, an important model for temporal data. We
provide an analytical evaluation of the alternatives
that have been proposed in the literature, highlighting
the advantages and disadvantages of each approach, and
further compare these alternatives with two additional
techniques that were carefully studied before. We
conduct an extensive experimental evaluation with 17
real datasets, and discuss some surprising results,
which suggest that a fruitful research direction is to
take into account the temporal correlations in the time
series. Based on our evaluations, we also provide
guidelines useful for the practitioners in the field.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dasu:2012:SDC,
author = "Tamraparni Dasu and Ji Meng Loh",
title = "Statistical distortion: consequences of data
cleaning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1674--1683",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We introduce the notion of statistical distortion as
an essential metric for measuring the effectiveness of
data cleaning strategies. We use this metric to propose
a widely applicable yet scalable experimental framework
for evaluating data cleaning strategies along three
dimensions: glitch improvement, statistical distortion
and cost-related criteria. Existing metrics focus on
glitch improvement and cost, but not on the statistical
impact of data cleaning strategies. We illustrate our
framework on real world data, with a comprehensive
suite of experiments and analyses.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lang:2012:TEE,
author = "Willis Lang and Stavros Harizopoulos and Jignesh M.
Patel and Mehul A. Shah and Dimitris Tsirogiannis",
title = "Towards energy-efficient database cluster design",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "11",
pages = "1684--1695",
month = jul,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:15 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Energy is a growing component of the operational cost
for many ``big data'' deployments, and hence has become
increasingly important for practitioners of large-scale
data analysis who require scale-out clusters or
parallel DBMS appliances. Although a number of recent
studies have investigated the energy efficiency of
DBMSs, none of these studies have looked at the
architectural design space of energy-efficient parallel
DBMS clusters. There are many challenges to increasing
the energy efficiency of a DBMS cluster, including
dealing with the inherent scaling inefficiency of
parallel data processing, and choosing the appropriate
energy-efficient hardware. In this paper, we
experimentally examine and analyze a number of key
parameters related to these challenges for designing
energy-efficient database clusters. We explore the
cluster design space using empirical results and
propose a model that considers the key bottlenecks to
energy efficiency in a parallel DBMS. This paper
represents a key first step in designing
energy-efficient database clusters, which is
increasingly important given the trend toward parallel
database appliances.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jensen:2012:DMS,
author = "Christian S. Jensen",
title = "Data management on the spatial web",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1696--1696",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Due in part to the increasing mobile use of the web
and the proliferation of geo-positioning, the web is
fast acquiring a significant spatial aspect. Content
and users are being augmented with locations that are
used increasingly by location-based services. Studies
suggest that each week, several billion web queries are
issued that have local intent and target spatial web
objects. These are points of interest with a web
presence, and they thus have locations as well as
textual descriptions. This development has given
prominence to spatial web data management, an area ripe
with new and exciting opportunities and challenges. The
research community has embarked on inventing and
supporting new query functionality for the spatial web.
Different kinds of spatial web queries return objects
that are near a location argument and are relevant to a
text argument. To support such queries, it is important
to be able to rank objects according to their relevance
to a query. And it is important to be able to process
the queries with low latency. The talk offers an
overview of key aspects of the spatial web. Based on
recent results obtained by the speaker and his
colleagues, the talk explores new query functionality
enabled by the setting. Further, the talk offers
insight into the data management techniques capable of
supporting such functionality.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dietrich:2012:DAO,
author = "Brenda Dietrich",
title = "Data analytics opportunities in a smarter planet",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1697--1697",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "New applications of computing are being enabled by
instrumentation of physical entities, aggregation of
data, and the analysis of the data. The resulting
integration of information and control permits
efficient and effective management of complex man-made
systems. Examples include transportation systems,
buildings, electrical grids, health care systems,
governments, and supply chains. Achieving this vision
requires extensive data integration and analysis, over
diverse, rapidly changing, and often uncertain data.
There are many challenges, requiring both new data
management techniques as well as new mathematics,
forcing new collaborations as the basis of the new
``Data Science''. Needs and opportunities will be
discussed in the context of specific pilots and
projects.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sahin:2012:CEM,
author = "Kenan Sahin",
title = "Challenges in economic massive content storage and
management ({MCSAM}) in the era of self-organizing,
self-expanding and self-linking data clusters",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1698--1698",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Rapid spread of social networks, global on-line
shopping, post 9/11 security oriented linking of data
bases and foremost the global adoption of smart
phones/devices, among other phenomena, are transforming
data clusters into dynamic and almost uncontrollable
entities that have their own local intelligence,
clients and objectives. The scale and rapidity of
change is such that large scale innovations in content
storage and management are urgently needed if the
diseconomies of scale and complexity are to be
mitigated. The field needs to reinvent itself.
Istanbul, a city that has reinvented itself many times
is an excellent venue to engage in such a discussion
and for me to offer suggestions and proposals that
derive from personal experiences that span academia,
start ups, R\&D firms and Bell Labs as well my early
years spent in Istanbul.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Manku:2012:AFC,
author = "Gurmeet Singh Manku and Rajeev Motwani",
title = "Approximate frequency counts over data streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1699--1699",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Research in data stream algorithms has blossomed since
late 90s. The talk will trace the history of the
Approximate Frequency Counts paper, how it was
conceptualized and how it influenced data stream
research. The talk will also touch upon a recent
development: analysis of personal data streams for
improving our quality of lives.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Hellerstein:2012:MAL,
author = "Joseph M. Hellerstein and Christoper R{\'e} and
Florian Schoppmann and Daisy Zhe Wang and Eugene
Fratkin and Aleksander Gorajek and Kee Siong Ng and
Caleb Welton and Xixuan Feng and Kun Li and Arun
Kumar",
title = "The {MADlib} analytics library: or {MAD} skills, the
{SQL}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1700--1711",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "MADlib is a free, open-source library of in-database
analytic methods. It provides an evolving suite of
SQL-based algorithms for machine learning, data mining
and statistics that run at scale within a database
engine, with no need for data import/export to other
tools. The goal is for MADlib to eventually serve a
role for scalable database systems that is similar to
the CRAN library for R: a community repository of
statistical methods, this time written with scale and
parallelism in mind. In this paper we introduce the
MADlib project, including the background that led to
its beginnings, and the motivation for its open-source
nature. We provide an overview of the library's
architecture and design patterns, and provide a
description of various statistical methods in that
context. We include performance and speedup results of
a core design pattern from one of those methods over
the Greenplum parallel DBMS on a modest-sized test
cluster. We then report on two initial efforts at
incorporating academic research into MADlib, which is
one of the project's goals. MADlib is freely available
at http://madlib.net, and the project is open for
contributions of both new methods, and ports to
additional database platforms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Floratou:2012:CEH,
author = "Avrilia Floratou and Nikhil Teletia and David J.
DeWitt and Jignesh M. Patel and Donghui Zhang",
title = "Can the elephants handle the {NoSQL} onslaught?",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1712--1723",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this new era of ``big data'', traditional DBMSs are
under attack from two sides. At one end of the
spectrum, the use of document store NoSQL systems (e.g.
MongoDB) threatens to move modern Web 2.0 applications
away from traditional RDBMSs. At the other end of the
spectrum, big data DSS analytics that used to be the
domain of parallel RDBMSs is now under attack by
another class of NoSQL data analytics systems, such as
Hive on Hadoop. So, are the traditional RDBMSs, aka
``big elephants'', doomed as they are challenged from
both ends of this ``big data'' spectrum? In this paper,
we compare one representative NoSQL system from each
end of this spectrum with SQL Server, and analyze the
performance and scalability aspects of each of these
approaches (NoSQL vs. SQL) on two workloads (decision
support analysis and interactive data-serving) that
represent the two ends of the application spectrum. We
present insights from this evaluation and speculate on
potential trends for the future.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Rabl:2012:SBD,
author = "Tilmann Rabl and Sergio G{\'o}mez-Villamor and
Mohammad Sadoghi and Victor Munt{\'e}s-Mulero and
Hans-Arno Jacobsen and Serge Mankovskii",
title = "Solving big data challenges for enterprise application
performance management",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1724--1735",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "As the complexity of enterprise systems increases, the
need for monitoring and analyzing such systems also
grows. A number of companies have built sophisticated
monitoring tools that go far beyond simple resource
utilization reports. For example, based on
instrumentation and specialized APIs, it is now
possible to monitor single method invocations and trace
individual transactions across geographically
distributed systems. This high-level of detail enables
more precise forms of analysis and prediction but comes
at the price of high data rates (i.e., big data). To
maximize the benefit of data monitoring, the data has
to be stored for an extended period of time for
ulterior analysis. This new wave of big data analytics
imposes new challenges especially for the application
performance monitoring systems. The monitoring data has
to be stored in a system that can sustain the high data
rates and at the same time enable an up-to-date view of
the underlying infrastructure. With the advent of
modern key--value stores, a variety of data storage
systems have emerged that are built with a focus on
scalability and high data rates as predominant in this
monitoring use case. In this work, we present our
experience and a comprehensive performance evaluation
of six modern (open-source) data stores in the context
of application performance monitoring as part of CA
Technologies initiative. We evaluated these systems
with data and workloads that can be found in
application performance monitoring, as well as, on-line
advertisement, power monitoring, and many other use
cases. We present our insights not only as performance
results but also as lessons learned and our experience
relating to the setup and configuration complexity of
these data stores in an industry setting.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shinnar:2012:MIP,
author = "Avraham Shinnar and David Cunningham and Vijay
Saraswat and Benjamin Herta",
title = "{M3R}: increased performance for in-memory {Hadoop}
jobs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1736--1747",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Main Memory Map Reduce (M3R) is a new implementation
of the Hadoop Map Reduce (HMR) API targeted at online
analytics on high mean-time-to-failure clusters. It
does not support resilience, and supports only those
workloads which can fit into cluster memory. In return,
it can run HMR jobs unchanged --- including jobs
produced by compilers for higher-level languages such
as Pig, Jaql, and SystemML and interactive front-ends
like IBM BigSheets --- while providing significantly
better performance than the Hadoop engine on several
workloads (e.g. 45x on some input sizes for sparse
matrix vector multiply). M3R also supports extensions
to the HMR API which can enable Map Reduce jobs to run
faster on the M3R engine, while not affecting their
performance under the Hadoop engine.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Rosch:2012:SAH,
author = "Philipp R{\"o}sch and Lars Dannecker and Franz
F{\"a}rber and Gregor Hackenbroich",
title = "A storage advisor for hybrid-store databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1748--1758",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the SAP HANA database, SAP offers a
high-performance in-memory hybrid-store database.
Hybrid-store databases---that is, databases supporting
row- and column-oriented data management---are getting
more and more prominent. While the columnar management
offers high-performance capabilities for analyzing
large quantities of data, the row-oriented store can
handle transactional point queries as well as inserts
and updates more efficiently. To effectively take
advantage of both stores at the same time the novel
question whether to store the given data row- or
column-oriented arises. We tackle this problem with a
storage advisor tool that supports database
administrators at this decision. Our proposed storage
advisor recommends the optimal store based on data and
query characteristics; its core is a cost model to
estimate and compare query execution times for the
different stores. Besides a per-table decision, our
tool also considers to horizontally and vertically
partition the data and manage the partitions on
different stores. We evaluated the storage advisor for
the use in the SAP HANA database; we show the
recommendation quality as well as the benefit of having
the data in the optimal store with respect to increased
query performance.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Switakowski:2012:CSP,
author = "Micha{\l} {\'S}witakowski and Peter Boncz and Marcin
Zukowski",
title = "From cooperative scans to predictive buffer
management",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1759--1770",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In analytical applications, database systems often
need to sustain workloads with multiple concurrent
scans hitting the same table. The Cooperative Scans
(CScans) framework, which introduces an Active Buffer
Manager (ABM) component into the database architecture,
has been the most effective and elaborate response to
this problem, and was initially developed in the X100
research prototype. We now report on the experiences of
integrating Cooperative Scans into its
industrial-strength successor, the Vectorwise database
product. During this implementation we invented a
simpler optimization of concurrent scan buffer
management, called Predictive Buffer Management (PBM).
PBM is based on the observation that in a workload with
long-running scans, the buffer manager has quite a bit
of information on the workload in the immediate future,
such that an approximation of the ideal OPT algorithm
becomes feasible. In the evaluation on both synthetic
benchmarks as well as a TPC-H throughput run we compare
the benefits of naive buffer management (LRU) versus
CScans, PBM and OPT; showing that PBM achieves benefits
close to Cooperative Scans, while incurring much lower
architectural impact.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lee:2012:ULI,
author = "George Lee and Jimmy Lin and Chuang Liu and Andrew
Lorek and Dmitriy Ryaboy",
title = "The unified logging infrastructure for data analytics
at {Twitter}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1771--1780",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In recent years, there has been a substantial amount
of work on large-scale data analytics using
Hadoop-based platforms running on large clusters of
commodity machines. A less-explored topic is how those
data, dominated by application logs, are collected and
structured to begin with. In this paper, we present
Twitter's production logging infrastructure and its
evolution from application-specific logging to a
unified ``client events'' log format, where messages
are captured in common, well-formatted, flexible Thrift
messages. Since most analytics tasks consider the user
session as the basic unit of analysis, we
pre-materialize ``session sequences'', which are
compact summaries that can answer a large class of
common queries quickly. The development of this
infrastructure has streamlined log collection and data
analysis, thereby improving our ability to rapidly
experiment and iterate on various aspects of the
service.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Talius:2012:TLB,
author = "Tomas Talius and Robin Dhamankar and Andrei Dumitrache
and Hanuma Kodavalla",
title = "Transaction log based application error recovery and
point in-time query",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1781--1789",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Database backups have traditionally been used as the
primary mechanism to recover from hardware and user
errors. High availability solutions maintain redundant
copies of data that can be used to recover from most
failures except user or application errors. Database
backups are neither space nor time efficient for
recovering from user errors which typically occur in
the recent past and affect a small portion of the
database. Moreover periodic full backups impact user
workload and increase storage costs. In this paper we
present a scheme that can be used for both user and
application error recovery starting from the current
state and rewinding the database back in time using the
transaction log. While we provide a consistent view of
the entire database as of a point in time in the past,
the actual prior versions are produced only for data
that is accessed. We make the as of data accessible to
arbitrary point in time queries by integrating with the
database snapshot feature in Microsoft SQL Server.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lamb:2012:VAD,
author = "Andrew Lamb and Matt Fuller and Ramakrishna
Varadarajan and Nga Tran and Ben Vandiver and Lyric
Doshi and Chuck Bear",
title = "The {Vertica Analytic Database}: {C-Store} 7 years
later",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1790--1801",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper describes the system architecture of the
Vertica Analytic Database (Vertica), a
commercialization of the design of the C-Store research
prototype. Vertica demonstrates a modern commercial
RDBMS system that presents a classical relational
interface while at the same time achieving the high
performance expected from modern ``web scale'' analytic
systems by making appropriate architectural choices.
Vertica is also an instructive lesson in how academic
systems research can be directly commercialized into a
successful product.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chen:2012:IAP,
author = "Yanpei Chen and Sara Alspaugh and Randy Katz",
title = "Interactive analytical processing in big data systems:
a cross-industry study of {MapReduce} workloads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1802--1813",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Within the past few years, organizations in diverse
industries have adopted MapReduce-based systems for
large-scale data processing. Along with these new
users, important new workloads have emerged which
feature many small, short, and increasingly interactive
jobs in addition to the large, long-running batch jobs
for which MapReduce was originally designed. As
interactive, large-scale query processing is a strength
of the RDBMS community, it is important that lessons
from that field be carried over and applied where
possible in this new domain. However, these new
workloads have not yet been described in the
literature. We fill this gap with an empirical analysis
of MapReduce traces from six separate business-critical
deployments inside Facebook and at Cloudera customers
in e-commerce, telecommunications, media, and retail.
Our key contribution is a characterization of new
MapReduce workloads which are driven in part by
interactive analysis, and which make heavy use of
query-like programming frameworks on top of MapReduce.
These workloads display diverse behaviors which
invalidate prior assumptions about MapReduce such as
uniform data access, regular diurnal patterns, and
prevalence of large jobs. A secondary contribution is a
first step towards creating a TPC-like data processing
benchmark for MapReduce.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lam:2012:MMS,
author = "Wang Lam and Lu Liu and Sts Prasad and Anand Rajaraman
and Zoheb Vacheri and AnHai Doan",
title = "{Muppet}: {MapReduce}-style processing of fast data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1814--1825",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "MapReduce has emerged as a popular method to process
big data. In the past few years, however, not just big
data, but fast data has also exploded in volume and
availability. Examples of such data include sensor data
streams, the Twitter Firehose, and Facebook updates.
Numerous applications must process fast data. Can we
provide a MapReduce-style framework so that developers
can quickly write such applications and execute them
over a cluster of machines, to achieve low latency and
high scalability? In this paper we report on our
investigation of this question, as carried out at
Kosmix and WalmartLabs. We describe MapUpdate, a
framework like MapReduce, but specifically developed
for fast data. We describe Muppet, our implementation
of MapUpdate. Throughout the description we highlight
the key challenges, argue why MapReduce is not well
suited to address them, and briefly describe our
current solutions. Finally, we describe our experience
and lessons learned with Muppet, which has been used
extensively at Kosmix and WalmartLabs to power a broad
range of applications in social media and e-commerce.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jacques-Silva:2012:BUD,
author = "Gabriela Jacques-Silva and Bugra Gedik and Rohit Wagle
and Kun-Lung Wu and Vibhore Kumar",
title = "Building user-defined runtime adaptation routines for
stream processing applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1826--1837",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Stream processing applications are deployed as
continuous queries that run from the time of their
submission until their cancellation. This deployment
mode limits developers who need their applications to
perform runtime adaptation, such as algorithmic
adjustments, incremental job deployment, and
application-specific failure recovery. Currently,
developers do runtime adaptation by using external
scripts and/or by inserting operators into the stream
processing graph that are unrelated to the data
processing logic. In this paper, we describe a
component called orchestrator that allows users to
write routines for automatically adapting the
application to runtime conditions. Developers build an
orchestrator by registering and handling events as well
as specifying actuations. Events can be generated due
to changes in the system state (e.g., application
component failures), built-in system metrics (e.g.,
throughput of a connection), or custom application
metrics (e.g., quality score). Once the orchestrator
receives an event, users can take adaptation actions by
using the orchestrator actuation APIs. We demonstrate
the use of the orchestrator in IBM's System S in the
context of three different applications, illustrating
application adaptation to changes on the incoming data
distribution, to application failures, and on-demand
dynamic composition.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jiang:2012:MSP,
author = "Junchen Jiang and Hongji Bao and Edward Y. Chang and
Yuqian Li",
title = "{MOIST}: a scalable and parallel moving object indexer
with school tracking",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1838--1849",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Location-Based Service (LBS) is rapidly becoming the
next ubiquitous technology for a wide range of mobile
applications. To support applications that demand
nearest-neighbor and history queries, an LBS spatial
indexer must be able to efficiently update, query,
archive and mine location records, which can be in
contention with each other. In this work, we propose
MOIST, whose baseline is a recursive spatial
partitioning indexer built upon BigTable. To reduce
update and query contention, MOIST groups nearby
objects of similar trajectory into the same school, and
keeps track of only the history of school leaders. This
dynamic clustering scheme can eliminate redundant
updates and hence reduce update latency. To improve
history query processing, MOIST keeps some history data
in memory, while it flushes aged data onto parallel
disks in a locality-preserving way. Through
experimental studies, we show that MOIST can support
highly efficient nearest-neighbor and history queries
and can scale well with an increasing number of users
and update frequency.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ports:2012:SSI,
author = "Dan R. K. Ports and Kevin Grittner",
title = "Serializable snapshot isolation in {PostgreSQL}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1850--1861",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper describes our experience implementing
PostgreSQL's new serializable isolation level. It is
based on the recently-developed Serializable Snapshot
Isolation (SSI) technique. This is the first
implementation of SSI in a production database release
as well as the first in a database that did not
previously have a lock-based serializable isolation
level. We reflect on our experience and describe how we
overcame some of the resulting challenges, including
the implementation of a new lock manager, a technique
for ensuring memory usage is bounded, and integration
with other PostgreSQL features. We also introduce an
extension to SSI that improves performance for
read-only transactions. We evaluate PostgreSQL's
serializable isolation level using several benchmarks
and show that it achieves performance only slightly
below that of snapshot isolation, and significantly
outperforms the traditional two-phase locking approach
on read-intensive workloads.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Murthy:2012:EEU,
author = "Karin Murthy and Prasad M. Deshpande and Atreyee Dey
and Ramanujam Halasipuram and Mukesh Mohania and P.
Deepak and Jennifer Reed and Scott Schumacher",
title = "Exploiting evidence from unstructured data to enhance
master data management",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1862--1873",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Master data management (MDM) integrates data from
multiple structured data sources and builds a
consolidated 360-degree view of business entities such
as customers and products. Today's MDM systems are not
prepared to integrate information from unstructured
data sources, such as news reports, emails, call-center
transcripts, and chat logs. However, those unstructured
data sources may contain valuable information about the
same entities known to MDM from the structured data
sources. Integrating information from unstructured data
into MDM is challenging as textual references to
existing MDM entities are often incomplete and
imprecise and the additional entity information
extracted from text should not impact the
trustworthiness of MDM data. In this paper, we present
an architecture for making MDM text-aware and showcase
its implementation as IBM Info-Sphere MDM Extension for
Unstructured Text Correlation, an add-on to IBM
InfoSphere Master Data Management Standard Edition. We
highlight how MDM benefits from additional evidence
found in documents when doing entity resolution and
relationship discovery. We experimentally demonstrate
the feasibility of integrating information from
unstructured data sources into MDM.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wu:2012:AOW,
author = "Lili Wu and Roshan Sumbaly and Chris Riccomini and
Gordon Koo and Hyung Jin Kim and Jay Kreps and Sam
Shah",
title = "{Avatara}: {OLAP} for web-scale analytics products",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1874--1877",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Multidimensional data generated by members on websites
has seen massive growth in recent years. OLAP is a
well-suited solution for mining and analyzing this
data. Providing insights derived from this analysis has
become crucial for these websites to give members
greater value. For example, LinkedIn, the largest
professional social network, provides its professional
members rich analytics features like ``Who's Viewed My
Profile?'' and ``Who's Viewed This Job?'' The data
behind these features form cubes that must be
efficiently served at scale, and can be neatly sharded
to do so. To serve our growing 160 million member base,
we built a scalable and fast OLAP serving system called
Avatara to solve this many, small cubes problem. At
LinkedIn, Avatara has been powering several analytics
features on the site for the past two years.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kolb:2012:DED,
author = "Lars Kolb and Andreas Thor and Erhard Rahm",
title = "{Dedoop}: efficient deduplication with {Hadoop}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1878--1881",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We demonstrate a powerful and easy-to-use tool called
Dedoop (Deduplication with Hadoop) for MapReduce-based
entity resolution (ER) of large datasets. Dedoop
supports a browser-based specification of complex ER
workflows including blocking and matching steps as well
as the optional use of machine learning for the
automatic generation of match classifiers. Specified
workflows are automatically translated into MapReduce
jobs for parallel execution on different Hadoop
clusters. To achieve high performance Dedoop supports
several advanced load balancing strategies.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2012:MBD,
author = "Xiufeng Liu and Christian Thomsen and Torben Bach
Pedersen",
title = "{MapReduce}-based dimensional {ETL} made easy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1882--1885",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper demonstrates ETLMR, a novel dimensional
Extract--Transform--Load (ETL) programming framework
that uses Map-Reduce to achieve scalability. ETLMR has
built-in native support of data warehouse (DW) specific
constructs such as star schemas, snowflake schemas, and
slowly changing dimensions (SCDs). This makes it
possible to build MapReduce-based dimensional ETL flows
very easily. The ETL process can be configured with
only few lines of code. We will demonstrate the
concrete steps in using ETLMR to load data into a
(partly snowflaked) DW schema. This includes
configuration of data sources and targets, dimension
processing schemes, fact processing, and deployment. In
addition, we also present the scalability on large data
sets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Xu:2012:CIE,
author = "Huiqi Xu and Zhen Li and Shumin Guo and Keke Chen",
title = "{CloudVista}: interactive and economical visual
cluster analysis for big data in the cloud",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1886--1889",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Analysis of big data has become an important problem
for many business and scientific applications, among
which clustering and visualizing clusters in big data
raise some unique challenges. This demonstration
presents the CloudVista prototype system to address the
problems with big data caused by using existing data
reduction approaches. It promotes a whole-big-data
visualization approach that preserves the details of
clustering structure. The prototype system has several
merits. (1) Its visualization model is naturally
parallel, which guarantees the scalability. (2) The
visual frame structure minimizes the data transferred
between the cloud and the client. (3) The RandGen
algorithm is used to achieve a good balance between
interactivity and batch processing. (4) This approach
is also designed to minimize the financial cost of
interactive exploration in the cloud. The demonstration
will highlight the problems with existing approaches
and show the advantages of the CloudVista approach. The
viewers will have the chance to play with the
CloudVista prototype system and compare the
visualization results generated with different
approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Alexandrov:2012:MSE,
author = "Alexander Alexandrov and Kostas Tzoumas and Volker
Markl",
title = "{Myriad}: scalable and expressive data generation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1890--1893",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The current research focus on Big Data systems calls
for a rethinking of data generation methods. The
traditional sequential data generation approach is not
well suited to large-scale systems as generating a
terabyte of data may require days or even weeks
depending on the number of constraints imposed on the
generated model. We demonstrate Myriad, a new data
generation toolkit that enables the specification of
semantically rich data generator programs that can
scale out linearly in a shared-nothing environment.
Data generation programs built on top of Myriad
implement an efficient parallel execution strategy
leveraged by the extensive use of pseudo-random number
generators with random access support.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wu:2012:DDC,
author = "Eugene Wu and Samuel Madden and Michael Stonebraker",
title = "A demonstration of {DBWipes}: clean as you query",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1894--1897",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "As data analytics becomes mainstream, and the
complexity of the underlying data and computation
grows, it will be increasingly important to provide
tools that help analysts understand the underlying
reasons when they encounter errors in the result. While
data provenance has been a large step in providing
tools to help debug complex workflows, its current form
has limited utility when debugging aggregation
operators that compute a single output from a large
collection of inputs. Traditional provenance will
return the entire input collection, which has very low
precision. In contrast, users are seeking precise
descriptions of the inputs that caused the errors. We
propose a Ranked Provenance System, which identifies
subsets of inputs that influenced the output error,
describes each subset with human readable predicates
and orders them by contribution to the error. In this
demonstration, we will present DBWipes, a novel data
cleaning system that allows users to execute aggregate
queries, and interactively detect, understand, and
clean errors in the query results. Conference attendees
will explore anomalies in campaign donations from the
current US presidential election and in readings from a
54-node sensor deployment.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Alsubaiee:2012:AOS,
author = "Sattam Alsubaiee and Yasser Altowim and Hotham
Altwaijry and Alexander Behm and Vinayak Borkar and
Yingyi Bu and Michael Carey and Raman Grover and
Zachary Heilbron and Young-Seok Kim and Chen Li and
Nicola Onose and Pouria Pirzadeh and Rares Vernica and
Jian Wen",
title = "{ASTERIX}: an open source system for ``Big {Data'}'
management and analysis (demo)",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1898--1901",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "At UC Irvine, we are building a next generation
parallel database system, called ASTERIX, as our
approach to addressing today's ``Big Data'' management
challenges. ASTERIX aims to combine time-tested
principles from parallel database systems with those of
the Web-scale computing community, such as fault
tolerance for long running jobs. In this demo, we
present a whirlwind tour of ASTERIX, highlighting a few
of its key features. We will demonstrate examples of
our data definition language to model semi-structured
data, and examples of interesting queries using our
declarative query language. In particular, we will show
the capabilities of ASTERIX for answering geo-spatial
queries and fuzzy queries, as well as ASTERIX' data
feed construct for continuously ingesting data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Agarwal:2012:BDI,
author = "Sameer Agarwal and Anand P. Iyer and Aurojit Panda and
Samuel Madden and Barzan Mozafari and Ion Stoica",
title = "Blink and it's done: interactive queries on very large
data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1902--1905",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this demonstration, we present BlinkDB, a massively
parallel, sampling-based approximate query processing
framework for running interactive queries on large
volumes of data. The key observation in BlinkDB is that
one can make reasonable decisions in the absence of
perfect answers. BlinkDB extends the Hive/HDFS stack
and can handle the same set of SPJA (selection,
projection, join and aggregate) queries as supported by
these systems. BlinkDB provides real-time answers along
with statistical error guarantees, and can scale to
petabytes of data and thousands of machines in a
fault-tolerant manner. Our experiments using the TPC-H
benchmark and on an anonymized real-world video content
distribution workload from Conviva Inc. show that
BlinkDB can execute a wide range of queries up to 150x
faster than Hive on MapReduce and 10--150x faster than
Shark (Hive on Spark) over tens of terabytes of data
stored across 100 machines, all with an error of
2--10\%.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Roy:2012:MGD,
author = "Abhishek Roy and Yanlei Diao and Evan Mauceli and
Yiping Shen and Bai-Lin Wu",
title = "Massive genomic data processing and deep analysis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1906--1909",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Today large sequencing centers are producing genomic
data at the rate of 10 terabytes a day and require
complicated processing to transform massive amounts of
noisy raw data into biological information. To address
these needs, we develop a system for end-to-end
processing of genomic data, including alignment of
short read sequences, variation discovery, and deep
analysis. We also employ a range of quality control
mechanisms to improve data quality and parallel
processing techniques for performance. In the demo, we
will use real genomic data to show details of data
transformation through the workflow, the usefulness of
end results (ready for use as testable hypotheses), the
effects of our quality control mechanisms and improved
algorithms, and finally performance improvement.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liarou:2012:MDO,
author = "Erietta Liarou and Stratos Idreos and Stefan Manegold
and Martin Kersten",
title = "{MonetDB\slash DataCell}: online analytics in a
streaming column-store",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1910--1913",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In DataCell, we design streaming functionalities in a
modern relational database kernel which targets big
data analytics. This includes exploitation of both its
storage/execution engine and its optimizer
infrastructure. We investigate the opportunities and
challenges that arise with such a direction and we show
that it carries significant advantages for modern
applications in need for online analytics such as web
logs, network monitoring and scientific data
management. The major challenge then becomes the
efficient support for specialized stream features,
e.g., multi-query processing and incremental
window-based processing as well as exploiting standard
DBMS functionalities in a streaming environment such as
indexing. This demo presents DataCell, an extension of
the MonetDB open-source column-store for online
analytics. The demo gives users the opportunity to
experience the features of DataCell such as processing
both stream and persistent data and performing window
based processing. The demo provides a visual interface
to monitor the critical system components, e.g., how
query plans transform from typical DBMS query plans to
online query plans, how data flows through the query
plans as the streams evolve, how DataCell maintains
intermediate results in columnar form to avoid repeated
evaluation of the same stream portions, etc. The demo
also provides the ability to interactively set the test
scenarios and various DataCell knobs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cao:2012:SSE,
author = "Xin Cao and Gao Cong and Christian S. Jensen and Jun
Jie Ng and Beng Chin Ooi and Nhan-Tue Phan and Dingming
Wu",
title = "{SWORS}: a system for the efficient retrieval of
relevant spatial web objects",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1914--1917",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Spatial web objects that possess both a geographical
location and a textual description are gaining in
prevalence. This gives prominence to spatial keyword
queries that exploit both location and textual
arguments. Such queries are used in many web services
such as yellow pages and maps services. We present
SWORS, the Spatial Web Object Retrieval System, that is
capable of efficiently retrieving spatial web objects
that satisfy spatial keyword queries. Specifically,
SWORS supports two types of queries: (a) the
location-aware top-$k$ text retrieval (L $k$ T) query
that retrieves $k$ individual spatial web objects
taking into account query location proximity and text
relevancy; (b) the spatial keyword group (SKG) query
that retrieves a group of objects that cover the query
keywords and are nearest to the query location and have
the shortest inter-object distances. SWORS provides
browser-based interfaces for desktop and laptop
computers and provides a client application for mobile
devices. The interfaces and the client enable users to
formulate queries and view the query results on a map.
The server side stores the data and processes the
queries. We use three real-life data sets to
demonstrate the functionality and performance of
SWORS.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Morishima:2012:CCD,
author = "Atsuyuki Morishima and Norihide Shinagawa and Tomomi
Mitsuishi and Hideto Aoki and Shun Fukusumi",
title = "{CyLog\slash Crowd4U}: a declarative platform for
complex data-centric crowdsourcing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1918--1921",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This demo presents a principled approach to the
problems of data-centric human/machine computations
with Crowd4U, a crowdsourcing platform equipped with a
suite of tools for rapid development of crowdsourcing
applications. Using the demo, we show that declarative
database abstraction can be used as a powerful tool to
design, implement, and analyze data-centric
crowdsourcing applications. The power of Crowd4U comes
from CyLog, a database abstraction that handles complex
data-centric human/machine computations. CyLog is a
Datalog-like language that incorporates a principled
feedback system for humans at the language level so
that the semantics of the computation not closed in
machines can be defined based on the game theory. We
believe that the demo clearly shows that database
abstraction can be a promising basis for designing
complex data-centric applications requiring
human/machine computations.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Silva:2012:EDS,
author = "Yasin N. Silva and Spencer Pearson",
title = "Exploiting database similarity joins for metric
spaces",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1922--1925",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Similarity Joins are recognized among the most useful
data processing and analysis operations and are
extensively used in multiple application domains. They
retrieve all data pairs whose distances are smaller
than a predefined threshold $ \epsilon $. Multiple
Similarity Join algorithms and implementation
techniques have been proposed. They range from
out-of-database approaches for only in-memory and
external memory data to techniques that make use of
standard database operators to answer similarity joins.
Recent work has shown that this operation can be
efficiently implemented as a physical database
operator. However, the proposed operator only support
1D numeric data. This paper presents DBSimJoin, a
physical Similarity Join database operator for datasets
that lie in any metric space. DBSimJoin is a
non-blocking operator that prioritizes the early
generation of results. We implemented the proposed
operator in PostgreSQL, an open source database system.
We show how this operator can be used in multiple
real-world data analysis scenarios with multiple data
types and distance functions. Particularly, we show the
use of DBSimJoin to identify similar images represented
as feature vectors, and similar publications in a
bibliographic database. We also show that DBSimJoin
scales very well when important parameters, e.g., e,
data size, increase.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gawade:2012:SPI,
author = "Mrunal Gawade and Martin Kersten",
title = "{Stethoscope}: a platform for interactive visual
analysis of query execution plans",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1926--1929",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Searching for the performance bottleneck in an
execution trace is an error prone and time consuming
activity. Existing tools offer some comfort by
providing a visual representation of trace for
analysis. In this paper we present the Stethoscope, an
interactive visual tool to inspect and analyze columnar
database query performance, both online and offline.
It's unique interactive animated interface capitalizes
the large data-flow graph representation of a query
execution plan, augmented with query execution trace
information. We demonstrate features of Stethoscope for
both online and offline analysis of long running
queries. It helps in understanding where time goes, how
optimizers perform, and how parallel processing on
multi-core systems is exploited.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kotsifakos:2012:HSS,
author = "Alexios Kotsifakos and Panagiotis Papapetrou and
Jaakko Hollm{\'e}n and Dimitrios Gunopulos and Vassilis
Athitsos and George Kollios",
title = "{Hum-a-song}: a subsequence matching with
gaps-range-tolerances query-by-humming system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1930--1933",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present ``Hum-a-song'', a system built for music
retrieval, and particularly for the Query-By-Humming
(QBH) application. According to QBH, the user is able
to hum a part of a song that she recalls and would like
to learn what this song is, or find other songs similar
to it in a large music repository. We present a simple
yet efficient approach that maps the problem to time
series subsequence matching. The query and the database
songs are represented as 2-dimensional time series
conveying information about the pitch and the duration
of the notes. Then, since the query is a short sequence
and we want to find its best match that may start and
end anywhere in the database, subsequence matching
methods are suitable for this task. In this demo, we
present a system that employs and exposes to the user a
variety of state-of-the-art dynamic programming
methods, including a newly proposed efficient method
named SMBGT that is robust to noise and considers all
intrinsic problems in QBH; it allows variable tolerance
levels when matching elements, where tolerances are
defined as functions of the compared sequences, gaps in
both the query and target sequences, and bounds the
matching length and (optionally) the minimum number of
matched elements. Our system is intended to become open
source, which is to the best of our knowledge the first
non-commercial effort trying to solve QBH with a
variety of methods, and that also approaches the
problem from the time series perspective.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kwon:2012:SAM,
author = "YongChul Kwon and Magdalena Balazinska and Bill Howe
and Jerome Rolia",
title = "{SkewTune} in action: mitigating skew in {MapReduce}
applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1934--1937",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We demonstrate SkewTune, a system that automatically
mitigates skew in user-defined MapReduce programs and
is a drop-in replacement for Hadoop. The demonstration
has two parts. First, we demonstrate how SkewTune
mitigates skew in real MapReduce applications at
runtime by running a real application in a public
cloud. Second, through an interactive graphical
interface, we demonstrate the details of the skew
mitigation process using both real and synthetic
workloads that represent various skew configurations.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Abouzied:2012:PQS,
author = "Azza Abouzied and Joseph M. Hellerstein and Avi
Silberschatz",
title = "Playful query specification with {DataPlay}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1938--1941",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "DataPlay is a query tool that encourages a
trial-and-error approach to query specification.
DataPlay uses a graphical query language to make a
particularly challenging query specification task ---
quantification --- easier. It constrains the relational
data model to enable the presentation of non-answers,
in addition to answers, to aid query interpretation.
Two novel features of DataPlay are suggesting semantic
variations to a query and correcting queries by
example. We introduce DataPlay as a sophisticated query
specification tool and demonstrate its unique
interaction models.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Alagiannis:2012:NAA,
author = "Ioannis Alagiannis and Renata Borovica and Miguel
Branco and Stratos Idreos and Anastasia Ailamaki",
title = "{NoDB} in action: adaptive query processing on raw
data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1942--1945",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "As data collections become larger and larger, users
are faced with increasing bottlenecks in their data
analysis. More data means more time to prepare the
data, to load the data into the database and to execute
the desired queries. Many applications already avoid
using traditional database systems, e.g., scientific
data analysis and social networks, due to their
complexity and the increased data-to-query time, i.e.
the time between getting the data and retrieving its
first useful results. For many applications data
collections keep growing fast, even on a daily basis,
and this data deluge will only increase in the future,
where it is expected to have much more data than what
we can move or store, let alone analyze. In this
demonstration, we will showcase a new philosophy for
designing database systems called NoDB. NoDB aims at
minimizing the data-to-query time, most prominently by
removing the need to load data before launching
queries. We will present our prototype implementation,
PostgresRaw, built on top of PostgreSQL, which allows
for efficient query execution over raw data files with
zero initialization overhead. We will visually
demonstrate how PostgresRaw incrementally and
adaptively touches, parses, caches and indexes raw data
files autonomously and exclusively as a side-effect of
user queries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wenzel:2012:CPQ,
author = "Florian Wenzel and Markus Endres and Stefan Mandl and
Werner Kie{\ss}ling",
title = "Complex preference queries supporting spatial
applications for user groups",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1946--1949",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Our demo application demonstrates a personalized
location-based web application using Preference SQL
that allows single users as well as groups of users to
find accommodations in Istanbul that satisfy both hard
constraints and user preferences. The application
assists in defining spatial, numerical, and categorical
base preferences and composes complex preference
statements in an intuitive fashion. Unlike existing
location-based services, the application considers
spatial queries as soft instead of hard constraints to
determine the best matches which are finally presented
on a map. The underlying Preference SQL framework is
implemented on top of a database, therefore enabling a
seamless application integration with standard SQL
back-end systems as well as efficient and extensible
preference query processing.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bakibayev:2012:DFQ,
author = "Nurzhan Bakibayev and Dan Olteanu and Jakub
Z{\'a}vodn{\'y}",
title = "Demonstration of the {FDB} query engine for factorised
databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1950--1953",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "FDB is an in-memory query engine for factorised
databases, which are relational databases that use
compact factorised representations at the physical
layer to reduce data redundancy and boost query
performance. We demonstrate FDB using real data sets
from IMDB, DBLP, and the NELL repository of facts
learned from Web pages. The users can inspect
factorisations as well as plans used by FDB to compute
factorised results of select-project-join queries on
factorised databases.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Xu:2012:PRD,
author = "Zichen Xu and Yi-Cheng Tu and Xiaorui Wang",
title = "{PET}: reducing database energy cost via query
optimization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1954--1957",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Energy conservation is a growing important issue in
designing modern database management system (DBMS).
This requires a deep thinking about the tradeoffs
between energy and performance. Despite the significant
amount of efforts at the hardware level to make the
major components consume less energy, we argue for a
revisit of the DBMS query processing mechanism to
identify and harvest the potential of energy saving.
However, the state-of-art architecture of DBMS does not
take energy usage into consideration in its design. A
major challenge in developing an energy-aware DBMS is
to design and implement a cost-based query optimizer
that evaluates query plans by both performance and
energy costs. By following such a strategy, our
previous work revealed the fact that energy-efficient
query plans do not necessarily have the shortest
processing time. This demo proposal introduces PET ---
an energy-aware query optimization framework that is
built as a part of the PostgreSQL kernel. PET, via its
power cost estimation module and plan evaluation model,
enables the database system to run under a
DBA-specified energy/performance tradeoff level. PET
contains a power cost estimator that can accurately
estimate the power cost of query plans at compile time,
and a query evaluation engine that the DBA could
configure key PET parameters towards the desired
tradeoff. The software to be demonstrated will also
include workload engine for producing large quantities
of queries and data sets. Our demonstration will show
how PET functions via a comprehensive set of views from
its graphical user interface named PET Viewer. Through
such interfaces, a user can achieve a good
understanding of the energy-related query optimization
and cost-based plan generation. Users are also allowed
to interact with PET to experience the different
energy/performance tradeoffs by changing PET and
workload parameters at query runtime.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Letelier:2012:SSA,
author = "Andr{\'e}s Letelier and Jorge P{\'e}rez and Reinhard
Pichler and Sebastian Skritek",
title = "{SPAM}: a {SPARQL} analysis and manipulation tool",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1958--1961",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "SQL developers are used to having elaborate tools
which help them in writing queries. In contrast, the
creation of tools to assist users in the development of
SPARQL queries is still in its infancy. In this system
demo, we present the SPARQL Analysis and Manipulation
(SPAM) tool, which provides help for the development of
SPARQL queries. The main features of the SPAM tool
comprise an editor with both text and graphical
interface, as well as various functions for the static
and dynamic analysis of SPARQL queries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Koutris:2012:QDP,
author = "Paraschos Koutris and Prasang Upadhyaya and Magdalena
Balazinska and Bill Howe and Dan Suciu",
title = "{QueryMarket} demonstration: pricing for online data
markets",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1962--1965",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Increasingly data is being bought and sold online. To
facilitate such transactions, online data market-places
have emerged to provide a service for sellers to price
views on their data, and buyers to buy such views.
These marketplaces neither support the sale of ad-hoc
queries (that are not one of the specified views), nor
do they support queries that join datasets. We present
QueryMarket, a prototype data marketplace that
automatically extrapolates prices to ad-hoc queries,
including those with joins, from the manually priced
views. We call this capability ``query-based pricing''
and describe how it is superior to existing pricing
methods, and how it provides more flexible pricing for
the sellers. We then show how QueryMarket implements
query-based pricing and how it generates explanations
for the prices it computes.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Luo:2012:DSD,
author = "Siqiang Luo and Yifeng Luo and Shuigeng Zhou and Gao
Cong and Jihong Guan",
title = "{DISKs}: a system for distributed spatial group
keyword search on road networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1966--1969",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Query (e.g., shortest path) on road networks has been
extensively studied. Although most of the existing
query processing approaches are designed for
centralized environments, there is a growing need to
handle queries on road networks in distributed
environments due to the increasing query workload and
the challenge of querying large networks. In this
demonstration, we showcase a distributed system called
{DISKs} (DIstributed Spatial Keyword search) that is
capable of efficiently supporting spatial group keyword
search (S-GKS) on road networks. Given a group of
keywords $X$ and a distance $r$, an SGKS returns
locations on a road network, such that for each
returned location $p$, there exists a set of nodes (on
the road network), which are located within a network
distance $r$ from $p$ and collectively contains $X$. We
will demonstrate the innovative modules, performance
and interactive user interfaces of DISKs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Endrullis:2012:WEM,
author = "Stefan Endrullis and Andreas Thor and Erhard Rahm",
title = "{WETSUIT}: an efficient mashup tool for searching and
fusing web entities",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1970--1973",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We demonstrate a new powerful mashup tool called
WETSUIT (Web EnTity Search and fUsIon Tool) to search
and integrate web data from diverse sources and
domain-specific entity search engines. WETSUIT supports
adaptive search strategies to query sets of relevant
entities with a minimum of communication overhead.
Mashups can be composed using a set of high-level
operators based on the Java-compatible language Scala.
The operator implementation supports a high degree of
parallel processing, in particular a streaming of
entities between all data transformation operations
facilitating a fast presentation of intermediate
results. WETSUIT has already been applied to solve
challenging integration tasks from different domains.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Khalefa:2012:MBI,
author = "Mohamed E. Khalefa and Ulrike Fischer and Torben Bach
Pedersen and Wolfgang Lehner",
title = "Model-based integration of past \& future in
{TimeTravel}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1974--1977",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We demonstrate TimeTravel, an efficient DBMS system
for seamless integrated querying of past and
(forecasted) future values of time series, allowing the
user to view past and future values as one joint time
series. This functionality is important for advanced
application domain like energy. The main idea is to
compactly represent time series as models. By using
models, the TimeTravel system answers queries
approximately on past and future data with error
guarantees (absolute error and confidence) one order of
magnitude faster than when accessing the time series
directly. In addition, it efficiently supports exact
historical queries by only accessing relevant portions
of the time series. This is unlike existing approaches,
which access the entire time series to exactly answer
the query. To realize this system, we propose a novel
hierarchical model index structure. As real-world time
series usually exhibits seasonal behavior, models in
this index incorporate seasonality. To construct a
hierarchical model index, the user specifies
seasonality period, error guarantees levels, and a
statistical forecast method. As time proceeds, the
system incrementally updates the index and utilizes it
to answer approximate and exact queries. TimeTravel is
implemented into PostgreSQL, thus achieving complete
user transparency at the query level. In the demo, we
show the easy building of a hierarchical model index
for a real-world time series and the effect of varying
the error guarantees on the speed up of approximate and
exact queries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Eberius:2012:DEB,
author = "Julian Eberius and Maik Thiele and Katrin Braunschweig
and Wolfgang Lehner",
title = "{DrillBeyond}: enabling business analysts to explore
the {Web of Open Data}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1978--1981",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Following the Open Data trend, governments and public
agencies have started making their data available on
the Web and established platforms such as data.gov or
data.un.org. These Open Data platforms provide a huge
amount of data for various topics such as demographics,
transport, finance or health in various data formats.
One typical usage scenario for this kind of data is
their integration into a database or data warehouse in
order to apply data analytics. However, in today's
business intelligence tools there is an evident lack of
support for so-called situational or ad-hoc data
integration. In this demonstration we will therefore
present DrillBeyond, a novel database and information
retrieval engine which allows users to query a local
database as well as the Web of Open Data in a seamless
and integrated way with standard SQL. The audience will
be able to pose queries to our DrillBeyond system which
will be answered partly from local data in the database
and partly from datasets that originate from the Web of
Data. We will show how such queries are divided into
known and unknown parts and how missing attributes are
mapped to open datasets. We will demonstrate the
integration of the open datasets back into the DBMS in
order to apply its analytical features.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Nakashole:2012:DER,
author = "Ndapandula Nakashole and Gerhard Weikum and Fabian
Suchanek",
title = "Discovering and exploring relations on the web",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1982--1985",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We propose a demonstration of PATTY, a system for
learning semantic relationships from the Web. PATTY is
a collection of relations learned automatically from
text. It aims to be to patterns what WordNet is to
words. The semantic types of PATTY relations enable
advanced search over subject-predicate-object data.
With the ongoing trends of enriching Web data (both
text and tables) with entity-relationship-oriented
semantic annotations, we believe a demo of the PATTY
system will be of interest to the database community.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Thirumuruganathan:2012:MME,
author = "Saravanan Thirumuruganathan and Mahashweta Das and
Shrikant Desai and Sihem Amer-Yahia and Gautam Das and
Cong Yu",
title = "{MapRat}: meaningful explanation, interactive
exploration and geo-visualization of collaborative
ratings",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1986--1989",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Collaborative rating sites such as IMDB and Yelp have
become rich resources that users consult to form
judgments about and choose from among competing items.
Most of these sites either provide a plethora of
information for users to interpret all by themselves or
a simple overall aggregate information. Such aggregates
(e.g., average rating over all users who have rated an
item, aggregates along pre-defined dimensions, etc.)
can not help a user quickly decide the desirability of
an item. In this paper, we build a system MapRat that
allows a user to explore multiple carefully chosen
aggregate analytic details over a set of user
demographics that meaningfully explain the ratings
associated with item(s) of interest. MapRat allows a
user to systematically explore, visualize and
understand user rating patterns of input item(s) so as
to make an informed decision quickly. In the demo,
participants are invited to explore collaborative movie
ratings for popular movies.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Park:2012:DSD,
author = "Hyunjung Park and Hector Garcia-Molina and Richard
Pang and Neoklis Polyzotis and Aditya Parameswaran and
Jennifer Widom",
title = "{Deco}: a system for declarative crowdsourcing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1990--1993",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Deco is a system that enables declarative
crowdsourcing: answering SQL queries posed over data
gathered from the crowd as well as existing relational
data. Deco implements a novel push-pull hybrid
execution model in order to support a flexible data
model and a precise query semantics, while coping with
the combination of latency, monetary cost, and
uncertainty of crowdsourcing. We demonstrate Deco using
two crowdsourcing platforms: Amazon Mechanical Turk and
an in-house platform, to show how Deco provides a
convenient means of collecting and querying
crowdsourced data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Martens:2012:DAX,
author = "Wim Martens and Matthias Niewerth and Frank Neven and
Thomas Schwentick",
title = "Developing and analyzing {XSDs} through {BonXai}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1994--1997",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "BonXai is a versatile schema specification language
expressively equivalent to XML Schema. It is not
intended as a replacement for XML Schema but it can
serve as an additional, user-friendly front-end. It
offers a simple way and a lightweight syntax to specify
the context of elements based on regular expressions
rather than on types. In this demo we show the
front-end capabilities of BonXai and exemplify its
potential to offer a novel way to view existing XML
Schema Definitions. In particular, we present several
usage scenarios specifically targeted to showcase the
ease of specifying, modifying, and understanding XML
Schema Definitions through BonXai.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Elmore:2012:IEG,
author = "Aaron J. Elmore and Sudipto Das and Divyakant Agrawal
and Amr {El Abbadi}",
title = "{InfoPuzzle}: exploring group decision making in
mobile peer-to-peer databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "1998--2001",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "As Internet-based services and mobile computing
devices, such as smartphones and tablets, become
ubiquitous, society's reliance on them to accomplish
critical and time-sensitive tasks, such as information
dissemination and collaborative decision making, also
increases. Dependence on these media magnifies the
damage caused by their disruption, whether malicious or
natural. For instance, a natural disaster disrupting
cellular and Internet infrastructures impedes
information spread, which in turn leads to chaos, both
among the victims as well as the aid providers.
Decentralized and ad-hoc mechanisms for information
dissemination and decision making are paramount to help
restore order. We demonstrate InfoPuzzle, a mobile
peer-to-peer database that utilizes direct device
communication to enable group decision making, or
consensus, without reliance on centralized
communication services. InfoPuzzle minimizes the
system's resource consumption, to prolong the lifetime
of the power constrained devices by minimizing
communication overhead, computational complexity, and
persistent storage size. Due to user mobility and the
limited range of point-to-point communication, knowing
the exact number of participants is impossible, and
therefore traditional consensus or quorum protocols
cannot be used. We rely of distinct counting
techniques, probabilistic thresholds, and bounded time
based approaches to reach agreement. In this demo, we
will explore various challenges and heuristics in
estimating group participation to aid users in
reconciling consensus without centralized services.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Xu:2012:MQG,
author = "Jianqiu Xu and Ralf Hartmut G{\"u}ting",
title = "Manage and query generic moving objects in {SECONDO}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "2002--2005",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this demonstration, we introduce a system that is
able to manage moving objects in all real world
environments, e.g., road network, bus network and
indoor. The complete trip of a person is managed by the
system such as Walk, Car, Walk, and Indoor, where the
precise locations of both outdoor and indoor movements
are represented. Trajectories located in several
environments are integrated into the same framework.
The system supports the shortest path searching for
start and end locations being in different
environments, for example, from a room to a bus stop. A
comprehensive and scalable set of moving objects is
generated to simulate human movement in practice.
Optimization methods are developed to efficiently
answer novel queries regarding transportation modes and
mobile environments. Most of these queries are not
supported by existing methods because of the limitation
of data representation.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2012:CFH,
author = "Pei Li and Christina Tziviskou and Haidong Wang and
Xin Luna Dong and Xiaoguang Liu and Andrea Maurino and
Divesh Srivastava",
title = "{Chronos}: facilitating history discovery by linking
temporal records",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "2006--2009",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Many data sets contain temporal records over a long
period of time; each record is associated with a time
stamp and describes some aspects of a real-world entity
at that particular time. From such data, users often
wish to search for entities in a particular period and
understand the history of one entity or all entities in
the data set. A major challenge for enabling such
search and exploration is to identify records that
describe the same real-world entity over a long period
of time; however, linking temporal records is hard
given that the values that describe an entity can
evolve over time (e.g., a person can move from one
affiliation to another). We demonstrate the Chronos
system which offers users the useful tool for finding
real-world entities over time and understanding history
of entities in the bibliography domain. The core of
Chronos is a temporal record-linkage algorithm, which
is tolerant to value evolution over time. Our algorithm
can obtain an F-measure of over 0.9 in linking author
records and fix errors made by DBLP. We show how
Chronos allows users to explore the history of authors,
and how it helps users understand our linkage results
by comparing our results with those of existing
systems, highlighting differences in the results,
explaining our decisions to users, and answering
``what-if'' questions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Koubarakis:2012:TDP,
author = "Manolis Koubarakis and Mihai Datcu and Charalambos
Kontoes and Ugo {Di Giammatteo} and Stefan Manegold and
Eva Klien",
title = "{TELEIOS}: a database-powered virtual earth
observatory",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "2010--2013",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "TELEIOS is a recent European project that addresses
the need for scalable access to petabytes of Earth
Observation data and the discovery and exploitation of
knowledge that is hidden in them. TELEIOS builds on
scientific database technologies (array databases,
SciQL, data vaults) and Semantic Web technologies
(stRDF and stSPARQL) implemented on top of a state of
the art column store database system (MonetDB). We
demonstrate a first prototype of the TELEIOS Virtual
Earth Observatory (VEO) architecture, using a forest
fire monitoring application as example.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dittrich:2012:EBD,
author = "Jens Dittrich and Jorge-Arnulfo Quian{\'e}-Ruiz",
title = "Efficient big data processing in {Hadoop MapReduce}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "2014--2015",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This tutorial is motivated by the clear need of many
organizations, companies, and researchers to deal with
big data volumes efficiently. Examples include web
analytics applications, scientific applications, and
social networks. A popular data processing engine for
big data is Hadoop MapReduce. Early versions of Hadoop
MapReduce suffered from severe performance problems.
Today, this is becoming history. There are many
techniques that can be used with Hadoop MapReduce jobs
to boost performance by orders of magnitude. In this
tutorial we teach such techniques. First, we will
briefly familiarize the audience with Hadoop MapReduce
and motivate its use for big data processing. Then, we
will focus on different data management techniques,
going from job optimization to physical data
organization like data layouts and indexes. Throughout
this tutorial, we will highlight the similarities and
differences between Hadoop MapReduce and Parallel DBMS.
Furthermore, we will point out unresolved research
problems and open issues.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shim:2012:MAB,
author = "Kyuseok Shim",
title = "{MapReduce} algorithms for big data analysis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "2016--2017",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "There is a growing trend of applications that should
handle big data. However, analyzing big data is a very
challenging problem today. For such applications, the
MapReduce framework has recently attracted a lot of
attention. Google's MapReduce or its open-source
equivalent Hadoop is a powerful tool for building such
applications. In this tutorial, we will introduce the
MapReduce framework based on Hadoop, discuss how to
design efficient MapReduce algorithms and present the
state-of-the-art in MapReduce algorithms for data
mining, machine learning and similarity joins. The
intended audience of this tutorial is professionals who
plan to design and develop MapReduce algorithms and
researchers who should be aware of the state-of-the-art
in MapReduce algorithms available today for big data
analysis.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Getoor:2012:ERT,
author = "Lise Getoor and Ashwin Machanavajjhala",
title = "Entity resolution: theory, practice \& open
challenges",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "2018--2019",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This tutorial brings together perspectives on ER from
a variety of fields, including databases, machine
learning, natural language processing and information
retrieval, to provide, in one setting, a survey of a
large body of work. We discuss both the practical
aspects and theoretical underpinnings of ER. We
describe existing solutions, current challenges, and
open research problems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Schindler:2012:CND,
author = "Jiri Schindler",
title = "{I/O} characteristics of {NoSQL} databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "2020--2021",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The advent of the so-called NoSQL databases has
brought about a new model of using storage systems.
While traditional relational database systems took
advantage of features offered by centrally-managed,
enterprise-class storage arrays, the new generation of
database systems with weaker data consistency models is
content with using and managing locally attached
individual storage devices and providing data
reliability and availability through high-level
software features and protocols. This work aims to
review the architecture of several existing NoSQL DBs
with an emphasis on how they organize and access data
in the shared-nothing locally-attached storage model.
It shows how these systems operate under typical
workloads (new inserts and point and range queries),
what access characteristics they exhibit to storage
systems. Finally, it examines how several recently
developed key/value stores, schema-free document
storage systems, and extensible column stores organize
data on local filesystems on top of directly-attached
disks and what system features they must (re)implement
in order to provide the expected data reliability.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sun:2012:MKI,
author = "Yizhou Sun and Jiawei Han and Xifeng Yan and Philip S.
Yu",
title = "Mining knowledge from interconnected data: a
heterogeneous information network analysis approach",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "2022--2023",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Most objects and data in the real world are
interconnected, forming complex, heterogeneous but
often semi-structured information networks. However,
most people consider a database merely as a data
repository that supports data storage and retrieval
rather than one or a set of heterogeneous information
networks that contain rich, inter-related, multi-typed
data and information. Most network science researchers
only study homogeneous networks, without distinguishing
the different types of objects and links in the
networks. In this tutorial, we view database and other
interconnected data as heterogeneous information
networks, and study how to leverage the rich semantic
meaning of types of objects and links in the networks.
We systematically introduce the technologies that can
effectively and efficiently mine useful knowledge from
such information networks.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Prakash:2012:UMC,
author = "B. Aditya Prakash and Christos Faloutsos",
title = "Understanding and managing cascades on large graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "2024--2025",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "How do contagions spread in population networks? Which
group should we market to, for maximizing product
penetration? Will a given YouTube video go viral? Who
are the best people to vaccinate? What happens when two
products compete? The objective of this tutorial is to
provide an intuitive and concise overview of most
important theoretical results and algorithms to help us
understand and manipulate such propagation-style
processes on large networks. The tutorial contains
three parts: (a) Theoretical results on the behavior of
fundamental models; (b) Scalable Algorithms for
changing the behavior of these processes e.g., for
immunization, marketing etc.; and (c) Empirical Studies
of diffusion on blogs and on-line websites like
Twitter. The problems we focus on are central in
surprisingly diverse areas: from computer science and
engineering, epidemiology and public health, product
marketing to information dissemination. Our emphasis is
on intuition behind each topic, and guidelines for the
practitioner.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dogac:2012:IES,
author = "Asuman Dogac",
title = "Interoperability in {eHealth} systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "2026--2027",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Interoperability in eHealth systems is important for
delivering quality healthcare and reducing healthcare
costs. Some of the important use cases include
coordinating the care of chronic patients by enabling
the co-operation of many different eHealth systems such
as Electronic Health Record Systems (EHRs), Personal
Health Record Systems (PHRs) and wireless medical
sensor devices; enabling secondary use of EHRs for
clinical research; being able to share life long EHRs
among different healthcare providers. Although
achieving eHealth interoperability is quite a challenge
both because there are competing standards and clinical
information itself is very complex, there have been a
number of successful industry initiatives such as
Integrating the Healthcare Enterprise (IHE) Profiles,
as well as large scale deployments such as the National
Health Information System of Turkey and the epSOS
initiative for sharing Electronic Health Records and
ePrescriptions in Europe. This article briefly
describes the subjects discussed in the VLDB 2012
tutorial to provide an overview of the issues in
eHealth interoperability describing the key
technologies and standards, identifying important use
cases and the associated research challenges and also
describing some of the large scale deployments. The aim
is to foster further interest in this area.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Agrawal:2012:SPP,
author = "Divyakant Agrawal and Amr {El Abbadi} and Shiyuan
Wang",
title = "Secure and privacy-preserving data services in the
cloud: a data centric view",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "2028--2029",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Cloud computing becomes a successful paradigm for data
computing and storage. Increasing concerns about data
security and privacy in the cloud, however, have
emerged. Ensuring security and privacy for data
management and query processing in the cloud is
critical for better and broader uses of the cloud. This
tutorial covers some common cloud security and privacy
threats and the relevant research, while focusing on
the works that protect data confidentiality and query
access privacy for sensitive data being stored and
queried in the cloud. We provide a comprehensive study
of state-of-the-art schemes and techniques for
protecting data confidentiality and access privacy,
which make different tradeoffs in the multidimensional
space of security, privacy, functionality and
performance.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Guha:2012:GSS,
author = "Sudipto Guha and Andrew McGregor",
title = "Graph synopses, sketches, and streams: a survey",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "2030--2031",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Massive graphs arise in any application where there is
data about both basic entities and the relationships
between these entities, e.g., web-pages and hyperlinks;
neurons and synapses; papers and citations; IP
addresses and network flows; people and their
friendships. Graphs have also become the de facto
standard for representing many types of highly
structured data. However, the sheer size of many of
these graphs renders classical algorithms inapplicable
when it comes to analyzing such graphs. In addition,
these existing algorithms are typically ill-suited to
processing distributed or stream data. Various
platforms have been developed for processing large data
sets. At the same time, there is the need to develop
new algorithmic ideas and paradigms. In the case of
graph processing, a lot of recent work has focused on
understanding the important algorithmic issues. An
central aspect of this is the question of how to
construct and leverage small-space synopses in graph
processing. The goal of this tutorial is to survey
recent work on this question and highlight interesting
directions for future research.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Labrinidis:2012:COB,
author = "Alexandros Labrinidis and H. V. Jagadish",
title = "Challenges and opportunities with big data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "2032--2033",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The promise of data-driven decision-making is now
being recognized broadly, and there is growing
enthusiasm for the notion of ``Big Data,'' including
the recent announcement from the White House about new
funding initiatives across different agencies, that
target research for Big Data. While the promise of Big
Data is real --- for example, it is estimated that
Google alone contributed 54 billion dollars to the US
economy in 2009 --- there is no clear consensus on what
is Big Data. In fact, there have been many
controversial statements about Big Data, such as ``Size
is the only thing that matters.'' In this panel we will
try to explore the controversies and debunk the myths
surrounding Big Data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{ElAbbadi:2012:PDS,
author = "Amr {El Abbadi} and Mohamed F. Mokbel",
title = "Panel discussion on social networks and mobility in
the cloud",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "5",
number = "12",
pages = "2034--2035",
month = aug,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 6 16:43:21 MST 2012",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Social networks, mobility and the cloud represent
special and unique opportunities for synergy among
several existing and emerging communities that are now
often evolving in isolated silos. All three areas hold
much promise for the future of computing, and represent
significant challenges for large scale data management.
As these three areas evolve, their direct influence on
significant decisions on each other becomes evident and
critical. This panel will bring together a set of
renowned researchers who will explore and discuss the
synergy and tensions among critical and often
intertwined research and application issues that arise
in the context of social networks and mobility in a
cloud infrastructure setting.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bouros:2012:STS,
author = "Panagiotis Bouros and Shen Ge and Nikos Mamoulis",
title = "Spatio-textual similarity joins",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "1",
pages = "1--12",
month = nov,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 22 12:18:56 MDT 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Given a collection of objects that carry both spatial
and textual information, a spatio-textual similarity
join retrieves the pairs of objects that are spatially
close and textually similar. As an example, consider a
social network with spatially and textually tagged
persons (i.e., their locations and profiles). A useful
task (for friendship recommendation) would be to find
pairs of persons that are spatially close and their
profiles have a large overlap (i.e., they have common
interests). Another application is data de-duplication
(e.g., finding photographs which are spatially close to
each other and high overlap in their descriptive tags).
Despite the importance of this operation, there is very
little previous work that studies its efficient
evaluation and in fact under a different definition;
only the best match for each object is identified. In
this paper, we combine ideas from state-of-the-art
spatial distance join and set similarity join methods
and propose efficient algorithms that take into account
both spatial and textual constraints. Besides, we
propose a batch processing technique which boosts the
performance of our approaches. An experimental
evaluation using real and synthetic datasets shows that
our optimized techniques are orders of magnitude faster
than base-line solutions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Drosou:2012:DDR,
author = "Marina Drosou and Evaggelia Pitoura",
title = "{DisC} diversity: result diversification based on
dissimilarity and coverage",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "1",
pages = "13--24",
month = nov,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 22 12:18:56 MDT 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Recently, result diversification has attracted a lot
of attention as a means to improve the quality of
results retrieved by user queries. In this paper, we
propose a new, intuitive definition of diversity called
DisC diversity. A DisC diverse subset of a query result
contains objects such that each object in the result is
represented by a similar object in the diverse subset
and the objects in the diverse subset are dissimilar to
each other. We show that locating a minimum DisC
diverse subset is an NP-hard problem and provide
heuristics for its approximation. We also propose
adapting DisC diverse subsets to a different degree of
diversification. We call this operation zooming. We
present efficient implementations of our algorithms
based on the M-tree, a spatial index structure, and
experimentally evaluate their performance.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zeng:2012:DPF,
author = "Chen Zeng and Jeffrey F. Naughton and Jin-Yi Cai",
title = "On differentially private frequent itemset mining",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "1",
pages = "25--36",
month = nov,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 22 12:18:56 MDT 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We consider differentially private frequent itemset
mining. We begin by exploring the theoretical
difficulty of simultaneously providing good utility and
good privacy in this task. While our analysis proves
that in general this is very difficult, it leaves a
glimmer of hope in that our proof of difficulty relies
on the existence of long transactions (that is,
transactions containing many items). Accordingly, we
investigate an approach that begins by truncating long
transactions, trading off errors introduced by the
truncation with those introduced by the noise added to
guarantee privacy. Experimental results over standard
benchmark databases show that truncating is indeed
effective. Our algorithm solves the ``classical''
frequent itemset mining problem, in which the goal is
to find all itemsets whose support exceeds a threshold.
Related work has proposed differentially private
algorithms for the top-$k$ itemset mining problem
(``find the $k$ most frequent itemsets''.) An
experimental comparison with those algorithms show that
our algorithm achieves better $F$-score unless $k$ is
small.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dong:2012:LMS,
author = "Xin Luna Dong and Barna Saha and Divesh Srivastava",
title = "Less is more: selecting sources wisely for
integration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "2",
pages = "37--48",
month = dec,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:14 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We are often thrilled by the abundance of information
surrounding us and wish to integrate data from as many
sources as possible. However, understanding, analyzing,
and using these data are often hard. Too much data can
introduce a huge integration cost, such as expenses for
purchasing data and resources for integration and
cleaning. Furthermore, including low-quality data can
even deteriorate the quality of integration results
instead of bringing the desired quality gain. Thus,
``the more the better'' does not always hold for data
integration and often ``less is more''. In this paper,
we study how to select a subset of sources before
integration such that we can balance the quality of
integrated data and integration cost. Inspired by the
Marginalism principle in economic theory, we wish to
integrate a new source only if its marginal gain, often
a function of improved integration quality, is higher
than the marginal cost, associated with data-purchase
expense and integration resources. As a first step
towards this goal, we focus on data fusion tasks, where
the goal is to resolve conflicts from different
sources. We propose a randomized solution for selecting
sources for fusion and show empirically its
effectiveness and scalability on both real-world data
and synthetic data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhou:2012:DTA,
author = "Wenchao Zhou and Suyog Mapara and Yiqing Ren and Yang
Li and Andreas Haeberlen and Zachary Ives and Boon Thau
Loo and Micah Sherr",
title = "Distributed time-aware provenance",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "2",
pages = "49--60",
month = dec,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:14 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The ability to reason about changes in a distributed
system's state enables network administrators to better
diagnose protocol misconfigurations, detect intrusions,
and pinpoint performance bottlenecks. We propose a
novel provenance model called Distributed Time-aware
Provenance (DTaP) that aids forensics and debugging in
distributed systems by explicitly representing time,
distributed state, and state changes. Using a
distributed Datalog abstraction for modeling
distributed protocols, we prove that the DTaP model
provides a sound and complete representation that
correctly captures dependencies among events in a
distributed system. We additionally introduce DistTape,
an implementation of the DTaP model that uses novel
distributed storage structures, query processing, and
cost-based optimization techniques to efficiently query
time-aware provenance in a distributed setting. Using
two example systems (declarative network routing and
Hadoop MapReduce), we demonstrate that DistTape can
efficiently maintain and query time-aware provenance at
low communication and computation cost.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Calvanese:2012:QPU,
author = "Diego Calvanese and Giuseppe {De Giacomo} and Maurizio
Lenzerini and Moshe Y. Vardi",
title = "Query processing under {GLAV} mappings for relational
and graph databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "2",
pages = "61--72",
month = dec,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:14 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Schema mappings establish a correspondence between
data stored in two databases, called source and target
respectively. Query processing under schema mappings
has been investigated extensively in the two cases
where each target atom is mapped to a query over the
source (called GAV, global-as-view), and where each
source atom is mapped to a query over the target
(called LAV, local-as-view). The general case, called
GLAV, in which queries over the source are mapped to
queries over the target, has attracted a lot of
attention recently, especially for data exchange.
However, query processing for GLAV mappings has been
considered only for the basic service of query
answering, and mainly in the context of conjunctive
queries (CQs) in relational databases. In this paper we
study query processing for GLAV mappings in a wider
sense, considering not only query answering, but also
query rewriting, perfectness (the property of a
rewriting to compute exactly the certain answers), and
query containment relative to a mapping. We deal both
with the relational case, and with graph databases,
where the basic querying mechanism is that of regular
path queries. Query answering in GLAV can be smoothly
reduced to a combination of the LAV and GAV cases, and
for CQs this reduction can be exploited also for the
remaining query processing tasks. In contrast, as we
show, GLAV query processing for graph databases is
non-trivial and requires new insights and techniques.
We obtain upper bounds for answering, rewriting, and
perfectness, and show decidability of relative
containment.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mouratidis:2012:CIR,
author = "Kyriakos Mouratidis and HweeHwa Pang",
title = "Computing immutable regions for subspace top-$k$
queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "2",
pages = "73--84",
month = dec,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:14 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Given a high-dimensional dataset, a top-$k$ query can
be used to shortlist the $k$ tuples that best match the
user's preferences. Typically, these preferences regard
a subset of the available dimensions (i.e., attributes)
whose relative significance is expressed by
user-specified weights. Along with the query result, we
propose to compute for each involved dimension the
maximal deviation to the corresponding weight for which
the query result remains valid. The derived weight
ranges, called immutable regions, are useful for
performing sensitivity analysis, for fine-tuning the
query weights, etc. In this paper, we focus on top-$k$
queries with linear preference functions over the
queried dimensions. We codify the conditions under
which changes in a dimension's weight invalidate the
query result, and develop algorithms to compute the
immutable regions. In general, this entails the
examination of numerous non-result tuples. To reduce
processing time, we introduce a pruning technique and a
thresholding mechanism that allow the immutable regions
to be determined correctly after examining only a small
number of non-result tuples. We demonstrate empirically
that the two techniques combine well to form a robust
and highly resource-efficient algorithm. We verify the
generality of our findings using real high-dimensional
data from different domains (documents, images, etc)
and with different characteristics.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhao:2012:LSC,
author = "Feng Zhao and Anthony K. H. Tung",
title = "Large scale cohesive subgraphs discovery for social
network visual analysis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "2",
pages = "85--96",
month = dec,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:14 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Graphs are widely used in large scale social network
analysis nowadays. Not only analysts need to focus on
cohesive subgraphs to study patterns among social
actors, but also normal users are interested in
discovering what happening in their neighborhood.
However, effectively storing large scale social network
and efficiently identifying cohesive subgraphs is
challenging. In this work we introduce a novel subgraph
concept to capture the cohesion in social interactions,
and propose an I/O efficient approach to discover
cohesive subgraphs. Besides, we propose an analytic
system which allows users to perform intuitive, visual
browsing on large scale social networks. Our system
stores the network as a social graph in the graph
database, retrieves a local cohesive subgraph based on
the input keywords, and then hierarchically visualizes
the subgraph out on orbital layout, in which more
important social actors are located in the center. By
summarizing textual interactions between social actors
as tag cloud, we provide a way to quickly locate active
social communities and their interactions in a unified
view.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2012:TFD,
author = "Xian Li and Xin Luna Dong and Kenneth Lyons and Weiyi
Meng and Divesh Srivastava",
title = "Truth finding on the {Deep Web}: is the problem
solved?",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "2",
pages = "97--108",
month = dec,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:14 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The amount of useful information available on the Web
has been growing at a dramatic pace in recent years and
people rely more and more on the Web to fulfill their
information needs. In this paper, we study truthfulness
of Deep Web data in two domains where we believed data
are fairly clean and data quality is important to
people's lives: Stock and Flight. To our surprise, we
observed a large amount of inconsistency on data from
different sources and also some sources with quite low
accuracy. We further applied on these two data sets
state-of-the-art data fusion methods that aim at
resolving conflicts and finding the truth, analyzed
their strengths and limitations, and suggested
promising research directions. We wish our study can
increase awareness of the seriousness of conflicting
data on the Web and in turn inspire more research in
our community to tackle this problem.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Marcus:2012:CC,
author = "Adam Marcus and David Karger and Samuel Madden and
Robert Miller and Sewoong Oh",
title = "Counting with the crowd",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "2",
pages = "109--120",
month = dec,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:14 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we address the problem of selectivity
estimation in a crowdsourced database. Specifically, we
develop several techniques for using workers on a
crowdsourcing platform like Amazon's Mechanical Turk to
estimate the fraction of items in a dataset (e.g., a
collection of photos) that satisfy some property or
predicate (e.g., photos of trees). We do this without
explicitly iterating through every item in the dataset.
This is important in crowd-sourced query optimization
to support predicate ordering and in query evaluation,
when performing a GROUP BY operation with a COUNT or
AVG aggregate. We compare sampling item labels, a
traditional approach, to showing workers a collection
of items and asking them to estimate how many satisfy
some predicate. Additionally, we develop techniques to
eliminate spammers and colluding attackers trying to
skew selectivity estimates when using this count
estimation approach. We find that for images, counting
can be much more effective than sampled labeling,
reducing the amount of work necessary to arrive at an
estimate that is within 1\% of the true fraction by up
to an order of magnitude, with lower worker latency. We
also find that sampled labeling outperforms count
estimation on a text processing task, presumably
because people are better at quickly processing large
batches of images than they are at reading strings of
text. Our spammer detection technique, which is
applicable to both the label- and count-based
approaches, can improve accuracy by up to two orders of
magnitude.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zou:2012:CDA,
author = "Tao Zou and Ronan {Le Bras} and Marcos {Vaz Salles}
and Alan Demers and Johannes Gehrke",
title = "{ClouDiA}: a deployment advisor for public clouds",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "2",
pages = "121--132",
month = dec,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:14 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "An increasing number of distributed data-driven
applications are moving into shared public clouds. By
sharing resources and operating at scale, public clouds
promise higher utilization and lower costs than private
clusters. To achieve high utilization, however, cloud
providers inevitably allocate virtual machine instances
noncontiguously, i.e., instances of a given application
may end up in physically distant machines in the cloud.
This allocation strategy can lead to large differences
in average latency between instances. For a large class
of applications, this difference can result in
significant performance degradation, unless care is
taken in how application components are mapped to
instances. In this paper, we propose ClouDiA, a general
deployment advisor that selects application node
deployments minimizing either (i) the largest latency
between application nodes, or (ii) the longest critical
path among all application nodes. ClouDiA employs
mixed-integer programming and constraint programming
techniques to efficiently search the space of possible
mappings of application nodes to instances. Through
experiments with synthetic and real applications in
Amazon EC2, we show that our techniques yield a 15\% to
55\% reduction in time-to-solution or service response
time, without any need for modifying application
code.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lee:2012:DCS,
author = "Jinsoo Lee and Wook-Shin Han and Romans Kasperovics
and Jeong-Hoon Lee",
title = "An in-depth comparison of subgraph isomorphism
algorithms in graph databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "2",
pages = "133--144",
month = dec,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:14 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Finding subgraph isomorphisms is an important problem
in many applications which deal with data modeled as
graphs. While this problem is NP-hard, in recent years,
many algorithms have been proposed to solve it in a
reasonable time for real datasets using different join
orders, pruning rules, and auxiliary neighborhood
information. However, since they have not been
empirically compared one another in most research work,
it is not clear whether the later work outperforms the
earlier work. Another problem is that reported
comparisons were often done using the original authors'
binaries which were written in different programming
environments. In this paper, we address these serious
problems by re-implementing five state-of-the-art
subgraph isomorphism algorithms in a common code base
and by comparing them using many real-world datasets
and their query loads. Through our in-depth analysis of
experimental results, we report surprising empirical
findings.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ren:2012:LLM,
author = "Kun Ren and Alexander Thomson and Daniel J. Abadi",
title = "Lightweight locking for main memory database systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "2",
pages = "145--156",
month = dec,
year = "2012",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:14 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Locking is widely used as a concurrency control
mechanism in database systems. As more OLTP databases
are stored mostly or entirely in memory, transactional
throughput is less and less limited by disk IO, and
lock managers increasingly become performance
bottlenecks. In this paper, we introduce very
lightweight locking (VLL), an alternative approach to
pessimistic concurrency control for main-memory
database systems that avoids almost all overhead
associated with traditional lock manager operations. We
also propose a protocol called selective contention
analysis (SCA), which enables systems implementing VLL
to achieve high transactional throughput under high
contention workloads. We implement these protocols both
in a traditional single-machine multi-core database
server setting and in a distributed database where data
is partitioned across many commodity machines in a
shared-nothing cluster. Our experiments show that VLL
dramatically reduces locking overhead and thereby
increases transactional throughput in both settings.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2013:LPP,
author = "Ye Zhang and Wai-Kit Wong and S. M. Yiu and Nikos
Mamoulis and David W. Cheung",
title = "Lightweight privacy-preserving peer-to-peer data
integration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "3",
pages = "157--168",
month = jan,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:18 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Peer Data Management Systems (PDMS) are an attractive
solution for managing distributed heterogeneous
information. When a peer (client) requests data from
another peer (server) with a different schema,
translations of the query and its answer are done by a
sequence of intermediate peers (translators). There are
two privacy issues in this P2P data integration
process: (i) answer privacy: no unauthorized parties
(including the translators) should learn the query
result; (ii) mapping privacy: the schema and the value
mappings used by the translators to perform the
translation should not be revealed to other peers.
Elmeleegy and Ouzzani proposed the PPP protocol that is
the first to support privacy-preserving querying in
PDMS. However, PPP suffers from several shortcomings.
First, PPP does not satisfy the requirement of answer
privacy, because it is based on commutative encryption;
we show that this issue can be fixed by adopting
another cryptographic technique called oblivious
transfer. Second, PPP adopts a weaker notion for
mapping privacy, which allows the client peer to
observe certain mappings done by translators. In this
paper, we develop a lightweight protocol, which
satisfies mapping privacy and extend it to a more
complex one that facilitates parallel translation by
peers. Furthermore, we consider a stronger adversary
model where there may be collusions among peers and
propose an efficient protocol that guards against
collusions. We conduct an experimental study on the
performance of the proposed protocols using both real
and synthetic data. The results show that the proposed
protocols not only achieve a better privacy guarantee
than PPP, but they are also more efficient.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2013:MEM,
author = "Yang Li and Pegah Kamousi and Fangqiu Han and Shengqi
Yang and Xifeng Yan and Subhash Suri",
title = "Memory efficient minimum substring partitioning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "3",
pages = "169--180",
month = jan,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:18 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Massively parallel DNA sequencing technologies are
revolutionizing genomics research. Billions of short
reads generated at low costs can be assembled for
reconstructing the whole genomes. Unfortunately, the
large memory footprint of the existing de novo assembly
algorithms makes it challenging to get the assembly
done for higher eukaryotes like mammals. In this work,
we investigate the memory issue of constructing de
Bruijn graph, a core task in leading assembly
algorithms, which often consumes several hundreds of
gigabytes memory for large genomes. We propose a
disk-based partition method, called Minimum Substring
Partitioning (MSP), to complete the task using less
than 10 gigabytes memory, without runtime slowdown. MSP
breaks the short reads into multiple small disjoint
partitions so that each partition can be loaded into
memory, processed individually and later merged with
others to form a de Bruijn graph. By leveraging the
overlaps among the $k$-mers (substring of length k),
MSP achieves astonishing compression ratio: The total
size of partitions is reduced from $ \Theta (k n) $ to
$ \Theta (n) $, where $n$ is the size of the short read
database, and $k$ is the length of a $k$-mer.
Experimental results show that our method can build de
Bruijn graphs using a commodity computer for any
large-volume sequence dataset.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Khan:2013:NFG,
author = "Arijit Khan and Yinghui Wu and Charu C. Aggarwal and
Xifeng Yan",
title = "{NeMa}: fast graph search with label similarity",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "3",
pages = "181--192",
month = jan,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:18 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "It is increasingly common to find real-life data
represented as networks of labeled, heterogeneous
entities. To query these networks, one often needs to
identify the matches of a given query graph in a
(typically large) network modeled as a target graph.
Due to noise and the lack of fixed schema in the target
graph, the query graph can substantially differ from
its matches in the target graph in both structure and
node labels, thus bringing challenges to the graph
querying tasks. In this paper, we propose NeMa (Network
Match), a neighborhood-based subgraph matching
technique for querying real-life networks. (1) To
measure the quality of the match, we propose a novel
subgraph matching cost metric that aggregates the costs
of matching individual nodes, and unifies both
structure and node label similarities. (2) Based on the
metric, we formulate the minimum cost subgraph matching
problem. Given a query graph and a target graph, the
problem is to identify the (top-$k$) matches of the
query graph with minimum costs in the target graph. We
show that the problem is NP-hard, and also hard to
approximate. (3) We propose a heuristic algorithm for
solving the problem based on an inference model. In
addition, we propose optimization techniques to improve
the efficiency of our method. (4) We empirically verify
that NeMa is both effective and efficient compared to
the keyword search and various state-of-the-art graph
querying techniques.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lin:2013:PPS,
author = "Xika Lin and Abhishek Mukherji and Elke A.
Rundensteiner and Carolina Ruiz and Matthew O. Ward",
title = "{PARAS}: a parameter space framework for online
association mining",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "3",
pages = "193--204",
month = jan,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:18 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Association rule mining is known to be computationally
intensive, yet real-time decision-making applications
are increasingly intolerant to delays. In this paper,
we introduce the parameter space model, called PARAS.
PARAS enables efficient rule mining by compactly
maintaining the final rulesets. The PARAS model is
based on the notion of stable region abstractions that
form the coarse granularity ruleset space. Based on new
insights on the redundancy relationships among rules,
PARAS establishes a surprisingly compact representation
of complex redundancy relationships while enabling
efficient redundancy resolution at query-time. Besides
the classical rule mining requests, the PARAS model
supports three novel classes of exploratory queries.
Using the proposed PSpace index, these exploratory
query classes can all be answered with near real-time
responsiveness. Our experimental evaluation using
several benchmark datasets demonstrates that PARAS
achieves 2 to 5 orders of magnitude improvement over
state-of-the-art approaches in online association rule
mining.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yan:2013:ASF,
author = "Zhepeng Yan and Nan Zheng and Zachary G. Ives and
Partha Pratim Talukdar and Cong Yu",
title = "Actively soliciting feedback for query answers in
keyword search-based data integration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "3",
pages = "205--216",
month = jan,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:18 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The problem of scaling up data integration, such that
new sources can be quickly utilized as they are
discovered, remains elusive: global schemas for
integrated data are difficult to develop and expand,
and schema and record matching techniques are limited
by the fact that data and metadata are often
under-specified and must be disambiguated by data
experts. One promising approach is to avoid using a
global schema, and instead to develop keyword
search-based data integration--where the system lazily
discovers associations enabling it to join together
matches to keywords, and return ranked results. The
user is expected to understand the data domain and
provide feedback about answers' quality. The system
generalizes such feedback to learn how to correctly
integrate data. A major open challenge is that under
this model, the user only sees and offers feedback on a
few ``top-$k$'' results: this result set must be
carefully selected to include answers of high relevance
and answers that are highly informative when feedback
is given on them. Existing systems merely focus on
predicting relevance, by composing the scores of
various schema and record matching algorithms. In this
paper we show how to predict the uncertainty associated
with a query result's score, as well as how informative
feedback is on a given result. We build upon these
foundations to develop an active learning approach to
keyword search-based data integration, and we validate
the effectiveness of our solution over real data from
several very different domains.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chen:2013:SKQ,
author = "Lisi Chen and Gao Cong and Christian S. Jensen and
Dingming Wu",
title = "Spatial keyword query processing: an experimental
evaluation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "3",
pages = "217--228",
month = jan,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:18 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Geo-textual indices play an important role in spatial
keyword querying. The existing geo-textual indices have
not been compared systematically under the same
experimental framework. This makes it difficult to
determine which indexing technique best supports
specific functionality. We provide an all-around survey
of 12 state-of-the-art geo-textual indices. We propose
a benchmark that enables the comparison of the spatial
keyword query performance. We also report on the
findings obtained when applying the benchmark to the
indices, thus uncovering new insights that may guide
index selection as well as further research.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Eftekhar:2013:PRT,
author = "Milad Eftekhar and Nick Koudas",
title = "Partitioning and ranking tagged data sources",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "4",
pages = "229--240",
month = feb,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:22 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Online types of expression in the form of social
networks, micro-blogging, blogs and rich content
sharing platforms have proliferated in the last few
years. Such proliferation contributed to the vast
explosion in online data sharing we are experiencing
today. One unique aspect of online data sharing is tags
manually inserted by content generators to facilitate
content description and discovery (e.g., hashtags in
tweets). In this paper we focus on these tags and we
study and propose algorithms that make use of tags in
order to automatically organize and categorize this
vast collection of socially contributed and tagged
information. In particular, we take a holistic approach
in organizing such tags and we propose algorithms to
partition as well as rank this information collection.
Our partitioning algorithms aim to segment the entire
collection of tags (and the associated content) into a
specified number of partitions for specific problem
constraints. In contrast our ranking algorithms aim to
identify few partitions fast, for suitably defined
ranking functions. We present a detailed experimental
study utilizing the full twitter firehose (set of all
tweets in the Twitter service) that attests to the
practical utility and effectiveness of our overall
approach. We also present a detailed qualitative study
of our results.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Badia:2013:EIG,
author = "Antonio Badia and Bin Cao",
title = "Efficient implementation of generalized quantification
in relational query languages",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "4",
pages = "241--252",
month = feb,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:22 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present research aimed at improving our
understanding of the use and implementation of
quantification in relational query languages in general
and SQL in particular. In order to make our results as
general as possible, we use the framework of
Generalized Quantification. Generalized Quantifiers
(GQs) are high-level, declarative logical operators
that in the past have been studied from a theoretical
perspective. In this paper we focus on their practical
use, showing how to incorporate a dynamic set of GQs in
relational query languages, how to implement them
efficiently and use them in the context of SQL. We
present experimental evidence of the performance of the
approach, showing that it improves over traditional
(relational) approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2013:DWD,
author = "Rui Liu and Ashraf Aboulnaga and Kenneth Salem",
title = "{DAX}: a widely distributed multitenant storage
service for {DBMS} hosting",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "4",
pages = "253--264",
month = feb,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:22 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Many applications hosted on the cloud have
sophisticated data management needs that are best
served by a SQL-based relational DBMS. It is not
difficult to run a DBMS in the cloud, and in many cases
one DBMS instance is enough to support an application's
workload. However, a DBMS running in the cloud (or even
on a local server) still needs a way to persistently
store its data and protect it against failures. One way
to achieve this is to provide a scalable and reliable
storage service that the DBMS can access over a
network. This paper describes such a service, which we
call DAX. DAX relies on multi-master replication and
Dynamo-style flexible consistency, which enables it to
run in multiple data centers and hence be disaster
tolerant. Flexible consistency allows DAX to control
the consistency level of each read or write operation,
choosing between strong consistency at the cost of high
latency or weak consistency with low latency. DAX makes
this choice for each read or write operation by
applying protocols that we designed based on the
storage tier usage characteristics of database systems.
With these protocols, DAX provides a storage service
that can host multiple DBMS tenants, scaling with the
number of tenants and the required storage capacity and
bandwidth. DAX also provides high availability and
disaster tolerance for the DBMS storage tier.
Experiments using the TPC-C benchmark show that DAX
provides up to a factor of 4 performance improvement
over baseline solutions that do not exploit flexible
consistency.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zeng:2013:DGE,
author = "Kai Zeng and Jiacheng Yang and Haixun Wang and Bin
Shao and Zhongyuan Wang",
title = "A distributed graph engine for web scale {RDF} data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "4",
pages = "265--276",
month = feb,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:22 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Much work has been devoted to supporting RDF data. But
state-of-the-art systems and methods still cannot
handle web scale RDF data effectively. Furthermore,
many useful and general purpose graph-based operations
(e.g., random walk, reachability, community discovery)
on RDF data are not supported, as most existing systems
store and index data in particular ways (e.g., as
relational tables or as a bitmap matrix) to maximize
one particular operation on RDF data: SPARQL query
processing. In this paper, we introduce Trinity. RDF, a
distributed, memory-based graph engine for web scale
RDF data. Instead of managing the RDF data in triple
stores or as bitmap matrices, we store RDF data in its
native graph form. It achieves much better (sometimes
orders of magnitude better) performance for SPARQL
queries than the state-of-the-art approaches.
Furthermore, since the data is stored in its native
graph form, the system can support other operations
(e.g., random walks, reachability) on RDF graphs as
well. We conduct comprehensive experimental studies on
real life, web scale RDF data to demonstrate the
effectiveness of our approach.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sarma:2013:ULB,
author = "Anish Das Sarma and Foto N. Afrati and Semih Salihoglu
and Jeffrey D. Ullman",
title = "Upper and lower bounds on the cost of a map-reduce
computation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "4",
pages = "277--288",
month = feb,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:22 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper we study the tradeoff between
parallelism and communication cost in a map-reduce
computation. For any problem that is not
``embarrassingly parallel,'' the finer we partition the
work of the reducers so that more parallelism can be
extracted, the greater will be the total communication
between mappers and reducers. We introduce a model of
problems that can be solved in a single round of
map-reduce computation. This model enables a generic
recipe for discovering lower bounds on communication
cost as a function of the maximum number of inputs that
can be assigned to one reducer. We use the model to
analyze the tradeoff for three problems: finding pairs
of strings at Hamming distance d, finding triangles and
other patterns in a larger graph, and matrix
multiplication. For finding strings of Hamming distance
1, we have upper and lower bounds that match exactly.
For triangles and many other graphs, we have upper and
lower bounds that are the same to within a constant
factor. For the problem of matrix multiplication, we
have matching upper and lower bounds for one-round
map-reduce algorithms. We are also able to explore
two-round map-reduce algorithms for matrix
multiplication and show that these never have more
communication, for a given reducer size, than the best
one-round algorithm, and often have significantly
less.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tu:2013:PAQ,
author = "Stephen Tu and M. Frans Kaashoek and Samuel Madden and
Nickolai Zeldovich",
title = "Processing analytical queries over encrypted data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "5",
pages = "289--300",
month = mar,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:27 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "MONOMI is a system for securely executing analytical
workloads over sensitive data on an untrusted database
server. MONOMI works by encrypting the entire database
and running queries over the encrypted data. MONOMI
introduces split client/server query execution, which
can execute arbitrarily complex queries over encrypted
data, as well as several techniques that improve
performance for such workloads, including per-row
precomputation, space-efficient encryption, grouped
homomorphic addition, and pre-filtering. Since these
optimizations are good for some queries but not others,
MONOMI introduces a designer for choosing an efficient
physical design at the server for a given workload, and
a planner to choose an efficient execution plan for a
given query at runtime. A prototype of MONOMI running
on top of Postgres can execute most of the queries from
the TPC-H benchmark with a median overhead of only $
1.24 \times $ (ranging from $ 1.03 \times $ to $ 2.33
\times $) compared to an un-encrypted Postgres database
where a compromised server would reveal all data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kellaris:2013:PDP,
author = "Georgios Kellaris and Stavros Papadopoulos",
title = "Practical differential privacy via grouping and
smoothing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "5",
pages = "301--312",
month = mar,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:27 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We address one-time publishing of non-overlapping
counts with $ \epsilon $-differential privacy. These
statistics are useful in a wide and important range of
applications, including transactional, traffic and
medical data analysis. Prior work on the topic
publishes such statistics with prohibitively low
utility in several practical scenarios. Towards this
end, we present GS, a method that pre-processes the
counts by elaborately grouping and smoothing them via
averaging. This step acts as a form of preliminary
perturbation that diminishes sensitivity, and enables
GS to achieve $ \epsilon $-differential privacy through
low Laplace noise injection. The grouping strategy is
dictated by a sampling mechanism, which minimizes the
smoothing perturbation. We demonstrate the superiority
of GS over its competitors, and confirm its
practicality, via extensive experiments on real
datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kaushik:2013:SSD,
author = "Raghav Kaushik and Yupeng Fu and Ravishankar
Ramamurthy",
title = "On scaling up sensitive data auditing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "5",
pages = "313--324",
month = mar,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:27 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper studies the following problem: given (1) a
query and (2) a set of sensitive records, find the
subset of records ``accessed'' by the query. The notion
of a query accessing a single record is adopted from
prior work. There are several scenarios where the
number of sensitive records is large (in the millions).
The novel challenge addressed in this work is to
develop a general-purpose solution for complex SQL that
scales in the number of sensitive records. We propose
efficient techniques that improves upon straightforward
alternatives by orders of magnitude. Our empirical
evaluation over the TPC-H benchmark data illustrates
the benefits of our techniques.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sathiamoorthy:2013:XEN,
author = "Maheswaran Sathiamoorthy and Megasthenis Asteris and
Dimitris Papailiopoulos and Alexandros G. Dimakis and
Ramkumar Vadali and Scott Chen and Dhruba Borthakur",
title = "{XORing} elephants: novel erasure codes for big data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "5",
pages = "325--336",
month = mar,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:27 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Distributed storage systems for large clusters
typically use replication to provide reliability.
Recently, erasure codes have been used to reduce the
large storage overhead of three-replicated systems.
Reed--Solomon codes are the standard design choice and
their high repair cost is often considered an
unavoidable price to pay for high storage efficiency
and high reliability. This paper shows how to overcome
this limitation. We present a novel family of erasure
codes that are efficiently repairable and offer higher
reliability compared to Reed--Solomon codes. We show
analytically that our codes are optimal on a recently
identified tradeoff between locality and minimum
distance. We implement our new codes in Hadoop HDFS and
compare to a currently deployed HDFS module that uses
Reed--Solomon codes. Our modified HDFS implementation
shows a reduction of approximately $ 2 \times $ on the
repair disk I/O and repair network traffic. The
disadvantage of the new coding scheme is that it
requires 14\% more storage compared to Reed--Solomon
codes, an overhead shown to be information
theoretically optimal to obtain locality. Because the
new codes repair failures faster, this provides higher
reliability, which is orders of magnitude higher
compared to replication.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Rendle:2013:SFM,
author = "Steffen Rendle",
title = "Scaling factorization machines to relational data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "5",
pages = "337--348",
month = mar,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:27 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The most common approach in predictive modeling is to
describe cases with feature vectors (aka design
matrix). Many machine learning methods such as linear
regression or support vector machines rely on this
representation. However, when the underlying data has
strong relational patterns, especially relations with
high cardinality, the design matrix can get very large
which can make learning and prediction slow or even
infeasible. This work solves this issue by making use
of repeating patterns in the design matrix which stem
from the underlying relational structure of the data.
It is shown how coordinate descent learning and
Bayesian Markov Chain Monte Carlo inference can be
scaled for linear regression and factorization machine
models. Empirically, it is shown on two large scale and
very competitive datasets (Netflix prize, KDDCup 2012),
that (1) standard learning algorithms based on the
design matrix representation cannot scale to relational
predictor variables, (2) the proposed new algorithms
scale and (3) the predictive quality of the proposed
generic feature-based approach is as good as the best
specialized models that have been tailored to the
respective tasks.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Whang:2013:QSC,
author = "Steven Euijong Whang and Peter Lofgren and Hector
Garcia-Molina",
title = "Question selection for crowd entity resolution",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "6",
pages = "349--360",
month = apr,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:32 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We study the problem of enhancing Entity Resolution
(ER) with the help of crowdsourcing. ER is the problem
of clustering records that refer to the same real-world
entity and can be an extremely difficult process for
computer algorithms alone. For example, figuring out
which images refer to the same person can be a hard
task for computers, but an easy one for humans. We
study the problem of resolving records with
crowdsourcing where we ask questions to humans in order
to guide ER into producing accurate results. Since
human work is costly, our goal is to ask as few
questions as possible. We propose a probabilistic
framework for ER that can be used to estimate how much
ER accuracy we obtain by asking each question and
select the best question with the highest expected
accuracy. Computing the expected accuracy is \#P-hard,
so we propose approximation techniques for efficient
computation. We evaluate our best question algorithms
on real and synthetic datasets and demonstrate how we
can obtain high ER accuracy while significantly
reducing the number of questions asked to humans.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jindal:2013:CKB,
author = "Alekh Jindal and Endre Palatinus and Vladimir Pavlov
and Jens Dittrich",
title = "A comparison of knives for bread slicing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "6",
pages = "361--372",
month = apr,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:32 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Vertical partitioning is a crucial step in physical
database design in row-oriented databases. A number of
vertical partitioning algorithms have been proposed
over the last three decades for a variety of niche
scenarios. In principle, the underlying problem remains
the same: decompose a table into one or more vertical
partitions. However, it is not clear how good different
vertical partitioning algorithms are in comparison to
each other. In fact, it is not even clear how to
experimentally compare different vertical partitioning
algorithms. In this paper, we present an exhaustive
experimental study of several vertical partitioning
algorithms. We categorize vertical partitioning
algorithms along three dimensions. We survey six
vertical partitioning algorithms and discuss their pros
and cons. We identify the major differences in the
use-case settings for different algorithms and describe
how to make an apples-to-apples comparison of different
vertical partitioning algorithms under the same
setting. We propose four metrics to compare vertical
partitioning algorithms. We show experimental results
from the TPC-H and SSB benchmark and present four key
lessons learned: (1) we can do four orders of magnitude
less computation and still find the optimal layouts,
(2) the benefits of vertical partitioning depend
strongly on the database buffer size, (3) HillClimb is
the best vertical partitioning algorithm, and (4)
vertical partitioning for TPC-H-like benchmarks can
improve over column layout by only up to 5\%.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Xiao:2013:EET,
author = "Chuan Xiao and Jianbin Qin and Wei Wang and Yoshiharu
Ishikawa and Koji Tsuda and Kunihiko Sadakane",
title = "Efficient error-tolerant query autocompletion",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "6",
pages = "373--384",
month = apr,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:32 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Query autocompletion is an important feature saving
users many keystrokes from typing the entire query. In
this paper we study the problem of query autocompletion
that tolerates errors in users' input using edit
distance constraints. Previous approaches index data
strings in a trie, and continuously maintain all the
prefixes of data strings whose edit distance from the
query are within the threshold. The major inherent
problem is that the number of such prefixes is huge for
the first few characters of the query and is
exponential in the alphabet size. This results in slow
query response even if the entire query approximately
matches only few prefixes. In this paper, we propose a
novel neighborhood generation-based algorithm,
IncNGTrie, which can achieve up to two orders of
magnitude speedup over existing methods for the
error-tolerant query autocompletion problem. Our
proposed algorithm only maintains a small set of active
nodes, thus saving both space and time to process the
query. We also study efficient duplicate removal which
is a core problem in fetching query answers. In
addition, we propose optimization techniques to reduce
our index size, as well as discussions on several
extensions to our method. The efficiency of our method
is demonstrated against existing methods through
extensive experiments on real datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shraer:2013:TKP,
author = "Alexander Shraer and Maxim Gurevich and Marcus
Fontoura and Vanja Josifovski",
title = "Top-$k$ publish-subscribe for social annotation of
news",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "6",
pages = "385--396",
month = apr,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:32 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Social content, such as Twitter updates, often have
the quickest first-hand reports of news events, as well
as numerous commentaries that are indicative of public
view of such events. As such, social updates provide a
good complement to professionally written news
articles. In this paper we consider the problem of
automatically annotating news stories with social
updates (tweets), at a news website serving high volume
of pageviews. The high rate of both the pageviews
(millions to billions a day) and of the incoming tweets
(more than 100 millions a day) make real-time indexing
of tweets ineffective, as this requires an index that
is both queried and updated extremely frequently. The
rate of tweet updates makes caching techniques almost
unusable since the cache would become stale very
quickly. We propose a novel architecture where each
story is treated as a subscription for tweets relevant
to the story's content, and new algorithms that
efficiently match tweets to stories, proactively
maintaining the top-$k$ tweets for each story. Such
top-$k$ pub-sub consumes only a small fraction of the
resource cost of alternative solutions, and can be
applicable to other large scale content-based
publish-subscribe problems. We demonstrate the
effectiveness of our approach on realworld data: a
corpus of news stories from Yahoo! News and a log of
Twitter updates.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kolaitis:2013:EQI,
author = "Phokion G. Kolaitis and Enela Pema and Wang-Chiew
Tan",
title = "Efficient querying of inconsistent databases with
binary integer programming",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "6",
pages = "397--408",
month = apr,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:32 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "An inconsistent database is a database that violates
one or more integrity constraints. A typical approach
for answering a query over an inconsistent database is
to first clean the inconsistent database by
transforming it to a consistent one and then apply the
query to the consistent database. An alternative and
more principled approach, known as consistent query
answering, derives the answers to a query over an
inconsistent database without changing the database,
but by taking into account all possible repairs of the
database. In this paper, we study the problem of
consistent query answering over inconsistent databases
for the class for conjunctive queries under primary key
constraints. We develop a system, called EQUIP, that
represents a fundamental departure from existing
approaches for computing the consistent answers to
queries in this class. At the heart of EQUIP is a
technique, based on Binary Integer Programming (BIP),
that repeatedly searches for repairs to eliminate
candidate consistent answers until no further such
candidates can be eliminated. We establish rigorously
the correctness of the algorithms behind EQUIP and
carry out an extensive experimental investigation that
validates the effectiveness of our approach.
Specifically, EQUIP exhibits good and stable
performance on conjunctive queries under primary key
constraints, it significantly outperforms existing
systems for computing the consistent answers of such
queries in the case in which the consistent answers are
not first-order rewritable, and it scales well.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gionis:2013:PSN,
author = "Aristides Gionis and Flavio Junqueira and Vincent
Leroy and Marco Serafini and Ingmar Weber",
title = "Piggybacking on social networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "6",
pages = "409--420",
month = apr,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:32 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The popularity of social-networking sites has
increased rapidly over the last decade. A basic
functionalities of social-networking sites is to
present users with streams of events shared by their
friends. At a systems level, materialized per-user
views are a common way to assemble and deliver such
event streams on-line and with low latency. Access to
the data stores, which keep the user views, is a major
bottleneck of social-networking systems. We propose to
improve the throughput of these systems by using social
piggybacking, which consists of processing the requests
of two friends by querying and updating the view of a
third common friend. By using one such hub view, the
system can serve requests of the first friend without
querying or updating the view of the second. We show
that, given a social graph, social piggybacking can
minimize the overall number of requests, but computing
the optimal set of hubs is an NP-hard problem. We
propose an $ O(\log n) $ approximation algorithm and a
heuristic to solve the problem, and evaluate them using
the full Twitter and Flickr social graphs, which have
up to billions of edges. Compared to existing
approaches, using social piggybacking results in
similar throughput in systems with few servers, but
enables substantial throughput improvements as the size
of the system grows, reaching up to a 2-factor
increase. We also evaluate our algorithms on a real
social networking system prototype and we show that the
actual increase in throughput corresponds nicely to the
gain anticipated by our cost function.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Adelfio:2013:SET,
author = "Marco D. Adelfio and Hanan Samet",
title = "Schema extraction for tabular data on the {Web}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "6",
pages = "421--432",
month = apr,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:32 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Tabular data is an abundant source of information on
the Web, but remains mostly isolated from the latter's
interconnections since tables lack links and
computer-accessible descriptions of their structure. In
other words, the schemas of these tables --- attribute
names, values, data types, etc. --- are not explicitly
stored as table metadata. Consequently, the structure
that these tables contain is not accessible to the
crawlers that power search engines and thus not
accessible to user search queries. We address this lack
of structure with a new method for leveraging the
principles of table construction in order to extract
table schemas. Discovering the schema by which a table
is constructed is achieved by harnessing the
similarities and differences of nearby table rows
through the use of a novel set of features and a
feature processing scheme. The schemas of these data
tables are determined using a classification technique
based on conditional random fields in combination with
a novel feature encoding method called logarithmic
binning, which is specifically designed for the data
table extraction task. Our method provides considerable
improvement over the well-known WebTables schema
extraction method. In contrast with previous work that
focuses on extracting individual relations, our method
excels at correctly interpreting full tables, thereby
being capable of handling general tables such as those
found in spreadsheets, instead of being restricted to
HTML tables as is the case with the WebTables method.
We also extract additional schema characteristics, such
as row groupings, which are important for supporting
information retrieval tasks on tabular data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sariyuce:2013:SAK,
author = "Ahmet Erdem Sar{\'\i}y{\"u}ce and Bugra Gedik and
Gabriela Jacques-Silva and Kun-Lung Wu and {\"U}mit V.
{\c{C}}ataly{\"u}rek",
title = "Streaming algorithms for $k$-core decomposition",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "6",
pages = "433--444",
month = apr,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:32 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A $k$-core of a graph is a maximal connected subgraph
in which every vertex is connected to at least $k$
vertices in the subgraph. $k$-core decomposition is
often used in large-scale network analysis, such as
community detection, protein function prediction,
visualization, and solving NP-Hard problems on real
networks efficiently, like maximal clique finding. In
many real-world applications, networks change over
time. As a result, it is essential to develop efficient
incremental algorithms for streaming graph data. In
this paper, we propose the first incremental $k$-core
decomposition algorithms for streaming graph data.
These algorithms locate a small subgraph that is
guaranteed to contain the list of vertices whose
maximum $k$-core values have to be updated, and
efficiently process this subgraph to update the
$k$-core decomposition. Our results show a significant
reduction in run-time compared to non-incremental
alternatives. We show the efficiency of our algorithms
on different types of real and synthetic graphs, at
different scales. For a graph of 16 million vertices,
we observe speedups reaching a million times, relative
to the non-incremental algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Hassanzadeh:2013:DLP,
author = "Oktie Hassanzadeh and Ken Q. Pu and Soheil Hassas
Yeganeh and Ren{\'e}e J. Miller and Lucian Popa and
Mauricio A. Hern{\'a}ndez and Howard Ho",
title = "Discovering linkage points over {Web} data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "6",
pages = "445--456",
month = apr,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:32 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A basic step in integration is the identification of
linkage points, i.e., finding attributes that are
shared (or related) between data sources, and that can
be used to match records or entities across sources.
This is usually performed using a match operator, that
associates attributes of one database to another.
However, the massive growth in the amount and variety
of unstructured and semi-structured data on the Web has
created new challenges for this task. Such data sources
often do not have a fixed pre-defined schema and
contain large numbers of diverse attributes.
Furthermore, the end goal is not schema alignment as
these schemas may be too heterogeneous (and dynamic) to
meaningfully align. Rather, the goal is to align any
overlapping data shared by these sources. We will show
that even attributes with different meanings (that
would not qualify as schema matches) can sometimes be
useful in aligning data. The solution we propose in
this paper replaces the basic schema-matching step with
a more complex instance-based schema analysis and
linkage discovery. We present a framework consisting of
a library of efficient lexical analyzers and similarity
functions, and a set of search algorithms for effective
and efficient identification of linkage points over Web
data. We experimentally evaluate the effectiveness of
our proposed algorithms in real-world integration
scenarios in several domains.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fu:2013:LIS,
author = "Ada Wai-Chee Fu and Huanhuan Wu and James Cheng and
Raymond Chi-Wing Wong",
title = "{IS-Label}: an independent-set based labeling scheme
for point-to-point distance querying",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "6",
pages = "457--468",
month = apr,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:32 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We study the problem of computing shortest path or
distance between two query vertices in a graph, which
has numerous important applications. Quite a number of
indexes have been proposed to answer such distance
queries. However, all of these indexes can only process
graphs of size barely up to 1 million vertices, which
is rather small in view of many of the fast-growing
real-world graphs today such as social networks and Web
graphs. We propose an efficient index, which is a novel
labeling scheme based on the independent set of a
graph. We show that our method can handle graphs of
size orders of magnitude larger than existing
indexes.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tran:2013:SUD,
author = "Thanh T. L. Tran and Yanlei Diao and Charles Sutton
and Anna Liu",
title = "Supporting user-defined functions on uncertain data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "6",
pages = "469--480",
month = apr,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:32 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Uncertain data management has become crucial in many
sensing and scientific applications. As user-defined
functions (UDFs) become widely used in these
applications, an important task is to capture result
uncertainty for queries that evaluate UDFs on uncertain
data. In this work, we provide a general framework for
supporting UDFs on uncertain data. Specifically, we
propose a learning approach based on Gaussian processes
(GPs) to compute approximate output distributions of a
UDF when evaluated on uncertain input, with guaranteed
error bounds. We also devise an online algorithm to
compute such output distributions, which employs a
suite of optimizations to improve accuracy and
performance. Our evaluation using both real-world and
synthetic functions shows that our proposed GP approach
can outperform the state-of-the-art sampling approach
with up to two orders of magnitude improvement for a
variety of UDFs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhu:2013:IAA,
author = "Fanwei Zhu and Yuan Fang and Kevin Chen-Chuan Chang
and Jing Ying",
title = "Incremental and accuracy-aware {Personalized PageRank}
through scheduled approximation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "6",
pages = "481--492",
month = apr,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:32 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/pagerank.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "As Personalized PageRank has been widely leveraged for
ranking on a graph, the efficient computation of
Personalized PageRank Vector (PPV) becomes a prominent
issue. In this paper, we propose FastPPV, an
approximate PPV computation algorithm that is
incremental and accuracy-aware. Our approach hinges on
a novel paradigm of scheduled approximation: the
computation is partitioned and scheduled for processing
in an ``organized'' way, such that we can gradually
improve our PPV estimation in an incremental manner,
and quantify the accuracy of our approximation at query
time. Guided by this principle, we develop an efficient
hub based realization, where we adopt the metric of
hub-length to partition and schedule random walk tours
so that the approximation error reduces exponentially
over iterations. Furthermore, as tours are segmented by
hubs, the shared substructures between different tours
(around the same hub) can be reused to speed up query
processing both within and across iterations. Finally,
we evaluate FastPPV over two real-world graphs, and
show that it not only significantly outperforms two
state-of-the-art baselines in both online and offline
phrases, but also scale well on larger graphs. In
particular, we are able to achieve near-constant time
online query processing irrespective of graph size.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zheng:2013:ESB,
author = "Weiguo Zheng and Lei Zou and Yansong Feng and Lei Chen
and Dongyan Zhao",
title = "Efficient simrank-based similarity join over large
graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "7",
pages = "493--504",
month = may,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:37 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Graphs have been widely used to model complex data in
many real-world applications. Answering vertex join
queries over large graphs is meaningful and
interesting, which can benefit friend recommendation in
social networks and link prediction, etc. In this
paper, we adopt ``SimRank'' to evaluate the similarity
of two vertices in a large graph because of its
generality. Note that ``SimRank'' is purely structure
dependent and it does not rely on the domain knowledge.
Specifically, we define a SimRank-based join (SRJ)
query to find all the vertex pairs satisfying the
threshold in a data graph $G$. In order to reduce the
search space, we propose an estimated shortest-path
distance based upper bound for SimRank scores to prune
unpromising vertex pairs. In the verification, we
propose a novel index, called $h$-go cover, to
efficiently compute the SimRank score of a single
vertex pair. Given a graph $G$, we only materialize the
SimRank scores of a small proportion of vertex pairs
(called $h$-go covers), based on which, the SimRank
score of any vertex pair can be computed easily. In
order to handle large graphs, we extend our technique
to the partition-based framework. Thorough theoretical
analysis and extensive experiments over both real and
synthetic datasets confirm the efficiency and
effectiveness of our solution.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2013:PST,
author = "Guimei Liu and Andre Suchitra and Limsoon Wong",
title = "A performance study of three disk-based structures for
indexing and querying frequent itemsets",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "7",
pages = "505--516",
month = may,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:37 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Frequent itemset mining is an important problem in the
data mining area. Extensive efforts have been devoted
to developing efficient algorithms for mining frequent
itemsets. However, not much attention is paid on
managing the large collection of frequent itemsets
produced by these algorithms for subsequent analysis
and for user exploration. In this paper, we study three
structures for indexing and querying frequent itemsets:
inverted files, signature files and CFP-tree. The first
two structures have been widely used for indexing
general set-valued data. We make some modifications to
make them more suitable for indexing frequent itemsets.
The CFP-tree structure is specially designed for
storing frequent itemsets. We add a pruning technique
based on length-2 frequent itemsets to make it more
efficient for processing superset queries. We study the
performance of the three structures in supporting five
types of containment queries: exact match,
subset/superset search and immediate subset/superset
search. Our results show that no structure can
outperform other structures for all the five types of
queries on all the datasets. CFP-tree shows better
overall performance than the other two structures.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yuan:2013:TFC,
author = "Pingpeng Yuan and Pu Liu and Buwen Wu and Hai Jin and
Wenya Zhang and Ling Liu",
title = "{TripleBit}: a fast and compact system for large scale
{RDF} data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "7",
pages = "517--528",
month = may,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:37 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The volume of RDF data continues to grow over the past
decade and many known RDF datasets have billions of
triples. A grant challenge of managing this huge RDF
data is how to access this big RDF data efficiently. A
popular approach to addressing the problem is to build
a full set of permutations of $ (S, P, O) $ indexes.
Although this approach has shown to accelerate joins by
orders of magnitude, the large space overhead limits
the scalability of this approach and makes it
heavyweight. In this paper, we present TripleBit, a
fast and compact system for storing and accessing RDF
data. The design of TripleBit has three salient
features. First, the compact design of TripleBit
reduces both the size of stored RDF data and the size
of its indexes. Second, TripleBit introduces two
auxiliary index structures, ID-Chunk bit matrix and
ID-Predicate bit matrix, to minimize the cost of index
selection during query evaluation. Third, its query
processor dynamically generates an optimal execution
ordering for join queries, leading to fast query
execution and effective reduction on the size of
intermediate results. Our experiments show that
TripleBit outperforms RDF-3X, MonetDB, BitMat on LUBM,
UniProt and BTC 2012 benchmark queries and it offers
orders of mangnitude performance improvement for some
complex join queries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bajaj:2013:CSE,
author = "Sumeet Bajaj and Radu Sion",
title = "{CorrectDB}: {SQL} engine with practical query
authentication",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "7",
pages = "529--540",
month = may,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:37 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Clients of outsourced databases need Query
Authentication (QA) guaranteeing the integrity
(correctness and completeness), and authenticity of the
query results returned by potentially compromised
providers. Existing results provide QA assurances for a
limited class of queries by deploying several software
cryptographic constructs. Here, we show that, to
achieve QA, however, it is significantly cheaper and
more practical to deploy server-hosted, tamper-proof
co-processors, despite their higher acquisition costs.
Further, this provides the ability to handle arbitrary
queries. To reach this insight, we extensively survey
existing QA work and identify interdependencies and
efficiency relationships. We then introduce CorrectDB,
a new DBMS with full QA assurances, leveraging
server-hosted, tamper-proof, trusted hardware in close
proximity to the outsourced data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2013:HSM,
author = "Xin Liu and Kenneth Salem",
title = "Hybrid storage management for database systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "8",
pages = "541--552",
month = jun,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:42 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The use of flash-based solid state drives (SSDs) in
storage systems is growing. Adding SSDs to a storage
system not only raises the question of how to manage
the SSDs, but also raises the question of whether
current buffer pool algorithms will still work
effectively. We are interested in the use of hybrid
storage systems, consisting of SSDs and hard disk
drives (HDDs), for database management. We present
cost-aware replacement algorithms, which are aware of
the difference in performance between SSDs and HDDs,
for both the DBMS buffer pool and the SSDs. In hybrid
storage systems, the physical access pattern to the
SSDs depends on the management of the DBMS buffer pool.
We studied the impact of buffer pool caching policies
on SSD access patterns. Based on these studies, we
designed a cost-adjusted caching policy to effectively
manage the SSD. We implemented these algorithms in
MySQL's InnoDB storage engine and used the TPC-C
workload to demonstrate that these cost-aware
algorithms outperform previous algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wu:2013:SEO,
author = "Eugene Wu and Samuel Madden",
title = "{Scorpion}: explaining away outliers in aggregate
queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "8",
pages = "553--564",
month = jun,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:42 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Database users commonly explore large data sets by
running aggregate queries that project the data down to
a smaller number of points and dimensions, and
visualizing the results. Often, such visualizations
will reveal outliers that correspond to errors or
surprising features of the input data set.
Unfortunately, databases and visualization systems do
not provide a way to work backwards from an outlier
point to the common properties of the (possibly many)
unaggregated input tuples that correspond to that
outlier. We propose Scorpion, a system that takes a set
of user-specified outlier points in an aggregate query
result as input and finds predicates that explain the
outliers in terms of properties of the input tuples
that are used to compute the selected outlier results.
Specifically, this explanation identifies predicates
that, when applied to the input data, cause the
outliers to disappear from the output. To find such
predicates, we develop a notion of influence of a
predicate on a given output, and design several
algorithms that efficiently search for maximum
influence predicates over the input data. We show that
these algorithms can quickly find outliers in two real
data sets (from a sensor deployment and a campaign
finance data set), and run orders of magnitude faster
than a naive search algorithm while providing
comparable quality on a synthetic data set.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gupta:2013:RTQ,
author = "Rajeev Gupta and Krithi Ramamritham and Mukesh
Mohania",
title = "Ratio threshold queries over distributed data
sources",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "8",
pages = "565--576",
month = jun,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:42 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Continuous aggregation queries over dynamic data are
used for real time decision making and timely business
intelligence. In this paper we consider queries where a
client wants to be notified if the ratio of two
aggregates over distributed data crosses a specified
threshold. Consider these scenarios: a mechanism
designed to defend against distributed denial of
service attacks may be triggered when the fraction of
packets arriving to a subnet is more than 5\% of the
total packets; or a distributed store chain withdraws
its discount on luxury goods when sales of luxury goods
constitute more than 20\% of the overall sales. The
challenge in executing such ratio threshold queries
(RTQs) lies in incurring the minimal amount of
communication necessary for propagation of updates from
data sources to the aggregator node where the client
query is executed. We address this challenge by
proposing schemes for converting the client ratio
threshold condition into conditions on individual
distributed data sources. Whenever the condition
associated with a source is violated, the source pushes
its data values to the aggregator, which in turn pulls
data values from other sources to determine whether the
client threshold condition is indeed violated. We
present algorithms to minimize the number of source
condition violations (i.e., the number of pushes) while
ensuring that no violation of the client threshold
condition is missed. Further, in case of a source
condition violation, we propose efficient selective
pulling algorithms for intelligently choosing
additional sources whose data should be pulled by the
aggregator. Using performance evaluation on synthetic
and real traces of data updates we show that our
algorithms result in up to an order of magnitude less
number of messages compared to existing approaches in
the literature.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Deng:2013:CQR,
author = "Ting Deng and Wenfei Fan",
title = "On the complexity of query result diversification",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "8",
pages = "577--588",
month = jun,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:42 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Query result diversification is a bi-criteria
optimization problem for ranking query results. Given a
database $D$, a query $Q$ and a positive integer $k$,
it is to find a set of $k$ tuples from $ Q(D) $ such
that the tuples are as relevant as possible to the
query, and at the same time, as diverse as possible to
each other. Subsets of $ Q(D) $ are ranked by an
objective function defined in terms of relevance and
diversity. Query result diversification has found a
variety of applications in databases, information
retrieval and operations research. This paper studies
the complexity of result diversification for relational
queries. We identify three problems in connection with
query result diversification, to determine whether
there exists a set of $k$ tuples that is ranked above a
bound with respect to relevance and diversity, to
assess the rank of a given $k$-element set, and to
count how many $k$-element sets are ranked above a
given bound. We study these problems for a variety of
query languages and for three objective functions. We
establish the upper and lower bounds of these problems,
all matching, for both combined complexity and data
complexity. We also investigate several special
settings of these problems, identifying tractable
cases.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dutta:2013:SQF,
author = "Sourav Dutta and Ankur Narang and Suman K. Bera",
title = "Streaming quotient filter: a near optimal approximate
duplicate detection approach for data streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "8",
pages = "589--600",
month = jun,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:42 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The unparalleled growth and popularity of the Internet
coupled with the advent of diverse modern applications
such as search engines, on-line transactions, climate
warning systems, etc., has catered to an unprecedented
expanse in the volume of data stored world-wide.
Efficient storage, management, and processing of such
massively exponential amount of data has emerged as a
central theme of research in this direction. Detection
and removal of redundancies and duplicates in real-time
from such multi-trillion record-set to bolster resource
and compute efficiency constitutes a challenging area
of study. The infeasibility of storing the entire data
from potentially unbounded data streams, with the need
for precise elimination of duplicates calls for
intelligent approximate duplicate detection algorithms.
The literature hosts numerous works based on the
well-known probabilistic bitmap structure, Bloom Filter
and its variants. In this paper we propose a novel data
structure, Streaming Quotient Filter, (SQF) for
efficient detection and removal of duplicates in data
streams. SQF intelligently stores the signatures of
elements arriving on a data stream, and along with an
eviction policy provides near zero false positive and
false negative rates. We show that the near optimal
performance of SQF is achieved with a very low memory
requirement, making it ideal for real-time
memory-efficient de-duplication applications having an
extremely low false positive and false negative
tolerance rates. We present detailed theoretical
analysis of the working of SQF, providing a guarantee
on its performance. Empirically, we compare SQF to
alternate methods and show that the proposed method is
superior in terms of memory and accuracy compared to
the existing solutions. We also discuss Dynamic SQF for
evolving streams and the parallel implementation of
SQF.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Korn:2013:RSP,
author = "Flip Korn and Barna Saha and Divesh Srivastava and
Shanshan Ying",
title = "On repairing structural problems in semi-structured
data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "9",
pages = "601--612",
month = jul,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:46 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Semi-structured data such as XML are popular for data
interchange and storage. However, many XML documents
have improper nesting where open --- and close-tags are
unmatched. Since some semi-structured data (e.g.,
Latex) have a flexible grammar and since many XML
documents lack an accompanying DTD or XSD, we focus on
computing a syntactic repair via the edit distance. To
solve this problem, we propose a dynamic programming
algorithm which takes cubic time. While this algorithm
is not scalable, well-formed substrings of the data can
be pruned to enable faster computation. Unfortunately,
there are still cases where the dynamic program could
be very expensive; hence, we give branch-and-bound
algorithms based on various combinations of two
heuristics, called MinCost and MaxBenefit, that trade
off between accuracy and efficiency. Finally, we
experimentally demonstrate the performance of these
algorithms on real data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Manshadi:2013:DAL,
author = "Faraz Makari Manshadi and Baruch Awerbuch and Rainer
Gemulla and Rohit Khandekar and Juli{\'a}n Mestre and
Mauro Sozio",
title = "A distributed algorithm for large-scale generalized
matching",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "9",
pages = "613--624",
month = jul,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:46 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Generalized matching problems arise in a number of
applications, including computational advertising,
recommender systems, and trade markets. Consider, for
example, the problem of recommending multimedia items
(e.g., DVDs) to users such that (1) users are
recommended items that they are likely to be interested
in, (2) every user gets neither too few nor too many
recommendations, and (3) only items available in stock
are recommended to users. State-of-the-art matching
algorithms fail at coping with large real-world
instances, which may involve millions of users and
items. We propose the first distributed algorithm for
computing near-optimal solutions to large-scale
generalized matching problems like the one above. Our
algorithm is designed to run on a small cluster of
commodity nodes (or in a MapReduce environment), has
strong approximation guarantees, and requires only a
poly-logarithmic number of passes over the input. In
particular, we propose a novel distributed algorithm to
approximately solve mixed packing-covering linear
programs, which include but are not limited to
generalized matching problems. Experiments on
real-world and synthetic data suggest that a practical
variant of our algorithm scales to very large problem
sizes and can be orders of magnitude faster than
alternative approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Geerts:2013:LDC,
author = "Floris Geerts and Giansalvatore Mecca and Paolo
Papotti and Donatello Santoro",
title = "The {LLUNATIC} data-cleaning framework",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "9",
pages = "625--636",
month = jul,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:46 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data-cleaning (or data-repairing) is considered a
crucial problem in many database-related tasks. It
consists in making a database consistent with respect
to a set of given constraints. In recent years,
repairing methods have been proposed for several
classes of constraints. However, these methods rely on
ad hoc decisions and tend to hard-code the strategy to
repair conflicting values. As a consequence, there is
currently no general algorithm to solve database
repairing problems that involve different kinds of
constraints and different strategies to select
preferred values. In this paper we develop a uniform
framework to solve this problem. We propose a new
semantics for repairs, and a chase-based algorithm to
compute minimal solutions. We implemented the framework
in a DBMS-based prototype, and we report experimental
results that confirm its good scalability and superior
quality in computing repairs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Psaroudakis:2013:SDW,
author = "Iraklis Psaroudakis and Manos Athanassoulis and
Anastasia Ailamaki",
title = "Sharing data and work across concurrent analytical
queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "9",
pages = "637--648",
month = jul,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:46 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Today's data deluge enables organizations to collect
massive data, and analyze it with an ever-increasing
number of concurrent queries. Traditional data
warehouses (DW) face a challenging problem in executing
this task, due to their query-centric model: each query
is optimized and executed independently. This model
results in high contention for resources. Thus, modern
DW depart from the query-centric model to execution
models involving sharing of common data and work. Our
goal is to show when and how a DW should employ
sharing. We evaluate experimentally two sharing
methodologies, based on their original prototype
systems, that exploit work sharing opportunities among
concurrent queries at run-time: Simultaneous Pipelining
(SP), which shares intermediate results of common
sub-plans, and Global Query Plans (GQP), which build
and evaluate a single query plan with shared operators.
First, after a short review of sharing methodologies,
we show that SP and GQP are orthogonal techniques. SP
can be applied to shared operators of a GQP, reducing
response times by 20\%--48\% in workloads with numerous
common sub-plans. Second, we corroborate previous
results on the negative impact of SP on performance for
cases of low concurrency. We attribute this behavior to
a bottleneck caused by the push-based communication
model of SP. We show that pull-based communication for
SP eliminates the overhead of sharing altogether for
low concurrency, and scales better on multi-core
machines than push-based SP, further reducing response
times by 82\%--86\% for high concurrency. Third, we
perform an experimental analysis of SP, GQP and their
combination, and show when each one is beneficial. We
identify a trade-off between low and high concurrency.
In the former case, traditional query-centric operators
with SP perform better, while in the latter case, GQP
with shared operators enhanced by SP give the best
results.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shang:2013:SOA,
author = "Haichuan Shang and Masaru Kitsuregawa",
title = "Skyline operator on anti-correlated distributions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "9",
pages = "649--660",
month = jul,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:46 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Finding the skyline in a multi-dimensional space is
relevant to a wide range of applications. The skyline
operator over a set of $d$-dimensional points selects
the points that are not dominated by any other point on
all dimensions. Therefore, it provides a minimal set of
candidates for the users to make their personal
trade-off among all optimal solutions. The existing
algorithms establish both the worst case complexity by
discarding distributions and the average case
complexity by assuming dimensional independence.
However, the data in the real world is more likely to
be anti-correlated. The cardinality and complexity
analysis on dimensionally independent data is
meaningless when dealing with anti-correlated data.
Furthermore, the performance of the existing algorithms
becomes impractical on anti-correlated data. In this
paper, we establish a cardinality model for
anti-correlated distributions. We propose an accurate
polynomial estimation for the expected value of the
skyline cardinality. Because the high skyline
cardinality downgrades the performance of most existing
algorithms on anti-correlated data, we further develop
a determination and elimination framework which extends
the well-adopted elimination strategy. It achieves
remarkable effectiveness and efficiency. The
comprehensive experiments on both real datasets and
benchmark synthetic datasets demonstrate that our
approach significantly outperforms the state-of-the-art
algorithms under a wide range of settings.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mahmoud:2013:LLM,
author = "Hatem Mahmoud and Faisal Nawab and Alexander Pucher
and Divyakant Agrawal and Amr {El Abbadi}",
title = "Low-latency multi-datacenter databases using
replicated commit",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "9",
pages = "661--672",
month = jul,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:46 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Web service providers have been using NoSQL datastores
to provide scalability and availability for globally
distributed data at the cost of sacrificing
transactional guarantees. Recently, major web service
providers like Google have moved towards building
storage systems that provide ACID transactional
guarantees for globally distributed data. For example,
the newly published system, Spanner, uses Two-Phase
Commit and Two-Phase Locking to provide atomicity and
isolation for globally distributed data, running on top
of Paxos to provide fault-tolerant log replication. We
show in this paper that it is possible to provide the
same ACID transactional guarantees for multi-datacenter
databases with fewer cross-datacenter communication
trips, compared to replicated logging. Instead of
replicating the transactional log, we replicate the
commit operation itself, by running Two-Phase Commit
multiple times in different datacenters and using Paxos
to reach consensus among datacenters as to whether the
transaction should commit. Doing so not only replaces
several inter-datacenter communication trips with
intra-datacenter communication trips, but also allows
us to integrate atomic commitment and isolation
protocols with consistent replication protocols to
further reduce the number of cross-datacenter
communication trips needed for consistent replication;
for example, by eliminating the need for an election
phase in Paxos. We analyze our approach in terms of
communication trips to compare it against the log
replication approach, then we conduct an extensive
experimental study to compare the performance and
scalability of both approaches under various
multi-datacenter setups.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chi:2013:DBQ,
author = "Yun Chi and Hakan Hac{\'\i}g{\"u}m{\"u}s and Wang-Pin
Hsiung and Jeffrey F. Naughton",
title = "Distribution-based query scheduling",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "9",
pages = "673--684",
month = jul,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:46 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Query scheduling, a fundamental problem in database
management systems, has recently received a renewed
attention, perhaps in part due to the rise of the
``database as a service'' (DaaS) model for database
deployment. While there has been a great deal of work
investigating different scheduling algorithms, there
has been comparatively little work investigating what
the scheduling algorithms can or should know about the
queries to be scheduled. In this work, we investigate
the efficacy of using histograms describing the
distribution of likely query execution times as input
to the query scheduler. We propose a novel
distribution-based scheduling algorithm, Shepherd, and
show that Shepherd substantially outperforms
state-of-the-art point-based methods through extensive
experimentation with both synthetic and TPC
workloads.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fan:2013:MQT,
author = "Wenfei Fan and Floris Geerts and Frank Neven",
title = "Making queries tractable on big data with
preprocessing: through the eyes of complexity theory",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "9",
pages = "685--696",
month = jul,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:46 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A query class is traditionally considered tractable if
there exists a polynomial-time (PTIME) algorithm to
answer its queries. When it comes to big data, however,
PTIME algorithms often become infeasible in practice. A
traditional and effective approach to coping with this
is to preprocess data off-line, so that queries in the
class can be subsequently evaluated on the data
efficiently. This paper aims to provide a formal
foundation for this approach in terms of computational
complexity. (1) We propose a set of $ \Pi $-tractable
queries, denoted by $ \Pi T Q^0 $, to characterize
classes of queries that can be answered in parallel
poly-logarithmic time (NC) after PTIME preprocessing.
(2) We show that several natural query classes are $
\Pi $-tractable and are feasible on big data. (3) We
also study a set $ \Pi T Q $ of query classes that can
be effectively converted to $ \Pi $-tractable queries
by refactorizing its data and queries for
preprocessing. We introduce a form of NC reductions to
characterize such conversions. (4) We show that a
natural query class is complete for $ \Pi T Q $. (5) We
also show that $ \Pi T Q^0 \subset P $ unless $ P =
{\rm NC} $, i.e., the set $ \Pi T Q^0 $ of all $ \Pi
$-tractable queries is properly contained in the set
$P$ of all PTIME queries. Nonetheless, $ \Pi T Q = P $,
i.e., all PTIME query classes can be made $ \Pi
$-tractable via proper refactorizations. This work is a
step towards understanding the tractability of queries
in the context of big data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kaplan:2013:APQ,
author = "Haim Kaplan and Ilia Lotosh and Tova Milo and Slava
Novgorodov",
title = "Answering planning queries with the crowd",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "9",
pages = "697--708",
month = jul,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:46 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Recent research has shown that crowd sourcing can be
used effectively to solve problems that are difficult
for computers, e.g., optical character recognition and
identification of the structural configuration of
natural proteins. In this paper we propose to use the
power of the crowd to address yet another difficult
problem that frequently occurs in a daily life ---
answering planning queries whose output is a sequence
of objects/actions, when the goal, i.e, the notion of
``best output'', is hard to formalize. For example,
planning the sequence of places/attractions to visit in
the course of a vacation, where the goal is to enjoy
the resulting vacation the most, or planning the
sequence of courses to take in an academic schedule
planning, where the goal is to obtain solid knowledge
of a given subject domain. Such goals may be easily
understandable by humans, but hard or even impossible
to formalize for a computer. We present a novel
algorithm for efficiently harnessing the crowd to
assist in answering such planning queries. The
algorithm builds the desired plans incrementally,
choosing at each step the 'best' questions so that the
overall number of questions that need to be asked is
minimized. We prove the algorithm to be optimal within
its class and demonstrate experimentally its
effectiveness and efficiency.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Heimel:2013:HOP,
author = "Max Heimel and Michael Saecker and Holger Pirk and
Stefan Manegold and Volker Markl",
title = "Hardware-oblivious parallelism for in-memory
column-stores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "9",
pages = "709--720",
month = jul,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:46 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The multi-core architectures of today's computer
systems make parallelism a necessity for performance
critical applications. Writing such applications in a
generic, hardware-oblivious manner is a challenging
problem: Current database systems thus rely on
labor-intensive and error-prone manual tuning to
exploit the full potential of modern parallel hardware
architectures like multi-core CPUs and graphics cards.
We propose an alternative design for a parallel
database engine, based on a single set of
hardware-oblivious operators, which are compiled down
to the actual hardware at runtime. This design reduces
the development overhead for parallel database engines,
while achieving competitive performance to hand-tuned
systems. We provide a proof-of-concept for this design
by integrating operators written using the parallel
programming framework OpenCL into the open-source
database MonetDB. Following this approach, we achieve
efficient, yet highly portable parallel code without
the need for optimization by hand. We evaluated our
implementation against MonetDB using TPC-H derived
queries and observed a performance that rivals that of
MonetDB's query execution on the CPU and surpasses it
on the GPU. In addition, we show that the same set of
operators runs nearly unchanged on a GPU, demonstrating
the feasibility of our approach.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Thonangi:2013:PDR,
author = "Risi Thonangi and Jun Yang",
title = "Permuting data on random-access block storage",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "9",
pages = "721--732",
month = jul,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:46 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Permutation is a fundamental operator for array data,
with applications in, for example, changing matrix
layouts and reorganizing data cubes. We consider the
problem of permuting large quantities of data stored on
secondary storage that supports fast random block
accesses, such as solid state drives and distributed
key--value stores. Faster random accesses open up
interesting new opportunities for permutation. While
external merge sort has often been used for
permutation, it is an overkill that fails to exploit
the property of permutation fully and carries
unnecessary overhead in storing and comparing keys. We
propose faster algorithms with lower memory
requirements for a large, useful class of permutations.
We also tackle practical challenges that traditional
permutation algorithms have not dealt with, such as
exploiting random block accesses more aggressively,
considering the cost asymmetry between reads and
writes, and handling arbitrary data dimension sizes (as
opposed to perfect powers often assumed by previous
work). As a result, our algorithms are faster and more
broadly applicable.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Stoica:2013:IFW,
author = "Radu Stoica and Anastasia Ailamaki",
title = "Improving flash write performance by using update
frequency",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "9",
pages = "733--744",
month = jul,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:46 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Solid-state drives (SSDs) are quickly becoming the
default storage medium as the cost of NAND flash memory
continues to drop. However, flash memory introduces new
challenges, as data cannot be eciently updated
in-place. To overcome the technology's limitations,
SSDs incorporate a software Flash Translation Layer
(FTL) that implements out-of-place updates, typically
by storing data in a log-structured fashion. Despite a
large number of existing FTL algorithms, SSD
performance, predictability, and lifetime remain an
issue, especially for the write-intensive workloads
specific to database applications. In this paper, we
show how to design FTLs that are more efficient by
using the I/O write skew to guide data placement on
flash memory. We model the relationship between data
placement and write performance for basic I/O write
patterns and detail the most important concepts of
writing to flash memory: (i) the trade-o between the
extra capacity available and write overhead, (ii) the
benefit of adapting data placement to write skew, (iii)
the impact of the cleaning policy, and (iv) how to
estimate the best achievable write performance for a
given I/O workload. Based on the findings of the
theoretical model, we propose a new principled data
placement algorithm that can be incorporated into
existing FTLs. We show the benefits of our data
placement algorithm when running micro-benchmarks and
real database I/O traces: our data placement algorithm
reduces write overhead by 20\%--75\% when compared to
state-of-art techniques.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2013:EID,
author = "Lu Li and Chee-Yong Chan",
title = "Efficient indexing for diverse query results",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "9",
pages = "745--756",
month = jul,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:46 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper examines the problem of computing diverse
query results which is useful for browsing search
results in online shopping applications. The search
results are diversified wrt a sequence of output
attributes (termed $d$-order) where an attribute that
appears earlier in the $d$-order has higher priority
for diversification. We present a new indexing
technique, $D$-Index, to efficiently compute diverse
query results for queries with static or dynamic
$d$-orders. Our performance evaluation demonstrates
that our $D$-Index outperforms the state-of-the-art
techniques developed for queries with static or dynamic
$d$-orders.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2013:RUS,
author = "Chen Jason Zhang and Lei Chen and H. V. Jagadish and
Chen Caleb Cao",
title = "Reducing uncertainty of schema matching via
crowdsourcing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "9",
pages = "757--768",
month = jul,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:46 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Schema matching is a central challenge for data
integration systems. Automated tools are often
uncertain about schema matchings they suggest, and this
uncertainty is inherent since it arises from the
inability of the schema to fully capture the semantics
of the represented data. Human common sense can often
help. Inspired by the popularity and the success of
easily accessible crowdsourcing platforms, we explore
the use of crowdsourcing to reduce the uncertainty of
schema matching. Since it is typical to ask simple
questions on crowdsourcing platforms, we assume that
each question, namely Correspondence Correctness
Question (CCQ), is to ask the crowd to decide whether a
given correspondence should exist in the correct
matching. We propose frameworks and efficient
algorithms to dynamically manage the CCQs, in order to
maximize the uncertainty reduction within a limited
budget of questions. We develop two novel approaches,
namely ``Single CCQ'' and ``Multiple CCQ'', which
adaptively select, publish and manage the questions. We
verified the value of our solutions with simulation and
real implementation.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yang:2013:TCI,
author = "Bin Yang and Chenjuan Guo and Christian S. Jensen",
title = "Travel cost inference from sparse, spatio temporally
correlated time series using {Markov} models",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "9",
pages = "769--780",
month = jul,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:46 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The monitoring of a system can yield a set of
measurements that can be modeled as a collection of
time series. These time series are often sparse, due to
missing measurements, and spatiotemporally correlated,
meaning that spatially close time series exhibit
temporal correlation. The analysis of such time series
offers insight into the underlying system and enables
prediction of system behavior. While the techniques
presented in the paper apply more generally, we
consider the case of transportation systems and aim to
predict travel cost from GPS tracking data from probe
vehicles. Specifically, each road segment has an
associated travel-cost time series, which is derived
from GPS data. We use spatio-temporal hidden Markov
models (STHMM) to model correlations among different
traffic time series. We provide algorithms that are
able to learn the parameters of an STHMM while
contending with the sparsity, spatio-temporal
correlation, and heterogeneity of the time series.
Using the resulting STHMM, near future travel costs in
the transportation network, e.g., travel time or
greenhouse gas emissions, can be inferred, enabling a
variety of routing services, e.g., eco-routing.
Empirical studies with a substantial GPS data set offer
insight into the design properties of the proposed
framework and algorithms, demonstrating the
effectiveness and efficiency of travel cost
inferencing.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Park:2013:QOC,
author = "Hyunjung Park and Jennifer Widom",
title = "Query optimization over crowdsourced data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "10",
pages = "781--792",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:50 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Deco is a comprehensive system for answering
declarative queries posed over stored relational data
together with data obtained on-demand from the crowd.
In this paper we describe Deco's cost-based query
optimizer, building on Deco's data model, query
language, and query execution engine presented earlier.
Deco's objective in query optimization is to find the
best query plan to answer a query, in terms of
estimated monetary cost. Deco's query semantics and
plan execution strategies require several fundamental
changes to traditional query optimization. Novel
techniques incorporated into Deco's query optimizer
include a cost model distinguishing between ``free''
existing data versus paid new data, a cardinality
estimation algorithm coping with changes to the
database state during query execution, and a plan
enumeration algorithm maximizing reuse of common
subplans in a setting that makes reuse challenging. We
experimentally evaluate Deco's query optimizer,
focusing on the accuracy of cost estimation and the
efficiency of plan enumeration.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2013:DAD,
author = "Yang Wang and Peng Wang and Jian Pei and Wei Wang and
Sheng Huang",
title = "A data-adaptive and dynamic segmentation index for
whole matching on time series",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "10",
pages = "793--804",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:50 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Similarity search on time series is an essential
operation in many applications. In the state-of-the-art
methods, such as the R-tree based methods, SAX and
iSAX, time series are by default divided into
equi-length segments globally, that is, all time series
are segmented in the same way. Those methods then focus
on how to approximate or symbolize the segments and
construct indexes. In this paper, we make an important
observation: global segmentation of all time series may
incur unnecessary cost in space and time for indexing
time series. We develop DSTree, a data adaptive and
dynamic segmentation index on time series. In addition
to savings in space and time, our new index can provide
tight upper and lower bounds on distances between time
series. An extensive empirical study shows that our new
index DSTree supports time series similarity search
effectively and efficiently.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bronzi:2013:EIP,
author = "Mirko Bronzi and Valter Crescenzi and Paolo Merialdo
and Paolo Papotti",
title = "Extraction and integration of partially overlapping
web sources",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "10",
pages = "805--816",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:50 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present an unsupervised approach for harvesting the
data exposed by a set of structured and partially
overlapping data-intensive web sources. Our proposal
comes within a formal framework tackling two problems:
the data extraction problem, to generate extraction
rules based on the input websites, and the data
integration problem, to integrate the extracted data in
a unified schema. We introduce an original algorithm,
WEIR, to solve the stated problems and formally prove
its correctness. WEIR leverages the overlapping data
among sources to make better decisions both in the data
extraction (by pruning rules that do not lead to
redundant information) and in the data integration (by
reflecting local properties of a source over the
mediated schema). Along the way, we characterize the
amount of redundancy needed by our algorithm to produce
a solution, and present experimental results to show
the benefits of our approach with respect to existing
solutions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yuan:2013:YYP,
author = "Yuan Yuan and Rubao Lee and Xiaodong Zhang",
title = "The {Yin} and {Yang} of processing data warehousing
queries on {GPU} devices",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "10",
pages = "817--828",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:50 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Database community has made significant research
efforts to optimize query processing on GPUs in the
past few years. However, we can hardly find that GPUs
have been truly adopted in major warehousing production
systems. Preparing to merge GPUs to the warehousing
systems, we have identified and addressed several
critical issues in a three-dimensional study of
warehousing queries on GPUs by varying query
characteristics, software techniques, and GPU hardware
configurations. We also propose an analytical model to
understand and predict the query performance on GPUs.
Based on our study, we present our performance insights
for warehousing query execution on GPUs. The objective
of our work is to provide a comprehensive guidance for
GPU architects, software system designers, and database
practitioners to narrow the speed gap between the GPU
kernel execution (the fast mode) and data transfer to
prepare GPU execution (the slow mode) for high
performance in processing data warehousing queries. The
GPU query engine developed in this work is open source
to the public.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yuan:2013:MIG,
author = "Dayu Yuan and Prasenjit Mitra and C. Lee Giles",
title = "Mining and indexing graphs for supergraph search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "10",
pages = "829--840",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:50 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We study supergraph search (SPS), that is, given a
query graph $q$ and a graph database $G$ that contains
a collection of graphs, return graphs that have $q$ as
a supergraph from $G$. SPS has broad applications in
bioinformatics, cheminformatics and other scientific
and commercial fields. Determining whether a graph is a
subgraph (or supergraph) of another is an NP-complete
problem. Hence, it is intractable to compute SPS for
large graph databases. Two separate indexing methods, a
``filter + verify''-based method and a
``prefix-sharing''-based method, have been studied to
efficiently compute SPS. To implement the above two
methods, subgraph patterns are mined from the graph
database to build an index. Those subgraphs are mined
to optimize either the filtering gain or the
prefix-sharing gain. However, no single subgraph-mining
algorithm considers both gains. This work is the first
one to mine subgraphs to optimize both the filtering
gain and the prefix-sharing gain while processing SPS
queries. First, we show that the subgraph-mining
problem is NP-hard. Then, we propose two
polynomial-time algorithms to solve the problem with an
approximation ratio of $ 1 - 1 / e $ and $ 1 / 4 $
respectively. In addition, we construct a lattice-like
index, LW-index, to organize the selected subgraph
patterns for fast index-lookup. Our experiments show
that our approach improves the query processing time
for SPS queries by a factor of 3 to 10.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2013:ERM,
author = "Jianmin Wang and Shaoxu Song and Xiaochen Zhu and
Xuemin Lin",
title = "Efficient recovery of missing events",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "10",
pages = "841--852",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:50 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "For various entering and transmission issues raised by
human or system, missing events often occur in event
data, which record execution logs of business
processes. Without recovering these missing events,
applications such as provenance analysis or complex
event processing built upon event data are not
reliable. Following the minimum change discipline in
improving data quality, it is also rational to find a
recovery that minimally differs from the original data.
Existing recovery approaches fall short of efficiency
owing to enumerating and searching over all the
possible sequences of events. In this paper, we study
the efficient techniques for recovering missing events.
According to our theoretical results, the recovery
problem is proved to be NP-hard. Nevertheless, we are
able to concisely represent the space of event
sequences in a branching framework. Advanced indexing
and pruning techniques are developed to further improve
the recovery efficiency. Our proposed efficient
techniques make it possible to find top-$k$ recoveries.
The experimental results demonstrate that our minimum
recovery approach achieves high accuracy, and
significantly outperforms the state-of-the-art
technique for up to 5 orders of magnitudes improvement
in time performance.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ren:2013:HAA,
author = "Kai Ren and YongChul Kwon and Magdalena Balazinska and
Bill Howe",
title = "{Hadoop}'s adolescence: an analysis of {Hadoop} usage
in scientific workloads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "10",
pages = "853--864",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:50 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We analyze Hadoop workloads from three di?erent
research clusters from a user-centric perspective. The
goal is to better understand data scientists' use of
the system and how well the use of the system matches
its design. Our analysis suggests that Hadoop usage is
still in its adolescence. We see underuse of Hadoop
features, extensions, and tools. We see significant
diversity in resource usage and application styles,
including some interactive and iterative workloads,
motivating new tools in the ecosystem. We also observe
significant opportunities for optimizations of these
workloads. We find that job customization and
configuration are used in a narrow scope, suggesting
the future pursuit of automatic tuning systems.
Overall, we present the first user-centered measurement
study of Hadoop and find significant opportunities for
improving its efficient use for data scientists.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mansour:2013:RSE,
author = "Essam Mansour and Ahmed El-Roby and Panos Kalnis and
Aron Ahmadia and Ashraf Aboulnaga",
title = "{RACE}: a scalable and elastic parallel system for
discovering repeats in very long sequences",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "10",
pages = "865--876",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:50 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A wide range of applications, including
bioinformatics, time series, and log analysis, depend
on the identification of repetitions in very long
sequences. The problem of finding maximal pairs
subsumes most important types of repetition-finding
tasks. Existing solutions require both the input
sequence and its index (typically an order of magnitude
larger than the input) to fit in memory. Moreover, they
are serial algorithms with long execution time.
Therefore, they are limited to small datasets, despite
the fact that modern applications demand orders of
magnitude longer sequences. In this paper we present
RACE, a parallel system for finding maximal pairs in
very long sequences. RACE supports parallel execution
on stand-alone multicore systems, in addition to
scaling to thousands of nodes on clusters or
supercomputers. RACE does not require the input or the
index to fit in memory; therefore, it supports very
long sequences with limited memory. Moreover, it uses a
novel array representation that allows for
cache-efficient implementation. RACE is particularly
suitable for the cloud (e.g., Amazon EC2) because,
based on availability, it can scale elastically to more
or fewer machines during its execution. Since scaling
out introduces overheads, mainly due to load imbalance,
we propose a cost model to estimate the expected
speedup, based on statistics gathered through sampling.
The model allows the user to select the appropriate
combination of cloud resources based on the provider's
prices and the required deadline. We conducted
extensive experimental evaluation with large real
datasets and large computing infrastructures. In
contrast to existing methods, RACE can handle the
entire human genome on a typical desktop computer with
16GB RAM. Moreover, for a problem that takes 10 hours
of serial execution, RACE finishes in 28 seconds using
2,048 nodes on an IBM BlueGene/P supercomputer.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Levandoski:2013:LCS,
author = "Justin Levandoski and David Lomet and Sudipta
Sengupta",
title = "{LLAMA}: a cache\slash storage subsystem for modern
hardware",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "10",
pages = "877--888",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:50 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "LLAMA is a subsystem designed for new hardware
environments that supports an API for page-oriented
access methods, providing both cache and storage
management. Caching (CL) and storage (SL) layers use a
common mapping table that separates a page's logical
and physical location. CL supports data updates and
management updates (e.g., for index re-organization)
via latch-free compare-and-swap atomic state changes on
its mapping table. SL uses the same mapping table to
cope with page location changes produced by log
structuring on every page flush. To demonstrate LLAMA's
suitability, we tailored our latch-free Bw-tree
implementation to use LLAMA. The Bw-tree is a B-tree
style index. Layered on LLAMA, it has higher
performance and scalability using real workloads
compared with BerkeleyDB's B-tree, which is known for
good performance.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{He:2013:RCP,
author = "Jiong He and Mian Lu and Bingsheng He",
title = "Revisiting co-processing for hash joins on the coupled
{CPU--GPU} architecture",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "10",
pages = "889--900",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:50 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Query co-processing on graphics processors (GPUs) has
become an effective means to improve the performance of
main memory databases. However, the relatively low
bandwidth and high latency of the PCI-e bus are usually
bottleneck issues for co-processing. Recently, coupled
CPU-GPU architectures have received a lot of attention,
e.g. AMD APUs with the CPU and the GPU integrated into
a single chip. That opens up new opportunities for
optimizing query co-processing. In this paper, we
experimentally revisit hash joins, one of the most
important join algorithms for main memory databases, on
a coupled CPU-GPU architecture. Particularly, we study
the fine-grained co-processing mechanisms on hash joins
with and without partitioning. The co-processing
outlines an interesting design space. We extend
existing cost models to automatically guide decisions
on the design space. Our experimental results on a
recent AMD APU show that (1) the coupled architecture
enables fine-grained co-processing and cache reuses,
which are inefficient on discrete CPU-GPU
architectures; (2) the cost model can automatically
guide the design and tuning knobs in the design space;
(3) fine-grained co-processing achieves up to 53\%,
35\% and 28\% performance improvement over CPU-only,
GPU-only and conventional CPU-GPU co-processing,
respectively. We believe that the insights and
implications from this study are initial yet important
for further research on query co-processing on coupled
CPU-GPU architectures.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Qiao:2013:TKN,
author = "Miao Qiao and Lu Qin and Hong Cheng and Jeffrey Xu Yu
and Wentao Tian",
title = "Top-$k$ nearest keyword search on large graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "10",
pages = "901--912",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:50 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "It is quite common for networks emerging nowadays to
have labels or textual contents on the nodes. On such
networks, we study the problem of top-$k$ nearest
keyword ($k$-NK) search. In a network $G$ modeled as an
undirected graph, each node is attached with zero or
more keywords, and each edge is assigned with a weight
measuring its length. Given a query node $q$ in $G$ and
a keyword $ \lambda $, a $k$-NK query seeks $k$ nodes
which contain $ \lambda $ and are nearest to $q$.
$k$-NK is not only useful as a stand-alone query but
also as a building block for tackling complex graph
pattern matching problems. The key to an accurate
$k$-NK result is a precise shortest distance estimation
in a graph. Based on the latest distance oracle
technique, we build a shortest path tree for a distance
oracle and use the tree distance as a more accurate
estimation. With such representation, the original
$k$-NK query on a graph can be reduced to answering the
query on a set of trees and then assembling the results
obtained from the trees. We propose two efficient
algorithms to report the exact $k$-NK result on a tree.
One is query time optimized for a scenario when a small
number of result nodes are of interest to users. The
other handles $k$-NK queries for an arbitrarily large
$k$ efficiently. In obtaining a $k$-NK result on a
graph from that on trees, a global storage technique is
proposed to further reduce the index size and the query
time. Extensive experimental results conform with our
theoretical findings, and demonstrate the effectiveness
and efficiency of our $k$-NK algorithms on large real
graphs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Armenatzoglou:2013:GFG,
author = "Nikos Armenatzoglou and Stavros Papadopoulos and
Dimitris Papadias",
title = "A general framework for geo-social query processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "10",
pages = "913--924",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:50 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The proliferation of GPS-enabled mobile devises and
the popularity of social networking have recently led
to the rapid growth of Geo-Social Networks (GeoSNs).
GeoSNs have created a fertile ground for novel
location-based social interactions and advertising.
These can be facilitated by GeoSN queries, which
extract useful information combining both the social
relationships and the current location of the users.
This paper constitutes the first systematic work on
GeoSN query processing. We propose a general framework
that offers flexible data management and algorithmic
design. Our architecture segregates the social,
geographical and query processing modules. Each GeoSN
query is processed via a transparent combination of
primitive queries issued to the social and geographical
modules. We demonstrate the power of our framework by
introducing several ``basic'' and ``advanced'' query
types, and devising various solutions for each type.
Finally, we perform an exhaustive experimental
evaluation with real and synthetic datasets, based on
realistic implementations with both commercial software
(such as MongoDB) and state-of-the-art research
methods. Our results confirm the viability of our
framework in typical large-scale GeoSNs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wu:2013:TPQ,
author = "Wentao Wu and Yun Chi and Hakan Hac{\'\i}g{\"u}m{\"u}s
and Jeffrey F. Naughton",
title = "Towards predicting query execution time for concurrent
and dynamic database workloads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "10",
pages = "925--936",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:50 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Predicting query execution time is crucial for many
database management tasks including admission control,
query scheduling, and progress monitoring. While a
number of recent papers have explored this problem, the
bulk of the existing work either considers prediction
for a single query, or prediction for a static workload
of concurrent queries, where by ``static'' we mean that
the queries to be run are fixed and known. In this
paper, we consider the more general problem of dynamic
concurrent workloads. Unlike most previous work on
query execution time prediction, our proposed framework
is based on analytic modeling rather than machine
learning. We first use the optimizer's cost model to
estimate the I/O and CPU requirements for each pipeline
of each query in isolation, and then use a combination
queueing model and buffer pool model that merges the
I/O and CPU requests from concurrent queries to predict
running times. We compare the proposed approach with a
machine-learning based approach that is a variant of
previous work. Our experiments show that our
analytic-model based approach can lead to competitive
and often better prediction accuracy than its
machine-learning based counterpart.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Garofalakis:2013:SBG,
author = "Minos Garofalakis and Daniel Keren and Vasilis
Samoladas",
title = "Sketch-based geometric monitoring of distributed
stream queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "10",
pages = "937--948",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:50 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Emerging large-scale monitoring applications rely on
continuous tracking of complex data-analysis queries
over collections of massive, physically-distributed
data streams. Thus, in addition to the space- and
time-efficiency requirements of conventional stream
processing (at each remote monitor site), effective
solutions also need to guarantee communication
efficiency (over the underlying communication network).
The complexity of the monitored query adds to the
difficulty of the problem --- this is especially true
for nonlinear queries (e.g., joins), where no obvious
solutions exist for distributing the monitor condition
across sites. The recently proposed geometric method
offers a generic methodology for splitting an arbitrary
(non-linear) global threshold-monitoring task into a
collection of local site constraints; still, the
approach relies on maintaining the complete stream(s)
at each site, thus raising serious efficiency concerns
for massive data streams. In this paper, we propose
novel algorithms for efficiently tracking a broad class
of complex aggregate queries in such
distributed-streams settings. Our tracking schemes rely
on a novel combination of the geometric method with
compact sketch summaries of local data streams, and
maintain approximate answers with provable error
guarantees, while optimizing space and processing costs
at each remote site and communication cost across the
network. One of our key technical insights for the
effective use of the geometric method lies in
exploiting a much lower-dimensional space for
monitoring the sketch-based estimation query. Due to
the complex, highly nonlinear nature of these
estimates, efficiently monitoring the local geometric
constraints poses challenging algorithmic issues for
which we propose novel solutions. Experimental results
on real-life data streams verify the effectiveness of
our approach.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Long:2013:DPT,
author = "Cheng Long and Raymond Chi-Wing Wong and H. V.
Jagadish",
title = "Direction-preserving trajectory simplification",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "10",
pages = "949--960",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:50 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Trajectories of moving objects are collected in many
applications. Raw trajectory data is typically very
large, and has to be simplified before use. In this
paper, we introduce the notion of direction-preserving
trajectory simplification, and show both analytically
and empirically that it can support a broader range of
applications than traditional position-preserving
trajectory simplification. We present a polynomial-time
algorithm for optimal direction-preserving
simplification, and another approximate algorithm with
a quality guarantee. Extensive experimental evaluation
with real trajectory data shows the benefit of the new
techniques.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bruno:2013:CCS,
author = "Nicolas Bruno and Sapna Jain and Jingren Zhou",
title = "Continuous cloud-scale query optimization and
processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "961--972",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Massive data analysis in cloud-scale data centers
plays a crucial role in making critical business
decisions. High-level scripting languages free
developers from understanding various system
trade-offs, but introduce new challenges for query
optimization. One key optimization challenge is missing
accurate data statistics, typically due to massive data
volumes and their distributed nature, complex
computation logic, and frequent usage of user-defined
functions. In this paper we propose novel techniques to
adapt query processing in the Scope system, the
cloud-scale computation environment in Microsoft Online
Services. We continuously monitor query execution,
collect actual runtime statistics, and adapt parallel
execution plans as the query executes. We discuss
similarities and differences between our approach and
alternatives proposed in the context of traditional
centralized systems. Experiments on large-scale Scope
production clusters show that the proposed techniques
systematically solve the challenge of
missing/inaccurate data statistics, detect and resolve
partition skew and plan structure, and improve query
latency by a few folds for real workloads. Although we
focus on optimizing high-level languages, the same
ideas are also applicable for MapReduce systems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cherniak:2013:OSB,
author = "Andrii Cherniak and Huma Zaidi and Vladimir
Zadorozhny",
title = "Optimization strategies for {A\slash B} testing on
{HADOOP}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "973--984",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this work, we present a set of techniques that
considerably improve the performance of executing
concurrent MapReduce jobs. Our proposed solution relies
on proper resource allocation for concurrent Hive jobs
based on data dependency, inter-query optimization and
modeling of Hadoop cluster load. To the best of our
knowledge, this is the first work towards
Hive/MapReduce job optimization which takes Hadoop
cluster load into consideration. We perform an
experimental study that demonstrates 233\% reduction in
execution time for concurrent vs sequential execution
schema. We report up to 40\% extra reduction in
execution time for concurrent job execution after
resource usage optimization. The results reported in
this paper were obtained in a pilot project to assess
the feasibility of migrating A/B testing from Teradata
+ SAS analytics infrastructure to Hadoop. This work was
performed on eBay production Hadoop cluster.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Elmeleegy:2013:POS,
author = "Khaled Elmeleegy",
title = "{Piranha}: optimizing short jobs in {Hadoop}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "985--996",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Cluster computing has emerged as a key parallel
processing platform for large scale data. All major
internet companies use it as their major central
processing platform. One of cluster computing's most
popular examples is MapReduce and its open source
implementation Hadoop. These systems were originally
designed for batch and massive-scale computations.
Interestingly, over time their production workloads
have evolved into a mix of a small fraction of large
and long-running jobs and a much bigger fraction of
short jobs. This came about because these systems end
up being used as data warehouses, which store most of
the data sets and attract ad hoc, short, data-mining
queries. Moreover, the availability of higher level
query languages that operate on top of these cluster
systems proliferated these ad hoc queries. Since
existing systems were not designed for short,
latency-sensistive jobs, short interactive jobs suffer
from poor response times. In this paper, we present
Piranha--a system for optimizing short jobs on Hadoop
without affecting the larger jobs. It runs on existing
unmodified Hadoop clusters facilitating its adoption.
Piranha exploits characteristics of short jobs learned
from production workloads at Yahoo! clusters to reduce
the latency of such jobs. To demonstrate Piranha's
effectiveness, we evaluated its performance using three
realistic short queries. Piranha was able to reduce the
queries' response times by up to 71\%.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sadoghi:2013:MUD,
author = "Mohammad Sadoghi and Kenneth A. Ross and Mustafa Canim
and Bishwaranjan Bhattacharjee",
title = "Making updates disk-{I/O} friendly using {SSDs}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "997--1008",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Multiversion databases store both current and
historical data. Rows are typically annotated with
timestamps representing the period when the row is/was
valid. We develop novel techniques for reducing index
maintenance in multiversion databases, so that indexes
can be used effectively for analytical queries over
current data without being a heavy burden on
transaction throughput. To achieve this end, we
re-design persistent index data structures in the
storage hierarchy to employ an extra level of
indirection. The indirection level is stored on solid
state disks that can support very fast random I/Os, so
that traversing the extra level of indirection incurs a
relatively small overhead. The extra level of
indirection dramatically reduces the number of magnetic
disk I/Os that are needed for index updates, and
localizes maintenance to indexes on updated attributes.
Further, we batch insertions within the indirection
layer in order to reduce physical disk I/Os for
indexing new records. By reducing the index maintenance
overhead on transactions, we enable operational data
stores to create more indexes to support queries. We
have developed a prototype of our indirection proposal
by extending the widely used Generalized Search Tree
(GiST) open-source project, which is also employed in
PostgreSQL. Our working implementation demonstrates
that we can significantly reduce index maintenance
and/or query processing cost, by a factor of 3. For
insertions of new records, our novel batching technique
can save up to 90\% of the insertion time.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Aji:2013:HGH,
author = "Ablimit Aji and Fusheng Wang and Hoang Vo and Rubao
Lee and Qiaoling Liu and Xiaodong Zhang and Joel
Saltz",
title = "{Hadoop GIS}: a high performance spatial data
warehousing system over {MapReduce}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "1009--1020",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Support of high performance queries on large volumes
of spatial data becomes increasingly important in many
application domains, including geospatial problems in
numerous fields, location based services, and emerging
scientific applications that are increasingly data- and
compute-intensive. The emergence of massive scale
spatial data is due to the proliferation of cost
effective and ubiquitous positioning technologies,
development of high resolution imaging technologies,
and contribution from a large number of community
users. There are two major challenges for managing and
querying massive spatial data to support spatial
queries: the explosion of spatial data, and the high
computational complexity of spatial queries. In this
paper, we present Hadoop-GIS --- a scalable and high
performance spatial data warehousing system for running
large scale spatial queries on Hadoop. Hadoop-GIS
supports multiple types of spatial queries on MapReduce
through spatial partitioning, customizable spatial
query engine RESQUE, implicit parallel spatial query
execution on MapReduce, and effective methods for
amending query results through handling boundary
objects. Hadoop-GIS utilizes global partition indexing
and customizable on demand local spatial indexing to
achieve efficient query processing. Hadoop-GIS is
integrated into Hive to support declarative spatial
queries with an integrated architecture. Our
experiments have demonstrated the high efficiency of
Hadoop-GIS on query response and high scalability to
run on commodity clusters. Our comparative experiments
have showed that performance of Hadoop-GIS is on par
with parallel SDBMS and outperforms SDBMS for
compute-intensive queries. Hadoop-GIS is available as a
set of library for processing spatial queries, and as
an integrated software package in Hive.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bamba:2013:SCO,
author = "Bhuvan Bamba and Siva Ravada and Ying Hu and Richard
Anderson",
title = "Statistics collection in {Oracle Spatial and Graph}:
fast histogram construction for complex geometry
objects",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "1021--1032",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Oracle Spatial and Graph is a geographic information
system (GIS) which provides users the ability to store
spatial data alongside conventional data in Oracle. As
a result of the coexistence of spatial and other data,
we observe a trend towards users performing
increasingly complex queries which involve spatial as
well as non-spatial predicates. Accurate selectivity
values, especially for queries with multiple predicates
requiring joins among numerous tables, are essential
for the database optimizer to determine a good
execution plan. For queries involving spatial
predicates, this requires that reasonably accurate
statistics collection has been performed on the spatial
data. For extensible data cartridges such as Oracle
Spatial and Graph, the optimizer expects to receive
accurate predicate selectivity and cost values from
functions implemented within the data cartridge.
Although statistics collection for spatial data has
been researched in academia for a few years; to the
best of our knowledge, this is the first work to
present spatial statistics collection implementation
details for a commercial GIS database. In this paper,
we describe our experiences with implementation of
statistics collection methods for complex geometry
objects within Oracle Spatial and Graph. Firstly, we
exemplify issues with previous partitioning-based
algorithms in presence of complex geometry objects and
suggest enhancements which resolve the issues.
Secondly, we propose a main memory implementation which
not only speeds up the disk-based partitioning
algorithms but also utilizes existing R-tree indexes to
provide surprisingly accurate selectivity estimates.
Last but not the least, we provide extensive
experimental results and an example study which
displays the efficacy of our approach on Oracle query
performance.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Akidau:2013:MFT,
author = "Tyler Akidau and Alex Balikov and Kaya Bekiroglu and
Slava Chernyak and Josh Haberman and Reuven Lax and Sam
McVeety and Daniel Mills and Paul Nordstrom and Sam
Whittle",
title = "{MillWheel}: fault-tolerant stream processing at
{Internet} scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "1033--1044",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "MillWheel is a framework for building low-latency
data-processing applications that is widely used at
Google. Users specify a directed computation graph and
application code for individual nodes, and the system
manages persistent state and the continuous flow of
records, all within the envelope of the framework's
fault-tolerance guarantees. This paper describes
MillWheel's programming model as well as its
implementation. The case study of a continuous anomaly
detector in use at Google serves to motivate how many
of MillWheel's features are used. MillWheel's
programming model provides a notion of logical time,
making it simple to write time-based aggregations.
MillWheel was designed from the outset with fault
tolerance and scalability in mind. In practice, we find
that MillWheel's unique combination of scalability,
fault tolerance, and a versatile programming model
lends itself to a wide variety of problems at Google.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Rae:2013:OAS,
author = "Ian Rae and Eric Rollins and Jeff Shute and Sukhdeep
Sodhi and Radek Vingralek",
title = "Online, asynchronous schema change in {F1}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "1045--1056",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We introduce a protocol for schema evolution in a
globally distributed database management system with
shared data, stateless servers, and no global
membership. Our protocol is asynchronous--it allows
different servers in the database system to transition
to a new schema at different times--and online--all
servers can access and update all data during a schema
change. We provide a formal model for determining the
correctness of schema changes under these conditions,
and we demonstrate that many common schema changes can
cause anomalies and database corruption. We avoid these
problems by replacing corruption-causing schema changes
with a sequence of schema changes that is guaranteed to
avoid corrupting the database so long as all servers
are no more than one schema version behind at any time.
Finally, we discuss a practical implementation of our
protocol in F1, the database management system that
stores data for Google AdWords.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Abraham:2013:SDD,
author = "Lior Abraham and John Allen and Oleksandr Barykin and
Vinayak Borkar and Bhuwan Chopra and Ciprian Gerea and
Daniel Merl and Josh Metzler and David Reiss and Subbu
Subramanian and Janet L. Wiener and Okay Zed",
title = "{Scuba}: diving into data at {Facebook}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "1057--1067",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Facebook takes performance monitoring seriously.
Performance issues can impact over one billion users so
we track thousands of servers, hundreds of PB of daily
network traffic, hundreds of daily code changes, and
many other metrics. We require latencies of under a
minute from events occuring (a client request on a
phone, a bug report filed, a code change checked in) to
graphs showing those events on developers' monitors.
Scuba is the data management system Facebook uses for
most real-time analysis. Scuba is a fast, scalable,
distributed, in-memory database built at Facebook. It
currently ingests millions of rows (events) per second
and expires data at the same rate. Scuba stores data
completely in memory on hundreds of servers each with
144 GB RAM. To process each query, Scuba aggregates
data from all servers. Scuba processes almost a million
queries per day. Scuba is used extensively for
interactive, ad hoc, analysis queries that run in under
a second over live data. In addition, Scuba is the
workhorse behind Facebook's code regression analysis,
bug report monitoring, ads revenue monitoring, and
performance debugging.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shute:2013:FDS,
author = "Jeff Shute and Radek Vingralek and Bart Samwel and Ben
Handy and Chad Whipkey and Eric Rollins and Mircea
Oancea and Kyle Littlefield and David Menestrina and
Stephan Ellner and John Cieslewicz and Ian Rae and
Traian Stancescu and Himani Apte",
title = "{F1}: a distributed {SQL} database that scales",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "1068--1079",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "F1 is a distributed relational database system built
at Google to support the AdWords business. F1 is a
hybrid database that combines high availability, the
scalability of NoSQL systems like Bigtable, and the
consistency and usability of traditional SQL databases.
F1 is built on Spanner, which provides synchronous
cross-datacenter replication and strong consistency.
Synchronous replication implies higher commit latency,
but we mitigate that latency by using a hierarchical
schema model with structured data types and through
smart application design. F1 also includes a fully
functional distributed SQL query engine and automatic
change tracking and publishing.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Raman:2013:DBA,
author = "Vijayshankar Raman and Gopi Attaluri and Ronald Barber
and Naresh Chainani and David Kalmuk and Vincent
KulandaiSamy and Jens Leenstra and Sam Lightstone and
Shaorong Liu and Guy M. Lohman and Tim Malkemus and
Rene Mueller and Ippokratis Pandis and Berni Schiefer
and David Sharpe and Richard Sidle and Adam Storm and
Liping Zhang",
title = "{DB2} with {BLU} acceleration: so much more than just
a column store",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "1080--1091",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "DB2 with BLU Acceleration deeply integrates innovative
new techniques for defining and processing
column-organized tables that speed read-mostly Business
Intelligence queries by 10 to 50 times and improve
compression by 3 to 10 times, compared to traditional
row-organized tables, without the complexity of
defining indexes or materialized views on those tables.
But DB2 BLU is much more than just a column store.
Exploiting frequency-based dictionary compression and
main-memory query processing technology from the Blink
project at IBM Research --- Almaden, DB2 BLU performs
most SQL operations --- predicate application (even
range predicates and IN-lists), joins, and grouping ---
on the compressed values, which can be packed
bit-aligned so densely that multiple values fit in a
register and can be processed simultaneously via SIMD
(single-instruction, multipledata) instructions.
Designed and built from the ground up to exploit modern
multi-core processors, DB2 BLU's hardware-conscious
algorithms are carefully engineered to maximize
parallelism by using novel data structures that need
little latching, and to minimize data-cache and
instruction-cache misses. Though DB2 BLU is optimized
for in-memory processing, database size is not limited
by the size of main memory. Fine-grained synopses, late
materialization, and a new probabilistic buffer pool
protocol for scans minimize disk I/Os, while aggressive
prefetching reduces I/O stalls. Full integration with
DB2 ensures that DB2 with BLU Acceleration benefits
from the full functionality and robust utilities of a
mature product, while still enjoying order-of-magnitude
performance gains from revolutionary technology without
even having to change the SQL, and can mix
column-organized and row-organized tables in the same
tablespace and even within the same query.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ovsiannikov:2013:QFS,
author = "Michael Ovsiannikov and Silvius Rus and Damian Reeves
and Paul Sutter and Sriram Rao and Jim Kelly",
title = "The {Quantcast File System}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "1092--1101",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The Quantcast File System (QFS) is an efficient
alternative to the Hadoop Distributed File System
(HDFS). QFS is written in C++, is plugin compatible
with Hadoop MapReduce, and offers several efficiency
improvements relative to HDFS: 50\% disk space savings
through erasure coding instead of replication, a
resulting doubling of write throughput, a faster name
node, support for faster sorting and logging through a
concurrent append feature, a native command line client
much faster than hadoop fs, and global
feedback-directed I/O device management. As QFS works
out of the box with Hadoop, migrating data from HDFS to
QFS involves simply executing hadoop distcp. QFS is
being developed fully open source and is available
under an Apache license from
https://github.com/quantcast/qfs. Multi-petabyte QFS
instances have been in heavy production use since
2011.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bellamkonda:2013:ABD,
author = "Srikanth Bellamkonda and Hua-Gang Li and Unmesh Jagtap
and Yali Zhu and Vince Liang and Thierry Cruanes",
title = "Adaptive and big data scale parallel execution in
{Oracle}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "1102--1113",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper showcases some of the newly introduced
parallel execution methods in Oracle RDBMS. These
methods provide highly scalable and adaptive evaluation
for the most commonly used SQL operations --- joins,
group-by, rollup/cube, grouping sets, and window
functions. The novelty of these techniques is their use
of multi-stage parallelization models, accommodation of
optimizer mistakes, and the runtime parallelization and
data distribution decisions. These parallel plans adapt
based on the statistics gathered on the real data at
query execution time. We realized enormous performance
gains from these adaptive parallelization techniques.
The paper also discusses our approach to parallelize
queries with operations that are inherently serial. We
believe all these techniques will make their way into
big data analytics and other massively parallel
database systems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bellare:2013:WSM,
author = "Kedar Bellare and Carlo Curino and Ashwin
Machanavajihala and Peter Mika and Mandar Rahurkar and
Aamod Sane",
title = "{WOO}: a scalable and multi-tenant platform for
continuous knowledge base synthesis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "1114--1125",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Search, exploration and social experience on the Web
has recently undergone tremendous changes with search
engines, web portals and social networks offering a
different perspective on information discovery and
consumption. This new perspective is aimed at capturing
user intents, and providing richer and highly connected
experiences. The new battleground revolves around
technologies for the ingestion, disambiguation and
enrichment of entities from a variety of structured and
unstructured data sources --- we refer to this process
as knowledge base synthesis. This paper presents the
design, implementation and production deployment of the
Web Of Objects (WOO) system, a Hadoop-based platform
tackling such challenges. WOO has been designed and
implemented to enable various products in Yahoo! to
synthesize knowledge bases (KBs) of entities relevant
to their domains. Currently, the implementation of WOO
we describe is used by various Yahoo! properties such
as Intonow, Yahoo! Local, Yahoo! Events and Yahoo!
Search. This paper highlights: (i) challenges that
arise in designing, building and operating a platform
that handles multi-domain, multi-version, and
multi-tenant disambiguation of web-scale knowledge
bases (hundreds of millions of entities), (ii) the
architecture and technical solutions we devised, and
(iii) an evaluation on real-world production
datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gattani:2013:EEL,
author = "Abhishek Gattani and Digvijay S. Lamba and Nikesh
Garera and Mitul Tiwari and Xiaoyong Chai and Sanjib
Das and Sri Subramaniam and Anand Rajaraman and Venky
Harinarayan and AnHai Doan",
title = "Entity extraction, linking, classification, and
tagging for social media: a {Wikipedia}-based
approach",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "1126--1137",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Many applications that process social data, such as
tweets, must extract entities from tweets (e.g.,
``Obama'' and ``Hawaii'' in ``Obama went to Hawaii''),
link them to entities in a knowledge base (e.g.,
Wikipedia), classify tweets into a set of predefined
topics, and assign descriptive tags to tweets. Few
solutions exist today to solve these problems for
social data, and they are limited in important ways.
Further, even though several industrial systems such as
OpenCalais have been deployed to solve these problems
for text data, little if any has been published about
them, and it is unclear if any of the systems has been
tailored for social media. In this paper we describe in
depth an end-to-end industrial system that solves these
problems for social data. The system has been developed
and used heavily in the past three years, first at
Kosmix, a startup, and later at WalmartLabs. We show
how our system uses a Wikipedia-based global
``real-time'' knowledge base that is well suited for
social data, how we interleave the tasks in a
synergistic fashion, how we generate and use contexts
and social signals to improve task accuracy, and how we
scale the system to the entire Twitter firehose. We
describe experiments that show that our system
outperforms current approaches. Finally we describe
applications of the system at Kosmix and WalmartLabs,
and lessons learned.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Elmeleegy:2013:OTD,
author = "Hazem Elmeleegy and Yinan Li and Yan Qi and Peter
Wilmot and Mingxi Wu and Santanu Kolay and Ali Dasdan
and Songting Chen",
title = "Overview of turn data management platform for digital
advertising",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "1138--1149",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper gives an overview of Turn Data Management
Platform (DMP). We explain the purpose of this type of
platforms, and show how it is positioned in the current
digital advertising ecosystem. We also provide a
detailed description of the key components in Turn DMP.
These components cover the functions of (1) data
ingestion and integration, (2) data warehousing and
analytics, and (3) real-time data activation. For all
components, we discuss the main technical and research
challenges, as well as the alternative design choices.
One of the main goals of this paper is to highlight the
central role that data management is playing in shaping
this fast growing multi-billion dollars industry.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Curtiss:2013:USS,
author = "Michael Curtiss and Iain Becker and Tudor Bosman and
Sergey Doroshenko and Lucian Grijincu and Tom Jackson
and Sandhya Kunnatur and Soren Lassen and Philip Pronin
and Sriram Sankar and Guanghao Shen and Gintaras Woss
and Chao Yang and Ning Zhang",
title = "{Unicorn}: a system for searching the social graph",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "1150--1161",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Unicorn is an online, in-memory social graph-aware
indexing system designed to search trillions of edges
between tens of billions of users and entities on
thousands of commodity servers. Unicorn is based on
standard concepts in information retrieval, but it
includes features to promote results with good social
proximity. It also supports queries that require
multiple round-trips to leaves in order to retrieve
objects that are more than one edge away from source
nodes. Unicorn is designed to answer billions of
queries per day at latencies in the hundreds of
milliseconds, and it serves as an infrastructural
building block for Facebook's Graph Search product. In
this paper, we describe the data model and query
language supported by Unicorn. We also describe its
evolution as it became the primary backend for
Facebook's search offerings.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ramazzina:2013:NSC,
author = "Sergio Ramazzina and Chiara L. Ballari and Daniela
Somenzi",
title = "A new service for customer care based on the
{Trentorise} bigdata platform",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "1162--1163",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we give an overview of a platform
implemented in collaboration with the University of
Trento to deliver an innovative family of customer care
services.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Antonelli:2013:EDM,
author = "Fabrizio Antonelli and Antonino Casella and Cristiana
Chitic and Roberto Larcher and Giovanni Torrisi",
title = "Exploiting the diversity, mass and speed of
territorial data by {TELCO Operator} for better user
services",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "1164--1165",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bedini:2013:TBD,
author = "Ivan Bedini and Benedikt Elser and Yannis Velegrakis",
title = "The {Trento} big data platform for public
administration and large companies: use cases and
opportunities",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "1166--1167",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tran:2013:DQO,
author = "Nga Tran and Sreenath Bodagala and Jaimin Dave",
title = "Designing query optimizers for big data problems of
the future",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "1168--1169",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The Vertica SQL Query Optimizer was written from the
ground up for the Vertica Analytic Database. Its
design, and the tradeoffs we encountered during
implementation, support the case that the full power of
novel database systems can be realized only with a
custom Query Optimizer, carefully crafted exclusively
for the system in which it operates.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Franceschini:2013:HMV,
author = "Monica Franceschini",
title = "How to maximize the value of big data with the open
source {SpagoBI} suite through a comprehensive
approach",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "1170--1171",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper describes the approach adopted by SpagoBI
suite (\path=www.spagobi.org=) to manage large volumes
of heterogeneous structured and unstructured data, to
perform real-time Business Intelligence on Big Data
streaming and to give meaning to data through the
semantic analysis. SpagoBI supplies meaningful data
insights through the main concept of persistable and
schedulable datasets, and using tools such as
self-service BI, ad-hoc reporting, interactive
dashboards and explorative analysis.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chang:2013:CAC,
author = "Edward Y. Chang",
title = "Context-aware computing: opportunities and open
issues",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "1172--1173",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A 2011 Gartner report [3] describes context-aware
computing as a game-changing opportunity for
enterprises to improve both productivity and profits.
Context-aware computing is about making applications
and content more relevant to a user's context, e.g.,
when and where the user is, thereby improving user
experience. For instance, a coupon delivered to a user
at a wrong time or at a wrong location is considered a
nuisance. On the contrary, receiving a timely, usable
coupon before purchasing a merchandise is a treat.
Context-aware computing is not a new concept, but the
ongoing mobile revolution makes it both necessary and
feasible. o Necessary because the mobile phone display
is small and information must be delivered with much
higher relevance and precision to meet user needs. o
Feasible because small, light-weight mobile devices
allow users to almost always carry them around, and
much can be learned via a phone about its user's habits
and states. Context-aware computing involves first
acquiring context and then taking context-dependent
actions. For instance, a phone can sense a user's
location and turn off its GPS unit to conserve power
when the user enters a building, or it can collect EKG
signals of a user and trigger an alert if the user's
heart beats irregularly. Similarly, a restaurant can
send a coupon to a user when that user is queued up in
front of a nearby restaurant. The useful context can be
divided into three categories: information on the user
(knowledge of habits, emotional state, biophysiological
conditions), the user's environment (time, location,
co-location of others, social interaction), and the
user's tasks (transportation mode, engaged tasks,
general goals) [4]. Context-aware computing can be
applied to benefit applications in many areas including
but not limited to information retrieval, facility
management, productivity enhancement, in addition to
the aforementioned three examples representing power
management, health care, and commerce, respectively.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Hassanzadeh:2013:NGD,
author = "Oktie Hassanzadeh and Anastasios Kementsietsidis and
Benny Kimelfeld and Rajasekar Krishnamurthy and Fatma
{\"O}zcan and Ippokratis Pandis",
title = "Next generation data analytics at {IBM} research",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "1174--1175",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Brunato:2013:LIO,
author = "Mauro Brunato and Roberto Battiti",
title = "Learning and intelligent optimization {(LION)}: one
ring to rule them all",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "1176--1177",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Almost by definition, optimization is a source of a
tremendous power for automatically improving processes,
decisions, products and services. But its potential is
still largely unexploited in most real-world contexts.
One of the main reasons blocking its widespread
adoption is that standard optimization assumes the
existence of a function $ f(x) $ to be minimized, while
in most real-world business contexts this function does
not exist or is extremely difficult and costly to build
by hand. Machine learning (ML) comes to the rescue: the
function (the model) can be built by machine learning
starting from abundant data. By Learning and
Intelligent Optimization (LION) we mean this
combination of learning from data and optimization
which can be applied to complex, dynamic, stochastic
contexts. This combination dramatically increases the
automation level and puts more power directly in the
hands of decision makers without resorting to
intermediate layers of data scientists (LION has a huge
potential for a self-service usage). Reaching this goal
is a huge challenge and it will require research at the
boundary between two areas, machine learning and
optimization, which have been traditionally
separated.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lomet:2013:MSS,
author = "David Lomet",
title = "{Microsoft SQL} server's integrated database approach
for modern applications and hardware",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "1178--1179",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Recently, there has been much renewed interest in
re-architecting database systems to exploit new
hardware. While some efforts have suggested that one
needs specialized engines (``one size does not fit
all''), the approach pursued by Microsoft's SQL Server
has been to integrate multiple elements into a common
architecture. This brings customers what they want by
reducing data impedance mismatches between database
systems that they are using for multiple purposes. This
integration is, of course, more easily said than done.
But this is, in fact, precisely what the SQL Server
team has done.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Hacigumus:2013:OMS,
author = "Hakan Hac{\'\i}g{\"u}m{\"u}s and Jagan
Sankaranarayanan and Junichi Tatemura and Jeff LeFevre
and Neoklis Polyzotis",
title = "{Odyssey}: a multistore system for evolutionary
analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "1180--1181",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bouquet:2013:GEN,
author = "Paolo Bouquet and Andrea Molinari",
title = "A global {Entity Name System (ENS)} for data
ecosystems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "1182--1183",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "After decades of schema-centric research on data
management and integration, the evolution of data on
the web and the adoption of resource-based models seem
to have shifted the focus towards an entity-centric
approach. Our thesis is that the missing element to
achieve the full potential of this approach is the
development of what we call an Entity Name System
(ENS), namely a system which provides a collection of
general services for managing the lifecycle of globally
unique identifiers in an open and decentralized
environment. The claim is that this system can indeed
play the coordination role that the DNS played for the
document-centric development of the current web.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sikka:2013:SHE,
author = "Vishal Sikka and Franz F{\"a}rber and Anil Goel and
Wolfgang Lehner",
title = "{SAP HANA}: the evolution from a modern main-memory
data platform to an enterprise application platform",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "1184--1185",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "SAP HANA is a pioneering, and one of the best
performing, data platform designed from the grounds up
to heavily exploit modern hardware capabilities,
including SIMD, and large memory and CPU footprints. As
a comprehensive data management solution, SAP HANA
supports the complete data life cycle encompassing
modeling, provisioning, and consumption. This extended
abstract outlines the vision and planned next step of
the SAP HANA evolution growing from a core data
platform into an innovative enterprise application
platform as the foundation for current as well as novel
business applications in both on-premise and on-demand
scenarios. We argue that only a holistic system design
rigorously applying co-design at different levels may
yield a highly optimized and sustainable platform for
modern enterprise applications.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Nambiar:2013:KTR,
author = "Raghunath Nambiar and Meikel Poess",
title = "Keeping the {TPC} relevant!",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "1186--1187",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The Transaction Processing Performance Council (TPC)
is a nonprofit organization founded in 1988 to define
transaction processing and database benchmarks. Since
then, the TPC has played a crucial role in providing
the industry with relevant standards for total system
performance, price-performance, and energy-efficiency
comparisons. TPC benchmarks are widely used by database
researchers and academia. Historically known for
database-centric standards, the TPC has developed a
benchmark for virtualization and is currently
developing a multisource data integration benchmark.
The technology landscape is changing at a rapid pace,
challenging industry experts and researchers to develop
innovative techniques for evaluating, measuring, and
characterizing the performance of modern application
systems. The Technology Conference series on
Performance Evaluation and Benchmarking (TPCTC),
introduced in 2009, and the new TPC-Express initiatives
are steps taken by the TPC to be relevant in the coming
years and beyond.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dong:2013:BDI,
author = "Xin Luna Dong and Divesh Srivastava",
title = "Big data integration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "1188--1189",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The Big Data era is upon us: data is being generated,
collected and analyzed at an unprecedented scale, and
data-driven decision making is sweeping through
society. Since the value of data explodes when it can
be linked and fused with other data, addressing the big
data integration (BDI) challenge is critical to
realizing the promise of Big Data. BDI differs from
traditional data integration in many dimensions: (i)
the number of data sources, even for a single domain,
has grown to be in the tens of thousands, (ii) many of
the data sources are very dynamic, as a huge amount of
newly collected data are continuously made available,
(iii) the data sources are extremely heterogeneous in
their structure, with considerable variety even for
substantially similar entities, and (iv) the data
sources are of widely differing qualities, with
significant differences in the coverage, accuracy and
timeliness of data provided. This tutorial explores the
progress that has been made by the data integration
community on the topics of schema mapping, record
linkage and data fusion in addressing these novel
challenges faced by big data integration, and
identifies a range of open problems for the
community.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Viglas:2013:JTC,
author = "Stratis D. Viglas",
title = "Just-in-time compilation for {SQL} query processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "1190--1191",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Just-in-time compilation of SQL queries into native
code has recently emerged as a viable alternative to
interpretation-based query processing. We present the
salient results of research in this fresh area,
addressing all aspects of the query processing stack.
Throughout the discussion we draw analogies to the
general code generation techniques used in contemporary
compiler technology. At the same time we describe the
open research problems of the area.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ailamaki:2013:TST,
author = "Anastasia Ailamaki and Ryan Johnson and Ippokratis
Pandis and P{\'\i}nar T{\"o}z{\"u}n",
title = "Toward scalable transaction processing: evolution of
{Shore-MT}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "1192--1193",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Designing scalable transaction processing systems on
modern multicore hardware has been a challenge for
almost a decade. The typical characteristics of
transaction processing workloads lead to a high degree
of unbounded communication on multicores for
conventional system designs. In this tutorial, we
initially present a systematic way of eliminating
scalability bottlenecks of a transaction processing
system, which is based on minimizing the unbounded
communication. Then, we show several techniques that
apply the presented methodology to minimize logging,
locking, latching etc. related bottlenecks of
transaction processing systems. In parallel, we
demonstrate the internals of the Shore-MT storage
manager and how they have evolved over the years in
terms of scalability on multicore hardware through such
techniques. We also teach how to use Shore-MT with the
various design options it offers through its
application layer Shore-Kits and Metadata Frontend.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Elmore:2013:TDV,
author = "Aaron J. Elmore and Carlo Curino and Divyakant Agrawal
and Amr {El Abbadi}",
title = "Towards database virtualization for database as a
service",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "1194--1195",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Advances in operating system and storage-level
virtualization technologies have enabled the effective
consolidation of heterogeneous applications in a shared
cloud infrastructure. Novel research challenges arising
from this new shared environment include load
balancing, workload estimation, resource isolation,
machine replication, live migration, and an emergent
need of automation to handle large scale operations
with minimal manual intervention. Given that databases
are at the core of most applications that are deployed
in the cloud, database management systems (DBMSs)
represent a very important technology component that
needs to be virtualized in order to realize the
benefits of virtualization from autonomic management of
data-intensive applications in large scale
data-centers. The goal of this tutorial is to survey
the techniques used in providing elasticity in virtual
machine systems, shared storage systems, and survey
database research on multitenant architectures and
elasticity primitives. This foundation of core Database
as a Service advances, together with a primer of
important related topics in OS and storage-level
virtualization, are central for anyone that wants to
operate in this area of research.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mokbel:2013:MSN,
author = "Mohamed F. Mokbel and Mohamed Sarwat",
title = "Mobility and social networking: a data management
perspective",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "11",
pages = "1196--1197",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:56:54 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This tutorial presents the state-of-the-art research
that lies at the intersection of two hot topics in the
data management community: (1) social networking and
(2) mobility. In this tutorial, we give an overview of
existing research work, systems, and applications
related to both social networking and mobility. In
addition, we introduce several resources (i.e.,
datasets, software tools) as well as a list of
promising research directions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Xue:2013:DSD,
author = "Andy Yuan Xue and Rui Zhang and Yu Zheng and Xing Xie
and Jianhui Yu and Yong Tang",
title = "{DesTeller}: a system for destination prediction based
on trajectories with privacy protection",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1198--1201",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Destination prediction is an essential task for a
number of emerging location based applications such as
recommending sightseeing places and sending targeted
advertisements. A common approach to destination
prediction is to derive the probability of a location
being the destination based on historical trajectories.
However, existing techniques suffer from the ``data
sparsity problem'', i.e., the number of available
historical trajectories is far from sufficient to cover
all possible trajectories. This problem considerably
limits the amount of query trajectories whose predicted
destinations can be inferred. In this demonstration, we
showcase a system named ``DesTeller'' that is
interactive, user-friendly, publicly accessible, and
capable of answering real-time queries. The underlying
algorithm Sub-Trajectory Synthesis (SubSyn)
successfully addressed the data sparsity problem and is
able to predict destinations for almost every query
submitted by travellers. We also consider the privacy
protection issue in case an adversary uses SubSyn
algorithm to derive sensitive location information of
users.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chen:2013:SPS,
author = "Zhe Chen and Michael Cafarella and Jun Chen and Daniel
Prevo and Junfeng Zhuang",
title = "{Senbazuru}: a prototype spreadsheet database
management system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1202--1205",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Spreadsheets have become a critical data management
tool, but they lack explicit relational metadata,
making it difficult to join or integrate data across
multiple spreadsheets. Because spreadsheet data are
widely available on a huge range of topics, a tool that
allows easy spreadsheet integration would be hugely
beneficial for a variety of users. We demonstrate that
Senbazuru, a prototype spreadsheet database management
system (SSDBMS), is able to extract relational
information from spreadsheets. By doing so, it opens up
opportunities for integration among spreadsheets and
with other relational sources. Senbazuru allows users
to search for relevant spreadsheets in a large corpus,
probabilistically constructs a relational version of
the data, and offers several relational operations over
the resulting extracted data (including joins to other
spreadsheet data). Our demonstration is available on
two clients: a JavaScript-rich Web site and a touch
interface on the iPad. During the demo, Senbazuru will
allow VLDB participants to search spreadsheets, extract
relational data from them, and apply relational
operators such as select and join.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Smits:2013:RFQ,
author = "Gr{\'e}gory Smits and Olivier Pivert and Thomas
Girault",
title = "{ReqFlex}: fuzzy queries for everyone",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1206--1209",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this demonstration we present a complete
fuzzy-set-based approach to preference queries that
tackles the two main questions raised by the
introduction of flexibility and personalization when
querying relational databases: (i) how to efficiently
execute preference queries? and, (ii) how to help users
define preferences and queries? As an answer to the
first question, we propose PostgreSQL\_f, a module
implemented on top of PostgreSQL to handle fuzzy
queries. To answer the second question, we propose
ReqFlex an intuitive user interface to the definition
of preferences and the construction of fuzzy queries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kaufmann:2013:CIT,
author = "Martin Kaufmann and Panagiotis Vagenas and Peter M.
Fischer and Donald Kossmann and Franz F{\"a}rber",
title = "Comprehensive and interactive temporal query
processing with {SAP HANA}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1210--1213",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this demo, we present a prototype of a main memory
database system which provides a wide range of temporal
operators featuring predictable and interactive
response times. Much of real-life data is temporal in
nature, and there is an increasing application demand
for temporal models and operations in databases.
Nevertheless, SQL:2011 has only recently overcome a
decade-long standstill on standardizing temporal
features. As a result, few database systems provide any
temporal support, and even those only have limited
expressiveness and poor performance. Our prototype
combines an in-memory column store and a novel, generic
temporal index structure named Timeline Index. As we
will show on a workload based on real customer use
cases, it achieves predictable and interactive query
performance for a wide range of temporal query types
and data sizes.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Grust:2013:FDT,
author = "Torsten Grust and Nils Schweinsberg and Alexander
Ulrich",
title = "Functions are data too: defunctionalization for
{PL\slash SQL}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1214--1217",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We demonstrate a full-fledged implementation of
first-class functions for the widely used PL/SQL
database programming language. Functions are treated as
regular data items that may be (1) constructed at query
runtime, (2) stored in and retrieved from tables, (3)
assigned to variables, and (4) passed to and from other
(higher-order) functions. The resulting PL/SQL dialect
concisely and elegantly expresses a wide range of new
query idioms which would be cumbersome to formulate if
functions remained second-class citizens. We include a
diverse set of application scenarios that make these
advantages tangible. First-class PL/SQL functions
require featherweight syntactic extensions only and
come with a non-invasive implementation-- the
defunctionalization transformation--that can entirely
be built on top of existing relational DBMS
infrastructure. An interactive demonstrator helps users
to experiment with the ``function as data'' paradigm
and to earn a solid intuition of its inner workings.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ebaid:2013:NGD,
author = "Amr Ebaid and Ahmed Elmagarmid and Ihab F. Ilyas and
Mourad Ouzzani and Jorge-Arnulfo Quiane-Ruiz and Nan
Tang and Si Yin",
title = "{NADEEF}: a generalized data cleaning system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1218--1221",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present NADEEF, an extensible, generic and
easy-to-deploy data cleaning system. NADEEF
distinguishes between a programming interface and a
core to achieve generality and extensibility. The
programming interface allows users to specify data
quality rules by writing code that implements
predefined classes. These classes uniformly define what
is wrong with the data and (possibly) how to fix it. We
will demonstrate the following features provided by
NADEEF. (1) Heterogeneity: The programming interface
can be used to express many types of data quality rules
beyond the well known CFDs (FDs), MDs and ETL rules.
(2) Interdependency: The core algorithms can interleave
multiple types of rules to detect and repair data
errors. (3) Deployment and extensibility: Users can
easily customize NADEEF by defining new types of rules,
or by extending the core. (4) Metadata management and
data custodians: We show a live data quality dashboard
to effectively involve users in the data cleaning
process.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bergamaschi:2013:QKS,
author = "Sonia Bergamaschi and Francesco Guerra and Matteo
Interlandi and Raquel Trillo-Lado and Yannis
Velegrakis",
title = "{QUEST}: a keyword search system for relational data
based on semantic and machine learning techniques",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1222--1225",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We showcase QUEST (QUEry generator for STructured
sources), a search engine for relational databases that
combines semantic and machine learning techniques for
transforming keyword queries into meaningful SQL
queries. The search engine relies on two approaches:
the forward, providing mappings of keywords into
database terms (names of tables and attributes, and
domains of attributes), and the backward, computing the
paths joining the data structures identified in the
forward step. The results provided by the two
approaches are combined within a probabilistic
framework based on the Dempster-Shafer Theory. We
demonstrate QUEST capabilities, and we show how, thanks
to the flexibility obtained by the probabilistic
combination of different techniques, QUEST is able to
compute high quality results even with few training
data and/or with hidden data sources such as those
found in the Deep Web.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bogh:2013:GNA,
author = "Kenneth S. B{\o}gh and Anders Skovsgaard and Christian
S. Jensen",
title = "{GroupFinder}: a new approach to top-$k$
point-of-interest group retrieval",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1226--1229",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The notion of point-of-interest (PoI) has existed
since paper road maps began to include markings of
useful places such as gas stations, hotels, and tourist
attractions. With the introduction of geopositioned
mobile devices such as smartphones and mapping services
such as Google Maps, the retrieval of PoIs relevant to
a user's intent has became a problem of automated
spatio-textual information retrieval. Over the last
several years, substantial research has gone into the
invention of functionality and efficient
implementations for retrieving nearby PoIs. However,
with a couple of exceptions existing proposals retrieve
results at single-PoI granularity. We assume that a
mobile device user issues queries consisting of
keywords and an automatically supplied geo-position,
and we target the common case where the user wishes to
find nearby groups of PoIs that are relevant to the
keywords. Such groups are relevant to users who wish to
conveniently explore several options before making a
decision such as to purchase a specific product.
Specifically, we demonstrate a practical proposal for
finding top-$k$PoI groups in response to a query. We
show how problem parameter settings can be mapped to
options that are meaningful to users. Further, although
this kind of functionality is prone to combinatorial
explosion, we will demonstrate that the functionality
can be supported efficiently in practical settings.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Eldawy:2013:DSE,
author = "Ahmed Eldawy and Mohamed F. Mokbel",
title = "A demonstration of {SpatialHadoop}: an efficient
{MapReduce} framework for spatial data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1230--1233",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This demo presents SpatialHadoop as the first
full-fledged MapReduce framework with native support
for spatial data. SpatialHadoop is a comprehensive
extension to Hadoop that pushes spatial data inside the
core functionality of Hadoop. SpatialHadoop runs
existing Hadoop programs as is, yet, it achieves
order(s) of magnitude better performance than Hadoop
when dealing with spatial data. SpatialHadoop employs a
simple spatial high level language, a two-level spatial
index structure, basic spatial components built inside
the MapReduce layer, and three basic spatial
operations: range queries, $k$-NN queries, and spatial
join. Other spatial operations can be similarly
deployed in SpatialHadoop. We demonstrate a real system
prototype of SpatialHadoop running on an Amazon EC2
cluster against two sets of real spatial data obtained
from Tiger Files and OpenStreetMap with sizes 60GB and
300GB, respectively.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Abbasoglu:2013:APC,
author = "Mehmet Ali Abbasoglu and Bugra Gedik and Hakan
Ferhatosmanoglu",
title = "Aggregate profile clustering for telco analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1234--1237",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Many telco analytics require maintaining call profiles
based on recent customer call patterns. Such call
profiles are typically organized as aggregations
computed at different time scales over the recent
customer interactions. Customer call profiles are key
inputs for analytics targeted at improving operations,
marketing, and sales of telco providers. Many of these
analytics require clustering customer call profiles, so
that customers with similar calling patterns can be
modeled as a group. Example applications include
optimizing tariffs, customer segmentation, and usage
forecasting. In this demo, we present our system for
scalable aggregate profile clustering in a streaming
setting. We focus on managing anonymized segments of
customers for tariff optimization. Due to the large
number of customers, maintaining profile clusters have
high processing and memory resource requirements. In
order to tackle this problem, we apply distributed
stream processing. However, in the presence of
distributed state, it is a major challenge to partition
the profiles over machines (nodes) such that memory and
computation balance is maintained, while keeping the
clustering accuracy high. Furthermore, to adapt to
potentially changing customer calling patterns, the
partitioning of profiles to machines should be
continuously revised, yet one should minimize the
migration of profiles so as not to disturb the online
processing of updates. We provide a re-partitioning
technique that achieves all these goals. We keep
micro-cluster summaries at each node, collect these
summaries at a centralize node, and use a greedy
algorithm with novel affinity heuristics to revise the
partitioning. We present a demo that showcases our
Storm and Hbase based implementation of the proposed
solution in the context of a customer segmentation
application.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chen:2013:RRO,
author = "Luying Chen and Stefano Ortona and Giorgio Orsi and
Michael Benedikt",
title = "{ROSeAnn}: reconciling opinions of semantic
annotators",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1238--1241",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Named entity extractors can be used to enrich both
text and Web documents with semantic annotations. While
originally focused on a few standard entity types, the
ecosystem of annotators is becoming increasingly
diverse, with recognition capabilities ranging from
generic to specialised entity types. Both the overlap
and the diversity in annotator vocabularies motivate
the need for managing and integrating semantic
annotations: allowing users to see the results of
multiple annotations and to merge them into a unified
solution. We demonstrate ROSEANN, a system for the
management of semantic annotations. ROSEANN provides
users with a unified view over the opinion of multiple
independent annotators both on text and Web documents.
It allows users to understand and reconcile conflicts
between annotations via ontology-aware aggregation.
ROSEANN incorporates both supervised aggregation,
appropriate when representative training data is
available, and an unsupervised method based on the
notion of weighted-repair.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sarwat:2013:RAR,
author = "Mohamed Sarwat and James Avery and Mohamed F. Mokbel",
title = "{RecDB} in action: recommendation made easy in
relational databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1242--1245",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we demonstrate RecDB; a full-fledged
database system that provides personalized
recommendation to users. We implemented RecDB using an
existing open source database system PostgreSQL, and we
demonstrate the effectiveness of RecDB using two
existing recommendation applications (1) Restaurant
Recommendation, (2) Movie Recommendation. To make the
demo even more interactive, we showcase a novel
application that recommends research papers presented
at VLDB 2013 to the conference attendees based on their
publication history in DBLP.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Drosou:2013:PTE,
author = "Marina Drosou and Evaggelia Pitoura",
title = "{POIKILO}: a tool for evaluating the results of
diversification models and algorithms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1246--1249",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Search result diversification has attracted
considerable attention as a means of improving the
quality of results retrieved by user queries. In this
demonstration, we present Poikilo, a tool to assist
users in locating and evaluating diverse results. We
provide implementations of a wide suite of models and
algorithms to compute and compare diverse results.
Users can tune various diversification parameters,
combine diversity with relevance and also see how
diverse results change over time in the case of
streaming data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Amsterdamer:2013:CMA,
author = "Yael Amsterdamer and Yael Grossman and Tova Milo and
Pierre Senellart",
title = "{CrowdMiner}: mining association rules from the
crowd",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1250--1253",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This demo presents CrowdMiner, a system enabling the
mining of interesting data patterns from the crowd.
While traditional data mining techniques have been used
extensively for finding patterns in classic databases,
they are not always suitable for the crowd, mainly
because humans tend to remember only simple trends and
summaries rather than exact details. To address this,
CrowdMiner employs a novel crowd-mining algorithm,
designed specifically for this context. The algorithm
iteratively chooses appropriate questions to ask the
crowd, while aiming to maximize the knowledge gain at
each step. We demonstrate CrowdMiner through a
Well-Being portal, constructed interactively by mining
the crowd, and in particular the conference
participants, for common health related practices and
trends.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chen:2013:TTR,
author = "Chen Chen and Hongzhi Yin and Junjie Yao and Bin Cui",
title = "{TeRec}: a temporal recommender system over tweet
stream",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1254--1257",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "As social media further integrates into our daily
lives, people are increasingly immersed in real-time
social streams via services such as Twitter and Weibo.
One important observation in these online social
platforms is that users' interests and the popularity
of topics shift very fast, which poses great challenges
on existing recommender systems to provide the right
topics at the right time. In this paper, we extend the
online ranking technique and propose a temporal
recommender system --- TeRec. In TeRec, when posting
tweets, users can get recommendations of topics
(hashtags) according to their real-time interests, they
can also generate fast feedbacks according to the
recommendations. TeRec provides the browser-based
client interface which enables the users to access the
real time topic recommendations, and the server side
processes and stores the real-time stream data. The
experimental study demonstrates the superiority of
TeRec in terms of temporal recommendation accuracy.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shkapsky:2013:GQN,
author = "Alexander Shkapsky and Kai Zeng and Carlo Zaniolo",
title = "Graph queries in a next-generation {Datalog} system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1258--1261",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Recent theoretical advances have enabled the use of
special monotonic aggregates in recursion. These
special aggregates make possible the concise expression
and efficient implementation of a rich new set of
advanced applications. Among these applications, graph
queries are particularly important because of their
pervasiveness in data intensive application areas. In
this demonstration, we present our Deductive
Application Language (DeAL) System, the first of a new
generation of Deductive Database Systems that support
applications that could not be expressed using regular
stratification, or could be expressed using
XY-stratification (also supported in DeAL) but suffer
from inefficient execution. Using example queries, we
will (i) show how complex graph queries can be
concisely expressed using DeAL and (ii) illustrate the
formal semantics and efficient implementation of these
powerful new monotonic constructs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Hendawi:2013:IFS,
author = "Abdeltawab M. Hendawi and Jie Bao and Mohamed F.
Mokbel",
title = "{iRoad}: a framework for scalable predictive query
processing on road networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1262--1265",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This demo presents the iRoad framework for evaluating
predictive queries on moving objects for road networks.
The main promise of the iRoad system is to support a
variety of common predictive queries including
predictive point query, predictive range query,
predictive KNN query, and predictive aggregate query.
The iRoad framework is equipped with a novel data
structure, named reachability tree, employed to
determine the reachable nodes for a moving object
within a specified future time $ \Tau $. In fact, the
reachability tree prunes the space around each object
in order to significantly reduce the computation time.
So, iRoad is able to scale up to handle real road
networks with millions of nodes, and it can process
heavy workloads on large numbers of moving objects.
During the demo, audience will be able to interact with
iRoad through a well designed Graphical User Interface
to issue different types of predictive queries on a
real road network, to obtain the predictive heatmap of
the area of interest, to follow the creation and the
dynamic update of the reachability tree around a
specific moving object, and finally to examine the
system efficiency and scalability.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Nagendra:2013:SFS,
author = "Mithila Nagendra and K. Sel{\c{c}}uk Candan",
title = "{SkySuite}: a framework of skyline-join operators for
static and stream environments",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1266--1269",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Efficient processing of skyline queries has been an
area of growing interest over both static and stream
environments. Most existing static and streaming
techniques assume that the skyline query is applied to
a single data source. Unfortunately, this is not true
in many applications in which, due to the complexity of
the schema, the skyline query may involve attributes
belonging to multiple data sources. Recently, in the
context of static environments, various hybrid
skyline-join algorithms have been proposed. However,
these algorithms suffer from several drawbacks: they
often need to scan the data sources exhaustively in
order to obtain the set of skyline-join results;
moreover, the pruning techniques employed to eliminate
the tuples are largely based on expensive pairwise
tuple-to-tuple comparisons. On the other hand, most
existing streaming methods focus on single stream
skyline analysis, thus rendering these techniques
unsuitable for applications that require a real-time
``join'' operation to be carried out before the skyline
query can be answered. Based on these observations, we
introduce and propose to demonstrate SkySuite: a
framework of skyline-join operators that can be
leveraged to efficiently process skyline-join queries
over both static and stream environments. Among others,
SkySuite includes (1) a novel Skyline-Sensitive Join
(SSJ) operator that effectively processes skyline-join
queries in static environments, and (2) a Layered
Skyline-window-Join (LSJ) operator that incrementally
maintains skyline-join results over stream
environments.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhong:2013:PGP,
author = "Jianlong Zhong and Bingsheng He",
title = "Parallel graph processing on graphics processors made
easy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1270--1273",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper demonstrates Medusa, a programming
framework for parallel graph processing on graphics
processors (GPUs). Medusa enables developers to
leverage the massive parallelism and other hardware
features of GPUs by writing sequential C/C++ code for a
small set of APIs. This simplifies the implementation
of parallel graph processing on the GPU. The runtime
system of Medusa automatically executes the
user-defined APIs in parallel on the GPU, with a series
of graph-centric optimizations based on the
architecture features of GPUs. We will demonstrate the
steps of developing GPU-based graph processing
algorithms with Medusa, and the superior performance of
Medusa with both real-world and synthetic datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Richter:2013:MAO,
author = "Stefan Richter and Jens Dittrich and Stefan Schuh and
Tobias Frey",
title = "{Mosquito}: another one bites the data upload stream",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1274--1277",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Mosquito is a lightweight and adaptive physical design
framework for Hadoop. Mosquito connects to existing
data pipelines in Hadoop MapReduce and/or HDFS,
observes the data, and creates better physical designs,
i.e. indexes, as a byproduct. Our approach is minimally
invasive, yet it allows users and developers to easily
improve the runtime of Hadoop. We present three
important use cases: first, how to create indexes as a
byproduct of data uploads into HDFS; second, how to
create indexes as a byproduct of map tasks; and third,
how to execute map tasks as a byproduct of HDFS data
uploads. These use cases may even be combined.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Hardock:2013:NDS,
author = "Sergej Hardock and Ilia Petrov and Robert Gottstein
and Alejandro Buchmann",
title = "{NoFTL}: database systems on {FTL}-less flash
storage",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1278--1281",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The database architecture and workhorse algorithms
have been designed to compensate for hard disk
properties. The I/O characteristics of Flash memories
have significant impact on database systems and many
algorithms and approaches taking advantage of those
have been proposed recently. Nonetheless on system
level Flash storage devices are still treated as HDD
compatible block devices, black boxes and fast HDD
replacements. This backwards compatibility (both
software and hardware) masks the native behaviour,
incurs significant complexity and decreases I/O
performance, making it non-robust and unpredictable.
Database systems have a long tradition of operating
directly on RAW storage natively, utilising the
physical characteristics of storage media to improve
performance. In this paper we demonstrate an approach
called NoFTL that goes a step further. We show that
allowing for native Flash access and integrating parts
of the FTL functionality into the database system
yields significant performance increase and
simplification of the I/O stack. We created a real-time
data-driven Flash emulator and integrated it
accordingly into Shore-MT. We demonstrate a performance
improvement of up to $ 3.7 \times $ compared to
Shore-MT on RAW block-device Flash storage under
various TPC workloads.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kotsakos:2013:SUS,
author = "Dimitrios Kotsakos and Panos Sakkos and Vana
Kalogeraki and Dimitirios Gunopulos",
title = "{SmartMonitor}: using smart devices to perform
structural health monitoring",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1282--1285",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this demonstration, we are presenting SmartMonitor,
a distributed Structural Health Monitoring (SHM) system
consisting of smart devices. Over the last few years,
the vast majority of smart devices is equipped with
accelerometers that can be utilized towards building
SHM systems with hundreds of nodes. We describe a
scalable, fault-tolerant communication protocol, that
performs best-effort time synchronization of the nodes
and is used to implement a decentralized version of the
popular peak-picking SHM method. The implemented
interactive system can be easily installed in any
accelerometer-equipped Android device and the user has
a number of options for configuring the system or
analyzing the collected data and computed outcomes.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kargin:2013:LEA,
author = "Yag{\'\i}z Karg{\'\i}n and Milena Ivanova and Ying
Zhang and Stefan Manegold and Martin Kersten",
title = "{Lazy ETL} in action: {ETL} technology dates
scientific data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1286--1289",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Both scientific data and business data have analytical
needs. Analysis takes place after a scientific data
warehouse is eagerly filled with all data from external
data sources (repositories). This is similar to the
initial loading stage of Extract, Transform, and Load
(ETL) processes that drive business intelligence. ETL
can also help scientific data analysis. However, the
initial loading is a time and resource consuming
operation. It might not be entirely necessary, e.g. if
the user is interested in only a subset of the data. We
propose to demonstrate Lazy ETL, a technique to lower
costs for initial loading. With it, ETL is integrated
into the query processing of the scientific data
warehouse. For a query, only the required data items
are extracted, transformed, and loaded transparently
on-the-fly. The demo is built around concrete
implementations of Lazy ETL for seismic data analysis.
The seismic data warehouse is ready for query
processing, without waiting for long initial loading.
The audience fires analytical queries to observe the
internal mechanisms and modifications that realize each
of the steps; lazy extraction, transformation, and
loading.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dayan:2013:EED,
author = "Niv Dayan and Martin Kj{\ae}r Svendsen and Matias
Bj{\o}rling and Philippe Bonnet and Luc Bouganim",
title = "{EagleTree}: exploring the design space of {SSD}-based
algorithms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1290--1293",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Solid State Drives (SSDs) are a moving target for
system designers: they are black boxes, their internals
are undocumented, and their performance characteristics
vary across models. There is no appropriate analytical
model and experimenting with commercial SSDs is
cumbersome, as it requires a careful experimental
methodology to ensure repeatability. Worse, performance
results obtained on a given SSD cannot be generalized.
Overall, it is impossible to explore how a given
algorithm, say a hash join or LSM-tree insertions,
leverages the intrinsic parallelism of a modern SSD, or
how a slight change in the internals of an SSD would
impact its overall performance. In this paper, we
propose a new SSD simulation framework, named
EagleTree, which addresses these problems, and enables
a principled study of SSD-Based algorithms. The
demonstration scenario illustrates the design space for
algorithms based on an SSD-based IO stack, and shows
how researchers and practitioners can use EagleTree to
perform tractable explorations of this complex design
space.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sathe:2013:EPQ,
author = "Saket Sathe and Arthur Oviedo and Dipanjan Chakraborty
and Karl Aberer",
title = "{EnviroMeter}: a platform for querying
community-sensed data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1294--1297",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Efficiently querying data collected from Large-area
Community driven Sensor Networks (LCSNs) is a new and
challenging problem. In our previous works, we proposed
adaptive techniques for learning models (e.g.,
statistical, nonparametric, etc.) from such data,
considering the fact that LCSN data is typically
geo-temporally skewed. In this paper, we present a
demonstration of EnviroMeter. EnviroMeter uses our
adaptive model creation techniques for processing
continuous queries on community-sensed environmental
pollution data. Subsequently, it efficiently pushes
current pollution updates to GPS-enabled smartphones
(through its Android application) or displays it via a
web-interface. We experimentally demonstrate that our
model-based query processing approach is orders of
magnitude efficient than processing the queries over
indexed raw data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Okcan:2013:SEA,
author = "Alper Okcan and Mirek Riedewald and Biswanath Panda
and Daniel Fink",
title = "{Scolopax}: exploratory analysis of scientific data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1298--1301",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The formulation of hypotheses based on patterns found
in data is an essential component of scientific
discovery. As larger and richer data sets become
available, new scalable and user-friendly tools for
scientific discovery through data analysis are needed.
We demonstrate Scolopax, which explores the idea of a
search engine for hypotheses. It has an intuitive user
interface that supports sophisticated queries. Scolopax
can explore a huge space of possible hypotheses,
returning a ranked list of those that best match the
user preferences. To scale to large and complex data
sets, Scolopax relies on parallel data management and
mining techniques. These include model training,
efficient model summary generation, and novel parallel
join techniques that together with traditional
approaches such as clustering manipulate massive
model-summary collections to find the most interesting
hypotheses. This demonstration of Scolopax uses a real
observational data set, provided by the Cornell Lab of
Ornithology. It contains more than 3.3 million bird
sightings reported by citizen scientists and has almost
2500 attributes. Conference attendees have the
opportunity to make novel discoveries in this data set,
ranging from identifying variables that strongly affect
bird populations in specific regions to detecting more
sophisticated patterns such as habitat competition and
migration.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Deutch:2013:PPA,
author = "Daniel Deutch and Yuval Moskovitch and Val Tannen",
title = "{PROPOLIS}: provisioned analysis of data-centric
processes",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1302--1305",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We consider in this demonstration the (static)
analysis of data-centric process-based applications,
namely applications that depend on an underlying
database and whose control is guided by a finite state
transition system. We observe that analysts of such
applications often want to do more than analyze a
specific instance of the application's process control
and database. In particular they want to interactively
test and explore the effect on analysis results of
different hypothetical modifications applied to the
application's transition system and to the underlying
database. To that end, we propose a demonstration of
PROPOLIS, a system for PROvisioned PrOcess anaLysIS,
namely analysis of data-centric processes under
hypothetical modification scenarios. Our solution is
based on the notion of a provisioned expression (which
in turn is based on the notion of data provenance),
namely an expression that captures, in a compact way,
the analysis result with respect to all possible
combinations of scenarios, and allows for their
exploration at interactive speed. We will demonstrate
PROPOLIS in the context of an online shopping
application, letting participants play the role of
analysts.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Konda:2013:FSE,
author = "Pradap Konda and Arun Kumar and Christopher R{\'e} and
Vaishnavi Sashikanth",
title = "Feature selection in enterprise analytics: a
demonstration using an {R}-based data analytics
system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1306--1309",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Enterprise applications are analyzing ever larger
amounts of data using advanced analytics techniques.
Recent systems from Oracle, IBM, and SAP integrate R
with a data processing system to support richer
advanced analytics on large data. A key step in
advanced analytics applications is feature selection,
which is often an iterative process that involves
statistical algorithms and data manipulations. From our
conversations with data scientists and analysts at
enterprise settings, we observe three key aspects about
feature selection. First, feature selection is
performed by many types of users, not just data
scientists. Second, high performance is critical to
perform feature selection processes on large data.
Third, the provenance of the results and steps in
feature selection processes needs to be tracked for
purposes of transparency and auditability. Based on our
discussions with data scientists and the literature on
feature selection practice, we organize a set of
operations for feature selection into the Columbus
framework. We prototype Columbus as a library usable in
the Oracle R Enterprise environment. In this
demonstration, we use Columbus to showcase how we can
support various types of users of feature selection in
one system. We then show how we optimize performance
and manage the provenance of feature selection
processes.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Najafi:2013:FQP,
author = "Mohammadreza Najafi and Mohammad Sadoghi and Hans-Arno
Jacobsen",
title = "Flexible query processor on {FPGAs}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1310--1313",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this work, we demonstrate Flexible Query Processor
(FQP), an online reconfigurable event stream query
processor. FQP is an FPGA-based query processor that
supports select, project and join queries over event
streams at line rate. While processing incoming events,
FQP can accept new query expressions, a key
distinguishing characteristic from related approaches
employing FPGAs for acceleration. Our solution aims to
address performance limitations experienced with
general purpose processors needing to operate at line
rate and lack of on the fly reconfigurability with
custom designed hardware solutions on FPGAs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Civili:2013:MSM,
author = "Cristina Civili and Marco Console and Giuseppe {De
Giacomo} and Domenico Lembo and Maurizio Lenzerini and
Lorenzo Lepore and Riccardo Mancini and Antonella Poggi
and Riccardo Rosati and Marco Ruzzi and Valerio
Santarelli and Domenico Fabio Savo",
title = "{MASTRO STUDIO}: managing ontology-based data access
applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1314--1317",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Ontology-based data access (OBDA) is a novel paradigm
for accessing large data repositories through an
ontology, that is a formal description of a domain of
interest. Supporting the management of OBDA
applications poses new challenges, as it requires to
provide effective tools for (i) allowing both expert
and non-expert users to analyze the OBDA specification,
(ii) collaboratively documenting the ontology, (iii)
exploiting OBDA services, such as query answering and
automated reasoning over ontologies, e.g., to support
data quality check, and (iv) tuning the OBDA
application towards optimized performances. To fulfill
these challenges, we have built a novel system, called
MASTRO STUDIO, based on a tool for automated reasoning
over ontologies, enhanced with a suite of tools and
optimization facilities for managing OBDA applications.
To show the effectiveness of MASTRO STUDIO, we
demonstrate its usage in one OBDA application developed
in collaboration with the Italian Ministry of Economy
and Finance.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fuhry:2013:PHP,
author = "David Fuhry and Yang Zhang and Venu Satuluri and Arnab
Nandi and Srinivasan Parthasarathy",
title = "{PLASMA-HD}: probing the lattice structure and makeup
of high-dimensional data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1318--1321",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Rapidly making sense of, analyzing, and extracting
useful information from large and complex data is a
grand challenge. A user tasked with meeting this
challenge is often befuddled with questions on where
and how to begin to understand the relevant
characteristics of such data. Real-world problem
scenarios often involve scalability limitations and
time constraints. In this paper we present an
incremental interactive data analysis system as a step
to address this challenge. This system builds on recent
progress in the fields of interactive data exploration,
locality sensitive hashing, knowledge caching, and
graph visualization. Using visual clues based on rapid
incremental estimates, a user is provided a multi-level
capability to probe and interrogate the intrinsic
structure of data. Throughout the interactive process,
the output of previous probes can be used to construct
increasingly tight coherence estimates across the
parameter space, providing strong hints to the user
about promising analysis steps to perform next. We
present examples, interactive scenarios, and
experimental results on several synthetic and
real-world datasets which show the effectiveness and
efficiency of our approach. The implications of this
work are quite broad and can impact fields ranging from
top-$k$ algorithms to data clustering and from manifold
learning to similarity search.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Moyers:2013:DIP,
author = "Matthew Moyers and Emad Soroush and Spencer C. Wallace
and Simon Krughoff and Jake Vanderplas and Magdalena
Balazinska and Andrew Connolly",
title = "A demonstration of iterative parallel array processing
in support of telescope image analysis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1322--1325",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this demonstration, we present AscotDB, a new tool
for the analysis of telescope image data. AscotDB
results from the integration of ASCOT, a Web-based tool
for the collaborative analysis of telescope images and
their metadata, and SciDB, a parallel array processing
engine. We demonstrate the novel data exploration
supported by this integrated tool on a 1 TB dataset
comprising scientifically accurate, simulated telescope
images. We also demonstrate novel iterative-processing
features that we added to SciDB in order to support
this use-case.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Abdelhaq:2013:EOL,
author = "Hamed Abdelhaq and Christian Sengstock and Michael
Gertz",
title = "{EvenTweet}: online localized event detection from
{Twitter}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1326--1329",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Microblogging services such as Twitter, Facebook, and
Foursquare have become major sources for information
about real-world events. Most approaches that aim at
extracting event information from such sources
typically use the temporal context of messages.
However, exploiting the location information of
georeferenced messages, too, is important to detect
localized events, such as public events or emergency
situations. Users posting messages that are close to
the location of an event serve as human sensors to
describe an event. In this demonstration, we present a
novel framework to detect localized events in real-time
from a Twitter stream and to track the evolution of
such events over time. For this, spatio-temporal
characteristics of keywords are continuously extracted
to identify meaningful candidates for event
descriptions. Then, localized event information is
extracted by clustering keywords according to their
spatial similarity. To determine the most important
events in a (recent) time frame, we introduce a scoring
scheme for events. We demonstrate the functionality of
our system, called Even-Tweet, using a stream of tweets
from Europe during the 2012 UEFA European Football
Championship.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mousavi:2013:ITM,
author = "Hamid Mousavi and Shi Gao and Carlo Zaniolo",
title = "{IBminer}: a text mining tool for constructing and
populating {InfoBox} databases and knowledge bases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1330--1333",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Knowledge bases and structured summaries are playing a
crucial role in many applications, such as text
summarization, question answering, essay grading, and
semantic search. Although, many systems (e.g., DBpedia
and YaGo2) provide massive knowledge bases of such
summaries, they all suffer from incompleteness,
inconsistencies, and inaccuracies. These problems can
be addressed and much improved by combining and
integrating different knowledge bases, but their very
large sizes and their reliance on different
terminologies and ontologies make the task very
difficult. In this demo, we will demonstrate a system
that is achieving good success on this task by: (i)
employing available interlinks in the current knowledge
bases (e.g. external link and redirect links in
DBpedia) to combine information on individual entities,
and (ii) using widely available text corpora (e.g.
Wikipedia) and our IBminer text-mining system, to
generate and verify structured information, and
reconcile terminologies across different knowledge
bases. We will also demonstrate two tools designed to
support the integration process in close collaboration
with IBminer. The first is the InfoBox Knowledge-Base
Browser (IBKB) which provides structured summaries and
their provenance, and the second is the InfoBox Editor
(IBE), which is designed to suggest relevant attributes
for a user-specified subject, whereby the user can
easily improve the knowledge base without requiring any
knowledge about the internal terminology of individual
systems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Farnan:2013:PPA,
author = "Nicholas L. Farnan and Adam J. Lee and Panos K.
Chrysanthis and Ting Yu",
title = "{PAQO}: a preference-aware query optimizer for
{PostgreSQL}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1334--1337",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Although the declarative nature of SQL provides great
utility to database users, its use in distributed
database management systems can leave users unaware of
which servers in the system are evaluating portions of
their queries. By allowing users to merely say what
data they are interested in accessing without providing
guidance regarding how to retrieve it, query optimizers
can generate plans with unintended consequences to the
user (e.g., violating user privacy by revealing
sensitive portions of a user's query to untrusted
servers, or impacting result freshness by pulling data
from stale data stores). To address these types of
issues, we have created a framework that empowers users
with the ability to specify constraints on the kinds of
plans that can be produced by the optimizer to evaluate
their queries. Such constraints are specified through
an extended version of SQL that we have developed which
we call PASQL. With this proposal, we aim to
demonstrate PAQO, a version of PostgreSQL's query
optimizer that we have modified to produce plans that
respect constraints specified through PASQL while
optimizing user-specified SQL queries in terms of
performance.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bothe:2013:EPS,
author = "Suvarna Bothe and Panagiotis Karras and Akrivi
Vlachou",
title = "{eSkyline}: processing skyline queries over encrypted
data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1338--1341",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The advent of cloud computing redefines the
traditional query processing paradigm. Whereas
computational overhead and memory constraints become
less prohibitive, data privacy, security, and
confidentiality concerns become top priorities. In
particular, as data owners outsource the management of
their data to service providers, query processing over
such data has more resources to tap into, yet the data
oftentimes has to be encrypted so as to prevent
unauthorized access. The challenge that arises in such
a setting is to devise an encryption scheme that still
allows for query results to be efficiently computed
using the encrypted data values. An important type of
query that raises unconventional requirements in terms
of the operator that has to be evaluated is the skyline
query, which returns a set of objects in a dataset
whose values are not dominated by any other object
therein. In this demonstration, we present eSkyline, a
prototype system and query interface that enables the
processing of skyline queries over encrypted data, even
without preserving the order on each attribute as
order-preserving encryption would do. Our system
comprises of an encryption scheme that facilitates the
evaluation of domination relationships, hence allows
for state-of-the-art skyline processing algorithms to
be used. The actual data values are reconstructed only
at the client side, where the encryption key is known.
Our demo visualizes the details of the encryption
scheme, allows a user to interact with a server, and
showcases the efficiency of computing skyline queries
and decrypting the results.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jiang:2013:GMD,
author = "Lilong Jiang and Michael Mandel and Arnab Nandi",
title = "{GestureQuery}: a multitouch database query
interface",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1342--1345",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Multitouch interfaces allow users to directly and
interactively manipulate data. We propose bringing such
interactive manipulation to the task of querying SQL
databases. This paper describes an initial
implementation of such an interface for multitouch
tablet devices called GestureQuery that translates
multitouch gestures into database queries. It provides
database users with immediate constructive feedback on
their queries, allowing rapid iteration and refinement
of those queries. Based on preliminary user studies,
Gesture-Query is easier to use, and lets users
construct target queries quicker than console-based SQL
and visual query builders while maintaining interactive
performance.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yang:2013:MLP,
author = "Di Yang and Kaiyu Zhao and Maryam Hasan and Hanyuan Lu
and Elke Rundensteiner and Matthew Ward",
title = "Mining and linking patterns across live data streams
and stream archives",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1346--1349",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We will demonstrate the visual analytics system $V$
istream$^T$, that supports interactive mining of
complex patterns within and across live data streams
and stream pattern archives. Our system is equipped
with both computational pattern mining and
visualization techniques, which allow it to not only
efficiently discover and manage patterns but also
effectively convey the mining results to human analysts
through visual displays. In our demonstration, we will
illustrate that with $V$ istream$^T$, analysts can
easily submit, monitor and interact with a broad range
of query types for pattern mining. This includes novel
strategies for extracting complex patterns from streams
in real time, summarizing neighbour-based patterns
using multi-resolution compression strategies,
selectively pushing patterns into the stream archive,
validating the popularity or rarity of stream patterns
by stream archive matching, and pattern evolution
tracking to link patterns across time.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Samet:2013:PMQ,
author = "Hanan Samet and Marco D. Adelfio and Brendan C. Fruin
and Michael D. Lieberman and Jagan Sankaranarayanan",
title = "{PhotoStand}: a map query interface for a database of
news photos",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1350--1353",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "PhotoStand enables the use of a map query interface to
retrieve news photos associated with news articles that
are in turn associated with the principal locations
that they mention collected as a result of monitoring
the output of over 10,000 RSS news feeds, made
available within minutes of publication, and stored in
a PostgreSQL database. The news photos are ranked
according to their relevance to the clusters of news
articles associated with locations at which they are
displayed. This work differs from traditional work in
this field as the associated locations and topics (by
virtue of the cluster with which the articles
containing the news photos are associated) are
generated automatically without any human intervention
such as tagging, and that photos are retrieved by
location instead of just by keyword as is the case for
many existing systems. In addition, the clusters
provide a filtering step for detecting near-duplicate
news photos.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kumar:2013:HSH,
author = "K. Ashwin Kumar and Jonathan Gluck and Amol Deshpande
and Jimmy Lin",
title = "{Hone}: ``Scaling down'' {Hadoop} on shared-memory
systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1354--1357",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The underlying assumption behind Hadoop and, more
generally, the need for distributed processing is that
the data to be analyzed cannot be held in memory on a
single machine. Today, this assumption needs to be
re-evaluated. Although petabyte-scale data-stores are
increasingly common, it is unclear whether ``typical''
analytics tasks require more than a single high-end
server. Additionally, we are seeing increased
sophistication in analytics, e.g., machine learning,
which generally operates over smaller and more refined
datasets. To address these trends, we propose ``scaling
down'' Hadoop to run on shared-memory machines. This
paper presents a prototype runtime called Hone,
intended to be both API and binary compatible with
standard (distributed) Hadoop. That is, Hone can take
an existing Hadoop jar and efficiently execute it,
without modification, on a multi-core shared memory
machine. This allows us to take existing Hadoop
algorithms and find the most suitable run-time
environment for execution on datasets of varying sizes.
Our experiments show that Hone can be an order of
magnitude faster than Hadoop pseudo-distributed mode
(PDM); on dataset sizes that fit into memory, Hone can
outperform a fully-distributed 15-node Hadoop cluster
in some cases as well.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Antenucci:2013:RGN,
author = "Dolan Antenucci and Erdong Li and Shaobo Liu and
Bochun Zhang and Michael J. Cafarella and Christopher
R{\'e}",
title = "{Ringtail}: a generalized nowcasting system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1358--1361",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Social media nowcasting--using online user activity to
describe real-world phenomena--is an active area of
research to supplement more traditional and costly data
collection methods such as phone surveys. Given the
potential impact of such research, we would expect
general-purpose nowcasting systems to quickly become a
standard tool among noncomputer scientists, yet it has
largely remained a research topic. We believe a major
obstacle to widespread adoption is the nowcasting
feature selection problem. Typical nowcasting systems
require the user to choose a handful of social media
objects from a pool of billions of potential
candidates, which can be a time-consuming and
error-prone process. We have built RINGTAIL, a
nowcasting system that helps the user by automatically
suggesting high-quality signals. We demonstrate that
RINGTALL can make nowcasting easier by suggesting
relevant features for a range of topics. The user
provides just a short topic query (e.g., unemployment)
and a small conventional dataset in order for RINGTALL
to quickly return a usable predictive nowcasting
model.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Xie:2013:IIP,
author = "Min Xie and Laks V. S. Lakshmanan and Peter T. Wood",
title = "{IPS}: an interactive package configuration system for
trip planning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1362--1365",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "When planning a trip, one essential task is to find a
set of Places-of-Interest (POIs) which can be visited
during the trip. Using existing travel guides or
websites such as Lonely Planet and TripAdvisor, the
user has to either manually work out a desirable set of
POIs or take pre-configured travel packages; the former
can be time consuming while the latter lacks
flexibility. In this demonstration, we propose an
Interactive Package configuration System (IPS), which
visualizes different candidate packages on a map, and
enables users to configure a travel package through
simple interactions, i.e., comparing packages and
fixing/removing POIs from a package. Compared with
existing trip planning systems, we believe IPS strikes
the right balance between flexibility and manual
effort.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhou:2013:RDS,
author = "Jingbo Zhou and Anthony K. H. Tung and Wei Wu and Wee
Siong Ng",
title = "{R2-D2}: a system to support probabilistic path
prediction in dynamic environments via ``Semi-lazy''
learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1366--1369",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Path prediction is presently an important area of
research with a wide range of applications. However,
most of the existing path prediction solutions are
based on eager learning methods which commit to a model
or a set of patterns extracted from historical
trajectories. Such methods do not perform very well in
dynamic environments where the objects' trajectories
are affected by many irregular factors which are not
captured by pre-defined models or patterns. In this
demonstration, we present the ``R2-D2'' system that
supports probabilistic path prediction in dynamic
environments. The core of our system is a ``semi-lazy''
learning approach to probabilistic path prediction
which builds a prediction model on the fly using
historical trajectories that are selected dynamically
based on the trajectories of target objects. Our
``R2-D2'' system has a visual interface that shows how
our path prediction algorithm works on several
real-world datasets. It also allows us to experiment
with various parameter settings.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chun:2013:RRE,
author = "Byung-Gon Chun and Tyson Condie and Carlo Curino and
Chris Douglas and Sergiy Matusevych and Brandon Myers
and Shravan Narayanamurthy and Raghu Ramakrishnan and
Sriram Rao and Josh Rosen and Russell Sears and Markus
Weimer",
title = "{REEF}: retainable evaluator execution framework",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1370--1373",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this demo proposal, we describe REEF, a framework
that makes it easy to implement scalable,
fault-tolerant runtime environments for a range of
computational models. We will demonstrate diverse
workloads, including extract-transform-load MapReduce
jobs, iterative machine learning algorithms, and ad-hoc
declarative query processing. At its core, REEF builds
atop YARN (Apache Hadoop 2's resource manager) to
provide retainable hardware resources with lifetimes
that are decoupled from those of computational tasks.
This allows us to build persistent (cross-job) caches
and cluster-wide services, but, more importantly,
supports high-performance iterative graph processing
and machine learning algorithms. Unlike existing
systems, REEF aims for composability of jobs across
computational models, providing significant performance
and usability gains, even with legacy code. REEF
includes a library of interoperable data management
primitives optimized for communication and data
movement (which are distinct from storage locality).
The library also allows REEF applications to access
external services, such as user-facing relational
databases. We were careful to decouple lower levels of
REEF from the data models and semantics of systems
built atop it. The result was two new standalone
systems: Tang, a configuration manager and dependency
injector, and Wake, a state-of-the-art event-driven
programming and data movement framework. Both are
language independent, allowing REEF to bridge the JVM
and .NET.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2013:OTP,
author = "Shuhao Zhang and Jiong He and Bingsheng He and Mian
Lu",
title = "{OmniDB}: towards portable and efficient query
processing on parallel {CPU\slash GPU} architectures",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1374--1377",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Driven by the rapid hardware development of parallel
CPU/GPU architectures, we have witnessed emerging
relational query processing techniques and
implementations on those parallel architectures.
However, most of those implementations are not portable
across different architectures, because they are
usually developed from scratch and target at a specific
architecture. This paper proposes a kernel-adapter
based design (OmniDB), a portable yet efficient query
processor on parallel CPU/GPU architectures. OmniDB
attempts to develop an extensible query processing
kernel (qKernel) based on an abstract model for
parallel architectures, and to leverage an
architecture-specific layer (adapter) to make qKernel
be aware of the target architecture. The goal of OmniDB
is to maximize the common functionality in qKernel so
that the development and maintenance efforts for
adapters are minimized across different architectures.
In this demo, we demonstrate our initial efforts in
implementing OmniDB, and present the preliminary
results on the portability and efficiency.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Savkovic:2013:CAI,
author = "Ognjen Savkovi{\'c} and Paramita Mirza and Alex Tomasi
and Werner Nutt",
title = "Complete approximations of incomplete queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1378--1381",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present a system that computes for a query that may
be incomplete, complete approximations from above and
from below. We assume a setting where queries are posed
over a partially complete database, that is, a database
that is generally incomplete, but is known to contain
complete information about specific aspects of its
application domain. Which parts are complete, is
described by a set of so-called table-completeness
statements. Previous work led to a theoretical
framework and an implementation that allowed one to
determine whether in such a scenario a given
conjunctive query is guaranteed to return a complete
set of answers or not. With the present demonstrator we
show how to reformulate the original query in such a
way that answers are guaranteed to be complete. If
there exists a more general complete query, there is a
unique most specific one, which we find. If there
exists a more specific complete query, there may even
be infinitely many. In this case, we find the least
specific specializations whose size is bounded by a
threshold provided by the user. Generalizations are
computed by a fixpoint iteration, employing an answer
set programming engine. Specializations are found
leveraging unification from logic programming.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Koutrika:2013:UAU,
author = "Georgia Koutrika and Qian Lin and Jerry Liu",
title = "User analytics with {UbeOne}: insights into web
printing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1382--1385",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "As web and mobile applications become more sensitive
to the user context, there is a shift from purely
off-line processing of user actions (log analysis) to
real-time user analytics that can generate information
about the user context to be instantly leveraged by the
application. Ubeone is a system that enables both
real-time and aggregate analytics from user data. The
system is designed as a set of lightweight, composeable
mechanisms that can progressively and collectively
analyze a user action, such as pinning, saving or
printing a web page. We will demonstrate the system
capabilities on analyzing a live feed of URLs printed
through a proprietary, web browser plug-in. This is in
fact the first analysis of web printing activity. We
will also give a taste of how the system can enable
instant recommendations based on the user context.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Santos:2013:DDS,
author = "Ivo Santos and Marcel Tilly and Badrish Chandramouli
and Jonathan Goldstein",
title = "{DiAl}: distributed streaming analytics anywhere,
anytime",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1386--1389",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Connected devices are expected to grow to 50 billion
in 2020. Through our industrial partners and their use
cases, we validated the importance of inflight data
processing to produce results with low latency, in
particular local and global data analytics
capabilities. In order to cope with the scalability
challenges posed by distributed streaming analytics
scenarios, we propose two new technologies: (1)
JStreams, a low footprint and efficient JavaScript
complex event processing engine supporting local
analytics on heterogeneous devices and (2) DiAlM, a
distributed analytics management service that leverages
cloud-edge evolving topologies. In the demonstration,
based on a real manufacturing use case, we walk through
a situation where operators supervise manufacturing
equipment through global analytics, and drill down into
alarm cases on the factory floor by locally inspecting
the data generated by the manufacturing equipment.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chirkova:2013:BUW,
author = "Rada Chirkova and Jun Yang",
title = "Big and useful: what's in the data for me?",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1390--1391",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bartos:2013:UIA,
author = "Tom{\'a}s Bartos",
title = "Universal indexing of arbitrary similarity models",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1392--1397",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The increasing amount of available unstructured
content together with the growing number of large
nonrelational databases put more emphasis on the
content-based retrieval and precisely on the area of
similarity searching. Although there exist several
indexing methods for efficient querying, not all of
them are best-suited for arbitrary similarity models.
Having a metric space, we can easily apply metric
access methods but for nonmetric models which typically
better describe similarities between generally
unstructured objects the situation is a little bit more
complicated. To address this challenge, we introduce
SIMDEX, the universal framework that is capable of
finding alternative indexing methods that will serve
for efficient yet effective similarity searching for
any similarity model. Using trivial or more advanced
methods for the incremental exploration of possible
indexing techniques, we are able to find alternative
methods to the widely used metric space model paradigm.
Through experimental evaluations, we validate our
approach and show how it outperforms the known indexing
methods.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bress:2013:WIT,
author = "Sebastian Bre{\ss} and Gunter Saake",
title = "Why it is time for a {HyPE}: a hybrid query processing
engine for efficient {GPU} coprocessing in {DBMS}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1398--1403",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "GPU acceleration is a promising approach to speed up
query processing of database systems by using low cost
graphic processors as coprocessors. Two major trends
have emerged in this area: (1) The development of
frameworks for scheduling tasks in heterogeneous
CPU/GPU platforms, which is mainly in the context of
coprocessing for applications and does not consider
specifics of database-query processing and
optimization. (2) The acceleration of database
operations using efficient GPU algorithms, which
typically cannot be applied easily on other database
systems, because of their analytical-algorithm-specific
cost models. One major challenge is how to combine
traditional database query processing with GPU
coprocessing techniques and efficient database
operation scheduling in a GPU-aware query optimizer. In
this thesis, we develop a hybrid query processing
engine, which extends the traditional physical
optimization process to generate hybrid query plans and
to perform a cost-based optimization in a way that the
advantages of CPUs and GPUs are combined. Furthermore,
we aim at a portable solution between different
GPU-accelerated database management systems to maximize
applicability. Preliminary results indicate great
potential.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mahdiraji:2013:DSU,
author = "Alireza Rezaei Mahdiraji and Peter Baumann",
title = "Database support for unstructured meshes",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1404--1409",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Despite ubiquitous usage of unstructured mesh in many
application domains (e.g., computer aided design,
scientific simulation, climate modeling, etc.), there
is no specialized mesh database which supports storing
and querying such data structures. Existing mesh
libraries use file-based APIs which do not support
declarative querying and are difficult to maintain. A
mesh database can benefit these domains in several ways
such as: declarative query language, ease of
maintenance, query optimization, etc. In this thesis
work, the core idea is to have a very general model
which can represent objects from different domains and
specialize it to smaller object classes using
combinatorial constraints. We propose the Incidence
multi-Graph Complex (ImG-Complex) data model for
storing combinatorial aspect of meshes in a database.
We extend incidence graph (IG) representation with
multi-incidence information (ImG) to represent a class
of objects which we call ImG-Complexes. ImG-Complex can
support a wide range of application domains. We
introduce optional and application-specific constraints
to restrain the general ImG model to specific object
classes or specific geometric representations. The
constraints check validity of meshes based on the
properties of the modeled object class. Finally, we
show how graph databases can be utilized and reused to
query some combinatorial mesh queries based on the
(possibly constrained) ImG model. In particular, we
show the strengths and limitations of a graph-only
query language in expressing combinatorial mesh
queries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Madaan:2013:DSM,
author = "Aastha Madaan and Subhash Bhalla",
title = "Domain specific multistage query language for medical
document repositories",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1410--1415",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Vast amount of medical information is increasingly
available on the Web. As a result, seeking medical
information through queries is gaining importance in
the medical domain. The existing keyword-based search
engines such as Google, Yahoo fail to suffice the needs
of the health-care workers (who are well-versed with
the domain knowledge required for querying) using these
they often face results which are irrelevant and not
useful for their tasks. In this paper, we present the
need and the challenges for a user-level,
domain-specific query language for the specialized
document repositories of the medical domain. This topic
has not been sufficiently addressed by the existing
approaches including SQL-like query languages or
general-purpose keyword-based search engines and
document-level indexing based search. We aim to bridge
the gap between information needs of the
skilled/semi-skilled domain users and the query
capability provided by the query language. Overcoming
such a challenge can facilitate effective use of large
volume of information on the Web (and in the electronic
health records (EHRs)repositories).",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Taxidou:2013:RAI,
author = "Io Taxidou and Peter Fischer",
title = "Realtime analysis of information diffusion in social
media",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1416--1421",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The goal of this thesis is to investigate real-time
analysis methods on social media with a focus on
information diffusion. From a conceptual point of view,
we are interested both in the structural, sociological
and temporal aspects of information diffusion in social
media with a twist on the real time factor of what is
happening right now. From a technical side, the sheer
size of current social media services (100's of
millions of users) and the large amount of data
produced by these users renders conventional approaches
for these costly analyses impossible. For that, we need
to go beyond the state-of-the-art infrastructure for
data-intensive computation. Our high level goal is to
investigate how information diffuses in real time on
the underlying social network and the role of different
users in the propagation process. We plan to implement
these analyses with full and partially missing datasets
and compare the cost and quality of both approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bonomi:2013:MFP,
author = "Luca Bonomi and Li Xiong",
title = "Mining frequent patterns with differential privacy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1422--1427",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The mining of frequent patterns is a fundamental
component in many data mining tasks. A considerable
amount of research on this problem has led to a wide
series of efficient and scalable algorithms for mining
frequent patterns. However, releasing these patterns is
posing concerns on the privacy of the users
participating in the data. Indeed the information from
the patterns can be linked with a large amount of data
available from other sources creating opportunities for
adversaries to break the individual privacy of the
users and disclose sensitive information. In this
proposal, we study the mining of frequent patterns in a
privacy preserving setting. We first investigate the
difference between sequential and itemset patterns, and
second we extend the definition of patterns by
considering the absence and presence of noise in the
data. This leads us in distinguishing the patterns
between exact and noisy. For exact patterns, we
describe two novel mining techniques that we previously
developed. The first approach has been applied in a
privacy preserving record linkage setting, where our
solution is used to mine frequent patterns which are
employed in a secure transformation procedure to link
records that are similar. The second approach improves
the mining utility results using a two-phase strategy
which allows to effectively mine frequent substrings as
well as prefixes patterns. For noisy patterns, first we
formally define the patterns according to the type of
noise and second we provide a set of potential
applications that require the mining of these patterns.
We conclude the paper by stating the challenges in this
new setting and possible future research directions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Hoppe:2013:AOB,
author = "Anett Hoppe and C. Nicolle and A. Roxin",
title = "Automatic ontology-based user profile learning from
heterogeneous {Web} resources in a big data context",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1428--1433",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The Web has developed to the biggest source of
information and entertainment in the world. By its
size, its adaptability and flexibility, it challenged
our current paradigms on information sharing in several
areas. By offering everybody the opportunity to release
own contents in a fast and cheap way, the Web already
led to a revolution of the traditional publishing world
and just now, it commences to change the perspective on
advertisements. With the possibility to adapt the
contents displayed on a page dynamically based on the
viewer's context, campaigns launched to target rough
customer groups will become an element of the past.
However, this new ecosystem, that relates
advertisements with the user, heavily relies on the
quality of the underlying user profile. This profile
has to be able to model any combination of user
characteristics, the relations between its composing
elements and the uncertainty that stems from the
automated processing of real-world data. The work at
hand describes the beginnings of a PhD project that
aims to tackle those issues using a combination of data
analysis, ontology engineering and processing of big
data resources provided by an industrial partner. The
final goal is to automatically construct and populate a
profile ontology for each user identified by the
system. This allows to associate these users to
high-value audience segments in order to drive digital
marketing.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dey:2013:STA,
author = "Akon Dey and Alan Fekete and Uwe R{\"o}hm",
title = "Scalable transactions across heterogeneous {NoSQL}
key--value data stores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1434--1439",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Many cloud systems provide data stores with limited
features, especially they may not provide transactions,
or else restrict transactions to a single item. We
propose a approach that gives multi-item transactions
across heterogeneous data stores, using only a minimal
set of features from each store such as single item
consistency, conditional update, and the ability to
include extra metadata within a value. We offer a
client-coordinated transaction protocol that does not
need a central coordinating infrastructure. A prototype
implementation has been built as a Java library and
measured with an extension of YCSB benchmark to
exercise multi-item transactions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ngo:2013:GUS,
author = "Nhung Ngo and Enrico Franconi",
title = "Getting unique solution in data exchange",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1440--1443",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A schema mapping is a high-level specification in
which the relationship between two database schemas is
described. In data exchange, schema mappings are
one-way mappings that describe which data can be
brought from source data to target data. Therefore,
given a source instance and a mapping, there might be
more than one valid target instance. This fact causes
many problems in query answering over target data for
non-conjunctive queries. To make query answering
feasible for all queries, we focus on a methodology for
extending the original schema mapping to guarantee the
uniqueness of target instance corresponding to a source
instance. To this end, we introduce a theoretical
framework where the problem is transformed to an
abduction problem, namely, definability abduction. We
apply the framework to relational data exchange setting
and solve the problem by pointing out minimal solutions
according to a specific semantic minimality
criterion.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kaufmann:2013:SPT,
author = "Martin Kaufmann and Donald Kossmann",
title = "Storing and processing temporal data in a main memory
column store",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1444--1449",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Managing and accessing temporal data is of increasing
importance in industry. So far, most companies model
the time dimension on the application layer rather than
pushing down the operators to the database, which leads
to a significant performance overhead. The goal of this
PhD thesis is to develop a native support of temporal
features for SAP HANA, which is a commercial in-memory
column store database system. We investigate different
alternatives to store temporal data physically and
analyze the trade-offs arising from different memory
layouts which cluster the data either by time or by
space dimension. Taking into account the underlying
physical representation, different temporal operators
such as temporal aggregation, time travel and temporal
join have to be executed efficiently. We present a
novel data structure called Timeline Index and
algorithms based on this index, which have a very
competitive performance for all temporal operators
beating existing best-of-breed approaches by factors,
sometimes even by orders of magnitude. The results of
this thesis are currently being integrated into HANA,
with the goal of being shipped to the customers as a
productive release within the next few months.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kozak:2013:ESS,
author = "Stepan Kozak and Pavel Zezula",
title = "Efficiency and security in similarity cloud services",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1450--1455",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With growing popularity of cloud services, the trend
in the industry is to outsource the data to a 3rd party
system that provides searching in the data as a
service. This approach naturally brings privacy
concerns about the (potentially sensitive) data.
Recently, quite extensive research of outsourcing
classic exact-match or keyword search has been done.
However, not much attention has been paid to the
outsourcing of the similarity search, which becomes
more and more important in information retrieval
applications. In this work, we propose to the research
community a model of outsourcing similarity search to
the cloud environment (so called similarity cloud). We
establish privacy and efficiency requirements to be
laid down for the similarity cloud with an emphasis on
practical use of the system in real applications; this
requirement list can be used as a general guideline for
practical system analysis and we use it to analyze
current existing approaches. We propose two new
similarity indexes that ensure data privacy and thus
are suitable for search systems outsourced in a cloud.
The balance of the first proposed technique EM-Index is
more on the efficiency side while the other (DSH Index)
shifts this balance more to the privacy side.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sellam:2013:FCD,
author = "Thibault Sellam and Martin Kersten",
title = "Fast cartography for data explorers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "12",
pages = "1456--1461",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:00 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Exploration is the act of investigating unknown
regions. An analyst exploring a database cannot, by
definition, compose the right query or use the
appropriate data mining algorithm. However, current
data management tools cannot operate without well
defined instructions. Therefore, browsing an unknown
database can be a very tedious process. Our project,
Atlas, is an attempt to circumvent this problem. Atlas
is an active DBMS front-end, designed for database
exploration. It generates and ranks several data maps
from a user query. A data map is a small set of
database queries (less than a dozen), in which each
query describes an interesting region of the database.
The user can pick one and submit it for further
exploration. In order to support interaction, the
system should operate in quasi-real time, possibly at
the cost of precision, and require as little input
parameters as possible. We draft a framework to
generate such data maps, and introduce several short-to
long-terms research problems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Simoes:2013:WSP,
author = "Gon{\c{c}}alo Sim{\~o}es and Helena Galhardas and Luis
Gravano",
title = "When speed has a price: fast information extraction
using approximate algorithms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "13",
pages = "1462--1473",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:09 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A wealth of information produced by individuals and
organizations is expressed in natural language text.
This is a problem since text lacks the explicit
structure that is necessary to support rich querying
and analysis. Information extraction systems are
sophisticated software tools to discover structured
information in natural language text. Unfortunately,
information extraction is a challenging and
time-consuming task. In this paper, we address the
limitations of state-of-the-art systems for the
optimization of information extraction programs, with
the objective of producing efficient extraction
executions. Our solution relies on exploiting a wide
range of optimization opportunities. For efficiency, we
consider a wide spectrum of execution plans, including
approximate plans whose results differ in their
precision and recall. Our optimizer accounts for these
characteristics of the competing execution plans, and
uses accurate predictors of their extraction time,
recall, and precision. We demonstrate the efficiency
and effectiveness of our optimizer through a
large-scale experimental evaluation over real-world
datasets and multiple extraction tasks and
approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chasseur:2013:DES,
author = "Craig Chasseur and Jignesh M. Patel",
title = "Design and evaluation of storage organizations for
read-optimized main memory databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "13",
pages = "1474--1485",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:09 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Existing main memory data processing systems employ a
variety of storage organizations and make a number of
storage-related design choices. The focus of this paper
is on systematically evaluating a number of these key
storage design choices for main memory analytical (i.e.
read-optimized) database settings. Our evaluation
produces a number of key insights: First, it is always
beneficial to organize data into self-contained memory
blocks rather than large files. Second, both
column-stores and row-stores display performance
advantages for different types of queries, and for high
performance both should be implemented as options for
the tuple-storage layout. Third, cache-sensitive
B+-tree indices can play a major role in accelerating
query performance, especially when used in a
block-oriented organization. Finally, compression can
also play a role in accelerating query performance
depending on data distribution and query selectivity.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chen:2013:ASA,
author = "Luying Chen and Stefano Ortona and Giorgio Orsi and
Michael Benedikt",
title = "Aggregating semantic annotators",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "13",
pages = "1486--1497",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:09 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A growing number of resources are available for
enriching documents with semantic annotations. While
originally focused on a few standard classes of
annotations, the ecosystem of annotators is now
becoming increasingly diverse. Although annotators
often have very different vocabularies, with both
high-level and specialist concepts, they also have many
semantic interconnections. We will show that both the
overlap and the diversity in annotator vocabularies
motivate the need for semantic annotation integration:
middleware that produces a unified annotation on top of
diverse semantic annotators. On the one hand, the
diversity of vocabulary allows applications to benefit
from the much richer vocabulary available in an
integrated vocabulary. On the other hand, we present
evidence that the most widely-used annotators on the
web suffer from serious accuracy deficiencies: the
overlap in vocabularies from individual annotators
allows an integrated annotator to boost accuracy by
exploiting inter-annotator agreement and disagreement.
The integration of semantic annotations leads to new
challenges, both compared to usual data integration
scenarios and to standard aggregation of machine
learning tools. We overview an approach to these
challenges that performs ontology-aware aggregation. We
introduce an approach that requires no training data,
making use of ideas from database repair. We
experimentally compare this with a supervised approach,
which adapts maximal entropy Markov models to the
setting of ontology-based annotations. We further
experimentally compare both these approaches with
respect to ontology-unaware supervised approaches, and
to individual annotators.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chu:2013:DDC,
author = "Xu Chu and Ihab F. Ilyas and Paolo Papotti",
title = "Discovering denial constraints",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "13",
pages = "1498--1509",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:09 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Integrity constraints (ICs) provide a valuable tool
for enforcing correct application semantics. However,
designing ICs requires experts and time. Proposals for
automatic discovery have been made for some formalisms,
such as functional dependencies and their extension
conditional functional dependencies. Unfortunately,
these dependencies cannot express many common business
rules. For example, an American citizen cannot have
lower salary and higher tax rate than another citizen
in the same state. In this paper, we tackle the
challenges of discovering dependencies in a more
expressive integrity constraint language, namely Denial
Constraints (DCs). DCs are expressive enough to
overcome the limits of previous languages and, at the
same time, have enough structure to allow efficient
discovery and application in several scenarios. We lay
out theoretical and practical foundations for DCs,
including a set of sound inference rules and a linear
algorithm for implication testing. We then develop an
efficient instance-driven DC discovery algorithm and
propose a novel scoring function to rank DCs for user
validation. Using real-world and synthetic datasets, we
experimentally evaluate scalability and effectiveness
of our solution.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fan:2013:DTK,
author = "Wenfei Fan and Xin Wang and Yinghui Wu",
title = "Diversified top-$k$ graph pattern matching",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "13",
pages = "1510--1521",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:09 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Graph pattern matching has been widely used in e.g.,
social data analysis. A number of matching algorithms
have been developed that, given a graph pattern $Q$ and
a graph $G$, compute the set $ M(Q, G) $ of matches of
$Q$ in $G$. However, these algorithms often return an
excessive number of matches, and are expensive on large
real-life social graphs. Moreover, in practice many
social queries are to find matches of a specific
pattern node, rather than the entire $ M(Q, G) $. This
paper studies top- $k$ graph pattern matching. (1) We
revise graph pattern matching defined in terms of
simulation, by supporting a designated output node $ u
o $. Given $G$ and $Q$, it is to find those nodes in $
M(Q, G) $ that match $ u o $, instead of the large set
$ M(Q, G) $. (2) We study two classes of functions for
ranking the matches: relevance functions $ \delta r() $
based on, e.g., social impact, and distance functions $
\delta d() $ to cover diverse elements. (3) We develop
two algorithms for computing top-$k$ matches of $ u o $
based on $ \delta r() $, with the early termination
property, i.e., they find top-$k$ matches without
computing the entire $ M(Q, G) $. (4) We also study
diversified top-$k$ matching, a bi-criteria
optimization problem based on both $ \delta r() $ and $
\delta d() $. We show that its decision problem is
NP-complete. Nonetheless, we provide an approximation
algorithm with performance guarantees and a heuristic
one with the early termination property. (5) Using
real-life and synthetic data, we experimentally verify
that our (diversified) top-$k$ matching algorithms are
effective, and outperform traditional matching
algorithms in efficiency.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Rao:2013:BNF,
author = "Weixiong Rao and Lei Chen and Pan Hui and Sasu
Tarkoma",
title = "{Bitlist}: new full-text index for low space cost and
efficient keyword search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "13",
pages = "1522--1533",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:09 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Nowadays Web search engines are experiencing
significant performance challenges caused by a huge
amount of Web pages and increasingly larger number of
Web users. The key issue for addressing these
challenges is to design a compact structure which can
index Web documents with low space and meanwhile
process keyword search very fast. Unfortunately, the
current solutions typically separate the space
optimization from the search improvement. As a result,
such solutions either save space yet with search
inefficiency, or allow fast keyword search but with
huge space requirement. In this paper, to address the
challenges, we propose a novel structure bitlist with
both low space requirement and supporting fast keyword
search. Specifically, based on a simple and yet very
efficient encoding scheme, bitlist uses a single number
to encode a set of integer document IDs for low space,
and adopts fast bitwise operations for very efficient
boolean-based keyword search. Our extensive
experimental results on real and synthetic data sets
verify that bitlist outperforms the recent proposed
solution, inverted list compression [23, 22] by
spending 36.71\% less space and 61.91\% faster
processing time, and achieves comparable running time
as [8] but with significantly lower space.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wandelt:2013:RSS,
author = "Sebastian Wandelt and Johannes Starlinger and Marc Bux
and Ulf Leser",
title = "{RCSI}: scalable similarity search in thousand(s) of
genomes",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "13",
pages = "1534--1545",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:09 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Until recently, genomics has concentrated on comparing
sequences between species. However, due to the sharply
falling cost of sequencing technology, studies of
populations of individuals of the same species are now
feasible and promise advances in areas such as
personalized medicine and treatment of genetic
diseases. A core operation in such studies is read
mapping, i.e., finding all parts of a set of genomes
which are within edit distance $k$ to a given query
sequence ($k$-approximate search). To achieve
sufficient speed, current algorithms solve this problem
only for one to-be-searched genome and compute only
approximate solutions, i.e., they miss some $k$ ---
approximate occurrences. We present RCSI, Referentially
Compressed Search Index, which scales to a thousand
genomes and computes the exact answer. It exploits the
fact that genomes of different individuals of the same
species are highly similar by first compressing the
to-be-searched genomes with respect to a reference
genome. Given a query, RCSI then searches the reference
and all genome-specific individual differences. We
propose efficient data structures for representing
compressed genomes and present algorithms for scalable
compression and similarity search. We evaluate our
algorithms on a set of 1092 human genomes, which amount
to approx. 3 TB of raw data. RCSI compresses this set
by a ratio of 450:1 (26:1 including the search index)
and answers similarity queries on a mid-class server in
15 ms on average even for comparably large error
thresholds, thereby significantly outperforming other
methods. Furthermore, we present a fast and adaptive
heuristic for choosing the best reference sequence for
referential compression, a problem that was never
studied before at this scale.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tao:2013:AMS,
author = "Yufei Tao and Xiaocheng Hu and Dong-Wan Choi and
Chin-Wan Chung",
title = "Approximate {MaxRS} in spatial databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "13",
pages = "1546--1557",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:09 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In the maximizing range sum (MaxRS) problem, given (i)
a set $P$ of $2$D points each of which is associated
with a positive weight, and (ii) a rectangle $r$ of
specific extents, we need to decide where to place $r$
in order to maximize the covered weight of $r$ --- that
is, the total weight of the data points covered by $r$.
Algorithms solving the problem exactly entail expensive
CPU or I/O cost. In practice, exact answers are often
not compulsory in a MaxRS application, where slight
imprecision can often be comfortably tolerated,
provided that approximate answers can be computed
considerably faster. Motivated by this, the present
paper studies the $ (1 - \epsilon) $-approximate MaxRS
problem, which admits the same inputs as MaxRS, but
aims instead to return a rectangle whose covered weight
is at least $ (1 - \epsilon) m^* $, where $ m^* $ is
the optimal covered weight, and $ \epsilon $ can be an
arbitrarily small constant between $0$ and $1$. We
present fast algorithms that settle this problem with
strong theoretical guarantees.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kimelfeld:2013:MTD,
author = "Benny Kimelfeld and Jan Vondr{\'a}k and David P.
Woodruff",
title = "Multi-tuple deletion propagation: approximations and
complexity",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "13",
pages = "1558--1569",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:09 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper studies the computational complexity of the
classic problem of deletion propagation in a relational
database, where tuples are deleted from the base
relations in order to realize a desired deletion of
tuples from the view. Such an operation may result in a
(sometimes unavoidable) side effect: deletion of
additional tuples from the view, besides the
intentionally deleted ones. The goal is to minimize the
side effect. The complexity of this problem has been
well studied in the case where only a single tuple is
deleted from the view. However, only little is known
within the more realistic scenario of multi-tuple
deletion, which is the topic of this paper. The class
of conjunctive queries (CQs) is among the most well
studied in the literature, and we focus here on views
defined by CQs that are self-join free (sjf-CQs). Our
main result is a trichotomy in complexity, classifying
all sjf-CQs into three categories: those for which the
problem is in polynomial time, those for which the
problem is NP-hard but polynomial-time approximable (by
a constant-factor), and those for which even an
approximation (by any factor) is NP-hard to obtain. A
corollary of this trichotomy is a dichotomy in the
complexity of deciding whether a side-effect-free
solution exists, in the multi-tuple case. We further
extend the full classification to accommodate the
presence of a constant upper bound on the number of
view tuples to delete, and the presence of functional
dependencies. Finally, we establish (positive and
negative) complexity results on approximability for the
dual problem of maximizing the number of view tuples
surviving (rather than minimizing the side effect
incurred in) the deletion propagation.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chandramouli:2013:SDF,
author = "Badrish Chandramouli and Suman Nath and Wenchao Zhou",
title = "Supporting distributed feed-following apps over edge
devices",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "13",
pages = "1570--1581",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:09 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In feed-following applications such as Twitter and
Facebook, users (consumers) follow a large number of
other users (producers) to get personalized feeds,
generated by blending producers- feeds. With the
proliferation of Cloud-connected smart edge devices
such as smartphones, producers and consumers of many
feed-following applications reside on edge devices and
the Cloud. An important design goal of such
applications is to minimize communication (and energy)
overhead of edge devices. In this paper, we abstract
distributed feed-following applications as a view
maintenance problem, with the goal of optimally placing
the views on edge devices and in the Cloud to minimize
communication overhead between edge devices and the
Cloud. The view placement problem for general network
topology is NP Hard; however, we show that for the
special case of Cloud-edge topology, locally optimal
solutions yield a globally optimal view placement
solution. Based on this powerful result, we propose
view placement algorithms that are highly efficient,
yet provably minimize global network cost. Compared to
existing works on feed-following applications, our
algorithms are more general--they support views with
selection, projection, correlation (join) and arbitrary
black-box operators, and can even refer to other views.
We have implemented our algorithms within a distributed
feed-following architecture over real smartphones and
the Cloud. Experiments over real datasets indicate that
our algorithms are highly scalable and
orders-of-magnitude more efficient than existing
strategies for optimal placement. Further, our results
show that optimal placements generated by our
algorithms are often several factors better than
simpler schemes.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Thirumuruganathan:2013:RDW,
author = "Saravanan Thirumuruganathan and Nan Zhang and Gautam
Das",
title = "Rank discovery from web databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "13",
pages = "1582--1593",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:09 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Many web databases are only accessible through a
proprietary search interface which allows users to form
a query by entering the desired values for a few
attributes. After receiving a query, the system returns
the top-$k$ matching tuples according to a
pre-determined ranking function. Since the rank of a
tuple largely determines the attention it receives from
website users, ranking information for any tuple ---
not just the top-ranked ones --- is often of
significant interest to third parties such as sellers,
customers, market researchers and investors. In this
paper, we define a novel problem of rank discovery over
hidden web databases. We introduce a taxonomy of
ranking functions, and show that different types of
ranking functions require fundamentally different
approaches for rank discovery. Our technical
contributions include principled and efficient
randomized algorithms for estimating the rank of a
given tuple, as well as negative results which
demonstrate the inefficiency of any deterministic
algorithm. We show extensive experimental results over
real-world databases, including an online experiment at
Amazon.com, which illustrates the effectiveness of our
proposed techniques.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Rekatsinas:2013:SPS,
author = "Theodoros Rekatsinas and Amol Deshpande and Ashwin
Machanavajjhala",
title = "{SPARSI}: partitioning sensitive data amongst multiple
adversaries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "13",
pages = "1594--1605",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:09 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present SPARSI, a novel theoretical framework for
partitioning sensitive data across multiple
non-colluding adversaries. Most work in privacy-aware
data sharing has considered disclosing summaries where
the aggregate information about the data is preserved,
but sensitive user information is protected.
Nonetheless, there are applications, including online
advertising, cloud computing and crowdsourcing markets,
where detailed and fine-grained user data must be
disclosed. We consider a new data sharing paradigm and
introduce the problem of privacy-aware data
partitioning, where a sensitive dataset must be
partitioned among $k$ untrusted parties (adversaries).
The goal is to maximize the utility derived by
partitioning and distributing the dataset, while
minimizing the total amount of sensitive information
disclosed. The data should be distributed so that an
adversary, without colluding with other adversaries,
cannot draw additional inferences about the private
information, by linking together multiple pieces of
information released to her. The assumption of no
collusion is both reasonable and necessary in the above
application domains that require release of private
user information. SPARSI enables us to formally define
privacy-aware data partitioning using the notion of
sensitive properties for modeling private information
and a hypergraph representation for describing the
interdependencies between data entries and private
information. We show that solving privacy-aware
partitioning is, in general, NP-hard, but for specific
information disclosure functions, good approximate
solutions can be found using relaxation techniques.
Finally, we present a local search algorithm applicable
to generic information disclosure functions. We conduct
a rigorous performance evaluation with real-world and
synthetic datasets that illustrates the effectiveness
of SPARSI at partitioning sensitive data while
minimizing disclosure.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Deng:2013:SCC,
author = "Dong Deng and Yu Jiang and Guoliang Li and Jian Li and
Cong Yu",
title = "Scalable column concept determination for {Web} tables
using large knowledge bases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "13",
pages = "1606--1617",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:09 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Tabular data on the Web has become a rich source of
structured data that is useful for ordinary users to
explore. Due to its potential, tables on the Web have
recently attracted a number of studies with the goals
of understanding the semantics of those Web tables and
providing effective search and exploration mechanisms
over them. An important part of table understanding and
search is column concept determination, i.e.,
identifying the most appropriate concepts associated
with the columns of the tables. The problem becomes
especially challenging with the availability of
increasingly rich knowledge bases that contain hundreds
of millions of entities. In this paper, we focus on an
important instantiation of the column concept
determination problem, namely, the concepts of a column
are determined by fuzzy matching its cell values to the
entities within a large knowledge base. We provide an
efficient and scalable MapReduce-based solution that is
scalable to both the number of tables and the size of
the knowledge base and propose two novel techniques:
knowledge concept aggregation and knowledge entity
partition. We prove that both the problem of finding
the optimal aggregation strategy and that of finding
the optimal partition strategy are NP-hard, and propose
efficient heuristic techniques by leveraging the
hierarchy of the knowledge base. Experimental results
on real-world datasets show that our method achieves
high annotation quality and performance, and scales
well.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Huang:2013:TKS,
author = "Xin Huang and Hong Cheng and Rong-Hua Li and Lu Qin
and Jeffrey Xu Yu",
title = "top-$k$ structural diversity search in large
networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "13",
pages = "1618--1629",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:09 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Social contagion depicts a process of information
(e.g., fads, opinions, news) diffusion in the online
social networks. A recent study reports that in a
social contagion process the probability of contagion
is tightly controlled by the number of connected
components in an individual's neighborhood. Such a
number is termed structural diversity of an individual
and it is shown to be a key predictor in the social
contagion process. Based on this, a fundamental issue
in a social network is to find top-$k$ users with the
highest structural diversities. In this paper, we, for
the first time, study the top-$k$ structural diversity
search problem in a large network. Specifically, we
develop an effective upper bound of structural
diversity for pruning the search space. The upper bound
can be incrementally refined in the search process.
Based on such upper bound, we propose an efficient
framework for top-$k$ structural diversity search. To
further speed up the structural diversity evaluation in
the search process, several carefully devised heuristic
search strategies are proposed. Extensive experimental
studies are conducted in 13 real-world large networks,
and the results demonstrate the efficiency and
effectiveness of the proposed methods.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cavalieri:2013:SCX,
author = "Federico Cavalieri and Alessandro Solimando and
Giovanna Guerrini",
title = "Synthetising changes in {XML} documents as {PULs}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "13",
pages = "1630--1641",
month = aug,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:09 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The ability of efficiently detecting changes in XML
documents is crucial in many application contexts. If
such changes are represented as XQuery Update Pending
Update Lists (PULs), they can then be applied on
documents using XQuery Update engines, and document
management can take advantage of existing composition,
inversion, reconciliation approaches developed in the
update processing context. The paper presents an XML
edit-script generator with the unique characteristic of
using PULs as edit-script language and improving the
state of the art from both the performance and the
generated edit-script quality perspectives.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2013:PQR,
author = "Lei Zhang and Thanh Tran and Achim Rettinger",
title = "Probabilistic query rewriting for efficient and
effective keyword search on graph data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "1642--1653",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The problem of rewriting keyword search queries on
graph data has been studied recently, where the main
goal is to clean user queries by rewriting keywords as
valid tokens appearing in the data and grouping them
into meaningful segments. The main solution to this
problem employs heuristics for ranking query rewrites
and a dynamic programming algorithm for computing them.
Based on a broader set of queries defined by an
existing benchmark, we show that the use of these
heuristics does not yield good results. We propose a
novel probabilistic framework, which enables the
optimality of a query rewrite to be estimated in a more
principled way. We show that our approach outperforms
existing work in terms of effectiveness and efficiency
of query rewriting. More importantly, we provide the
first results indicating query rewriting can indeed
improve overall keyword search runtime performance and
result quality.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Schaler:2013:QBH,
author = "Martin Sch{\"a}ler and Alexander Grebhahn and Reimar
Schr{\"o}ter and Sandro Schulze and Veit K{\"o}ppen and
Gunter Saake",
title = "{QuEval}: beyond high-dimensional indexing {\`a} la
carte",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "1654--1665",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In the recent past, the amount of high-dimensional
data, such as feature vectors extracted from multimedia
data, increased dramatically. A large variety of
indexes have been proposed to store and access such
data efficiently. However, due to specific requirements
of a certain use case, choosing an adequate index
structure is a complex and time-consuming task. This
may be due to engineering challenges or open research
questions. To overcome this limitation, we present
QuEval, an open-source framework that can be flexibly
extended w.r.t. index structures, distance metrics, and
data sets. QuEval provides a unified environment for a
sound evaluation of different indexes, for instance, to
support tuning of indexes. In an empirical evaluation,
we show how to apply our framework, motivate benefits,
and demonstrate analysis possibilities.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2013:DLL,
author = "Yuhong Li and Leong Hou U. and Man Lung Yiu and Zhiguo
Gong",
title = "Discovering longest-lasting correlation in sequence
databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "1666--1677",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Most existing work on sequence databases use
correlation (e.g., Euclidean distance and Pearson
correlation) as a core function for various analytical
tasks. Typically, it requires users to set a length for
the similarity queries. However, there is no steady way
to define the proper length on different application
needs. In this work we focus on discovering
longest-lasting highly correlated subsequences in
sequence databases, which is particularly useful in
helping those analyses without prior knowledge about
the query length. Surprisingly, there has been limited
work on this problem. A baseline solution is to
calculate the correlations for every possible
subsequence combination. Obviously, the brute force
solution is not scalable for large datasets. In this
work we study a space-constrained index that gives a
tight correlation bound for subsequences of similar
length and offset by intra-object grouping and
inter-object grouping techniques. To the best of our
knowledge, this is the first index to support
normalized distance metric of arbitrary length
subsequences. Extensive experimental evaluation on both
real and synthetic sequence datasets verifies the
efficiency and effectiveness of our proposed methods.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Popescu:2013:PTP,
author = "Adrian Daniel Popescu and Andrey Balmin and Vuk
Ercegovac and Anastasia Ailamaki",
title = "{PREDIcT}: towards predicting the runtime of large
scale iterative analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "1678--1689",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Machine learning algorithms are widely used today for
analytical tasks such as data cleaning, data
categorization, or data filtering. At the same time,
the rise of social media motivates recent uptake in
large scale graph processing. Both categories of
algorithms are dominated by iterative subtasks, i.e.,
processing steps which are executed repetitively until
a convergence condition is met. Optimizing cluster
resource allocations among multiple workloads of
iterative algorithms motivates the need for estimating
their runtime, which in turn requires: (i) predicting
the number of iterations, and (ii) predicting the
processing time of each iteration. As both parameters
depend on the characteristics of the dataset and on the
convergence function, estimating their values before
execution is difficult. This paper proposes PREDIcT, an
experimental methodology for predicting the runtime of
iterative algorithms. PREDIcT uses sample runs for
capturing the algorithm's convergence trend and
per-iteration key input features that are well
correlated with the actual processing requirements of
the complete input dataset. Using this combination of
characteristics we predict the runtime of iterative
algorithms, including algorithms with very different
runtime patterns among subsequent iterations. Our
experimental evaluation of multiple algorithms on
scale-free graphs shows a relative prediction error of
10\%--30\% for predicting runtime, including algorithms
with up to $ 100 \times $ runtime variability among
consecutive iterations.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhao:2013:ERW,
author = "Xiaohan Zhao and Adelbert Chang and Atish Das Sarma
and Haitao Zheng and Ben Y. Zhao",
title = "On the embeddability of random walk distances",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "1690--1701",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Analysis of large graphs is critical to the ongoing
growth of search engines and social networks. One class
of queries centers around node affinity, often
quantified by random-walk distances between node pairs,
including hitting time, commute time, and personalized
PageRank (PPR). Despite the potential of these
``metrics,'' they are rarely, if ever, used in
practice, largely due to extremely high computational
costs. In this paper, we investigate methods to
scalably and efficiently compute random-walk distances,
by ``embedding'' graphs and distances into points and
distances in geometric coordinate spaces. We show that
while existing graph coordinate systems (GCS) can
accurately estimate shortest path distances, they
produce significant errors when embedding random-walk
distances. Based on our observations, we propose a new
graph embedding system that explicitly accounts for
per-node graph properties that affect random walk.
Extensive experiments on a range of graphs show that
our new approach can accurately estimate both symmetric
and asymmetric random-walk distances. Once a graph is
embedded, our system can answer queries between any two
nodes in 8 microseconds, orders of magnitude faster
than existing methods. Finally, we show that our system
produces estimates that can replace ground truth in
applications with minimal impact on application
output.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Muhlbauer:2013:ILM,
author = "Tobias M{\"u}hlbauer and Wolf R{\"o}diger and Robert
Seilbeck and Angelika Reiser and Alfons Kemper and
Thomas Neumann",
title = "Instant loading for main memory databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "1702--1713",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "eScience and big data analytics applications are
facing the challenge of efficiently evaluating complex
queries over vast amounts of structured text data
archived in network storage solutions. To analyze such
data in traditional disk-based database systems, it
needs to be bulk loaded, an operation whose performance
largely depends on the wire speed of the data source
and the speed of the data sink, i.e., the disk. As the
speed of network adapters and disks has stagnated in
the past, loading has become a major bottleneck. The
delays it is causing are now ubiquitous as text formats
are a preferred storage format for reasons of
portability. But the game has changed: Ever increasing
main memory capacities have fostered the development of
in-memory database systems and very fast network
infrastructures are on the verge of becoming
economical. While hardware limitations for fast loading
have disappeared, current approaches for main memory
databases fail to saturate the now available wire
speeds of tens of Gbit/s. With Instant Loading, we
contribute a novel CSV loading approach that allows
scalable bulk loading at wire speed. This is achieved
by optimizing all phases of loading for modern
super-scalar multi-core CPUs. Large main memory
capacities and Instant Loading thereby facilitate a
very efficient data staging processing model consisting
of instantaneous load-work-unload cycles across data
archives on a single node. Once data is loaded, updates
and queries are efficiently processed with the
flexibility, security, and high performance of
relational main memory databases.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Alexiou:2013:ARF,
author = "Karolina Alexiou and Donald Kossmann and Per-{\AA}ke
Larson",
title = "Adaptive range filters for cold data: avoiding trips
to {Siberia}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "1714--1725",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Bloom filters are a great technique to test whether a
key is not in a set of keys. This paper presents a
novel data structure called ARF. In a nutshell, ARFs
are for range queries what Bloom filters are for point
queries. That is, an ARF can determine whether a set of
keys does not contain any keys that are part of a
specific range. This paper describes the principles and
methods for efficient implementation of ARFs and
presents the results of comprehensive experiments that
assess the precision, space, and latency of ARFs.
Furthermore, this paper shows how ARFs can be applied
to a commercial database system that partitions data
into hot and cold regions to optimize queries that
involve only hot data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chandramouli:2013:SPA,
author = "Badrish Chandramouli and Jonathan Goldstein and Abdul
Quamar",
title = "Scalable progressive analytics on big data in the
{Cloud}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "1726--1737",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Analytics over the increasing quantity of data stored
in the Cloud has become very expensive, particularly
due to the pay-as-you-go Cloud computation model. Data
scientists typically manually extract samples of
increasing data size (progressive samples) using
domain-specific sampling strategies for exploratory
querying. This provides them with user-control,
repeatable semantics, and result provenance. However,
such solutions result in tedious workflows that
preclude the reuse of work across samples. On the other
hand, existing approximate query processing systems
report early results, but do not offer the above
benefits for complex ad-hoc queries. We propose a new
progressive analytics system based on a progress model
called Prism that (1) allows users to communicate
progressive samples to the system; (2) allows efficient
and deterministic query processing over samples; and
(3) provides repeatable semantics and provenance to
data scientists. We show that one can realize this
model for atemporal relational queries using an
unmodified temporal streaming engine, by
re-interpreting temporal event fields to denote
progress. Based on Prism, we build Now!, a progressive
data-parallel computation framework for Windows Azure,
where progress is understood as a first-class citizen
in the framework. Now! works with ``progress-aware
reducers''- in particular, it works with streaming
engines to support progressive SQL over big data.
Extensive experiments on Windows Azure with real and
synthetic workloads validate the scalability and
benefits of Now! and its optimizations, over current
solutions for progressive analytics.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ogden:2013:SXQ,
author = "Peter Ogden and David Thomas and Peter Pietzuch",
title = "Scalable {XML} query processing using parallel
pushdown transducers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "1738--1749",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In online social networking, network monitoring and
financial applications, there is a need to query high
rate streams of XML data, but methods for executing
individual XPath queries on streaming XML data have not
kept pace with multicore CPUs. For data-parallel
processing, a single XML stream is typically split into
well-formed fragments, which are then processed
independently. Such an approach, however, introduces a
sequential bottleneck and suffers from low cache
locality, limiting its scalability across CPU cores. We
describe a data-parallel approach for the processing of
streaming XPath queries based on pushdown transducers.
Our approach permits XML data to be split into
arbitrarily-sized chunks, with each chunk processed by
a parallel automaton instance. Since chunks may be
malformed, our automata consider all possible starting
states for XML elements and build mappings from
starting to finishing states. These mappings can be
constructed independently for each chunk by different
CPU cores. For streaming queries from the XPathMark
benchmark, we show a processing throughput of 2.5 GB/s,
with near linear scaling up to 64 CPU cores.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Huai:2013:UIB,
author = "Yin Huai and Siyuan Ma and Rubao Lee and Owen O'Malley
and Xiaodong Zhang",
title = "Understanding insights into the basic structure and
essential issues of table placement methods in
clusters",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "1750--1761",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A table placement method is a critical component in
big data analytics on distributed systems. It
determines the way how data values in a two-dimensional
table are organized and stored in the underlying
cluster. Based on Hadoop computing environments,
several table placement methods have been proposed and
implemented. However, a comprehensive and systematic
study to understand, to compare, and to evaluate
different table placement methods has not been done.
Thus, it is highly desirable to gain important insights
into the basic structure and essential issues of table
placement methods in the context of big data processing
infrastructures. In this paper, we present such a
study. The basic structure of a data placement method
consists of three core operations: row reordering,
table partitioning, and data packing. All the existing
placement methods are formed by these core operations
with variations made by the three key factors: (1) the
size of a horizontal logical subset of a table (or the
size of a row group), (2) the function of mapping
columns to column groups, and (3) the function of
packing columns or column groups in a row group into
physical blocks. We have designed and implemented a
benchmarking tool to provide insights into how
variations of each factor affect the I/O performance of
reading data of a table stored by a table placement
method. Based on our results, we give suggested actions
to optimize table reading performance. Results from
large-scale experiments have also confirmed that our
findings are valid for production workloads. Finally,
we present ORC File as a case study to show the
effectiveness of our findings and suggested actions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mottin:2013:POF,
author = "Davide Mottin and Alice Marascu and Senjuti Basu Roy
and Gautam Das and Themis Palpanas and Yannis
Velegrakis",
title = "A probabilistic optimization framework for the
empty-answer problem",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "1762--1773",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We propose a principled optimization-based interactive
query relaxation framework for queries that return no
answers. Given an initial query that returns an empty
answer set, our framework dynamically computes and
suggests alternative queries with less conditions than
those the user has initially requested, in order to
help the user arrive at a query with a non-empty
answer, or at a query for which no matter how many
additional conditions are ignored, the answer will
still be empty. Our proposed approach for suggesting
query relaxations is driven by a novel probabilistic
framework based on optimizing a wide variety of
application-dependent objective functions. We describe
optimal and approximate solutions of different
optimization problems using the framework. We analyze
these solutions, experimentally verify their efficiency
and effectiveness, and illustrate their advantage over
the existing approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wu:2013:SAG,
author = "Yinghui Wu and Shengqi Yang and Mudhakar Srivatsa and
Arun Iyengar and Xifeng Yan",
title = "Summarizing answer graphs induced by keyword queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "1774--1785",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Keyword search has been popularly used to query graph
data. Due to the lack of structure support, a keyword
query might generate an excessive number of matches,
referred to as ``answer graphs'', that could include
different relationships among keywords. An ignored yet
important task is to group and summarize answer graphs
that share similar structures and contents for better
query interpretation and result understanding. This
paper studies the summarization problem for the answer
graphs induced by a keyword query $Q$. (1) A notion of
summary graph is proposed to characterize the
summarization of answer graphs. Given $Q$ and a set of
answer graphs $G$, a summary graph preserves the
relation of the keywords in $Q$ by summarizing the
paths connecting the keywords nodes in $G$. (2) A
quality metric of summary graphs, called coverage
ratio, is developed to measure information loss of
summarization. (3) Based on the metric, a set of
summarization problems are formulated, which aim to
find minimized summary graphs with certain coverage
ratio. (a) We show that the complexity of these
summarization problems ranges from ptime to
NP-complete. (b) We provide exact and heuristic
summarization algorithms. (4) Using real-life and
synthetic graphs, we experimentally verify the
effectiveness and the efficiency of our techniques.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Duan:2013:SKS,
author = "Huizhong Duan and ChengXiang Zhai and Jinxing Cheng
and Abhishek Gattani",
title = "Supporting keyword search in product database: a
probabilistic approach",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "1786--1797",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The ability to let users search for products
conveniently in product database is critical to the
success of e-commerce. Although structured query
languages (e.g. SQL) can be used to effectively access
the product database, it is very difficult for end
users to learn and use. In this paper, we study how to
optimize search over structured product entities
(represented by specifications) with keyword queries
such as ``cheap gaming laptop''. One major difficulty
in this problem is the vocabulary gap between the
specifications of products in the database and the
keywords people use in search queries. To solve the
problem, we propose a novel probabilistic entity
retrieval model based on query generation, where the
entities would be ranked for a given keyword query
based on the likelihood that a user who likes an entity
would pose the query. Different ways to estimate the
model parameters would lead to different variants of
ranking functions. We start with simple estimates based
on the specifications of entities, and then leverage
user reviews and product search logs to improve the
estimation. Multiple estimation algorithms are
developed based on Maximum Likelihood and Maximum a
Posteriori estimators. We evaluate the proposed product
entity retrieval models on two newly created product
search test collections. The results show that the
proposed model significantly outperforms the existing
retrieval models, benefiting from the modeling of
attribute-level relevance. Despite the focus on product
retrieval, the proposed modeling method is general and
opens up many new opportunities in analyzing structured
entity data with unstructured text data. We show the
proposed probabilistic model can be easily adapted for
many interesting applications including facet
generation and review annotation.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Nirkhiwale:2013:SAA,
author = "Supriya Nirkhiwale and Alin Dobra and Christopher
Jermaine",
title = "A sampling algebra for aggregate estimation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "1798--1809",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "As of 2005, sampling has been incorporated in all
major database systems. While efficient sampling
techniques are realizable, determining the accuracy of
an estimate obtained from the sample is still an
unresolved problem. In this paper, we present a
theoretical framework that allows an elegant treatment
of the problem. We base our work on generalized uniform
sampling (GUS), a class of sampling methods that
subsumes a wide variety of sampling techniques. We
introduce a key notion of equivalence that allows GUS
sampling operators to commute with selection and join,
and derivation of confidence intervals. We illustrate
the theory through extensive examples and give
indications on how to use it to provide meaningful
estimates in database systems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dylla:2013:TPD,
author = "Maximilian Dylla and Iris Miliaraki and Martin
Theobald",
title = "A temporal-probabilistic database model for
information extraction",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "1810--1821",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Temporal annotations of facts are a key component both
for building a high-accuracy knowledge base and for
answering queries over the resulting temporal knowledge
base with high precision and recall. In this paper, we
present a temporal-probabilistic database model for
cleaning uncertain temporal facts obtained from
information extraction methods. Specifically, we
consider a combination of temporal deduction rules,
temporal consistency constraints and probabilistic
inference based on the common possible-worlds semantics
with data lineage, and we study the theoretical
properties of this data model. We further develop a
query engine which is capable of scaling to very large
temporal knowledge bases, with nearly interactive query
response times over millions of uncertain facts and
hundreds of thousands of grounded rules. Our
experiments over two real-world datasets demonstrate
the increased robustness of our approach compared to
related techniques based on constraint solving via
Integer Linear Programming (ILP) and probabilistic
inference via Markov Logic Networks (MLNs). We are also
able to show that our runtime performance is more than
competitive to current ILP solvers and the fastest
available, probabilistic but non-temporal, database
engines.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fender:2013:CSG,
author = "Pit Fender and Guido Moerkotte",
title = "Counter strike: generic top-down join enumeration for
hypergraphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "1822--1833",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Finding the optimal execution order of join operations
is a crucial task of today's cost-based query
optimizers. There are two approaches to identify the
best plan: bottom-up and top-down join enumeration. But
only the top-down approach allows for branch-and-bound
pruning, which can improve compile time by several
orders of magnitude while still preserving optimality.
For both optimization strategies, efficient enumeration
algorithms have been published. However, there are two
severe limitations for the top-down approach: The
published algorithms can handle only (1) simple
(binary) join predicates and (2) inner joins. Since
real queries may contain complex join predicates
involving more than two relations, and outer joins as
well as other non-inner joins, efficient top-down join
enumeration cannot be used in practice yet. We develop
a novel top-down join enumeration algorithm that
overcomes these two limitations. Furthermore, we show
that our new algorithm is competitive when compared to
the state of the art in bottom-up processing even
without playing out its advantage by making use of its
branch-and-bound pruning capabilities.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Achakeev:2013:EBU,
author = "Daniar Achakeev and Bernhard Seeger",
title = "Efficient bulk updates on multiversion {B}-trees",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "1834--1845",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Partial persistent index structures support efficient
access to current and past versions of objects, while
updates are allowed on the current version. The
Multiversion B-Tree (MVBT) represents a partially
persistent index-structure with both, asymptotic
worst-case performance and excellent performance in
real life applications. Updates are performed
tuple-by-tuple with the same asymptotic performance as
for standard B+trees. To the best of our knowledge,
there is no efficient algorithm for bulk loading and
bulk update of MVBT and other partially persistent
index structures. In this paper, we propose the first
loading algorithm for MVBT that meets the lower-bound
of external sorting. In addition, our approach is also
applicable to bulk updates. This is achieved by
combining two basic technologies, weight balancing and
buffer tree. Our extensive set of experiments confirm
the theoretical findings: Our loading algorithm runs
considerably faster than performing updates
tuple-by-tuple.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Altwaijry:2013:QDA,
author = "Hotham Altwaijry and Dmitri V. Kalashnikov and Sharad
Mehrotra",
title = "Query-driven approach to entity resolution",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "1846--1857",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper explores ``on-the-fly'' data cleaning in
the context of a user query. A novel Query-Driven
Approach (QDA) is developed that performs a minimal
number of cleaning steps that are only necessary to
answer a given selection query correctly. The
comprehensive empirical evaluation of the proposed
approach demonstrates its significant advantage in
terms of efficiency over traditional techniques for
query-driven applications.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Szlichta:2013:ECO,
author = "Jaros{\l}aw Szlichta and Parke Godfrey and Jarek Gryz
and Calisto Zuzarte",
title = "Expressiveness and complexity of order dependencies",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "1858--1869",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Dependencies play an important role in databases. We
study order dependencies (ODs)--and unidirectional
order dependencies (UODs), a proper sub-class of
ODs--which describe the relationships among
lexicographical orderings of sets of tuples. We
consider lexicographical ordering, as by the order-by
operator in SQL, because this is the notion of order
used in SQL and within query optimization. Our main
goal is to investigate the inference problem for ODs,
both in theory and in practice. We show the usefulness
of ODs in query optimization. We establish the
following theoretical results: (i) a hierarchy of order
dependency classes; (ii) a proof of co-NP-completeness
of the inference problem for the subclass of UODs (and
ODs); (iii) a proof of co-NP-completeness of the
inference problem of functional dependencies (FDs) from
ODs in general, but demonstrate linear time complexity
for the inference of FDs from UODs; (iv) a sound and
complete elimination procedure for inference over ODs;
and (v) a sound and complete polynomial inference
algorithm for sets of UODs over restricted domains.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Pavan:2013:CST,
author = "A. Pavan and Kanat Tangwongsan and Srikanta Tirthapura
and Kun-Lung Wu",
title = "Counting and sampling triangles from a graph stream",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "1870--1881",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper presents a new space-efficient algorithm
for counting and sampling triangles--and more
generally, constant-sized cliques--in a massive graph
whose edges arrive as a stream. Compared to prior work,
our algorithm yields significant improvements in the
space and time complexity for these fundamental
problems. Our algorithm is simple to implement and has
very good practical performance on large graphs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sowell:2013:EAI,
author = "Benjamin Sowell and Marcos Vaz Salles and Tuan Cao and
Alan Demers and Johannes Gehrke",
title = "An experimental analysis of iterated spatial joins in
main memory",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "1882--1893",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Many modern applications rely on high-performance
processing of spatial data. Examples include
location-based services, games, virtual worlds, and
scientific simulations such as molecular dynamics and
behavioral simulations. These applications deal with
large numbers of moving objects that continuously sense
their environment, and their data access can often be
abstracted as a repeated spatial join. Updates to
object positions are interspersed with these join
operations, and batched for performance. Even for the
most demanding scenarios, the data involved in these
joins fits comfortably in the main memory of a cluster
of machines, and most applications run completely in
main memory for performance reasons. Choosing
appropriate spatial join algorithms is challenging due
to the large number of techniques in the literature. In
this paper, we perform an extensive evaluation of
repeated spatial join algorithms for distance (range)
queries in main memory. Our study is unique in breadth
when compared to previous work: We implement, tune, and
compare ten distinct algorithms on several workloads
drawn from the simulation and spatial indexing
literature. We explore the design space of both index
nested loops algorithms and specialized join
algorithms, as well as the use of moving object indices
that can be incrementally maintained. Surprisingly, we
find that when queries and updates can be batched,
repeatedly re-computing the join result from scratch
outperforms using a moving object index in all but the
most extreme cases. This suggests that--given the code
complexity of index structures for moving objects ---
specialized join strategies over simple index
structures, such as Synchronous Traversal over R-Trees,
should be the methods of choice for the above
applications.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lee:2013:SQB,
author = "Kisung Lee and Ling Liu",
title = "Scaling queries over big {RDF} graphs with semantic
hash partitioning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "1894--1905",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Massive volumes of big RDF data are growing beyond the
performance capacity of conventional RDF data
management systems operating on a single node.
Applications using large RDF data demand efficient data
partitioning solutions for supporting RDF data access
on a cluster of compute nodes. In this paper we present
a novel semantic hash partitioning approach and
implement a Semantic HAsh Partitioning-Enabled
distributed RDF data management system, called Shape.
This paper makes three original contributions. First,
the semantic hash partitioning approach we propose
extends the simple hash partitioning method through
direction-based triple groups and direction-based
triple replications. The latter enhances the former by
controlled data replication through intelligent
utilization of data access locality, such that queries
over big RDF graphs can be processed with zero or very
small amount of inter-machine communication cost.
Second, we generate locality-optimized query execution
plans that are more efficient than popular multi-node
RDF data management systems by effectively minimizing
the inter-machine communication cost for query
processing. Third but not the least, we provide a suite
of locality-aware optimization techniques to further
reduce the partition size and cut down on the
inter-machine communication cost during distributed
query processing. Experimental results show that our
system scales well and can process big RDF datasets
more efficiently than existing approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Seo:2013:DSD,
author = "Jiwon Seo and Jongsoo Park and Jaeho Shin and Monica
S. Lam",
title = "Distributed socialite: a datalog-based language for
large-scale graph analysis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "1906--1917",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Large-scale graph analysis is becoming important with
the rise of world-wide social network services.
Recently in SociaLite, we proposed extensions to
Datalog to efficiently and succinctly implement graph
analysis programs on sequential machines. This paper
describes novel extensions and optimizations of
SociaLite for parallel and distributed executions to
support large-scale graph analysis. With distributed
SociaLite, programmers simply annotate how data are to
be distributed, then the necessary communication is
automatically inferred to generate parallel code for
cluster of multi-core machines. It optimizes the
evaluation of recursive monotone aggregate functions
using a delta stepping technique. In addition,
approximate computation is supported in SociaLite,
allowing programmers to trade off accuracy for less
time and space. We evaluated SociaLite with six core
graph algorithms used in many social network analyses.
Our experiment with 64 Amazon EC2 8-core instances
shows that SociaLite programs performed within a factor
of two with respect to ideal weak scaling. Compared to
optimized Giraph, an open-source alternative of Pregel,
SociaLite programs are 4 to 12 times faster across
benchmark algorithms, and 22 times more succinct on
average. As a declarative query language, SociaLite,
with the help of a compiler that generates efficient
parallel and approximate code, can be used easily to
create many social apps that operate on large-scale
distributed graphs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sarwat:2013:HDS,
author = "Mohamed Sarwat and Sameh Elnikety and Yuxiong He and
Mohamed F. Mokbel",
title = "{Horton+}: a distributed system for processing
declarative reachability queries over partitioned
graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "1918--1929",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Horton+ is a graph query processing system that
executes declarative reachability queries on a
partitioned attributed multi-graph. It employs a query
language, query optimizer, and a distributed execution
engine. The query language expresses declarative
reachability queries, and supports closures and
predicates on node and edge attributes to match graph
paths. We introduce three algebraic operators, select,
traverse, and join, and a query is compiled into an
execution plan containing these operators. As
reachability queries access the graph elements in a
random access pattern, the graph is therefore
maintained in the main memory of a cluster of servers
to reduce query execution time. We develop a
distributed execution engine that processes a query
plan in parallel on the graph servers. Since the query
language is declarative, we build a query optimizer
that uses graph statistics to estimate predicate
selectivity. We experimentally evaluate the system
performance on a cluster of 16 graph servers using
synthetic graphs as well as a real graph from an
application that uses reachability queries. The
evaluation shows (1) the efficiency of the optimizer in
reducing query execution time, (2) system scalability
with the size of the graph and with the number of
servers, and (3) the convenience of using declarative
queries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sundaram:2013:SSS,
author = "Narayanan Sundaram and Aizana Turmukhametova and
Nadathur Satish and Todd Mostak and Piotr Indyk and
Samuel Madden and Pradeep Dubey",
title = "Streaming similarity search over one billion tweets
using parallel locality-sensitive hashing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "1930--1941",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Finding nearest neighbors has become an important
operation on databases, with applications to text
search, multimedia indexing, and many other areas. One
popular algorithm for similarity search, especially for
high dimensional data (where spatial indexes like
kd-trees do not perform well) is Locality Sensitive
Hashing (LSH), an approximation algorithm for finding
similar objects. In this paper, we describe a new
variant of LSH, called Parallel LSH (PLSH) designed to
be extremely efficient, capable of scaling out on
multiple nodes and multiple cores, and which supports
high-throughput streaming of new data. Our approach
employs several novel ideas, including: cache-conscious
hash table layout, using a 2-level merge algorithm for
hash table construction; an efficient algorithm for
duplicate elimination during hash-table querying; an
insert-optimized hash table structure and efficient
data expiration algorithm for streaming data; and a
performance model that accurately estimates performance
of the algorithm and can be used to optimize parameter
settings. We show that on a workload where we perform
similarity search on a dataset of > 1 Billion tweets,
with hundreds of millions of new tweets per day, we can
achieve query times of 1--2.5 ms. We show that this is
an order of magnitude faster than existing indexing
schemes, such as inverted indexes. To the best of our
knowledge, this is the fastest implementation of LSH,
with table construction times up to $ 3.7 \times $
faster and query times that are $ 8.3 \times $ faster
than a basic implementation.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{DeBrabant:2013:ACN,
author = "Justin DeBrabant and Andrew Pavlo and Stephen Tu and
Michael Stonebraker and Stan Zdonik",
title = "Anti-caching: a new approach to database management
system architecture",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "1942--1953",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The traditional wisdom for building disk-based
relational database management systems (DBMS) is to
organize data in heavily-encoded blocks stored on disk,
with a main memory block cache. In order to improve
performance given high disk latency, these systems use
a multi-threaded architecture with dynamic record-level
locking that allows multiple transactions to access the
database at the same time. Previous research has shown
that this results in substantial overhead for on-line
transaction processing (OLTP) applications [15]. The
next generation DBMSs seek to overcome these
limitations with architecture based on main memory
resident data. To overcome the restriction that all
data fit in main memory, we propose a new technique,
called anti-caching, where cold data is moved to disk
in a transactionally-safe manner as the database grows
in size. Because data initially resides in memory, an
anti-caching architecture reverses the traditional
storage hierarchy of disk-based systems. Main memory is
now the primary storage device. We implemented a
prototype of our anti-caching proposal in a
high-performance, main memory OLTP DBMS and performed a
series of experiments across a range of database sizes,
workload skews, and read/write mixes. We compared its
performance with an open-source, disk-based DBMS
optionally fronted by a distributed main memory cache.
Our results show that for higher skewed workloads the
anti-caching architecture has a performance advantage
over either of the other architectures tested of up to
$ 9 \times $ for a data size $ 8 \times $ larger than
memory.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Qardaji:2013:UHM,
author = "Wahbeh Qardaji and Weining Yang and Ninghui Li",
title = "Understanding hierarchical methods for differentially
private histograms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "1954--1965",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In recent years, many approaches to differentially
privately publish histograms have been proposed.
Several approaches rely on constructing tree structures
in order to decrease the error when answer large range
queries. In this paper, we examine the factors
affecting the accuracy of hierarchical approaches by
studying the mean squared error (MSE) when answering
range queries. We start with one-dimensional
histograms, and analyze how the MSE changes with
different branching factors, after employing
constrained inference, and with different methods to
allocate the privacy budget among hierarchy levels. Our
analysis and experimental results show that combining
the choice of a good branching factor with constrained
inference outperform the current state of the art.
Finally, we extend our analysis to multi-dimensional
histograms. We show that the benefits from employing
hierarchical methods beyond a single dimension are
significantly diminished, and when there are 3 or more
dimensions, it is almost always better to use the Flat
method instead of a hierarchy.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2013:TSD,
author = "Rui Li and Shengjie Wang and Kevin Chen-Chuan Chang",
title = "Towards social data platform: automatic topic-focused
monitor for {Twitter} stream",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "1966--1977",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Many novel applications have been built based on
analyzing tweets about specific topics. While these
applications provide different kinds of analysis, they
share a common task of monitoring ``target'' tweets
from the Twitter stream for a topic. The current
solution for this task tracks a set of manually
selected keywords with Twitter APIs. Obviously, this
manual approach has many limitations. In this paper, we
propose a data platform to automatically monitor target
tweets from the Twitter stream for any given topic. To
monitor target tweets in an optimal and continuous way,
we design Automatic Topic-focused Monitor (ATM), which
iteratively (1) samples tweets from the stream and (2)
selects keywords to track based on the samples. To
realize ATM, we develop a tweet sampling algorithm to
sample sufficient unbiased tweets with available
Twitter APIs, and a keyword selection algorithm to
efficiently select keywords that have a near-optimal
coverage of target tweets under cost constraints. We
conduct extensive experiments to show the effectiveness
of ATM. E.g., ATM covers 90\% of target tweets for a
topic and improves the manual approach by 49\%.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jin:2013:SFS,
author = "Ruoming Jin and Guan Wang",
title = "Simple, fast, and scalable reachability oracle",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "1978--1989",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A reachability oracle (or hop labeling) assigns each
vertex $v$ two sets of vertices: $ {\rm Lout}(v) $ and
$ {\rm Lin}(v) $, such that $u$ reaches $v$ iff $ {\rm
Lout}(u) \cap {\rm Lin}(v) \neq 0 $. Despite their
simplicity and elegance, reachability oracles have
failed to achieve efficiency in more than ten years
since their introduction: The main problem is high
construction cost, which stems from a set-cover
framework and the need to materialize transitive
closure. In this paper, we present two simple and
efficient labeling algorithms, Hierarchical-Labeling
and Distribution-Labeling, which can work onmassive
real-world graphs: Their construction time is an order
of magnitude faster than the set-cover based labeling
approach, and transitive closure materialization is not
needed. On large graphs, their index sizes and their
query performance can now beat the state-of-the-art
transitive closure compression and online search
approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bakibayev:2013:AOF,
author = "Nurzhan Bakibayev and Tom{\'a}s Kocisk{\'y} and Dan
Olteanu and Jakub Z{\'a}vodn{\'y}",
title = "Aggregation and ordering in factorised databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "1990--2001",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A common approach to data analysis involves
understanding and manipulating succinct representations
of data. In earlier work, we put forward a succinct
representation system for relational data called
factorised databases and reported on the main-memory
query engine FDB for select-project-join queries on
such databases. In this paper, we extend FDB to support
a larger class of practical queries with aggregates and
ordering. This requires novel optimisation and
evaluation techniques. We show how factorisation
coupled with partial aggregation can effectively reduce
the number of operations needed for query evaluation.
We also show how factorisations of query results can
support enumeration of tuples in desired orders as
efficiently as listing them from the unfactorised,
sorted results. We experimentally observe that FDB can
outperform off-the-shelf relational engines by orders
of magnitude.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Park:2013:PCS,
author = "Yoonjae Park and Jun-Ki Min and Kyuseok Shim",
title = "Parallel computation of skyline and reverse skyline
queries using {MapReduce}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "2002--2013",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The skyline operator and its variants such as dynamic
skyline and reverse skyline operators have attracted
considerable attention recently due to their broad
applications. However, computations of such operators
are challenging today since there is an increasing
trend of applications to deal with big data. For such
data-intensive applications, the MapReduce framework
has been widely used recently. In this paper, we
propose efficient parallel algorithms for processing
the skyline and its variants using MapReduce. We first
build histograms to effectively prune out nonskyline
(non-reverse skyline) points in advance. We next
partition data based on the regions divided by the
histograms and compute candidate (reverse) skyline
points for each region independently using MapReduce.
Finally, we check whether each candidate point is
actually a (reverse) skyline point in every region
independently. Our performance study confirms the
effectiveness and scalability of the proposed
algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Xie:2013:FIG,
author = "Wenlei Xie and Guozhang Wang and David Bindel and Alan
Demers and Johannes Gehrke",
title = "Fast iterative graph computation with block updates",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "6",
number = "14",
pages = "2014--2025",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Dec 13 05:57:13 MST 2013",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Scaling iterative graph processing applications to
large graphs is an important problem. Performance is
critical, as data scientists need to execute graph
programs many times with varying parameters. The need
for a high-level, high-performance programming model
has inspired much research on graph programming
frameworks. In this paper, we show that the important
class of computationally light graph applications ---
applications that perform little computation per vertex
--- has severe scalability problems across multiple
cores as these applications hit an early ``memory
wall'' that limits their speedup. We propose a novel
block-oriented computation model, in which computation
is iterated locally over blocks of highly connected
nodes, significantly improving the amount of
computation per cache miss. Following this model, we
describe the design and implementation of a block-aware
graph processing runtime that keeps the familiar
vertex-centric programming paradigm while reaping the
benefits of block-oriented execution. Our experiments
show that block-oriented execution significantly
improves the performance of our framework for several
graph applications.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2013:EEK,
author = "Xiaoli Wang and Xiaofeng Ding and Anthony K. H. Tung
and Zhenjie Zhang",
title = "Efficient and effective {KNN} sequence search with
approximate $n$-grams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "1",
pages = "1--12",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:21:56 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we address the problem of finding
$k$-nearest neighbors (KNN) in sequence databases using
the edit distance. Unlike most existing works using
short and exact $n$-gram matchings together with a
filter-and-refine framework for KNN sequence search,
our new approach allows us to use longer but
approximate $n$-gram matchings as a basis of KNN
candidates pruning. Based on this new idea, we devise a
pipeline framework over a two-level index for searching
KNN in the sequence database. By coupling this
framework together with several efficient filtering
strategies, i.e. the frequency queue and the well-known
Combined Algorithm (CA), our proposal brings various
enticing advantages over existing works, including (1)
huge reduction on false positive candidates to avoid
large overheads on candidate verifications; (2)
progressive result update and early termination; and
(3) good extensibility to parallel computation. We
conduct extensive experiments on three real datasets to
verify the superiority of the proposed framework.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yu:2013:MSE,
author = "Weiren Yu and Xuemin Lin and Wenjie Zhang and Lijun
Chang and Jian Pei",
title = "More is simpler: effectively and efficiently assessing
node-pair similarities based on hyperlinks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "1",
pages = "13--24",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:21:56 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Similarity assessment is one of the core tasks in
hyperlink analysis. Recently, with the proliferation of
applications, e.g., web search and collaborative
filtering, SimRank has been a well-studied measure of
similarity between two nodes in a graph. It recursively
follows the philosophy that ``two nodes are similar if
they are referenced (have incoming edges) from similar
nodes'', which can be viewed as an aggregation of
similarities based on incoming paths. Despite its
popularity, SimRank has an undesirable property, i.e.,
``zero-similarity'': It only accommodates paths with
equal length from a common ``center'' node. Thus, a
large portion of other paths are fully ignored. This
paper attempts to remedy this issue. (1) We propose and
rigorously justify SimRank*, a revised version of
SimRank, which resolves such counter-intuitive
``zero-similarity'' issues while inheriting merits of
the basic SimRank philosophy. (2) We show that the
series form of SimRank* can be reduced to a fairly
succinct and elegant closed form, which looks even
simpler than SimRank, yet enriches semantics without
suffering from increased computational cost. This leads
to a fixed-point iterative paradigm of SimRank* in $ O
(K n m) $ time on a graph of $n$ nodes and $m$ edges
for $K$ iterations, which is comparable to SimRank. (3)
To further optimize SimRank* computation, we leverage a
novel clustering strategy via edge concentration. Due
to its NP-hardness, we devise an efficient and
effective heuristic to speed up SimRank* computation to
$ O(K n m)$ time, where $m$ is generally much smaller
than $m$. (4) Using real and synthetic data, we
empirically verify the rich semantics of SimRank*, and
demonstrate its high computation efficiency.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gyssens:2013:ATS,
author = "Marc Gyssens and Jan Paredaens and Dirk {Van Gucht}
and Jef Wijsen and Yuqing Wu",
title = "An approach towards the study of symmetric queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "1",
pages = "25--36",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:21:56 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Many data-intensive applications have to query a
database that involves sequences of sets of objects. It
is not uncommon that the order of the sets in such a
sequence does not affect the result of the query. Such
queries are called symmetric. In this paper, the
authors wish to initiate research on symmetric queries.
Thereto, a data model is proposed in which a binary
relation between objects and set names encodes set
membership. On this data model, two query languages are
introduced, QuineCALC and SyCALC. They are correlated
in a manner that is made precise with the symmetric
Boolean functions of Quine, respectively symmetric
relational functions, on sequences of sets of given
length. The latter do not only involve the Boolean
operations union, intersection, and complement, but
also projection and Cartesian product. Quine's
characterization of symmetric Boolean functions in
terms of incidence information is generalized to
QuineCALC queries. In the process, an incidence-based
normal form for QuineCALC queries is proposed. Inspired
by these desirable incidence-related properties of
QuineCALC queries, counting-only queries are introduced
as SyCALC queries for which the result only depends on
incidence information. Counting-only queries are then
characterized as quantified Boolean combinations of
QuineCALC queries, and a normal form is proposed for
them as well. Finally, it is shown that, while it is
undecidable whether a SyCALC query is counting-only, it
is decidable whether a counting-only query is a
QuineCALC query.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Das:2013:CST,
author = "Sudipto Das and Vivek R. Narasayya and Feng Li and
Manoj Syamala",
title = "{CPU} sharing techniques for performance isolation in
multi-tenant relational database-as-a-service",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "1",
pages = "37--48",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:21:56 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Multi-tenancy and resource sharing are essential to
make a Database-as-a-Service (DaaS) cost-effective.
However, one major consequence of resource sharing is
that the performance of one tenant's workload can be
significantly affected by the resource demands of
co-located tenants. The lack of performance isolation
in a shared environment can make DaaS less attractive
to performance-sensitive tenants. Our approach to
performance isolation in a DaaS is to isolate the key
resources needed by the tenants' workload. In this
paper, we focus on the problem of effectively sharing
and isolating CPU among co-located tenants in a
multi-tenant DaaS. We show that traditional CPU sharing
abstractions and algorithms are inadequate to support
several key new requirements that arise in DaaS: (a)
absolute and fine-grained CPU reservations without
static allocation; (b) support elasticity by
dynamically adapting to bursty resource demands; and
(c) enable the DaaS provider to suitably tradeoff
revenue with fairness. We implemented these new
scheduling algorithms in a commercial DaaS prototype
and extensive experiments demonstrate the effectiveness
of our techniques.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chen:2013:ATK,
author = "Qian Chen and Haibo Hu and Jianliang Xu",
title = "Authenticating top-$k$ queries in location-based
services with confidentiality",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "1",
pages = "49--60",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:21:56 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "State-of-the-art location-based services (LBSs)
involve data owners, requesting clients, and service
providers. As LBSs become new business opportunities,
there is an increasing necessity to verify the
genuineness of service results. Unfortunately, while
traditional query authentication techniques can address
this issue, they fail to protect the confidentiality of
data, which is sensitive location information when LBSs
are concerned. Recent work has studied how to preserve
such location privacy in query authentication. However,
the prior work is limited to range queries, where
private values only appear on one side of the range
comparison. In this paper, we address the more
challenging authentication problem on top-$k$ queries,
where private values appear on both sides of a
comparison. To start with, we propose two novel
cryptographic building blocks, followed by a
comprehensive design of authentication schemes for
top-$k$ queries based on R-tree and Power Diagram
indexes. Optimizations, security analysis, and
experimental results consistently show the
effectiveness and robustness of the proposed schemes
under various system settings and query workloads.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Qi:2013:TDO,
author = "Zichao Qi and Yanghua Xiao and Bin Shao and Haixun
Wang",
title = "Toward a distance oracle for billion-node graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "1",
pages = "61--72",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:21:56 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The emergence of real life graphs with billions of
nodes poses significant challenges for managing and
querying these graphs. One of the fundamental queries
submitted to graphs is the shortest distance query.
Online BFS (breadth-first search) and offline
pre-computing pairwise shortest distances are
prohibitive in time or space complexity for
billion-node graphs. In this paper, we study the
feasibility of building distance oracles for
billion-node graphs. A distance oracle provides
approximate answers to shortest distance queries by
using a pre-computed data structure for the graph.
Sketch-based distance oracles are good candidates
because they assign each vertex a sketch of bounded
size, which means they have linear space complexity.
However, state-of-the-art sketch-based distance oracles
lack efficiency or accuracy when dealing with big
graphs. In this paper, we address the scalability and
accuracy issues by focusing on optimizing the three key
factors that affect the performance of distance
oracles: landmark selection, distributed BFS, and
answer generation. We conduct extensive experiments on
both real networks and synthetic networks to show that
we can build distance oracles of affordable cost and
efficiently answer shortest distance queries even for
billion-node graphs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kaul:2013:FSP,
author = "Manohar Kaul and Raymond Chi-Wing Wong and Bin Yang
and Christian S. Jensen",
title = "Finding shortest paths on terrains by killing two
birds with one stone",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "1",
pages = "73--84",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:21:56 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the increasing availability of terrain data,
e.g., from aerial laser scans, the management of such
data is attracting increasing attention in both
industry and academia. In particular, spatial queries,
e.g., $k$-nearest neighbor and reverse nearest neighbor
queries, in Euclidean and spatial network spaces are
being extended to terrains. Such queries all rely on an
important operation, that of finding shortest surface
distances. However, shortest surface distance
computation is very time consuming. We propose
techniques that enable efficient computation of lower
and upper bounds of the shortest surface distance,
which enable faster query processing by eliminating
expensive distance computations. Empirical studies show
that our bounds are much tighter than the best-known
bounds in many cases and that they enable speedups of
up to 43 times for some well-known spatial queries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Balkesen:2013:MCM,
author = "Cagri Balkesen and Gustavo Alonso and Jens Teubner and
M. Tamer {\"O}zsu",
title = "Multi-core, main-memory joins: sort vs. hash
revisited",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "1",
pages = "85--96",
month = sep,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:21:56 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper we experimentally study the performance
of main-memory, parallel, multi-core join algorithms,
focusing on sort-merge and (radix-)hash join. The
relative performance of these two join approaches have
been a topic of discussion for a long time. With the
advent of modern multi-core architectures, it has been
argued that sort-merge join is now a better choice than
radix-hash join. This claim is justified based on the
width of SIMD instructions (sort-merge outperforms
radix-hash join once SIMD is sufficiently wide), and
NUMA awareness (sort-merge is superior to hash join in
NUMA architectures). We conduct extensive experiments
on the original and optimized versions of these
algorithms. The experiments show that, contrary to
these claims, radix-hash join is still clearly
superior, and sort-merge approaches to performance of
radix only when very large amounts of data are
involved. The paper also provides the fastest
implementations of these algorithms, and covers many
aspects of modern hardware architectures relevant not
only for joins but for any parallel data processing
operator.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Schuhknecht:2013:UPD,
author = "Felix Martin Schuhknecht and Alekh Jindal and Jens
Dittrich",
title = "The uncracked pieces in database cracking",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "2",
pages = "97--108",
month = oct,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:21:58 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Database cracking has been an area of active research
in recent years. The core idea of database cracking is
to create indexes adaptively and incrementally as a
side-product of query processing. Several works have
proposed different cracking techniques for different
aspects including updates, tuple-reconstruction,
convergence, concurrency-control, and robustness.
However, there is a lack of any comparative study of
these different methods by an independent group. In
this paper, we conduct an experimental study on
database cracking. Our goal is to critically review
several aspects, identify the potential, and propose
promising directions in database cracking. With this
study, we hope to expand the scope of database cracking
and possibly leverage cracking in database engines
other than MonetDB. We repeat several prior database
cracking works including the core cracking algorithms
as well as three other works on convergence (hybrid
cracking), tuple-reconstruction (sideways cracking),
and robustness (stochastic cracking) respectively. We
evaluate these works and show possible directions to do
even better. We further test cracking under a variety
of experimental settings, including high selectivity
queries, low selectivity queries, and multiple query
access patterns. Finally, we compare cracking against
different sorting algorithms as well as against
different main-memory optimised indexes, including the
recently proposed Adaptive Radix Tree (ART). Our
results show that: (i) the previously proposed cracking
algorithms are repeatable, (ii) there is still enough
room to significantly improve the previously proposed
cracking algorithms, (iii) cracking depends heavily on
query selectivity, (iv) cracking needs to catch up with
modern indexing trends, and (v) different indexing
algorithms have different indexing signatures.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Eravci:2013:DBR,
author = "Bahaeddin Eravci and Hakan Ferhatosmanoglu",
title = "Diversity based relevance feedback for time series
search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "2",
pages = "109--120",
month = oct,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:21:58 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We propose a diversity based relevance feedback
approach for time series data to improve the accuracy
of search results. We first develop the concept of
relevance feedback for time series based on dual-tree
complex wavelet (CWT) and SAX based approaches. We aim
to enhance the search quality by incorporating
diversity in the results presented to the user for
feedback. We then propose a method which utilizes the
representation type as part of the feedback, as opposed
to a human choosing based on a preprocessing or
training phase. The proposed methods utilize a
weighting to handle the relevance feedback of important
properties for both single and multiple representation
cases. Our experiments on a large variety of time
series data sets show that the proposed diversity based
relevance feedback improves the retrieval performance.
Results confirm that representation feedback
incorporates item diversity implicitly and achieves
good performance even when using simple nearest
neighbor as the retrieval method. To the best of our
knowledge, this is the first study on diversification
of time series search to improve retrieval accuracy and
representation feedback.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Pelley:2013:SMN,
author = "Steven Pelley and Thomas F. Wenisch and Brian T. Gold
and Bill Bridge",
title = "Storage management in the {NVRAM} era",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "2",
pages = "121--132",
month = oct,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:21:58 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Emerging nonvolatile memory technologies (NVRAM) offer
an alternative to disk that is persistent, provides
read latency similar to DRAM, and is byte-addressable.
Such NVRAMs could revolutionize online transaction
processing (OLTP), which today must employ
sophisticated optimizations with substantial software
overheads to overcome the long latency and poor random
access performance of disk. Nevertheless, many
candidate NVRAM technologies exhibit their own
limitations, such as greater-than-DRAM latency,
particularly for writes. In this paper, we reconsider
OLTP durability management to optimize recovery
performance and forward-processing throughput for
emerging NVRAMs. First, we demonstrate that using NVRAM
as a drop-in replacement for disk allows
near-instantaneous recovery, but software complexity
necessary for disk (i.e., Write Ahead Logging/ARIES)
limits transaction throughput. Next, we consider the
possibility of removing software-managed DRAM
buffering. Finally, we measure the cost of ordering
writes to NVRAM, which is vital for correct recovery.
We consider three recovery mechanisms: NVRAM
Disk-Replacement, In-Place Updates (transactions
persist data in-place), and NVRAM Group Commit
(transactions commit/persist atomically in batches).
Whereas In-Place Updates offers the simplest design, it
introduces persist synchronizations at every page
update. NVRAM Group Commit minimizes persist
synchronization, offering up to a 50\% throughput
improvement for large synchronous persist latencies.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Salloum:2013:OOO,
author = "Mariam Salloum and Xin Luna Dong and Divesh Srivastava
and Vassilis J. Tsotras",
title = "Online ordering of overlapping data sources",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "3",
pages = "133--144",
month = nov,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:00 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data integration systems offer a uniform interface for
querying a large number of autonomous and heterogeneous
data sources. Ideally, answers are returned as sources
are queried and the answer list is updated as more
answers arrive. Choosing a good ordering in which the
sources are queried is critical for increasing the rate
at which answers are returned. However, this problem is
challenging since we often do not have complete or
precise statistics of the sources, such as their
coverage and overlap. It is further exacerbated in the
Big Data era, which is witnessing two trends in
Deep-Web data: first, obtaining a full coverage of data
in a particular domain often requires extracting data
from thousands of sources; second, there is often a big
variation in overlap between different data sources. In
this paper we present OASIS, an {Online} query
{Answering} {System} for {overlappIng} {Sources}. OASIS
has three key components for source ordering. First,
the Overlap Estimation component estimates overlaps
between sources according to available statistics under
the Maximum Entropy principle. Second, the Source
Ordering component orders the sources according to the
new contribution they are expected to provide, and
adjusts the ordering based on statistics collected
during query answering. Third, the Statistics
Enrichment component selects critical missing
statistics to enrich at runtime. Experimental results
on both real and synthetic data show high efficiency
and scalability of our algorithm.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2013:MQO,
author = "Guoping Wang and Chee-Yong Chan",
title = "Multi-query optimization in {MapReduce} framework",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "3",
pages = "145--156",
month = nov,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:00 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "MapReduce has recently emerged as a new paradigm for
large-scale data analysis due to its high scalability,
fine-grained fault tolerance and easy programming
model. Since different jobs often share similar work
(e.g., several jobs scan the same input file or produce
the same map output), there are many opportunities to
optimize the performance for a batch of jobs. In this
paper, we propose two new techniques for multi-job
optimization in the MapReduce framework. The first is a
generalized grouping technique (which generalizes the
recently proposed MRShare technique) that merges
multiple jobs into a single job thereby enabling the
merged jobs to share both the scan of the input file as
well as the communication of the common map output. The
second is a materialization technique that enables
multiple jobs to share both the scan of the input file
as well as the communication of the common map output
via partial materialization of the map output of some
jobs (in the map and/or reduce phase). Our second
contribution is the proposal of a new optimization
algorithm that given an input batch of jobs, produces
an optimal plan by a judicious partitioning of the jobs
into groups and an optimal assignment of the processing
technique to each group. Our experimental results on
Hadoop demonstrate that our new approach significantly
outperforms the state-of-the-art technique, MRShare, by
up to 107\%.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2013:AAD,
author = "Zhenhui Li and Bolin Ding and Fei Wu and Tobias Kin
Hou Lei and Roland Kays and Margaret C. Crofoot",
title = "Attraction and avoidance detection from movements",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "3",
pages = "157--168",
month = nov,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:00 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the development of positioning technology,
movement data has become widely available nowadays. An
important task in movement data analysis is to mine the
relationships among moving objects based on their
spatiotemporal interactions. Among all relationship
types, attraction and avoidance are arguably the most
natural ones. However, rather surprisingly, there is no
existing method that addresses the problem of mining
significant attraction and avoidance relationships in a
well-defined and unified framework. In this paper, we
propose a novel method to measure the significance
value of relationship between any two objects by
examining the background model of their movements via
permutation test. Since permutation test is
computationally expensive, two effective pruning
strategies are developed to reduce the computation
time. Furthermore, we show how the proposed method can
be extended to efficiently answer the classic threshold
query: given an object, retrieve all the objects in the
database that have relationships, whose significance
values are above certain threshold, with the query
object. Empirical studies on both synthetic data and
real movement data demonstrate the effectiveness and
efficiency of our method.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhao:2013:PBA,
author = "Xiang Zhao and Chuan Xiao and Xuemin Lin and Qing Liu
and Wenjie Zhang",
title = "A partition-based approach to structure similarity
search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "3",
pages = "169--180",
month = nov,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:00 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Graphs are widely used to model complex data in many
applications, such as bioinformatics, chemistry, social
networks, pattern recognition, etc. A fundamental and
critical query primitive is to efficiently search
similar structures in a large collection of graphs.
This paper studies the graph similarity queries with
edit distance constraints. Existing solutions to the
problem utilize fixed-size overlapping substructures to
generate candidates, and thus become susceptible to
large vertex degrees or large distance thresholds. In
this paper, we present a partition-based approach to
tackle the problem. By dividing data graphs into
variable-size non-overlapping partitions, the edit
distance constraint is converted to a graph containment
constraint for candidate generation. We develop
efficient query processing algorithms based on the new
paradigm. A candidate pruning technique and an improved
graph edit distance algorithm are also developed to
further boost the performance. In addition, a
cost-aware graph partitioning technique is devised to
optimize the index. Extensive experiments demonstrate
our approach significantly outperforms existing
approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bailis:2013:HAT,
author = "Peter Bailis and Aaron Davidson and Alan Fekete and
Ali Ghodsi and Joseph M. Hellerstein and Ion Stoica",
title = "Highly available transactions: virtues and
limitations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "3",
pages = "181--192",
month = nov,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:00 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "To minimize network latency and remain online during
server failures and network partitions, many modern
distributed data storage systems eschew transactional
functionality, which provides strong semantic
guarantees for groups of multiple operations over
multiple data items. In this work, we consider the
problem of providing Highly Available Transactions
(HATs): transactional guarantees that do not suffer
unavailability during system partitions or incur high
network latency. We introduce a taxonomy of highly
available systems and analyze existing ACID isolation
and distributed data consistency guarantees to identify
which can and cannot be achieved in HAT systems. This
unifies the literature on weak transactional isolation,
replica consistency, and highly available systems. We
analytically and experimentally quantify the
availability and performance benefits of HATs --- often
two to three orders of magnitude over wide-area
networks --- and discuss their necessary semantic
compromises.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tian:2013:TLV,
author = "Yuanyuan Tian and Andrey Balmin and Severin Andreas
Corsten and Shirish Tatikonda and John McPherson",
title = "From {``think like a vertex''} to {``think like a
graph''}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "3",
pages = "193--204",
month = nov,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:00 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "To meet the challenge of processing rapidly growing
graph and network data created by modern applications,
a number of distributed graph processing systems have
emerged, such as Pregel and GraphLab. All these systems
divide input graphs into partitions, and employ a
``think like a vertex'' programming model to support
iterative graph computation. This vertex-centric model
is easy to program and has been proved useful for many
graph algorithms. However, this model hides the
partitioning information from the users, thus prevents
many algorithm-specific optimizations. This often
results in longer execution time due to excessive
network messages (e.g. in Pregel) or heavy scheduling
overhead to ensure data consistency (e.g. in GraphLab).
To address this limitation, we propose a new ``think
like a graph'' programming paradigm. Under this
graph-centric model, the partition structure is opened
up to the users, and can be utilized so that
communication within a partition can bypass the heavy
message passing or scheduling machinery. We implemented
this model in a new system, called Giraph++, based on
Apache Giraph, an open source implementation of Pregel.
We explore the applicability of the graph-centric model
to three categories of graph algorithms, and
demonstrate its flexibility and superior performance,
especially on well-partitioned data. For example, on a
web graph with 118 million vertices and 855 million
edges, the graph-centric version of connected component
detection algorithm runs 63X faster and uses 204X fewer
network messages than its vertex-centric counterpart.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Niedermayer:2013:PNN,
author = "Johannes Niedermayer and Andreas Z{\"u}fle and Tobias
Emrich and Matthias Renz and Nikos Mamoulis and Lei
Chen and Hans-Peter Kriegel",
title = "Probabilistic nearest neighbor queries on uncertain
moving object trajectories",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "3",
pages = "205--216",
month = nov,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:00 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Nearest neighbor (NN) queries in trajectory databases
have received significant attention in the past, due to
their applications in spatio-temporal data analysis.
More recent work has considered the realistic case
where the trajectories are uncertain; however, only
simple uncertainty models have been proposed, which do
not allow for accurate probabilistic search. In this
paper, we fill this gap by addressing probabilistic
nearest neighbor queries in databases with uncertain
trajectories modeled by stochastic processes,
specifically the Markov chain model. We study three
nearest neighbor query semantics that take as input a
query state or trajectory $q$ and a time interval, and
theoretically evaluate their runtime complexity.
Furthermore we propose a sampling approach which uses
Bayesian inference to guarantee that sampled
trajectories conform to the observation data stored in
the database. This sampling approach can be used in
Monte-Carlo based approximation solutions. We include
an extensive experimental study to support our
theoretical results.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Karanasos:2013:DSD,
author = "Konstantinos Karanasos and Asterios Katsifodimos and
Ioana Manolescu",
title = "{Delta}: scalable data dissemination under capacity
constraints",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "4",
pages = "217--228",
month = dec,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:02 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In content-based publish-subscribe (pub/sub) systems,
users express their interests as queries over a stream
of publications. Scaling up content-based pub/sub to
very large numbers of subscriptions is challenging:
users are interested in low latency, that is, getting
subscription results fast, while the pub/sub system
provider is mostly interested in scaling, i.e., being
able to serve large numbers of subscribers, with low
computational resources utilization. We present a novel
approach for scalable content-based pub/sub in the
presence of constraints on the available CPU and
network resources, implemented within our pub/sub
system Delta. We achieve scalability by off-loading
some subscriptions from the pub/sub server, and
leveraging view-based query rewriting to feed these
subscriptions from the data accumulated in others. Our
main contribution is a novel algorithm for organizing
views in a multi-level dissemination network,
exploiting view-based rewriting and powerful linear
programming capabilities to scale to many views,
respect capacity constraints, and minimize latency. The
efficiency and effectiveness of our algorithm are
confirmed through extensive experiments and a large
deployment in a WAN.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Budak:2013:GOD,
author = "Ceren Budak and Theodore Georgiou and Divyakant
Agrawal and Amr {El Abbadi}",
title = "{GeoScope}: online detection of geo-correlated
information trends in social networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "4",
pages = "229--240",
month = dec,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:02 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The First Law of Geography states ``Everything is
related to everything else, but near things are more
related than distant things''. This spatial
significance has implications in various applications,
trend detection being one of them. In this paper we
propose a new algorithmic tool, GeoScope, to detect
geo-trends. GeoScope is a data streams solution that
detects correlations between topics and locations in a
sliding window, in addition to analyzing topics and
locations independently. GeoScope offers theoretical
guarantees for detecting all trending correlated pairs
while requiring only sub-linear space and running time.
We perform various human validation tasks to
demonstrate the value of GeoScope. The results show
that human judges prefer GeoScope to the best
performing baseline solution 4:1 in terms of the
geographical significance of the presented information.
As the Twitter analysis demonstrates, GeoScope
successfully filters out topics without geo-intent and
detects various local interests such as emergency
events, political demonstrations or cultural events.
Experiments on Twitter show that GeoScope has perfect
recall and near-perfect precision.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Onizuka:2013:OIQ,
author = "Makoto Onizuka and Hiroyuki Kato and Soichiro Hidaka
and Keisuke Nakano and Zhenjiang Hu",
title = "Optimization for iterative queries on {MapReduce}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "4",
pages = "241--252",
month = dec,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:02 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/pagerank.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We propose OptIQ, a query optimization approach for
iterative queries in distributed environment. OptIQ
removes redundant computations among different
iterations by extending the traditional techniques of
view materialization and incremental view evaluation.
First, OptIQ decomposes iterative queries into
invariant and variant views, and materializes the
former view. Redundant computations are removed by
reusing the materialized view among iterations. Second,
OptIQ incrementally evaluates the variant view, so that
redundant computations are removed by skipping the
evaluation on converged tuples in the variant view. We
verify the effectiveness of OptIQ through the queries
of PageRank and $k$-means clustering on real datasets.
The results show that OptIQ achieves high efficiency,
up to five times faster than is possible without
removing the redundant computations among iterations.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shuai:2013:WOS,
author = "Hong-Han Shuai and De-Nian Yang and Philip S. Yu and
Ming-Syan Chen",
title = "Willingness optimization for social group activity",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "4",
pages = "253--264",
month = dec,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:02 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Studies show that a person is willing to join a social
group activity if the activity is interesting, and if
some close friends also join the activity as
companions. The literature has demonstrated that the
interests of a person and the social tightness among
friends can be effectively derived and mined from
social networking websites. However, even with the
above two kinds of information widely available, social
group activities still need to be coordinated manually,
and the process is tedious and time-consuming for
users, especially for a large social group activity,
due to complications of social connectivity and the
diversity of possible interests among friends. To
address the above important need, this paper proposes
to automatically select and recommend potential
attendees of a social group activity, which could be
very useful for social networking websites as a
value-added service. We first formulate a new problem,
named Willingness mAximization for Social grOup (WASO).
This paper points out that the solution obtained by a
greedy algorithm is likely to be trapped in a local
optimal solution. Thus, we design a new randomized
algorithm to effectively and efficiently solve the
problem. Given the available computational budgets, the
proposed algorithm is able to optimally allocate the
resources and find a solution with an approximation
ratio. We implement the proposed algorithm in Facebook,
and the user study demonstrates that social groups
obtained by the proposed algorithm significantly
outperform the solutions manually configured by
users.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cao:2013:HPS,
author = "Lei Cao and Elke A. Rundensteiner",
title = "High performance stream query processing with
correlation-aware partitioning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "4",
pages = "265--276",
month = dec,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:02 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "State-of-the-art optimizers produce one single optimal
query plan for all stream data, in spite of such a
singleton plan typically being sub-optimal or even poor
for highly correlated data. Recently a new stream
processing paradigm, called multi-route approach, has
emerged as a promising approach for tackling this
problem. Multi-route first divides data streams into
several partitions and then creates a separate query
plan for each combination of partitions. Unfortunately
current approaches suffer from severe shortcomings, in
particular, the lack of an effective partitioning
strategy and the prohibitive query optimization
expense. In this work we propose the first practical
multi-route optimizer named correlation-aware
multi-route stream query optimizer (or CMR) that solves
both problems. By exploiting both intra- and
inter-stream correlations of streams, CMR produces
effective partitions without having to undertake
repeated expensive query plan generation. The produced
partitions not only are best served by distinct optimal
query plans, but also leverage the partition-driven
pruning opportunity. Experimental results with both
synthetic and real life stream data confirm that CMR
outperforms the state-of-the-art solutions up to an
order of magnitude in both the query optimization time
and the run-time execution performance.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Difallah:2013:OBE,
author = "Djellel Eddine Difallah and Andrew Pavlo and Carlo
Curino and Philippe Cudre-Mauroux",
title = "{OLTP-Bench}: an extensible testbed for benchmarking
relational databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "4",
pages = "277--288",
month = dec,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:02 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Benchmarking is an essential aspect of any database
management system (DBMS) effort. Despite several recent
advancements, such as pre-configured cloud database
images and database-as-a-service (DBaaS) offerings, the
deployment of a comprehensive testing platform with a
diverse set of datasets and workloads is still far from
being trivial. In many cases, researchers and
developers are limited to a small number of workloads
to evaluate the performance characteristics of their
work. This is due to the lack of a universal
benchmarking infrastructure, and to the difficulty of
gaining access to real data and workloads. This results
in lots of unnecessary engineering efforts and makes
the performance evaluation results difficult to
compare. To remedy these problems, we present
OLTP-Bench, an extensible ``batteries included'' DBMS
benchmarking testbed. The key contributions of
OLTP-Bench are its ease of use and extensibility,
support for tight control of transaction mixtures,
request rates, and access distributions over time, as
well as the ability to support all major DBMSs and
DBaaS platforms. Moreover, it is bundled with fifteen
workloads that all differ in complexity and system
demands, including four synthetic workloads, eight
workloads from popular benchmarks, and three workloads
that are derived from real-world applications. We
demonstrate through a comprehensive set of experiments
conducted on popular DBMS and DBaaS offerings the
different features provided by OLTP-Bench and the
effectiveness of our testbed in characterizing the
performance of database services.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Nandi:2013:GQS,
author = "Arnab Nandi and Lilong Jiang and Michael Mandel",
title = "Gestural query specification",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "4",
pages = "289--300",
month = dec,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:02 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Direct, ad-hoc interaction with databases has
typically been performed over console-oriented
conversational interfaces using query languages such as
SQL. With the rise in popularity of gestural user
interfaces and computing devices that use gestures as
their exclusive modes of interaction, database query
interfaces require a fundamental rethinking to work
without keyboards. We present a novel query
specification system that allows the user to query
databases using a series of gestures. We present a
novel gesture recognition system that uses both the
interaction and the state of the database to classify
gestural input into relational database queries. We
conduct exhaustive systems performance tests and user
studies to demonstrate that our system is not only
performant and capable of interactive latencies, but it
is also more usable, faster to use and more intuitive
than existing systems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Heise:2013:SDU,
author = "Arvid Heise and Jorge-Arnulfo Quian{\'e}-Ruiz and
Ziawasch Abedjan and Anja Jentzsch and Felix Naumann",
title = "Scalable discovery of unique column combinations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "4",
pages = "301--312",
month = dec,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:02 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The discovery of all unique (and non-unique) column
combinations in a given dataset is at the core of any
data profiling effort. The results are useful for a
large number of areas of data management, such as
anomaly detection, data integration, data modeling,
duplicate detection, indexing, and query optimization.
However, discovering all unique and non-unique column
combinations is an NP-hard problem, which in principle
requires to verify an exponential number of column
combinations for uniqueness on all data values. Thus,
achieving efficiency and scalability in this context is
a tremendous challenge by itself. In this paper, we
devise Ducc, a scalable and efficient approach to the
problem of finding all unique and non-unique column
combinations in big datasets. We first model the
problem as a graph coloring problem and analyze the
pruning effect of individual combinations. We then
present our hybrid column-based pruning technique,
which traverses the lattice in a depth-first and random
walk combination. This strategy allows Ducc to
typically depend on the solution set size and hence to
prune large swaths of the lattice. Ducc also
incorporates row-based pruning to run uniqueness checks
in just few milliseconds. To achieve even higher
scalability, Ducc runs on several CPU cores (scale-up)
and compute nodes (scale-out) with a very low overhead.
We exhaustively evaluate Ducc using three datasets (two
real and one synthetic) with several millions rows and
hundreds of attributes. We compare Ducc with related
work: Gordian and HCA. The results show that Ducc is up
to more than 2 orders of magnitude faster than Gordian
and HCA (631x faster than Gordian and 398x faster than
HCA). Finally, a series of scalability experiments
shows the efficiency of Ducc to scale up and out.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tang:2013:EMD,
author = "Yu Tang and Leong Hou U. and Yilun Cai and Nikos
Mamoulis and Reynold Cheng",
title = "{Earth Mover's Distance} based similarity search at
scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "4",
pages = "313--324",
month = dec,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:02 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Earth Mover's Distance (EMD), as a similarity measure,
has received a lot of attention in the fields of
multimedia and probabilistic databases, computer
vision, image retrieval, machine learning, etc. EMD on
multidimensional histograms provides better
distinguishability between the objects approximated by
the histograms (e.g., images), compared to classic
measures like Euclidean distance. Despite its
usefulness, EMD has a high computational cost;
therefore, a number of effective filtering methods have
been proposed, to reduce the pairs of histograms for
which the exact EMD has to be computed, during
similarity search. Still, EMD calculations in the
refinement step remain the bottleneck of the whole
similarity search process. In this paper, we focus on
optimizing the refinement phase of EMD-based similarity
search by (i) adapting an efficient min-cost flow
algorithm (SIA) for EMD computation, (ii) proposing a
dynamic distance bound, which can be used to terminate
an EMD refinement early, and (iii) proposing a dynamic
refinement order for the candidates which, paired with
a concurrent EMD refinement strategy, reduces the
amount of needless computations. Our proposed
techniques are orthogonal to and can be easily
integrated with the state-of-the-art filtering
techniques, reducing the cost of EMD-based similarity
queries by orders of magnitude.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Parameswaran:2013:SVD,
author = "Aditya Parameswaran and Neoklis Polyzotis and Hector
Garcia-Molina",
title = "{SeeDB}: visualizing database queries efficiently",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "4",
pages = "325--328",
month = dec,
year = "2013",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:02 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data scientists rely on visualizations to interpret
the data returned by queries, but finding the right
visualization remains a manual task that is often
laborious. We propose a DBMS that partially automates
the task of finding the right visualizations for a
query. In a nutshell, given an input query Q, the new
DBMS optimizer will explore not only the space of
physical plans for Q, but also the space of possible
visualizations for the results of Q. The output will
comprise a recommendation of potentially
``interesting'' or ``useful'' visualizations, where
each visualization is coupled with a suitable query
execution plan. We discuss the technical challenges in
building this system and outline an agenda for future
research.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mahmoud:2014:MES,
author = "Hatem A. Mahmoud and Vaibhav Arora and Faisal Nawab
and Divyakant Agrawal and Amr {El Abbadi}",
title = "{MaaT}: effective and scalable coordination of
distributed transactions in the cloud",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "5",
pages = "329--340",
month = jan,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:04 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The past decade has witnessed an increasing adoption
of cloud database technology, which provides better
scalability, availability, and fault-tolerance via
transparent partitioning and replication, and automatic
load balancing and fail-over. However, only a small
number of cloud databases provide strong consistency
guarantees for distributed transactions, despite
decades of research on distributed transaction
processing, due to practical challenges that arise in
the cloud setting, where failures are the norm, and
human administration is minimal. For example, dealing
with locks left by transactions initiated by failed
machines, and determining a multi-programming level
that avoids thrashing without under-utilizing available
resources, are some of the challenges that arise when
using lock-based transaction processing mechanisms in
the cloud context. Even in the case of optimistic
concurrency control, most proposals in the literature
deal with distributed validation but still require the
database to acquire locks during two-phase commit when
installing updates of a single transaction on multiple
machines. Very little theoretical work has been done to
entirely eliminate the need for locking in distributed
transactions, including locks acquired during two-phase
commit. In this paper, we re-design optimistic
concurrency control to eliminate any need for locking
even for atomic commitment, while handling the
practical issues in earlier theoretical work related to
this problem. We conduct an extensive experimental
study to evaluate our approach against lock-based
methods under various setups and workloads, and
demonstrate that our approach provides many practical
advantages in the cloud context.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2014:DWA,
author = "Chao Li and Michael Hay and Gerome Miklau and Yue
Wang",
title = "A data- and workload-aware algorithm for range queries
under differential privacy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "5",
pages = "341--352",
month = jan,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:04 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We describe a new algorithm for answering a given set
of range queries under $ \epsilon $-differential
privacy which often achieves substantially lower error
than competing methods. Our algorithm satisfies
differential privacy by adding noise that is adapted to
the input data and to the given query set. We first
privately learn a partitioning of the domain into
buckets that suit the input data well. Then we
privately estimate counts for each bucket, doing so in
a manner well-suited for the given query set. Since the
performance of the algorithm depends on the input
database, we evaluate it on a wide range of real
datasets, showing that we can achieve the benefits of
data-dependence on both ``easy'' and ``hard''
databases.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Greco:2014:CQA,
author = "Sergio Greco and Fabian Pijcke and Jef Wijsen",
title = "Certain query answering in partially consistent
databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "5",
pages = "353--364",
month = jan,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:04 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A database is called uncertain if two or more tuples
of the same relation are allowed to agree on their
primary key. Intuitively, such tuples act as
alternatives for each other. A repair (or possible
world) of such uncertain database is obtained by
selecting a maximal number of tuples without ever
selecting two tuples of the same relation that agree on
their primary key. For a Boolean query $q$, the problem
$ {\rm CERTAINTY}(q)$ takes as input an uncertain
database db and asks whether $q$ evaluates to true on
every repair of db. In recent years, the complexity of
$ {\rm CERTAINTY}(q)$ has been studied under different
restrictions on $q$. These complexity studies have
assumed no restrictions on the uncertain databases that
are input to $ {\rm CERTAINTY}(q)$. In practice,
however, it may be known that these input databases are
partially consistent, in the sense that they satisfy
some dependencies (e.g., functional dependencies). In
this article, we introduce the problem $ {\rm
CERTAINTY}(q)$ in the presence of a set $ \Sigma $ of
dependencies. The problem $ {\rm CERTAINTY}(q, \Sigma)$
takes as input an uncertain database db that satisfies
$ \Sigma $, and asks whether every repair of db
satisfies $q$. We focus on the complexity of $ {\rm
CERTAINTY}(q, \Sigma)$ when $q$ is an acyclic
conjunctive query without self-join, and $ \Sigma $ is
a set of functional dependencies and join dependencies,
the latter of a particular form. We provide an
algorithm that, given $q$ and $ \Sigma $, decides
whether $ {\rm CERTAINTY}(q, \Sigma)$ is first-order
expressible. Moreover, we show how to effectively
construct a first-order definition of $ {\rm
CERTAINTY}(q, \Sigma)$ if it exists.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mottin:2014:EQG,
author = "Davide Mottin and Matteo Lissandrini and Yannis
Velegrakis and Themis Palpanas",
title = "Exemplar queries: give me an example of what you
need",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "5",
pages = "365--376",
month = jan,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:04 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Search engines are continuously employing advanced
techniques that aim to capture user intentions and
provide results that go beyond the data that simply
satisfy the query conditions. Examples include the
personalized results, related searches, similarity
search, popular and relaxed queries. In this work we
introduce a novel query paradigm that considers a user
query as an example of the data in which the user is
interested. We call these queries exemplar queries and
claim that they can play an important role in dealing
with the information deluge. We provide a formal
specification of the semantics of such queries and show
that they are fundamentally different from notions like
queries by example, approximate and related queries. We
provide an implementation of these semantics for
graph-based data and present an exact solution with a
number of optimizations that improve performance
without compromising the quality of the answers. We
also provide an approximate solution that prunes the
search space and achieves considerably better
time-performance with minimal or no impact on
effectiveness. We experimentally evaluate the
effectiveness and efficiency of these solutions with
synthetic and real datasets, and illustrate the
usefulness of exemplar queries in practice.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Korula:2014:ERA,
author = "Nitish Korula and Silvio Lattanzi",
title = "An efficient reconciliation algorithm for social
networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "5",
pages = "377--388",
month = jan,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:04 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "People today typically use multiple online social
networks (Facebook, Twitter, Google+, LinkedIn, etc.).
Each online network represents a subset of their
``real'' ego-networks. An interesting and challenging
problem is to reconcile these online networks, that is,
to identify all the accounts belonging to the same
individual. Besides providing a richer understanding of
social dynamics, the problem has a number of practical
applications. At first sight, this problem appears
algorithmically challenging. Fortunately, a small
fraction of individuals explicitly link their accounts
across multiple networks; our work leverages these
connections to identify a very large fraction of the
network. Our main contributions are to mathematically
formalize the problem for the first time, and to design
a simple, local, and efficient parallel algorithm to
solve it. We are able to prove strong theoretical
guarantees on the algorithm's performance on
well-established network models (Random Graphs,
Preferential Attachment). We also experimentally
confirm the effectiveness of the algorithm on synthetic
and real social network data sets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chester:2014:CKR,
author = "Sean Chester and Alex Thomo and S. Venkatesh and Sue
Whitesides",
title = "Computing $k$-regret minimizing sets",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "5",
pages = "389--400",
month = jan,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:04 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Regret minimizing sets are a recent approach to
representing a dataset $D$ by a small subset $R$ of
size $r$ of representative data points. The set $R$ is
chosen such that executing any top-1 query on $R$
rather than $D$ is minimally perceptible to any user.
However, such a subset $R$ may not exist, even for
modest sizes, $r$. In this paper, we introduce the
relaxation to $k$-regret minimizing sets, whereby a
top-$1$ query on $R$ returns a result imperceptibly
close to the top-$k$ on $D$. We show that, in general,
with or without the relaxation, this problem is
NP-hard. For the specific case of two dimensions, we
give an efficient dynamic programming, plane sweep
algorithm based on geometric duality to find an optimal
solution. For arbitrary dimension, we give an
empirically effective, greedy, randomized algorithm
based on linear programming. With these algorithms, we
can find subsets $R$ of much smaller size that better
summarize $D$, using small values of $k$ larger than
$1$.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yu:2014:RTK,
author = "Adams Wei Yu and Nikos Mamoulis and Hao Su",
title = "Reverse top-$k$ search using random walk with
restart",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "5",
pages = "401--412",
month = jan,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:04 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the increasing popularity of social networks,
large volumes of graph data are becoming available.
Large graphs are also derived by structure extraction
from relational, text, or scientific data (e.g.,
relational tuple networks, citation graphs, ontology
networks, protein-protein interaction graphs).
Node-to-node proximity is the key building block for
many graph-based applications that search or analyze
the data. Among various proximity measures, random walk
with restart (RWR) is widely adopted because of its
ability to consider the global structure of the whole
network. Although RWR-based similarity search has been
well studied before, there is no prior work on reverse
top-$k$ proximity search in graphs based on RWR. We
discuss the applicability of this query and show that
its direct evaluation using existing methods on
RWR-based similarity search has very high computational
and storage demands. To address this issue, we propose
an indexing technique, paired with an on-line reverse
top-$k$ search algorithm. Our experiments show that our
technique is efficient and has manageable storage
requirements even when applied on very large graphs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Viglas:2014:WLS,
author = "Stratis D. Viglas",
title = "Write-limited sorts and joins for persistent memory",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "5",
pages = "413--424",
month = jan,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:04 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "To mitigate the impact of the widening gap between the
memory needs of CPUs and what standard memory
technology can deliver, system architects have
introduced a new class of memory technology termed
persistent memory. Persistent memory is
byte-addressable, but exhibits asymmetric I/O: writes
are typically one order of magnitude more expensive
than reads. Byte addressability combined with I/O
asymmetry render the performance profile of persistent
memory unique. Thus, it becomes imperative to find new
ways to seamlessly incorporate it into database
systems. We do so in the context of query processing.
We focus on the fundamental operations of sort and join
processing. We introduce the notion of write-limited
algorithms that effectively minimize the I/O cost. We
give a high-level API that enables the system to
dynamically optimize the workflow of the algorithms;
or, alternatively, allows the developer to tune the
write profile of the algorithms. We present four
different techniques to incorporate persistent memory
into the database processing stack in light of this
API. We have implemented and extensively evaluated all
our proposals. Our results show that the algorithms
deliver on their promise of I/O-minimality and tunable
performance. We showcase the merits and deficiencies of
each implementation technique, thus taking a solid
first step towards incorporating persistent memory into
query processing.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Anciaux:2014:FOD,
author = "N. Anciaux and L. Bouganim and T. Delot and S. Ilarri
and L. Kloul and N. Mitton and P. Pucheral",
title = "{Folk-IS}: opportunistic data services in least
developed countries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "5",
pages = "425--428",
month = jan,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:04 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "According to a wide range of studies, IT should become
a key facilitator in establishing primary education,
reducing mortality and supporting commercial
initiatives in Least Developed Countries (LDCs). The
main barrier to the development of IT services in these
regions is not only the lack of communication
facilities, but also the lack of consistent information
systems, security procedures, economic and legal
support, as well as political commitment. In this
paper, we propose the vision of an infrastructureless
data platform well suited for the development of
innovative IT services in LDCs. We propose a
participatory approach, where each individual
implements a small subset of a complete information
system thanks to highly secure, portable and low-cost
personal devices as well as opportunistic networking,
without the need of any form of infrastructure. We
review the technical challenges that are specific to
this approach.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Giannikis:2014:SWO,
author = "Georgios Giannikis and Darko Makreshanski and Gustavo
Alonso and Donald Kossmann",
title = "Shared workload optimization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "6",
pages = "429--440",
month = feb,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:06 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "As a result of increases in both the query load and
the data managed, as well as changes in hardware
architecture (multicore), the last years have seen a
shift from query-at-a-time approaches towards shared
work (SW) systems where queries are executed in groups.
Such groups share operators like scans and joins,
leading to systems that process hundreds to thousands
of queries in one go. SW systems range from storage
engines that use in-memory co-operative scans to more
complex query processing engines that share joins over
analytical and star schema queries. In all cases, they
rely on either single query optimizers, predicate
sharing, or on manually generated plans. In this paper
we explore the problem of shared workload optimization
(SWO) for SW systems. The challenge in doing so is that
the optimization has to be done for the entire workload
and that results in a class of stochastic knapsack with
uncertain weights optimization, which can only be
addressed with heuristics to achieve a reasonable
runtime. In this paper we focus on hash joins and
shared scans and present a first algorithm capable of
optimizing the execution of entire workloads by
deriving a global executing plan for all the queries in
the system. We evaluate the optimizer over the TPC-W
and the TPC-H benchmarks. The results prove the
feasibility of this approach and demonstrate the
performance gains that can be obtained from SW
systems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Elseidy:2014:SAO,
author = "Mohammed Elseidy and Abdallah Elguindy and Aleksandar
Vitorovic and Christoph Koch",
title = "Scalable and adaptive online joins",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "6",
pages = "441--452",
month = feb,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:06 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Scalable join processing in a parallel shared-nothing
environment requires a partitioning policy that evenly
distributes the processing load while minimizing the
size of state maintained and number of messages
communicated. Previous research proposes static
partitioning schemes that require statistics
beforehand. In an online or streaming environment in
which no statistics about the workload are known,
traditional static approaches perform poorly. This
paper presents a novel parallel online dataflow join
operator that supports arbitrary join predicates. The
proposed operator continuously adjusts itself to the
data dynamics through adaptive dataflow routing and
state repartitioning. The operator is resilient to data
skew, maintains high throughput rates, avoids blocking
behavior during state repartitioning, takes an eventual
consistency approach for maintaining its local state,
and behaves strongly consistently as a black-box
dataflow operator. We prove that the operator ensures a
constant competitive ratio 3:75 in data distribution
optimality and that the cost of processing an input
tuple is amortized constant, taking into account
adaptivity costs. Our evaluation demonstrates that our
operator outperforms the state-of-the-art static
partitioning schemes in resource utilization,
throughput, and execution time.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Morton:2014:SDE,
author = "Kristi Morton and Magdalena Balazinska and Dan
Grossman and Jock Mackinlay",
title = "Support the data enthusiast: challenges for
next-generation data-analysis systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "6",
pages = "453--456",
month = feb,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:06 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present a vision of next-generation visual
analytics services. We argue that these services should
have three related capabilities: support visual and
interactive data exploration as they do today, but also
suggest relevant data to enrich visualizations, and
facilitate the integration and cleaning of that data.
Most importantly, they should provide all these
capabilities seamlessly in the context of an
uninterrupted data analysis cycle. We present the
challenges and opportunities in building
next-generation visual analytics services.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Deutch:2014:PFD,
author = "Daniel Deutch and Yuval Moskovitch and Val Tannen",
title = "A provenance framework for data-dependent process
analysis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "6",
pages = "457--468",
month = feb,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:06 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A data-dependent process (DDP) models an application
whose control flow is guided by a finite state machine,
as well as by the state of an underlying database. DDPs
are commonly found e.g., in e-commerce. In this paper
we develop a framework supporting the use of provenance
in static (temporal) analysis of possible DDP
executions. Using provenance support, analysts can
interactively test and explore the effect of
hypothetical modifications to a DDP's state machine
and/or to the underlying database. They can also extend
the analysis to incorporate the propagation of
annotations from meta-domains of interest, e.g., cost
or access privileges. Toward this goal we note that the
framework of semiring-based provenance was proven
highly effective in fulfilling similar needs in the
context of database queries. In this paper we consider
novel constructions that generalize the semiring
approach to the context of DDP analysis. These
constructions address two interacting new challenges:
(1) to combine provenance annotations for both
information that resides in the database and
information about external inputs (e.g., user choices),
and (2) to finitely capture infinite process
executions. We analyze our solution from theoretical
and experimental perspectives, proving its
effectiveness.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chiang:2014:TED,
author = "Yueh-Hsuan Chiang and AnHai Doan and Jeffrey F.
Naughton",
title = "Tracking entities in the dynamic world: a fast
algorithm for matching temporal records",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "6",
pages = "469--480",
month = feb,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:06 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Identifying records referring to the same real world
entity over time enables longitudinal data analysis.
However, difficulties arise from the dynamic nature of
the world: the entities described by a temporal data
set often evolve their states over time. While the
state of the art approach to temporal entity matching
achieves high accuracy, this approach is
computationally expensive and cannot handle large data
sets. In this paper, we present an approach that
achieves equivalent matching accuracy but takes far
less time. Our key insight is ``static first, dynamic
second.'' Our approach first runs an
evidence-collection pass, grouping records without
considering the possibility of entity evolution, as if
the world were ``static.'' Then, it merges clusters
from the initial grouping by determining whether an
entity might evolve from the state described in one
cluster to the state described in another cluster. This
intuitively reduces a difficult problem, record
matching with evolution, to two simpler problems:
record matching without evolution, then ``evolution
detection'' among the resulting clusters. Experimental
results on several temporal data sets show that our
approach provides an order of magnitude improvement in
run time over the state-of-the-art approach while
producing equivalent matching accuracy.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Conway:2014:EAS,
author = "Neil Conway and Peter Alvaro and Emily Andrews and
Joseph M. Hellerstein",
title = "{Edelweiss}: automatic storage reclamation for
distributed programming",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "6",
pages = "481--492",
month = feb,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:06 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Event Log Exchange (ELE) is a common programming
pattern based on immutable state and messaging. ELE
sidesteps traditional challenges in distributed
consistency, at the expense of introducing new
challenges in designing space reclamation protocols to
avoid consuming unbounded storage. We introduce
Edelweiss, a sublanguage of Bloom that provides an ELE
programming model, yet automatically reclaims space
without programmer assistance. We describe techniques
to analyze Edelweiss programs and automatically
generate application-specific distributed space
reclamation logic. We show how Edelweiss can be used to
elegantly implement a variety of communication and
distributed storage protocols; the storage reclamation
code generated by Edelweiss effectively
garbage-collects state and often matches hand-written
protocols from the literature.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ntarmos:2014:RJQ,
author = "Nikos Ntarmos and Ioannis Patlakas and Peter
Triantafillou",
title = "Rank join queries in {NoSQL} databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "7",
pages = "493--504",
month = mar,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:07 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Rank (i.e., top-$k$) join queries play a key role in
modern analytics tasks. However, despite their
importance and unlike centralized settings, they have
been completely overlooked in cloud NoSQL settings. We
attempt to fill this gap: We contribute a suite of
solutions and study their performance comprehensively.
Baseline solutions are offered using SQL-like languages
(like Hive and Pig), based on MapReduce jobs. We first
provide solutions that are based on specialized
indices, which may themselves be accessed using either
MapReduce or coordinator-based strategies. The first
index-based solution is based on inverted indices,
which are accessed with MapReduce jobs. The second
index-based solution adapts a popular centralized
rank-join algorithm. We further contribute a novel
statistical structure comprising histograms and Bloom
filters, which forms the basis for the third
index-based solution. We provide (i) MapReduce
algorithms showing how to build these indices and
statistical structures, (ii) algorithms to allow for
online updates to these indices, and (iii) query
processing algorithms utilizing them. We implemented
all algorithms in Hadoop (HDFS) and HBase and tested
them on TPC-H datasets of various scales, utilizing
different queries on tables of various sizes and
different score-attribute distributions. We ported our
implementations to Amazon EC2 and ``in-house'' lab
clusters of various scales. We provide performance
results for three metrics: query execution time,
network bandwidth consumption, and dollar-cost for
query execution.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gupta:2014:BOS,
author = "Rahul Gupta and Alon Halevy and Xuezhi Wang and Steven
Euijong Whang and Fei Wu",
title = "{Biperpedia}: an ontology for search applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "7",
pages = "505--516",
month = mar,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:07 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Search engines make significant efforts to recognize
queries that can be answered by structured data and
invest heavily in creating and maintaining
high-precision databases. While these databases have a
relatively wide coverage of entities, the number of
attributes they model (e.g., GDP, CAPITAL, ANTHEM) is
relatively small. Extending the number of attributes
known to the search engine can enable it to more
precisely answer queries from the long and heavy tail,
extract a broader range of facts from the Web, and
recover the semantics of tables on the Web. We describe
Biperpedia, an ontology with 1.6M (class, attribute)
pairs and 67K distinct attribute names. Biperpedia
extracts attributes from the query stream, and then
uses the best extractions to seed attribute extraction
from text. For every attribute Biperpedia saves a set
of synonyms and text patterns in which it appears,
thereby enabling it to recognize the attribute in more
contexts. In addition to a detailed analysis of the
quality of Biperpedia, we show that it can increase the
number of Web tables whose semantics we can recover by
more than a factor of 4 compared with Freebase.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Elseidy:2014:GFS,
author = "Mohammed Elseidy and Ehab Abdelhamid and Spiros
Skiadopoulos and Panos Kalnis",
title = "{GraMi}: frequent subgraph and pattern mining in a
single large graph",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "7",
pages = "517--528",
month = mar,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:07 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Mining frequent subgraphs is an important operation on
graphs; it is defined as finding all subgraphs that
appear frequently in a database according to a given
frequency threshold. Most existing work assumes a
database of many small graphs, but modern applications,
such as social networks, citation graphs, or
protein-protein interactions in bioinformatics, are
modeled as a single large graph. In this paper we
present GraMi, a novel framework for frequent subgraph
mining in a single large graph. GraMi undertakes a
novel approach that only finds the minimal set of
instances to satisfy the frequency threshold and avoids
the costly enumeration of all instances required by
previous approaches. We accompany our approach with a
heuristic and optimizations that significantly improve
performance. Additionally, we present an extension of
GraMi that mines frequent patterns. Compared to
subgraphs, patterns offer a more powerful version of
matching that captures transitive interactions between
graph nodes (like friend of a friend) which are very
common in modern applications. Finally, we present
CGraMi, a version supporting structural and semantic
constraints, and AGraMi, an approximate version
producing results with no false positives. Our
experiments on real data demonstrate that our framework
is up to 2 orders of magnitude faster and discovers
more interesting patterns than existing approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2014:LIO,
author = "Sheng Wang and David Maier and Beng Chin Ooi",
title = "Lightweight indexing of observational data in
log-structured storage",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "7",
pages = "529--540",
month = mar,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:07 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Huge amounts of data are being generated by sensing
devices every day, recording the status of objects and
the environment. Such observational data is widely used
in scientific research. As the capabilities of sensors
keep improving, the data produced are drastically
expanding in precision and quantity, making it a
write-intensive domain. Log-structured storage is
capable of providing high write throughput, and hence
is a natural choice for managing large-scale
observational data. In this paper, we propose an
approach to indexing and querying observational data in
log-structured storage. Based on key traits of
observational data, we design a novel index approach
called the CR-index (Continuous Range Index), which
provides fast query performance without compromising
write throughput. It is a lightweight structure that is
fast to construct and often small enough to reside in
RAM. Our experimental results show that the CR-index is
superior in handling observational data compared to
other indexing techniques. While our focus is
scientific data, we believe our index will be effective
for other applications with similar properties, such as
process monitoring in manufacturing.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jiang:2014:EES,
author = "Dawei Jiang and Gang Chen and Beng Chin Ooi and
Kian-Lee Tan and Sai Wu",
title = "{epiC}: an extensible and scalable system for
processing big data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "7",
pages = "541--552",
month = mar,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:07 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The Big Data problem is characterized by the so called
3V features: Volume --- a huge amount of data, Velocity
--- a high data ingestion rate, and Variety --- a mix
of structured data, semi-structured data, and
unstructured data. The state-of-the-art solutions to
the Big Data problem are largely based on the MapReduce
framework (aka its open source implementation Hadoop).
Although Hadoop handles the data volume challenge
successfully, it does not deal with the data variety
well since the programming interfaces and its
associated data processing model is inconvenient and
inefficient for handling structured data and graph
data. This paper presents epiC, an extensible system to
tackle the Big Data's data variety challenge. epiC
introduces a general Actor-like concurrent programming
model, independent of the data processing models, for
specifying parallel computations. Users process
multi-structured datasets with appropriate epiC
extensions, the implementation of a data processing
model best suited for the data type and auxiliary code
for mapping that data processing model into epiC's
concurrent programming model. Like Hadoop, programs
written in this way can be automatically parallelized
and the runtime system takes care of fault tolerance
and inter-machine communications. We present the design
and implementation of epiC's concurrent programming
model. We also present two customized data processing
model, an optimized MapReduce extension and a
relational model, on top of epiC. Experiments
demonstrate the effectiveness and efficiency of our
proposed epiC.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Boehm:2014:HPS,
author = "Matthias Boehm and Shirish Tatikonda and Berthold
Reinwald and Prithviraj Sen and Yuanyuan Tian and
Douglas R. Burdick and Shivakumar Vaithyanathan",
title = "Hybrid parallelization strategies for large-scale
machine learning in {SystemML}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "7",
pages = "553--564",
month = mar,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:07 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "SystemML aims at declarative, large-scale machine
learning (ML) on top of MapReduce, where high-level ML
scripts with R-like syntax are compiled to programs of
MR jobs. The declarative specification of ML algorithms
enables --- in contrast to existing large-scale machine
learning libraries --- automatic optimization.
SystemML's primary focus is on data parallelism but
many ML algorithms inherently exhibit opportunities for
task parallelism as well. A major challenge is how to
efficiently combine both types of parallelism for
arbitrary ML scripts and workloads. In this paper, we
present a systematic approach for combining task and
data parallelism for large-scale machine learning on
top of MapReduce. We employ a generic Parallel FOR
construct (ParFOR) as known from high performance
computing (HPC). Our core contributions are (1)
complementary parallelization strategies for exploiting
multi-core and cluster parallelism, as well as (2) a
novel cost-based optimization framework for
automatically creating optimal parallel execution
plans. Experiments on a variety of use cases showed
that this achieves both efficiency and scalability due
to automatic adaptation to ad-hoc workloads and unknown
data characteristics.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yang:2014:SSG,
author = "Shengqi Yang and Yinghui Wu and Huan Sun and Xifeng
Yan",
title = "Schemaless and structureless graph querying",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "7",
pages = "565--576",
month = mar,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:07 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Querying complex graph databases such as knowledge
graphs is a challenging task for non-professional
users. Due to their complex schemas and variational
information descriptions, it becomes very hard for
users to formulate a query that can be properly
processed by the existing systems. We argue that for a
user-friendly graph query engine, it must support
various kinds of transformations such as synonym,
abbreviation, and ontology. Furthermore, the derived
query results must be ranked in a principled manner. In
this paper, we introduce a novel framework enabling
schemaless and structureless graph querying (SLQ),
where a user need not describe queries precisely as
required by most databases. The query engine is built
on a set of transformation functions that automatically
map keywords and linkages from a query to their matches
in a graph. It automatically learns an effective
ranking model, without assuming manually labeled
training examples, and can efficiently return top
ranked matches using graph sketch and belief
propagation. The architecture of SLQ is elastic for
``plug-in'' new transformation functions and query
logs. Our experimental results show that this new graph
querying paradigm is promising: It identifies
high-quality matches for both keyword and graph queries
over real-life knowledge graphs, and outperforms
existing methods significantly in terms of
effectiveness and efficiency.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Salihoglu:2014:OGA,
author = "Semih Salihoglu and Jennifer Widom",
title = "Optimizing graph algorithms on {Pregel}-like systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "7",
pages = "577--588",
month = mar,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:07 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We study the problem of implementing graph algorithms
efficiently on Pregel-like systems, which can be
surprisingly challenging. Standard graph algorithms in
this setting can incur unnecessary inefficiencies such
as slow convergence or high communication or
computation cost, typically due to structural
properties of the input graphs such as large diameters
or skew in component sizes. We describe several
optimization techniques to address these
inefficiencies. Our most general technique is based on
the idea of performing some serial computation on a
tiny fraction of the input graph, complementing
Pregel's vertex-centric parallelism. We base our study
on thorough implementations of several fundamental
graph algorithms, some of which have, to the best of
our knowledge, not been implemented on Pregel-like
systems before. The algorithms and optimizations we
describe are fully implemented in our open-source
Pregel implementation. We present detailed experiments
showing that our optimization techniques improve
runtime significantly on a variety of very large graph
datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wu:2014:TCF,
author = "You Wu and Pankaj K. Agarwal and Chengkai Li and Jun
Yang and Cong Yu",
title = "Toward computational fact-checking",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "7",
pages = "589--600",
month = mar,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:07 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Our news are saturated with claims of ``facts'' made
from data. Database research has in the past focused on
how to answer queries, but has not devoted much
attention to discerning more subtle qualities of the
resulting claims, e.g., is a claim ``cherry-picking''?
This paper proposes a framework that models claims
based on structured data as parameterized queries. A
key insight is that we can learn a lot about a claim by
perturbing its parameters and seeing how its conclusion
changes. This framework lets us formulate practical
fact-checking tasks --- reverse-engineering (often
intentionally) vague claims, and countering
questionable claims --- as computational problems.
Along with the modeling framework, we develop an
algorithmic framework that enables efficient
instantiations of ``meta'' algorithms by supplying
appropriate algorithmic building blocks. We present
real-world examples and experiments that demonstrate
the power of our model, efficiency of our algorithms,
and usefulness of their results.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Arenas:2014:PAB,
author = "Marcelo Arenas and Gonzalo D{\'\i}az and Achille
Fokoue and Anastasios Kementsietsidis and Kavitha
Srinivas",
title = "A principled approach to bridging the gap between
graph data and their schemas",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "8",
pages = "601--612",
month = apr,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:10 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Although RDF graph data often come with an associated
schema, recent studies have proven that real RDF data
rarely conform to their perceived schemas. Since a
number of data management decisions, including storage
layouts, indexing, and efficient query processing, use
schemas to guide the decision making, it is imperative
to have an accurate description of the structuredness
of the data at hand (how well the data conform to the
schema). In this paper, we have approached the study of
the structuredness of an RDF graph in a principled way:
we propose a framework for specifying structuredness
functions, which gauge the degree to which an RDF graph
conforms to a schema. In particular, we first define a
formal language for specifying structuredness functions
with expressions we call rules. This language allows a
user to state a rule to which an RDF graph may fully or
partially conform. Then we consider the issue of
discovering a refinement of a sort (type) by
partitioning the dataset into subsets whose
structuredness is over a specified threshold. In
particular, we prove that the natural decision problem
associated to this refinement problem is NP-complete,
and we provide a natural translation of this problem
into Integer Linear Programming (ILP). Finally, we test
this ILP solution with three real world datasets and
three different and intuitive rules, which gauge the
structuredness in different ways. We show that the
rules give meaningful refinements of the datasets,
showing that our language can be a powerful tool for
understanding the structure of RDF data, and we show
that the ILP solution is practical for a large fraction
of existing data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2014:EPS,
author = "Dongxiang Zhang and Chee-Yong Chan and Kian-Lee Tan",
title = "An efficient publish\slash subscribe index for
e-commerce databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "8",
pages = "613--624",
month = apr,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:10 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Many of today's publish/subscribe (pub/sub) systems
have been designed to cope with a large volume of
subscriptions and high event arrival rate (velocity).
However, in many novel applications (such as
e-commerce), there is an increasing variety of items,
each with different attributes. This leads to a very
high-dimensional and sparse database that existing
pub/sub systems can no longer support effectively. In
this paper, we propose an efficient in-memory index
that is scalable to the volume and update of
subscriptions, the arrival rate of events and the
variety of subscribable attributes. The index is also
extensible to support complex scenarios such as
prefix/suffix filtering and regular expression
matching. We conduct extensive experiments on synthetic
datasets and two real datasets (AOL query log and Ebay
products). The results demonstrate the superiority of
our index over state-of-the-art methods: our index
incurs orders of magnitude less index construction
time, consumes a small amount of memory and performs
event matching efficiently.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jiang:2014:SSJ,
author = "Yu Jiang and Guoliang Li and Jianhua Feng and Wen-Syan
Li",
title = "String similarity joins: an experimental evaluation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "8",
pages = "625--636",
month = apr,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:10 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/string-matching.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "String similarity join is an important operation in
data integration and cleansing that finds similar
string pairs from two collections of strings. More than
ten algorithms have been proposed to address this
problem in the recent two decades. However, existing
algorithms have not been thoroughly compared under the
same experimental framework. For example, some
algorithms are tested only on specific datasets. This
makes it rather difficult for practitioners to decide
which algorithms should be used for various scenarios.
To address this problem, in this paper we provide a
comprehensive survey on a wide spectrum of existing
string similarity join algorithms, classify them into
different categories based on their main techniques,
and compare them through extensive experiments on a
variety of real-world datasets with different
characteristics. We also report comprehensive findings
obtained from the experiments and provide new insights
about the strengths and weaknesses of existing
similarity join algorithms which can guide
practitioners to select appropriate algorithms for
various scenarios.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Proserpio:2014:CDS,
author = "Davide Proserpio and Sharon Goldberg and Frank
McSherry",
title = "Calibrating data to sensitivity in private data
analysis: a platform for differentially-private
analysis of weighted datasets",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "8",
pages = "637--648",
month = apr,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:10 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present an approach to differentially private
computation in which one does not scale up the
magnitude of noise for challenging queries, but rather
scales down the contributions of challenging records.
While scaling down all records uniformly is equivalent
to scaling up the noise magnitude, we show that scaling
records non-uniformly can result in substantially
higher accuracy by bypassing the worst-case
requirements of differential privacy for the noise
magnitudes. This paper details the data analysis
platform wPINQ, which generalizes the Privacy
Integrated Query (PINQ) to weighted datasets. Using a
few simple operators (including a non-uniformly scaling
Join operator) wPINQ can reproduce (and improve)
several recent results on graph analysis and introduce
new generalizations (e.g., counting triangles with
given degrees). We also show how to integrate
probabilistic inference techniques to synthesize
datasets respecting more complicated (and less easily
interpreted) measurements.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2014:EMM,
author = "Wei Wang and Beng Chin Ooi and Xiaoyan Yang and
Dongxiang Zhang and Yueting Zhuang",
title = "Effective multi-modal retrieval based on stacked
auto-encoders",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "8",
pages = "649--660",
month = apr,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 09:22:10 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Multi-modal retrieval is emerging as a new search
paradigm that enables seamless information retrieval
from various types of media. For example, users can
simply snap a movie poster to search relevant reviews
and trailers. To solve the problem, a set of mapping
functions are learned to project high-dimensional
features extracted from data of different media types
into a common low-dimensional space so that metric
distance measures can be applied. In this paper, we
propose an effective mapping mechanism based on deep
learning (i.e., stacked auto-encoders) for multi-modal
retrieval. Mapping functions are learned by optimizing
a new objective function, which captures both
intra-modal and inter-modal semantic relationships of
data from heterogeneous sources effectively. Compared
with previous works which require a substantial amount
of prior knowledge such as similarity matrices of
intra-modal data and ranking examples, our method
requires little prior knowledge. Given a large training
dataset, we split it into mini-batches and continually
adjust the mapping functions for each batch of input.
Hence, our method is memory efficient with respect to
the data volume. Experiments on three real datasets
illustrate that our proposed method achieves
significant improvement in search accuracy over the
state-of-the-art methods.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Song:2014:PNF,
author = "Renchu Song and Weiwei Sun and Baihua Zheng and Yu
Zheng",
title = "{PRESS}: a novel framework of trajectory compression
in road networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "9",
pages = "661--672",
month = may,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:18 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Location data becomes more and more important. In this
paper, we focus on the trajectory data, and propose a
new framework, namely PRESS (Paralleled
Road-Network-Based Trajectory Compression), to
effectively compress trajectory data under road network
constraints. Different from existing work, PRESS
proposes a novel representation for trajectories to
separate the spatial representation of a trajectory
from the temporal representation, and proposes a Hybrid
Spatial Compression (HSC) algorithm and error Bounded
Temporal Compression (BTC) algorithm to compress the
spatial and temporal information of trajectories
respectively. PRESS also supports common
spatial-temporal queries without fully decompressing
the data. Through an extensive experimental study on
real trajectory dataset, PRESS significantly
outperforms existing approaches in terms of saving
storage cost of trajectory data with bounded errors.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yang:2014:FCO,
author = "Yajun Yang and Hong Gao and Jeffrey Xu Yu and
Jianzhong Li",
title = "Finding the cost-optimal path with time constraint
over time-dependent graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "9",
pages = "673--684",
month = may,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:18 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Shortest path query is an important problem and has
been well studied in static graphs. However, in
practice, the costs of edges in graphs always change
over time. We call such graphs as time-dependent
graphs. In this paper, we study how to find a
cost-optimal path with time constraint in
time-dependent graphs. Most existing works regarding
the Time-Dependent Shortest Path (TDSP) problem focus
on finding a shortest path with the minimum travel
time. All these works are based on the following fact:
the earliest arrival time at a vertex $v$ can be
derived from the earliest arrival time at $v$'s
neighbors. Unfortunately, this fact does not hold for
our problem. In this paper, we propose a novel
algorithm to compute a cost-optimal path with time
constraint in time-dependent graphs. We show that the
time and space complexities of our algorithm are $ O(k
n \log n + m k)$ and $ O((n + m) k)$ respectively. We
confirm the effectiveness and efficiency of our
algorithm through conducting experiments on real
datasets with synthetic cost.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Parameswaran:2014:OCP,
author = "Aditya Parameswaran and Stephen Boyd and Hector
Garcia-Molina and Ashish Gupta and Neoklis Polyzotis
and Jennifer Widom",
title = "Optimal crowd-powered rating and filtering
algorithms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "9",
pages = "685--696",
month = may,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:18 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We focus on crowd-powered filtering, i.e., filtering a
large set of items using humans. Filtering is one of
the most commonly used building blocks in crowdsourcing
applications and systems. While solutions for
crowd-powered filtering exist, they make a range of
implicit assumptions and restrictions, ultimately
rendering them not powerful enough for real-world
applications. We describe two approaches to discard
these implicit assumptions and restrictions: one, that
carefully generalizes prior work, leading to an
optimal, but often-times intractable solution, and
another, that provides a novel way of reasoning about
filtering strategies, leading to a sometimes
suboptimal, but efficiently computable solution (that
is asymptotically close to optimal). We demonstrate
that our techniques lead to significant reductions in
error of up to 30\% for fixed cost over prior work in a
novel crowdsourcing application: peer evaluation in
online courses.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gruenheid:2014:IRL,
author = "Anja Gruenheid and Xin Luna Dong and Divesh
Srivastava",
title = "Incremental record linkage",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "9",
pages = "697--708",
month = may,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:18 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Record linkage clusters records such that each cluster
corresponds to a single distinct real-world entity. It
is a crucial step in data cleaning and data
integration. In the big data era, the velocity of data
updates is often high, quickly making previous linkage
results obsolete. This paper presents an end-to-end
framework that can incrementally and efficiently update
linkage results when data updates arrive. Our
algorithms not only allow merging records in the
updates with existing clusters, but also allow
leveraging new evidence from the updates to fix
previous linkage errors. Experimental results on three
real and synthetic data sets show that our algorithms
can significantly reduce linkage time without
sacrificing linkage quality.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Roy:2014:LLH,
author = "Pratanu Roy and Jens Teubner and Rainer Gemulla",
title = "Low-latency handshake join",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "9",
pages = "709--720",
month = may,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:18 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This work revisits the processing of stream joins on
modern hardware architectures. Our work is based on the
recently proposed handshake join algorithm, which is a
mechanism to parallelize the processing of stream joins
in a NUMA-aware and hardware-friendly manner. Handshake
join achieves high throughput and scalability, but it
suffers from a high latency penalty and a
non-deterministic ordering of the tuples in the
physical result stream. In this paper, we first
characterize the latency behavior of the handshake join
and then propose a new low-latency handshake join
algorithm, which substantially reduces latency without
sacrificing throughput or scalability. We also present
a technique to generate punctuated result streams with
very little overhead; such punctuations allow the
generation of correctly ordered physical output streams
with negligible effect on overall throughput and
latency.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wu:2014:PPT,
author = "Huanhuan Wu and James Cheng and Silu Huang and Yiping
Ke and Yi Lu and Yanyan Xu",
title = "Path problems in temporal graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "9",
pages = "721--732",
month = may,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:18 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Shortest path is a fundamental graph problem with
numerous applications. However, the concept of classic
shortest path is insufficient or even flawed in a
temporal graph, as the temporal information determines
the order of activities along any path. In this paper,
we show the shortcomings of classic shortest path in a
temporal graph, and study various concepts of
``shortest'' path for temporal graphs. Computing these
temporal paths is challenging as subpaths of a
``shortest'' path may not be ``shortest'' in a temporal
graph. We investigate properties of the temporal paths
and propose efficient algorithms to compute them. We
tested our algorithms on real world temporal graphs to
verify their efficiency, and also show that temporal
paths are essential for studying temporal graphs by
comparing shortest paths in normal static graphs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cao:2014:RRI,
author = "Xin Cao and Gao Cong and Christian S. Jensen and Man
Lung Yiu",
title = "Retrieving regions of interest for user exploration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "9",
pages = "733--744",
month = may,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:18 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We consider an application scenario where points of
interest (PoIs) each have a web presence and where a
web user wants to identify a region that contains
relevant PoIs that are relevant to a set of keywords,
e.g., in preparation for deciding where to go to
conveniently explore the PoIs. Motivated by this, we
propose the length-constrained maximum-sum region
(LCMSR) query that returns a spatial-network region
that is located within a general region of interest,
that does not exceed a given size constraint, and that
best matches query keywords. Such a query maximizes the
total weight of the PoIs in it w.r.t. the query
keywords. We show that it is NP-hard to answer this
query. We develop an approximation algorithm with a (5
+ \epsilon) approximation ratio utilizing a technique
that scales node weights into integers. We also propose
a more efficient heuristic algorithm and a greedy
algorithm. Empirical studies on real data offer
detailed insight into the accuracy of the proposed
algorithms and show that the proposed algorithms are
capable of computing results efficiently and
effectively.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2014:SLE,
author = "Yingfan Liu and Jiangtao Cui and Zi Huang and Hui Li
and Heng Tao Shen",
title = "{SK--LSH}: an efficient index structure for
approximate nearest neighbor search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "9",
pages = "745--756",
month = may,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:18 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Approximate Nearest Neighbor (ANN) search in high
dimensional space has become a fundamental paradigm in
many applications. Recently, Locality Sensitive Hashing
(LSH) and its variants are acknowledged as the most
promising solutions to ANN search. However,
state-of-the-art LSH approaches suffer from a drawback:
accesses to candidate objects require a large number of
random I/O operations. In order to guarantee the
quality of returned results, sufficient objects should
be verified, which would consume enormous I/O cost. To
address this issue, we propose a novel method, called
SortingKeys-LSH (SK-LSH), which reduces the number of
page accesses through locally arranging candidate
objects. We firstly define a new measure to evaluate
the distance between the compound hash keys of two
points. A linear order relationship on the set of
compound hash keys is then created, and the
corresponding data points can be sorted accordingly.
Hence, data points that are close to each other
according to the distance measure can be stored locally
in an index file. During the ANN search, only a limited
number of disk pages among few index files are
necessary to be accessed for sufficient candidate
generation and verification, which not only
significantly reduces the response time but also
improves the accuracy of the returned results. Our
exhaustive empirical study over several real-world data
sets demonstrates the superior efficiency and accuracy
of SK-LSH for the ANN search, compared with
state-of-the-art methods, including LSB, C2LSH and
CK-Means.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lin:2014:AFP,
author = "Bing-Rong Lin and Daniel Kifer",
title = "On arbitrage-free pricing for general data queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "9",
pages = "757--768",
month = may,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:18 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data is a commodity. Recent research has considered
the mathematical problem of setting prices for
different queries over data. Ideal pricing functions
need to be flexible --defined for arbitrary queries
(select-project-join, aggregate, random sample, and
noisy privacy-preserving queries). They should be
fine-grained --- a consumer should not be required to
buy the entire database to get answers to simple
``low-information'' queries (such as selecting only a
few tuples or aggregating over only one attribute).
Similarly, a consumer may not want to pay a large
amount of money, only to discover that the database is
empty. Finally, pricing functions should satisfy
consistency conditions such as being ``arbitrage-free''
--- consumers should not be able to circumvent the
pricing function by deducing the answer to an expensive
query from a few cheap queries. Previously proposed
pricing functions satisfy some of these criteria (i.e.
they are defined for restricted subclasses of queries
and/or use relaxed conditions for avoiding arbitrage).
In this paper, we study arbitrage-free pricing
functions defined for arbitrary queries. We propose new
necessary conditions for avoiding arbitrage and provide
new arbitrage-free pricing functions. We also prove
several negative results related to the tension between
flexible pricing and avoiding arbitrage, and show how
this tension often results in unreasonable prices.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2014:SMF,
author = "Chao Zhang and Jiawei Han and Lidan Shou and Jiajun Lu
and Thomas {La Porta}",
title = "{Splitter}: mining fine-grained sequential patterns in
semantic trajectories",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "9",
pages = "769--780",
month = may,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:18 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Driven by the advance of positioning technology and
the popularity of location-sharing services,
semantic-enriched trajectory data have become
unprecedentedly available. The sequential patterns
hidden in such data, when properly defined and
extracted, can greatly benefit tasks like targeted
advertising and urban planning. Unfortunately, classic
sequential pattern mining algorithms developed for
transactional data cannot effectively mine patterns in
semantic trajectories, mainly because the places in the
continuous space cannot be regarded as independent
``items''. Instead, similar places need to be grouped
to collaboratively form frequent sequential patterns.
That said, it remains a challenging task to mine what
we call fine-grained sequential patterns, which must
satisfy spatial compactness, semantic consistency and
temporal continuity simultaneously. We propose Splitter
to effectively mine such fine-grained sequential
patterns in two steps. In the first step, it retrieves
a set of spatially coarse patterns, each attached with
a set of trajectory snippets that precisely record the
pattern's occurrences in the database. In the second
step, Splitter breaks each coarse pattern into
fine-grained ones in a top-down manner, by
progressively detecting dense and compact clusters in a
higher-dimensional space spanned by the snippets.
Splitter uses an effective algorithm called weighted
snippet shift to detect such clusters, and leverages a
divide-and-conquer strategy to speed up the top-down
pattern splitting process. Our experiments on both real
and synthetic data sets demonstrate the effectiveness
and efficiency of Splitter.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Floratou:2014:TBW,
author = "Avrilia Floratou and Frank Bertsch and Jignesh M.
Patel and Georgios Laskaris",
title = "Towards building wind tunnels for data center design",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "9",
pages = "781--784",
month = may,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:18 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data center design is a tedious and expensive process.
Recently, this process has become even more challenging
as users of cloud services expect to have guaranteed
levels of availability, durability and performance. A
new challenge for the service providers is to find the
most cost-effective data center design and
configuration that will accommodate the users'
expectations, on ever-changing workloads, and
constantly evolving hardware and software components.
In this paper, we argue that data center design should
become a systematic process. First, it should be done
using an integrated approach that takes into account
both the hardware and the software interdependencies,
and their impact on users' expectations. Second, it
should be performed in a ``wind tunnel'', which uses
large-scale simulation to systematically explore the
impact of a data center configuration on both the
users' and the service providers' requirements. We
believe that this is the first step towards systematic
data center design --- an exciting area for future
research.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2014:RRQ,
author = "Zhao Zhang and Cheqing Jin and Qiangqiang Kang",
title = "Reverse $k$-ranks query",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "10",
pages = "785--796",
month = jun,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:21 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Finding matching customers for a given product based
on individual user's preference is critical for many
applications, especially in e-commerce. Recently, the
reverse top-$k$ query is proposed to return a number of
customers who regard a given product as one of the $k$
most favorite products based on a linear model.
Although a few ``hot'' products can be returned to some
customers via reverse top-$k$ query, a large proportion
of products (over 90\%, as our example illustrates, see
Figure 2) cannot find any matching customers. Inspired
by this observation, we propose a new kind of query
($R$-$k$ Ranks) which finds for a given product, the
top-$k$ customers whose rank for the product is highest
among all customers, to ensure 100\% coverage for any
given product, no matter it is hot or niche. Not
limited to e-commerce, the concept of customer ---
product can be extended to a wider range of
applications, such as dating and job-hunting.
Unfortunately, existing approaches for reverse top-$k$
query cannot be used to handle $R$-$k$ Ranks
conveniently due to infeasibility of getting enough
elements for the query result. Hence, we propose three
novel approaches to efficiently process $R$-$k$ Ranks
query, including one tree-based method and two
batch-pruning-based methods. Analysis of theoretical
and experimental results on real and synthetic data
sets illustrates the efficacy of the proposed
methods.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jugel:2014:MVO,
author = "Uwe Jugel and Zbigniew Jerzak and Gregor Hackenbroich
and Gregor Hackenbroich and Volker Markl",
title = "{M4}: a visualization-oriented time series data
aggregation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "10",
pages = "797--808",
month = jun,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:21 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Visual analysis of high-volume time series data is
ubiquitous in many industries, including finance,
banking, and discrete manufacturing. Contemporary,
RDBMS-based systems for visualization of high-volume
time series data have difficulty to cope with the hard
latency requirements and high ingestion rates of
interactive visualizations. Existing solutions for
lowering the volume of time series data disregard the
semantics of visualizations and result in visualization
errors. In this work, we introduce M4, an
aggregation-based time series dimensionality reduction
technique that provides error-free visualizations at
high data reduction rates. Focusing on line charts, as
the predominant form of time series visualization, we
explain in detail the drawbacks of existing data
reduction techniques and how our approach outperforms
state of the art, by respecting the process of line
rasterization. We describe how to incorporate
aggregation-based dimensionality reduction at the query
level in a visualization-driven query rewriting system.
Our approach is generic and applicable to any
visualization system that uses an RDBMS as data source.
Using real world data sets from high tech
manufacturing, stock markets, and sports analytics
domains we demonstrate that our visualization-oriented
data aggregation can reduce data volumes by up to two
orders of magnitude, while preserving perfect
visualizations.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ghashami:2014:CMA,
author = "Mina Ghashami and Jeff M. Phillips and Feifei Li",
title = "Continuous matrix approximation on distributed data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "10",
pages = "809--820",
month = jun,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:21 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Tracking and approximating data matrices in streaming
fashion is a fundamental challenge. The problem
requires more care and attention when data comes from
multiple distributed sites, each receiving a stream of
data. This paper considers the problem of ``tracking
approximations to a matrix'' in the distributed
streaming model. In this model, there are $m$
distributed sites each observing a distinct stream of
data (where each element is a row of a distributed
matrix) and has a communication channel with a
coordinator, and the goal is to track an \epsilon
-approximation to the norm of the matrix along any
direction. To that end, we present novel algorithms to
address the matrix approximation problem. Our
algorithms maintain a smaller matrix $B$, as an
approximation to a distributed streaming matrix $A$,
such that for any unit vector $x$: $ | \, || A x ||^2 -
|| B x ||^2 | \leq \epsilon || A ||^2_F$. Our
algorithms work in streaming fashion and incur small
communication, which is critical for distributed
computation. Our best method is deterministic and uses
only $ O((m / \epsilon) \log (\beta N))$ communication,
where $N$ is the size of stream (at the time of the
query) and $ \beta $ is an upperbound on the squared
norm of any row of the matrix. In addition to proving
all algorithmic properties theoretically, extensive
experiments with real large datasets demonstrate the
efficiency of these protocols.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ren:2014:EAD,
author = "Kun Ren and Alexander Thomson and Daniel J. Abadi",
title = "An evaluation of the advantages and disadvantages of
deterministic database systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "10",
pages = "821--832",
month = jun,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:21 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Recent proposals for deterministic database system
designs argue that deterministic database systems
facilitate replication since the same input can be
independently sent to two different replicas without
concern for replica divergence. In addition, they argue
that determinism yields performance benefits due to (1)
the introduction of deadlock avoidance techniques, (2)
the reduction (or elimination) of distributed commit
protocols, and (3) light-weight locking. However, these
performance benefits are not universally applicable,
and there exist several disadvantages of determinism,
including (1) the additional overhead of processing
transactions for which it is not known in advance what
data will be accessed, (2) an inability to abort
transactions arbitrarily (e.g., in the case of database
or partition overload), and (3) the increased latency
required by a preprocessing layer that ensures that the
same input is sent to every replica. This paper
presents a thorough experimental study that carefully
investigates both the advantages and disadvantages of
determinism, in order to give a database user a more
complete understanding of which database to use for a
given database workload and cluster configuration.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2014:EMD,
author = "Hao Zhang and Bogdan Marius Tudor and Gang Chen and
Beng Chin Ooi",
title = "Efficient in-memory data management: an analysis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "10",
pages = "833--836",
month = jun,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:21 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper analyzes the performance of three systems
for in-memory data management: Memcached, Redis and the
Resilient Distributed Datasets (RDD) implemented by
Spark. By performing a thorough performance analysis of
both analytics operations and fine-grained object
operations such as set/get, we show that neither system
handles efficiently both types of workloads. For
Memcached and Redis the CPU and I/O performance of the
TCP stack are the bottlenecks --- even when serving
in-memory objects within a single server node. RDD does
not support efficient get operation for random objects,
due to a large startup cost of the get job. Our
analysis reveals a set of features that a system must
support in order to achieve efficient in-memory data
management.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Aluc:2014:WMW,
author = "G{\"u}nes Alu{\c{c}} and M. Tamer {\"O}zsu and
Khuzaima Daudjee",
title = "Workload matters: why {RDF} databases need a new
design",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "10",
pages = "837--840",
month = jun,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:21 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The Resource Description Framework (RDF) is a standard
for conceptually describing data on the Web, and SPARQL
is the query language for RDF. As RDF is becoming
widely utilized, RDF data management systems are being
exposed to more diverse and dynamic workloads. Existing
systems are workload-oblivious, and are therefore
unable to provide consistently good performance. We
propose a vision for a workload-aware and adaptive
system. To realize this vision, we re-evaluate relevant
existing physical design criteria for RDF and address
the resulting set of new challenges.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Alsubaiee:2014:SMA,
author = "Sattam Alsubaiee and Alexander Behm and Vinayak Borkar
and Zachary Heilbron and Young-Seok Kim and Michael J.
Carey and Markus Dreseler and Chen Li",
title = "Storage management in {AsterixDB}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "10",
pages = "841--852",
month = jun,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:21 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Social networks, online communities, mobile devices,
and instant messaging applications generate complex,
unstructured data at a high rate, resulting in large
volumes of data. This poses new challenges for data
management systems that aim to ingest, store, index,
and analyze such data efficiently. In response, we
released the first public version of AsterixDB, an
open-source Big Data Management System (BDMS), in June
of 2013. This paper describes the storage management
layer of AsterixDB, providing a detailed description of
its ingestion-oriented approach to local storage and a
set of initial measurements of its ingestion-related
performance characteristics. In order to support high
frequency insertions, AsterixDB has wholly adopted
Log-Structured Merge-trees as the storage technology
for all of its index structures. We describe how the
AsterixDB software framework enables ``LSM-ification''
(conversion from an in-place update, disk-based data
structure to a deferred-update, append-only data
structure) of any kind of index structure that supports
certain primitive operations, enabling the index to
ingest data efficiently. We also describe how AsterixDB
ensures the ACID properties for operations involving
multiple heterogeneous LSM-based indexes. Lastly, we
highlight the challenges related to managing the
resources of a system when many LSM indexes are used
concurrently and present AsterixDB's initial
solution.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Klonatos:2014:BEQ,
author = "Yannis Klonatos and Christoph Koch and Tiark Rompf and
Hassan Chafi",
title = "Building efficient query engines in a high-level
language",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "10",
pages = "853--864",
month = jun,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:21 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
note = "See errata \cite{Klonatos:2014:EBE}.",
abstract = "In this paper we advocate that it is time for a
radical rethinking of database systems design.
Developers should be able to leverage high-level
programming languages without having to pay a price in
efficiency. To realize our vision of abstraction
without regret, we present LegoBase, a query engine
written in the high-level programming language Scala.
The key technique to regain efficiency is to apply
generative programming: the Scala code that constitutes
the query engine, despite its high-level appearance, is
actually a program generator that emits specialized,
low-level C code. We show how the combination of
high-level and generative programming allows to easily
implement a wide spectrum of optimizations that are
difficult to achieve with existing low-level query
compilers, and how it can continuously optimize the
query engine. We evaluate our approach with the TPC-H
benchmark and show that: (a) with all optimizations
enabled, our architecture significantly outperforms a
commercial in-memory database system as well as an
existing query compiler, (b) these performance
improvements require programming just a few hundred
lines of high-level code instead of complicated
low-level code that is required by existing query
compilers and, finally, that (c) the compilation
overhead is low compared to the overall execution time,
thus making our approach usable in practice for
efficiently compiling query engines.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2014:SLT,
author = "Tianzheng Wang and Ryan Johnson",
title = "Scalable logging through emerging non-volatile
memory",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "10",
pages = "865--876",
month = jun,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:21 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Emerging byte-addressable, non-volatile memory (NVM)
is fundamentally changing the design principle of
transaction logging. It potentially invalidates the
need for flush-before-commit as \log records are
persistent immediately upon write. Distributed
logging---a once prohibitive technique for single node
systems in the DRAM era---becomes a promising solution
to easing the logging bottleneck because of the
non-volatility and high performance of NVM. In this
paper, we advocate NVM and distributed logging on
multicore and multi-socket hardware. We identify the
challenges brought by distributed logging and discuss
solutions. To protect committed work in NVM-based
systems, we propose passive group commit, a
lightweight, practical approach that leverages existing
hardware and group commit. We expect that durable
processor cache is the ultimate solution to protecting
committed work and building reliable, scalable
NVM-based systems in general. We evaluate distributed
logging with logging-intensive workloads and show that
distributed logging can achieve as much as $ \approx 3
\times $ speedup over centralized logging in a modern
DBMS and that passive group commit only induces
minuscule overhead.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{He:2014:WDM,
author = "Bingsheng He",
title = "When data management systems meet approximate
hardware: challenges and opportunities",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "10",
pages = "877--880",
month = jun,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:21 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Recently, approximate hardware designs have got many
research interests in the computer architecture
community. The essential idea of approximate hardware
is that the hardware components such as CPU, memory and
storage can trade off the accuracy of results for
increased performance, reduced energy consumption, or
both. We propose a DBMS ApproxiDB with its design,
implementation and optimization aware of the underlying
approximate hardware. ApproxiDB will run on a hybrid
machine consisting of both approximate hardware and
precise hardware (i.e., the conventional hardware
without sacrificing the accuracy). With approximate
hardware, ApproxiDB can efficiently support the concept
of approximate query processing, without the overhead
of pre-computed synopses or sampling techniques. More
importantly, ApproxiDB is also beneficial to precise
query processing, by developing non-trivial hybrid
execution mechanisms on both precise and approximate
hardware. In this vision paper, we sketch the initial
design of ApproxiDB, discuss the technical challenges
in building this system and outline an agenda for
future research.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dong:2014:DFK,
author = "Xin Luna Dong and Evgeniy Gabrilovich and Geremy Heitz
and Wilko Horn and Kevin Murphy and Shaohua Sun and Wei
Zhang",
title = "From data fusion to knowledge fusion",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "10",
pages = "881--892",
month = jun,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:21 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The task of data fusion is to identify the true values
of data items (e.g., the true date of birth for Tom
Cruise) among multiple observed values drawn from
different sources (e.g., Web sites) of varying (and
unknown) reliability. A recent survey [20] has provided
a detailed comparison of various fusion methods on Deep
Web data. In this paper, we study the applicability and
limitations of different fusion techniques on a more
challenging problem: knowledge fusion. Knowledge fusion
identifies true subject-predicate-object triples
extracted by multiple information extractors from
multiple information sources. These extractors perform
the tasks of entity linkage and schema alignment, thus
introducing an additional source of noise that is quite
different from that traditionally considered in the
data fusion literature, which only focuses on factual
errors in the original sources. We adapt
state-of-the-art data fusion techniques and apply them
to a knowledge base with 1.6B unique knowledge triples
extracted by 12 extractors from over 1B Web pages,
which is three orders of magnitude larger than the data
sets used in previous data fusion papers. We show great
promise of the data fusion approaches in solving the
knowledge fusion problem, and suggest interesting
research directions through a detailed error analysis
of the methods.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Funke:2014:KPC,
author = "Stefan Funke and Andr{\'e} Nusser and Sabine
Storandt",
title = "On $k$-path covers and their applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "10",
pages = "893--902",
month = jun,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:21 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "For a directed graph G with vertex set V we call a
subset C \subseteq V a k-(All-)Path Cover if C contains
a node from any path consisting of $k$ nodes. This
paper considers the problem of constructing small
$k$-Path Covers in the context of road networks with
millions of nodes and edges. In many application
scenarios the set C and its induced overlay graph
constitute a very compact synopsis of G which is the
basis for the currently fastest data structure for
personalized shortest path queries, visually pleasing
overlays of subsampled paths, and efficient reporting,
retrieval and aggregation of associated data in spatial
network databases. Apart from a theoretical
investigation of the problem, we provide efficient
algorithms that produce very small $k$-Path Covers for
large real-world road networks (with a posteriori
guarantees via instance-based lower bounds).",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wu:2014:CDV,
author = "Eugene Wu and Leilani Battle and Samuel R. Madden",
title = "The case for data visualization management systems:
vision paper",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "10",
pages = "903--906",
month = jun,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:21 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Most visualizations today are produced by retrieving
data from a database and using a specialized
visualization tool to render it. This decoupled
approach results in significant duplication of
functionality, such as aggregation and filters, and
misses tremendous opportunities for cross-layer
optimizations. In this paper, we present the case for
an integrated Data Visualization Management System
(DVMS) based on a declarative visualization language
that fully compiles the end-to-end visualization
pipeline into a set of relational algebra queries. Thus
the DVMS can be both expressive via the visualization
language, and performant by lever-aging traditional and
visualization-specific optimizations to scale
interactive visualizations to massive datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2014:WAA,
author = "Yinan Li and Jignesh M. Patel",
title = "{WideTable}: an accelerator for analytical data
processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "10",
pages = "907--918",
month = jun,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:21 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper presents a technique called WideTable that
aims to improve the speed of analytical data processing
systems. A WideTable is built by denormalizing the
database, and then converting complex queries into
simple scans on the underlying (wide) table. To avoid
the pitfalls associated with denormalization, e.g.
space overheads, WideTable uses a combination of
techniques including dictionary encoding and columnar
storage. When denormalizing the data, WideTable uses
outer joins to ensure that queries on tables in the
schema graph, which are now nested as embedded tables
in the WideTable, are processed correctly. Then, using
a packed code scan technique, even complex queries on
the original database can be answered by using simple
scans on the WideTable(s). We experimentally evaluate
our methods in a main memory setting using the queries
in TPC-H, and demonstrate the effectiveness of our
methods, both in terms of raw query performance and
scalability when running on many-core machines.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{To:2014:FPW,
author = "Hien To and Gabriel Ghinita and Cyrus Shahabi",
title = "A framework for protecting worker location privacy in
spatial crowdsourcing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "10",
pages = "919--930",
month = jun,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:21 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Spatial Crowdsourcing (SC) is a transformative
platform that engages individuals, groups and
communities in the act of collecting, analyzing, and
disseminating environmental, social and other
spatio-temporal information. The objective of SC is to
outsource a set of spatio-temporal tasks to a set of
workers, i.e., individuals with mobile devices that
perform the tasks by physically traveling to specified
locations of interest. However, current solutions
require the workers, who in many cases are simply
volunteering for a cause, to disclose their locations
to untrustworthy entities. In this paper, we introduce
a framework for protecting location privacy of workers
participating in SC tasks. We argue that existing
location privacy techniques are not sufficient for SC,
and we propose a mechanism based on differential
privacy and geocasting that achieves effective SC
services while offering privacy guarantees to workers.
We investigate analytical models and task assignment
strategies that balance multiple crucial aspects of SC
functionality, such as task completion rate, worker
travel distance and system overhead. Extensive
experimental results on real-world datasets show that
the proposed technique protects workers' location
privacy without incurring significant performance
metrics penalties.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Eldawy:2014:TTS,
author = "Ahmed Eldawy and Justin Levandoski and Per-{\AA}ke
Larson",
title = "Trekking through {Siberia}: managing cold data in a
memory-optimized database",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "11",
pages = "931--942",
month = jul,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:24 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Main memories are becoming sufficiently large that
most OLTP databases can be stored entirely in main
memory, but this may not be the best solution. OLTP
workloads typically exhibit skewed access patterns
where some records are hot (frequently accessed) but
many records are cold (infrequently or never accessed).
It is still more economical to store the coldest
records on secondary storage such as flash. This paper
introduces Siberia, a framework for managing cold data
in the Microsoft Hekaton main-memory database engine.
We discuss how to migrate cold data to secondary
storage while providing an interface to the user to
manipulate both hot and cold data that hides the actual
data location. We describe how queries of different
isolation levels can read and modify data stored in
both hot and cold stores without restriction while
minimizing number of accesses to cold storage. We also
show how records can be migrated between hot and cold
stores while the DBMS is online and active. Experiments
reveal that for cold data access rates appropriate for
main-memory optimized databases, we incur an acceptable
7-14\% throughput loss.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Duggan:2014:CPD,
author = "Jennie Duggan",
title = "The case for personal data-driven decision making",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "11",
pages = "943--946",
month = jul,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:24 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data-driven decision making (D3M) has shown great
promise in professional pursuits such as business and
government. Here, policymakers collect and analyze data
to make their operations more efficient and equitable.
Progress in bringing the benefits of D3M to everyday
life has been slow. For example, a student asks, ``If I
pursue an undergraduate degree at this university, what
are my expected lifetime earnings?''. Presently there
is no principled way to search for this, because an
accurate answer depends on the student and school. Such
queries are personalized, winnowing down large datasets
for specific circumstances, rather than applying
well-defined predicates. They predict decision outcomes
by extrapolating from relevant examples. This vision
paper introduces a new approach to D3M that is designed
to empower the individual to make informed choices.
Here, we highlight research opportunities for the data
management community arising from this proposal.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chairunnanda:2014:CMM,
author = "Prima Chairunnanda and Khuzaima Daudjee and M. Tamer
{\"O}zsu",
title = "{ConfluxDB}: multi-master replication for partitioned
snapshot isolation databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "11",
pages = "947--958",
month = jul,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:24 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Lazy replication with snapshot isolation (SI) has
emerged as a popular choice for distributed databases.
However, lazy replication often requires execution of
update transactions at one (master) site so that it is
relatively easy for a total SI order to be determined
for consistent installation of updates in the lazily
replicated system. We propose a set of techniques that
support update transaction execution over multiple
partitioned sites, thereby allowing the master to
scale. Our techniques determine a total SI order for
update transactions over multiple master sites without
requiring global coordination in the distributed
system, and ensure that updates are installed in this
order at all sites to provide consistent and scalable
replication with SI. We present ConfluxDB, a
PostgreSQL-based implementation of our techniques, and
demonstrate its effectiveness through experimental
evaluation.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Goncalves:2014:DMS,
author = "Bernardo Gon{\c{c}}alves and Fabio Porto",
title = "{$ \gamma $-DB}: managing scientific hypotheses as
uncertain data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "11",
pages = "959--962",
month = jul,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:24 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In view of the paradigm shift that makes science ever
more data-driven, we consider deterministic scientific
hypotheses as uncertain data. This vision comprises a
probabilistic database (p-DB) design methodology for
the systematic construction and management of
U-relational hypothesis DBs, viz., $ \gamma $-DBs. It
introduces hypothesis management as a promising new
class of applications for p-DBs. We illustrate the
potential of $ \gamma $-DB as a tool for deep
predictive analytics.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Woods:2014:IIS,
author = "Louis Woods and Zsolt Istv{\'a}n and Gustavo Alonso",
title = "{Ibex}: an intelligent storage engine with support for
advanced {SQL} offloading",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "11",
pages = "963--974",
month = jul,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:24 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Modern data appliances face severe bandwidth
bottlenecks when moving vast amounts of data from
storage to the query processing nodes. A possible
solution to mitigate these bottlenecks is query
off-loading to an intelligent storage engine, where
partial or whole queries are pushed down to the storage
engine. In this paper, we present Ibex, a prototype of
an intelligent storage engine that supports off-loading
of complex query operators. Besides increasing
performance, Ibex also reduces energy consumption, as
it uses an FPGA rather than conventional CPUs to
implement the off-load engine. Ibex is a hybrid engine,
with dedicated hardware that evaluates SQL expressions
at line-rate and a software fallback for tasks that the
hardware engine cannot handle. Ibex supports GROUP BY
aggregation, as well as projection --- and selection
--- based filtering. GROUP BY aggregation has a higher
impact on performance but is also a more challenging
operator to implement on an FPGA.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yun:2014:NNL,
author = "Hyokun Yun and Hsiang-Fu Yu and Cho-Jui Hsieh and S.
V. N. Vishwanathan and Inderjit Dhillon",
title = "{NOMAD}: non-locking, stochastic multi-machine
algorithm for asynchronous and decentralized matrix
completion",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "11",
pages = "975--986",
month = jul,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:24 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We develop an efficient parallel distributed algorithm
for matrix completion, named NOMAD (Non-locking,
stOchastic Multi-machine algorithm for Asynchronous and
Decentralized matrix completion). NOMAD is a
decentralized algorithm with non-blocking communication
between processors. One of the key features of NOMAD is
that the ownership of a variable is asynchronously
transferred between processors in a decentralized
fashion. As a consequence it is a lock-free parallel
algorithm. In spite of being asynchronous, the variable
updates of NOMAD are serializable, that is, there is an
equivalent update ordering in a serial implementation.
NOMAD outperforms synchronous algorithms which require
explicit bulk synchronization after every iteration:
our extensive empirical evaluation shows that not only
does our algorithm perform well in distributed setting
on commodity hardware, but also outperforms
state-of-the-art algorithms on a HPC cluster both in
multi-core and distributed memory settings.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Song:2014:RVL,
author = "Shaoxu Song and Hong Cheng and Jeffrey Xu Yu and Lei
Chen",
title = "Repairing vertex labels under neighborhood
constraints",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "11",
pages = "987--998",
month = jul,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:24 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A broad class of data, ranging from similarity
networks, workflow networks to protein networks, can be
modeled as graphs with data values as vertex labels.
The vertex labels (data values) are often dirty for
various reasons such as typos or erroneous reporting of
results in scientific experiments. Neighborhood
constraints, specifying label pairs that are allowed to
appear on adjacent vertexes in the graph, are employed
to detect and repair erroneous vertex labels. In this
paper, we study the problem of repairing vertex labels
to make graphs satisfy neighborhood constraints.
Unfortunately, the relabeling problem is proved to be
NP hard, which motivates us to devise approximation
methods for repairing, and identify interesting special
cases (star and clique constraints) that can be
efficiently solved. We propose several approximate
repairing algorithms including greedy heuristics,
contraction method and a hybrid approach. The
performances of algorithms are also analyzed for the
special case. Our extensive experimental evaluation, on
both synthetic and real data, demonstrates the
effectiveness of eliminating frauds in several types of
application networks. Remarkably, the hybrid method
performs well in practice, i.e., guarantees
termination, while achieving high effectiveness at the
same time.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Altowim:2014:PAR,
author = "Yasser Altowim and Dmitri V. Kalashnikov and Sharad
Mehrotra",
title = "Progressive approach to relational entity resolution",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "11",
pages = "999--1010",
month = jul,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:24 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper proposes a progressive approach to entity
resolution (ER) that allows users to explore a
trade-off between the resolution cost and the achieved
quality of the resolved data. In particular, our
approach aims to produce the highest quality result
given a constraint on the resolution budget, specified
by the user. Our proposed method monitors and
dynamically reassesses the resolution progress to
determine which parts of the data should be resolved
next and how they should be resolved. The comprehensive
empirical evaluation of the proposed approach
demonstrates its significant advantage in terms of
efficiency over the traditional ER techniques for the
given problem settings.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2014:CAQ,
author = "Kaibo Wang and Kai Zhang and Yuan Yuan and Siyuan Ma
and Rubao Lee and Xiaoning Ding and Xiaodong Zhang",
title = "Concurrent analytical query processing with {GPUs}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "11",
pages = "1011--1022",
month = jul,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:24 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In current databases, GPUs are used as dedicated
accelerators to process each individual query. Sharing
GPUs among concurrent queries is not supported, causing
serious resource underutilization. Based on the
profiling of an open-source GPU query engine running
commonly used single-query data warehousing workloads,
we observe that the utilization of main GPU resources
is only up to 25\%. The underutilization leads to low
system throughput. To address the problem, this paper
proposes concurrent query execution as an effective
solution. To efficiently share GPUs among concurrent
queries for high throughput, the major challenge is to
provide software support to control and resolve
resource contention incurred by the sharing. Our
solution relies on GPU query scheduling and device
memory swapping policies to address this challenge. We
have implemented a prototype system and evaluated it
intensively. The experiment results confirm the
effectiveness and performance advantage of our
approach. By executing multiple GPU queries
concurrently, system throughput can be improved by up
to 55\% compared with dedicated processing.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Maehara:2014:CPP,
author = "Takanori Maehara and Takuya Akiba and Yoichi Iwata and
Ken-ichi Kawarabayashi",
title = "Computing personalized {PageRank} quickly by
exploiting graph structures",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "12",
pages = "1023--1034",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:26 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/pagerank.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We propose a new scalable algorithm that can compute
Personalized PageRank (PPR) very quickly. The Power
method is a state-of-the-art algorithm for computing
exact PPR; however, it requires many iterations. Thus
reducing the number of iterations is the main
challenge. We achieve this by exploiting graph
structures of web graphs and social networks. The
convergence of our algorithm is very fast. In fact, it
requires up to 7.5 times fewer iterations than the
Power method and is up to five times faster in actual
computation time. To the best of our knowledge, this is
the first time to use graph structures explicitly to
solve PPR quickly. Our contributions can be summarized
as follows. 1. We provide an algorithm for computing a
tree decomposition, which is more efficient and
scalable than any previous algorithm. 2. Using the
above algorithm, we can obtain a core-tree
decomposition of any web graph and social network. This
allows us to decompose a web graph and a social network
into (1) the core, which behaves like an expander
graph, and (2) a small tree-width graph, which behaves
like a tree in an algorithmic sense. 3. We apply a
direct method to the small tree-width graph to
construct an LU decomposition. 4. Building on the LU
decomposition and using it as pre-conditioner, we apply
GMRES method (a state-of-the-art advanced iterative
method) to compute PPR for whole web graphs and social
networks.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Serafini:2014:AES,
author = "Marco Serafini and Essam Mansour and Ashraf Aboulnaga
and Kenneth Salem and Taha Rafiq and Umar Farooq
Minhas",
title = "{Accordion}: elastic scalability for database systems
supporting distributed transactions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "12",
pages = "1035--1046",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:26 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Providing the ability to elastically use more or fewer
servers on demand (scale out and scale in) as the load
varies is essential for database management systems
(DBMSes) deployed on today's distributed computing
platforms, such as the cloud. This requires solving the
problem of dynamic (online) data placement, which has
so far been addressed only for workloads where all
transactions are local to one sever. In DBMSes where
ACID transactions can access more than one partition,
distributed transactions represent a major performance
bottleneck. Scaling out and spreading data across a
larger number of servers does not necessarily result in
a linear increase in the overall system throughput,
because transactions that used to access only one
server may become distributed. In this paper we present
Accordion, a dynamic data placement system for
partition-based DBMSes that support ACID transactions
(local or distributed). It does so by explicitly
considering the affinity between partitions, which
indicates the frequency in which they are accessed
together by the same transactions. Accordion estimates
the capacity of a server by explicitly considering the
impact of distributed transactions and affinity on the
maximum throughput of the server. It then integrates
this estimation in a mixed-integer linear program to
explore the space of possible configurations and decide
whether to scale out. We implemented Accordion and
evaluated it using H-Store, a shared-nothing in-memory
DBMS. Our results using the TPC-C and YCSB benchmarks
show that Accordion achieves benefits compared to
alternative heuristics of up to an order of magnitude
reduction in the number of servers used and in the
amount of data migrated.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Han:2014:ECP,
author = "Minyang Han and Khuzaima Daudjee and Khaled Ammar and
M. Tamer {\"O}zsu and Xingfang Wang and Tianqi Jin",
title = "An experimental comparison of Pregel-like graph
processing systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "12",
pages = "1047--1058",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:26 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The introduction of Google's Pregel generated much
interest in the field of large-scale graph data
processing, inspiring the development of Pregel-like
systems such as Apache Giraph, GPS, Mizan, and
GraphLab, all of which have appeared in the past two
years. To gain an understanding of how Pregel-like
systems perform, we conduct a study to experimentally
compare Giraph, GPS, Mizan, and GraphLab on equal
ground by considering graph and algorithm agnostic
optimizations and by using several metrics. The systems
are compared with four different algorithms (PageRank,
single source shortest path, weakly connected
components, and distributed minimum spanning tree) on
up to 128 Amazon EC2 machines. We find that the system
optimizations present in Giraph and GraphLab allow them
to perform well. Our evaluation also shows Giraph
1.0.0's considerable improvement since Giraph 0.1 and
identifies areas of improvement for all systems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sarma:2014:CSJ,
author = "Akash {Das Sarma} and Yeye He and Surajit Chaudhuri",
title = "{ClusterJoin}: a similarity joins framework using
map-reduce",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "12",
pages = "1059--1070",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:26 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Similarity join is the problem of finding pairs of
records with similarity score greater than some
threshold. In this paper we study the problem of
scaling up similarity join for different metric
distance functions using MapReduce. We propose a
ClusterJoin framework that partitions the data space
based on the underlying data distribution, and
distributes each record to partitions in which they may
produce join results based on the distance threshold.
We design a set of strong candidate filters specific to
different distance functions using a novel
bisector-based framework, so that each record only
needs to be distributed to a small number of partitions
while still guaranteeing correctness. To address data
skewness, which is common for high dimensional data, we
further develop a dynamic load balancing scheme using
sampling, which provides strong probabilistic
guarantees on the size of partitions, and greatly
improves scalability. Experimental evaluation using
real data sets shows that our approach is considerably
more scalable compared to state-of-the-art algorithms,
especially for high dimensional data with low distance
thresholds.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Vesdapunt:2014:CAE,
author = "Norases Vesdapunt and Kedar Bellare and Nilesh Dalvi",
title = "Crowdsourcing algorithms for entity resolution",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "12",
pages = "1071--1082",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:26 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we study a hybrid human-machine
approach for solving the problem of Entity Resolution
(ER). The goal of ER is to identify all records in a
database that refer to the same underlying entity, and
are therefore duplicates of each other. Our input is a
graph over all the records in a database, where each
edge has a probability denoting our prior belief (based
on Machine Learning models) that the pair of records
represented by the given edge are duplicates. Our
objective is to resolve all the duplicates by asking
humans to verify the equality of a subset of edges,
leveraging the transitivity of the equality relation to
infer the remaining edges (e.g. $ a = c $ can be
inferred given $ a = b $ and $ b = c$). We consider the
problem of designing optimal strategies for asking
questions to humans that minimize the expected number
of questions asked. Using our theoretical framework, we
analyze several strategies, and show that a strategy,
claimed as ``optimal'' for this problem in a recent
work, can perform arbitrarily bad in theory. We propose
alternate strategies with theoretical guarantees. Using
both public datasets as well as the production system
at Facebook, we show that our techniques are effective
in practice.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fan:2014:DGS,
author = "Wenfei Fan and Xin Wang and Yinghui Wu and Dong Deng",
title = "Distributed graph simulation: impossibility and
possibility",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "12",
pages = "1083--1094",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:26 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper studies fundamental problems for
distributed graph simulation. Given a pattern query Q
and a graph G that is fragmented and distributed, a
graph simulation algorithm A is to compute the matches
Q (G) of Q in G. We say that A is parallel scalable in
(a) response time if its parallel computational cost is
determined by the largest fragment F$_m$ of G and the
size | Q | of query Q, and (b) data shipment if its
total amount of data shipped is determined by | Q | and
the number of fragments of G, independent of the size
of graph G. (1) We prove an impossibility theorem:
there exists no distributed graph simulation algorithm
that is parallel scalable in either response time or
data shipment. (2) However, we show that distributed
graph simulation is partition bounded, i.e., its
response time depends only on | Q |, | F$_m$ | and the
number | V$_f$ | of nodes in G with edges across
different fragments; and its data shipment depends on |
Q | and the number | E$_f$ | of crossing edges only. We
provide the first algorithms with these performance
guarantees. (3) We also identify special cases of
patterns and graphs when parallel scalability is
possible. (4) We experimentally verify the scalability
and efficiency of our algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Nagel:2014:CGE,
author = "Fabian Nagel and Gavin Bierman and Stratis D. Viglas",
title = "Code generation for efficient query processing in
managed runtimes",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "12",
pages = "1095--1106",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:26 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper we examine opportunities arising from
the convergence of two trends in data management:
in-memory database systems (imdbs), which have received
renewed attention following the availability of
affordable, very large main memory systems; and
language-integrated query, which transparently
integrates database queries with programming languages
(thus addressing the famous 'impedance mismatch'
problem). Language-integrated query not only gives
application developers a more convenient way to query
external data sources like imdbs, but also to use the
same querying language to query an application's
in-memory collections. The latter offers further
transparency to developers as the query language and
all data is represented in the data model of the host
programming language. However, compared to imdbs, this
additional freedom comes at a higher cost for query
evaluation. Our vision is to improve in-memory query
processing of application objects by introducing
database technologies to managed runtimes. We focus on
querying and we leverage query compilation to improve
query processing on application objects. We explore
different query compilation strategies and study how
they improve the performance of query processing over
application data. We take C\# as the host programming
language as it supports language-integrated query
through the linq framework. Our techniques deliver
significant performance improvements over the default
linq implementation. Our work makes important first
steps towards a future where data processing
applications will commonly run on machines that can
store their entire datasets in-memory, and will be
written in a single programming language employing
language-integrated query and imdb-inspired runtimes to
provide transparent and highly efficient querying.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2014:AED,
author = "Weimo Liu and Saravanan Thirumuruganathan and Nan
Zhang and Gautam Das",
title = "Aggregate estimation over dynamic hidden web
databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "12",
pages = "1107--1118",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:26 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Many databases on the web are ``hidden'' behind (i.e.,
accessible only through) their restrictive, form-like,
search interfaces. Recent studies have shown that it is
possible to estimate aggregate query answers over such
hidden web databases by issuing a small number of
carefully designed search queries through the
restrictive web interface. A problem with these
existing work, however, is that they all assume the
underlying database to be static, while most real-world
web databases (e.g., Amazon, eBay) are frequently
updated. In this paper, we study the novel problem of
estimating/tracking aggregates over dynamic hidden web
databases while adhering to the stringent query-cost
limitation they enforce (e.g., at most 1,000 search
queries per day). Theoretical analysis and extensive
real-world experiments demonstrate the effectiveness of
our proposed algorithms and their superiority over
baseline solutions (e.g., the repeated execution of
algorithms designed for static web databases).",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Karpathiotakis:2014:AQP,
author = "Manos Karpathiotakis and Miguel Branco and Ioannis
Alagiannis and Anastasia Ailamaki",
title = "Adaptive query processing on {RAW} data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "12",
pages = "1119--1130",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:26 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Database systems deliver impressive performance for
large classes of workloads as the result of decades of
research into optimizing database engines. High
performance, however, is achieved at the cost of
versatility. In particular, database systems only
operate efficiently over loaded data, i.e., data
converted from its original raw format into the
system's internal data format. At the same time, data
volume continues to increase exponentially and data
varies increasingly, with an escalating number of new
formats. The consequence is a growing impedance
mismatch between the original structures holding the
data in the raw files and the structures used by query
engines for efficient processing. In an ideal scenario,
the query engine would seamlessly adapt itself to the
data and ensure efficient query processing regardless
of the input data formats, optimizing itself to each
instance of a file and of a query by leveraging
information available at query time. Today's systems,
however, force data to adapt to the query engine during
data loading. This paper proposes adapting the query
engine to the formats of raw data. It presents RAW, a
prototype query engine which enables querying
heterogeneous data sources transparently. RAW employs
Just-In-Time access paths, which efficiently couple
heterogeneous raw files to the query engine and reduce
the overheads of traditional general-purpose scan
operators. There are, however, inherent overheads with
accessing raw data directly that cannot be eliminated,
such as converting the raw values. Therefore, RAW also
uses column shreds, ensuring that we pay these costs
only for the subsets of raw data strictly needed by a
query. We use RAW in a real-world scenario and achieve
a two-order of magnitude speedup against the existing
hand-written solution.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Afrati:2014:SQT,
author = "Foto N. Afrati and Dan Delorey and Mosha Pasumansky
and Jeffrey D. Ullman",
title = "Storing and querying tree-structured records in
{Dremel}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "12",
pages = "1131--1142",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:26 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In Dremel, data is stored as nested relations. The
schema for a relation is a tree, all of whose nodes are
attributes, and whose leaf attributes hold values. We
explore filter and aggregate queries that are given in
the Dremel dialect of SQL. Complications arise because
of repeated attributes, i.e., attributes that are
allowed to have more than one value. We focus on the
common class of Dremel queries that are processed on
column-stored data in a way that results in query
processing time that is linear on the size of the
relevant data, i.e., data in the columns that
participate in the query. We formally define the data
model, the query language and the algorithms for query
processing in column-stored data. The concepts of
repetition context and semi-flattening are introduced
here and play a central role in understanding this
class of queries and their algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Starlinger:2014:SSS,
author = "Johannes Starlinger and Bryan Brancotte and Sarah
Cohen-Boulakia and Ulf Leser",
title = "Similarity search for scientific workflows",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "12",
pages = "1143--1154",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:26 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the increasing popularity of scientific
workflows, public repositories are gaining importance
as a means to share, find, and reuse such workflows. As
the sizes of these repositories grow, methods to
compare the scientific workflows stored in them become
a necessity, for instance, to allow duplicate detection
or similarity search. Scientific workflows are complex
objects, and their comparison entails a number of
distinct steps from comparing atomic elements to
comparison of the workflows as a whole. Various studies
have implemented methods for scientific workflow
comparison and came up with often contradicting
conclusions upon which algorithms work best. Comparing
these results is cumbersome, as the original studies
mixed different approaches for different steps and used
different evaluation data and metrics. We contribute to
the field (i) by dissecting each previous approach into
an explicitly defined and comparable set of subtasks,
(ii) by comparing in isolation different approaches
taken at each step of scientific workflow comparison,
reporting on an number of unexpected findings, (iii) by
investigating how these can best be combined into
aggregated measures, and (iv) by making available a
gold standard of over 2000 similarity ratings
contributed by 15 workflow experts on a corpus of
almost 1500 workflows and re-implementations of all
methods we evaluated.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kellaris:2014:DPE,
author = "Georgios Kellaris and Stavros Papadopoulos and Xiaokui
Xiao and Dimitris Papadias",
title = "Differentially private event sequences over infinite
streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "12",
pages = "1155--1166",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:26 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Numerous applications require continuous publication
of statistics or monitoring purposes, such as real-time
traffic analysis, timely disease outbreak discovery,
and social trends observation. These statistics may be
derived from sensitive user data and, hence,
necessitate privacy preservation. A notable paradigm
for offering strong privacy guarantees in statistics
publishing is \epsilon -differential privacy. However,
there is limited literature that adapts this concept to
settings where the statistics are computed over an
infinite stream of ``events'' (i.e., data items
generated by the users), and published periodically.
These works aim at hiding a single event over the
entire stream. We argue that, in most practical
scenarios, sensitive information is revealed from
multiple events occurring at contiguous time instances.
Towards this end, we put forth the novel notion of $w$
--- event privacy over infinite streams, which protects
any event sequence occurring in $w$ successive time
instants. We first formulate our privacy concept,
motivate its importance, and introduce a methodology
for achieving it. We next design two instantiations,
whose utility is independent of the stream length.
Finally, we confirm the practicality of our solutions
experimenting with real data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Londhe:2014:MTC,
author = "Nikhil Londhe and Vishrawas Gopalakrishnan and Aidong
Zhang and Hung Q. Ngo and Rohini Srihari",
title = "Matching titles with cross title web-search enrichment
and community detection",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "12",
pages = "1167--1178",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:26 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Title matching refers roughly to the following
problem. We are given two strings of text obtained from
different data sources. The texts refer to some
underlying physical entities and the problem is to
report whether the two strings refer to the same
physical entity or not. There are manifestations of
this problem in a variety of domains, such as product
or bibliography matching, and location or person
disambiguation. We propose a new approach to solving
this problem, consisting of two main components. The
first component uses Web searches to ``enrich'' the
given pair of titles: making titles that refer to the
same physical entity more similar, and those which do
not, much less similar. A notion of similarity is then
measured using the second component, where the tokens
from the two titles are modelled as vertices of a
``social'' network graph. A ``strength of ties'' style
of clustering algorithm is then applied on this to see
whether they form one cohesive ``community'' (matching
titles), or separately clustered communities
(mismatching titles). Experimental results confirm the
effectiveness of our approach over existing title
matching methods across several input domains.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Song:2014:CSR,
author = "Shaoxu Song and Lei Chen and Hong Cheng",
title = "On concise set of relative candidate keys",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "12",
pages = "1179--1190",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:26 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Matching keys, specifying what attributes to compare
and how to compare them for identifying the same
real-world entities, are found to be useful in
applications like record matching, blocking and
windowing [7]. Owing to the complex redundant semantics
among matching keys, capturing a proper set of matching
keys is highly non-trivial. Analogous to
minimal/candidate keys w.r.t. functional dependencies,
relative candidate keys (RCKs [7], with a minimal
number of compared attributes, see a more formal
definition in Section 2) can clear up redundant
semantics w.r.t. ``what attributes to compare''.
However, we note that redundancy issues may still exist
among rcks on the same attributes about ``how to
compare them''. In this paper, we propose to find a
concise set of matching keys, which has less redundancy
and can still meet the requirements on coverage and
validity. Specifically, we study approximation
algorithms to efficiently discover a near optimal set.
To ensure the quality of matching keys, the returned
results are guaranteed to be RCKs (minimal on compared
attributes), and most importantly, minimal w.r.t.
distance restrictions (i.e., redundancy free w.r.t.
``how to compare the attributes''). The experimental
evaluation demonstrates that our concise RCK set is
more effective than the existing rck choosing method.
Moreover, the proposed pruning methods show up to 2
orders of magnitude improvement w.r.t. time costs on
concise RCK set discovery.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wei:2014:RQI,
author = "Hao Wei and Jeffrey Xu Yu and Can Lu and Ruoming Jin",
title = "Reachability querying: an independent permutation
labeling approach",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "12",
pages = "1191--1202",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:26 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Reachability query is a fundamental graph operation
which answers whether a vertex can reach another vertex
over a large directed graph G with $n$ vertices and $m$
edges, and has been extensively studied. In the
literature, all the approaches compute a label for
every vertex in a graph G by index construction
offline. The query time for answering reachability
queries online is affected by the quality of the labels
computed in index construction. The three main costs
are the index construction time, the index size, and
the query time. Some of the up-to-date approaches can
answer reachability queries efficiently, but spend
non-linear time to construct an index. Some of the
up-to-date approaches construct an index in linear time
and space, but may need to depth-first search G at
run-time in $ O(n + m)$. In this paper, as the first,
we propose a new randomized labeling approach to answer
reachability queries, and the randomness is by
independent permutation. We conduct extensive
experimental studies to compare with the up-to-date
approaches using 19 large real datasets used in the
existing work and synthetic datasets. We confirm the
efficiency of our approach.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jiang:2014:HDL,
author = "Minhao Jiang and Ada Wai-Chee Fu and Raymond Chi-Wing
Wong and Yanyan Xu",
title = "Hop doubling label indexing for point-to-point
distance querying on scale-free networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "12",
pages = "1203--1214",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:26 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We study the problem of point-to-point distance
querying for massive scale-free graphs, which is
important for numerous applications. Given a directed
or undirected graph, we propose to build an index for
answering such queries based on a novel hop-doubling
labeling technique. We derive bounds on the index size,
the computation costs and I/O costs based on the
properties of unweighted scale-free graphs. We show
that our method is much more efficient and effective
compared to the state-of-the-art techniques, in terms
of both querying time and indexing costs. Our empirical
study shows that our method can handle graphs that are
orders of magnitude larger than existing methods.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Suchanek:2014:SC,
author = "Fabian M. Suchanek and Nicoleta Preda",
title = "Semantic culturomics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "12",
pages = "1215--1218",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:26 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Newspapers are testimonials of history. The same is
increasingly true of social media such as online
forums, online communities, and blogs. By looking at
the sequence of articles over time, one can discover
the birth and the development of trends that marked
society and history --- a field known as
``Culturomics''. But Culturomics has so far been
limited to statistics on keywords. In this vision
paper, we argue that the advent of large knowledge
bases (such as YAGO [37], NELL [5], DBpedia [3], and
Freebase) will revolutionize the field. If their
knowledge is combined with the news articles, it can
breathe life into what is otherwise just a sequence of
words for a machine. This will allow discovering trends
in history and culture, explaining them through
explicit logical rules, and making predictions about
the events of the future. We predict that this could
open up a new field of research, ``Semantic
Culturomics'', in which no longer human text helps
machines build up knowledge bases, but knowledge bases
help humans understand their society.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kuhlenkamp:2014:BSE,
author = "J{\"o}rn Kuhlenkamp and Markus Klems and Oliver
R{\"o}ss",
title = "Benchmarking scalability and elasticity of distributed
database systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "12",
pages = "1219--1230",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:26 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Distributed database system performance benchmarks are
an important source of information for decision makers
who must select the right technology for their data
management problems. Since important decisions rely on
trustworthy experimental data, it is necessary to
reproduce experiments and verify the results. We
reproduce performance and scalability benchmarking
experiments of HBase and Cassandra that have been
conducted by previous research and compare the results.
The scope of our reproduced experiments is extended
with a performance evaluation of Cassandra on different
Amazon EC2 infrastructure configurations, and an
evaluation of Cassandra and HBase elasticity by
measuring scaling speed and performance impact while
scaling.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cao:2014:BCQ,
author = "Yang Cao and Wenfei Fan and Tianyu Wo and Wenyuan Yu",
title = "Bounded conjunctive queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "12",
pages = "1231--1242",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:26 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A query Q is said to be effectively bounded if for all
datasets D, there exists a subset D$_Q$ of D such that
Q (D) = Q (D$_Q$), and the size of DQ and time for
fetching D$_Q$ are independent of the size of D. The
need for studying such queries is evident, since it
allows us to compute Q (D) by accessing a bounded
dataset D$_Q$, regardless of how big D is. This paper
investigates effectively bounded conjunctive queries
(SPC) under an access schema A, which specifies indices
and cardinality constraints commonly used. We provide
characterizations (sufficient and necessary conditions)
for determining whether an SPC query Q is effectively
bounded under A. We study several problems for deciding
whether Q is bounded, and if not, for identifying a
minimum set of parameters of Q to instantiate and make
Q bounded. We show that these problems range from
quadratic-time to NP-complete, and develop efficient
(heuristic) algorithms for them. We also provide an
algorithm that, given an effectively bounded SPC query
Q and an access schema A, generates a query plan for
evaluating Q by accessing a bounded amount of data in
any (possibly big) dataset. We experimentally verify
that our algorithms substantially reduce the cost of
query evaluation.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shanbhag:2014:OJE,
author = "Anil Shanbhag and S. Sudarshan",
title = "Optimizing join enumeration in transformation-based
query optimizers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "12",
pages = "1243--1254",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:26 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Query optimizers built on the Volcano/Cascades
framework, which is based on transformation rules, are
used in many commercial databases. Transformation
rulesets proposed earlier for join order enumeration in
such a framework either allow enumeration of joins with
cross-products (which can significantly increase the
cost of optimization), or generate a large number of
duplicate derivations. In this paper we propose two new
rulesets for generating cross-product free trees. One
of the rulesets is a minor extension of a simple but
inefficient ruleset, which we prove is complete (we
also show that a naive extension of an efficient
ruleset leads to incompleteness). We then propose an
efficient new ruleset, which is based on techniques
proposed recently for top-down join order enumeration,
but unlike earlier work it is cleanly integrated into
the Volcano/Cascades framework, and can be used in
conjunction with other transformation rules. We show
that our ruleset is complete (i.e., it generates the
entire search space without cross products) while
avoiding inefficiency due to duplicate derivations. We
have implemented this ruleset in the PyroJ Optimizer
(an implementation of the Volcano optimizer framework)
and show that it significantly outperforms the
alternatives, in some cases by up to two orders of
magnitude, in terms of time taken.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jacob:2014:SMA,
author = "Marie Jacob and Benny Kimelfeld and Julia
Stoyanovich",
title = "A system for management and analysis of preference
data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "12",
pages = "1255--1258",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:26 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Preference data arises in a wide variety of domains.
Over the past decade, we have seen a sharp increase in
the volume of preference data, in the diversity of
applications that use it, and in the richness of
preference data analysis methods. Examples of
applications include rank aggregation in genomic data
analysis, management of votes in elections, and
recommendation systems in e-commerce. However, little
attention has been paid to the challenges of building a
system for preference-data management, which would help
incorporate sophisticated analytics into larger
applications, support computational abstractions for
usability by data scientists, and enable scaling up to
modern volumes. This vision paper proposes a management
system for preference data that aims to address these
challenges. We adopt the relational database model, and
propose extensions that are specialized to handling
preference data. Specifically, we introduce a special
type of a relation that is designed for preference
data, and describe composable operators on preference
relations that can be embedded in SQL statements, for
convenient reuse across applications.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gupta:2014:MGR,
author = "Ashish Gupta and Fan Yang and Jason Govig and Adam
Kirsch and Kelvin Chan and Kevin Lai and Shuo Wu and
Sandeep Govind Dhoot and Abhilash Rajesh Kumar and
Ankur Agiwal and Sanjay Bhansali and Mingsheng Hong and
Jamie Cameron and Masood Siddiqi and David Jones and
Jeff Shute and Andrey Gubarev and Shivakumar
Venkataraman and Divyakant Agrawal",
title = "{Mesa}: geo-replicated, near real-time, scalable data
warehousing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "12",
pages = "1259--1270",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:26 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Mesa is a highly scalable analytic data warehousing
system that stores critical measurement data related to
Google's Internet advertising business. Mesa is
designed to satisfy a complex and challenging set of
user and systems requirements, including near real-time
data ingestion and queryability, as well as high
availability, reliability, fault tolerance, and
scalability for large data and query volumes.
Specifically, Mesa handles petabytes of data, processes
millions of row updates per second, and serves billions
of queries that fetch trillions of rows per day. Mesa
is geo-replicated across multiple datacenters and
provides consistent and repeatable query answers at low
latency, even when an entire datacenter fails. This
paper presents the Mesa system and reports the
performance and scale that it achieves.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liagouris:2014:EES,
author = "John Liagouris and Nikos Mamoulis and Panagiotis
Bouros and Manolis Terrovitis",
title = "An effective encoding scheme for spatial {RDF} data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "12",
pages = "1271--1282",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:26 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The RDF data model has recently been extended to
support representation and querying of spatial
information (i.e., locations and geometries), which is
associated with RDF entities. Still, there are limited
efforts towards extending RDF stores to efficiently
support spatial queries, such as range selections
(e.g., find entities within a given range) and spatial
joins (e.g., find pairs of entities whose locations are
close to each other). In this paper, we propose an
extension for RDF stores that supports efficient
spatial data management. Our contributions include an
effective encoding scheme for entities having spatial
locations, the introduction of on-the-fly spatial
filters and spatial join algorithms, and several
optimizations that minimize the overhead of geometry
and dictionary accesses. We implemented the proposed
techniques as an extension to the opensource RDF-3X
engine and we experimentally evaluated them using real
RDF knowledge bases. The results show that our system
offers robust performance for spatial queries, while
introducing little overhead to the original query
engine.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2014:DSM,
author = "Ce Zhang and Christopher R{\'e}",
title = "{DimmWitted}: a study of main-memory statistical
analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "12",
pages = "1283--1294",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:26 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We perform the first study of the tradeoff space of
access methods and replication to support statistical
analytics using first-order methods executed in the
main memory of a Non-Uniform Memory Access (NUMA)
machine. Statistical analytics systems differ from
conventional SQL-analytics in the amount and types of
memory incoherence that they can tolerate. Our goal is
to understand tradeoffs in accessing the data in row-
or column-order and at what granularity one should
share the model and data for a statistical task. We
study this new tradeoff space and discover that there
are tradeoffs between hardware and statistical
efficiency. We argue that our tradeoff study may
provide valuable information for designers of analytics
engines: for each system we consider, our prototype
engine can run at least one popular task at least 100$
\times $ faster. We conduct our study across five
architectures using popular models, including SVMs,
logistic regression, Gibbs sampling, and neural
networks.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Floratou:2014:SHF,
author = "Avrilia Floratou and Umar Farooq Minhas and Fatma
{\"O}zcan",
title = "{SQL-on-Hadoop}: full circle back to shared-nothing
database architectures",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "12",
pages = "1295--1306",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:26 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "SQL query processing for analytics over Hadoop data
has recently gained significant traction. Among many
systems providing some SQL support over Hadoop, Hive is
the first native Hadoop system that uses an underlying
framework such as MapReduce or Tez to process SQL-like
statements. Impala, on the other hand, represents the
new emerging class of SQL-on-Hadoop systems that
exploit a shared-nothing parallel database architecture
over Hadoop. Both systems optimize their data ingestion
via columnar storage, and promote different file
formats: ORC and Parquet. In this paper, we compare the
performance of these two systems by conducting a set of
cluster experiments using a TPC-H like benchmark and
two TPC-DS inspired workloads. We also closely study
the I/O efficiency of their columnar formats using a
set of micro-benchmarks. Our results show that Impala
is 3.3 X to 4.4 X faster than Hive on MapReduce and 2.1
X to 2.8 X than Hive on Tez for the overall TPC-H
experiments. Impala is also 8.2 X to 10 X faster than
Hive on MapReduce and about 4.3 X faster than Hive on
Tez for the TPC-DS inspired experiments. Through
detailed analysis of experimental results, we identify
the reasons for this performance gap and examine the
strengths and limitations of each system.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Guarnieri:2014:OSA,
author = "Marco Guarnieri and David Basin",
title = "Optimal security-aware query processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "12",
pages = "1307--1318",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:26 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Security-Aware Query Processing is the problem of
computing answers to queries in the presence of access
control policies. We present general impossibility
results for the existence of optimal algorithms for
Security-Aware Query Processing and classify query
languages for which such algorithms exist. In
particular, we show that for the relational calculus
there are no optimal algorithms, whereas optimal
algorithms exist for some of its fragments, such as the
existential fragment. We also establish relationships
between two different models of Fine-Grained Access
Control, called Truman and Non-Truman models, which
have been previously presented in the literature as
distinct. For optimal Security-Aware Query Processing,
we show that the Non-Truman model is a special case of
the Truman model for boolean queries in the relational
calculus, moreover the two models coincide for more
powerful languages, such as the relational calculus
with aggregation operators. In contrast, these two
models are distinct for non-boolean queries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shi:2014:MTE,
author = "Juwei Shi and Jia Zou and Jiaheng Lu and Zhao Cao and
Shiqiang Li and Chen Wang",
title = "{MRTuner}: a toolkit to enable holistic optimization
for {MapReduce} jobs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1319--1330",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "MapReduce based data-intensive computing solutions are
increasingly deployed as production systems. Unlike
Internet companies who invent and adopt the technology
from the very beginning, traditional enterprises demand
easy-to-use software due to the limited capabilities of
administrators. Automatic job optimization software for
MapReduce is a promising technique to satisfy such
requirements. In this paper, we introduce a toolkit
from IBM, called MRTuner, to enable holistic
optimization for MapReduce jobs. In particular, we
propose a novel Producer-Transporter-Consumer (PTC)
model, which characterizes the tradeoffs in the
parallel execution among tasks. We also carefully
investigate the complicated relations among about
twenty parameters, which have significant impact on the
job performance. We design an efficient search
algorithm to find the optimal execution plan. Finally,
we conduct a thorough experimental evaluation on two
different types of clusters using the HiBench suite
which covers various Hadoop workloads from GB to TB
size levels. The results show that the search latency
of MRTuner is a few orders of magnitude faster than
that of the state-of-the-art cost-based optimizer, and
the effectiveness of the optimized execution plan is
also significantly improved.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sadoghi:2014:RDL,
author = "Mohammad Sadoghi and Mustafa Canim and Bishwaranjan
Bhattacharjee and Fabian Nagel and Kenneth A. Ross",
title = "Reducing database locking contention through
multi-version concurrency",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1331--1342",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In multi-version databases, updates and deletions of
records by transactions require appending a new record
to tables rather than performing in-place updates. This
mechanism incurs non-negligible performance overhead in
the presence of multiple indexes on a table, where
changes need to be propagated to all indexes.
Additionally, an uncommitted record update will block
other active transactions from using the index to fetch
the most recently committed values for the updated
record. In general, in order to support snapshot
isolation and/or multi-version concurrency, either each
active transaction is forced to search a database
temporary area (e.g., roll-back segments) to fetch old
values of desired records, or each transaction is
forced to scan the entire table to find the older
versions of the record in a multi-version database (in
the absence of specialized temporal indexes). In this
work, we describe a novel kV-Indirection structure to
enable efficient (parallelizable) optimistic and
pessimistic multi-version concurrency control by
utilizing the old versions of records (at most two
versions of each record) to provide direct access to
the recent changes of records without the need of
temporal indexes. As a result, our technique results in
higher degree of concurrency by reducing the clashes
between readers and writers of data and avoiding
extended lock delays. We have a working prototype of
our concurrency model and kV-Indirection structure in a
commercial database and conducted an extensive
evaluation to demonstrate the benefits of our
multi-version concurrency control, and we obtained
orders of magnitude speed up over the single-version
concurrency control.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Su:2014:CEM,
author = "Xueyuan Su and Garret Swart and Brian Goetz and Brian
Oliver and Paul Sandoz",
title = "Changing engines in midstream: a {Java} stream
computational model for big data processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1343--1354",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/java2010.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the addition of lambda expressions and the Stream
API in Java 8, Java has gained a powerful and
expressive query language that operates over in-memory
collections of Java objects, making the transformation
and analysis of data more convenient, scalable and
efficient. In this paper, we build on Java 8 Stream and
add a DistributableStream abstraction that supports
federated query execution over an extensible set of
distributed compute engines. Each query eventually
results in the creation of a materialized result that
is returned either as a local object or as an engine
defined distributed Java Collection that can be saved
and/or used as a source for future queries.
Distinctively, DistributableStream supports the
changing of compute engines both between and within a
query, allowing different parts of a computation to be
executed on different platforms. At execution time, the
query is organized as a sequence of pipelined stages,
each stage potentially running on a different engine.
Each node that is part of a stage executes its portion
of the computation on the data available locally or
produced by the previous stage of the computation. This
approach allows for computations to be assigned to
engines based on pricing, data locality, and resource
availability. Coupled with the inherent laziness of
stream operations, this brings great flexibility to
query planning and separates the semantics of the query
from the details of the engine used to execute it. We
currently support three engines, Local, Apache Hadoop
MapReduce and Oracle Coherence, and we illustrate how
new engines and data sources can be added.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lee:2014:JEP,
author = "Jae-Gil Lee and Gopi Attaluri and Ronald Barber and
Naresh Chainani and Oliver Draese and Frederick Ho and
Stratos Idreos and Min-Soo Kim and Sam Lightstone and
Guy Lohman and Konstantinos Morfonios and Keshava
Murthy and Ippokratis Pandis and Lin Qiao and
Vijayshankar Raman and Vincent Kulandai Samy and
Richard Sidle and Knut Stolze and Liping Zhang",
title = "Joins on encoded and partitioned data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1355--1366",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Compression has historically been used to reduce the
cost of storage, I/Os from that storage, and buffer
pool utilization, at the expense of the CPU required to
decompress data every time it is queried. However,
significant additional CPU efficiencies can be achieved
by deferring decompression as late in query processing
as possible and performing query processing operations
directly on the still-compressed data. In this paper,
we investigate the benefits and challenges of
performing joins on compressed (or encoded) data. We
demonstrate the benefit of independently optimizing the
compression scheme of each join column, even though
join predicates relating values from multiple columns
may require translation of the encoding of one join
column into the encoding of the other. We also show the
benefit of compressing ``payload'' data other than the
join columns ``on the fly,'' to minimize the size of
hash tables used in the join. By partitioning the
domain of each column and defining separate
dictionaries for each partition, we can achieve even
better overall compression as well as increased
flexibility in dealing with new values introduced by
updates. Instead of decompressing both join columns
participating in a join to resolve their different
compression schemes, our system performs a light-weight
mapping of only qualifying rows from one of the join
columns to the encoding space of the other at run time.
Consequently, join predicates can be applied directly
on the compressed data. We call this procedure encoding
translation. Two alternatives of encoding translation
are developed and compared in the paper. We provide a
comprehensive evaluation of these alternatives using
product implementations of each on the TPC-H data set,
and demonstrate that performing joins on encoded and
partitioned data achieves both superior performance and
excellent compression.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Poess:2014:TFI,
author = "Meikel Poess and Tilmann Rabl and Hans-Arno Jacobsen
and Brian Caufield",
title = "{TPC--DI}: the first industry benchmark for data
integration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1367--1378",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Historically, the process of synchronizing a decision
support system with data from operational systems has
been referred to as Extract, Transform, Load (ETL) and
the tools supporting such process have been referred to
as ETL tools. Recently, ETL was replaced by the more
comprehensive acronym, data integration (DI). DI
describes the process of extracting and combining data
from a variety of data source formats, transforming
that data into a unified data model representation and
loading it into a data store. This is done in the
context of a variety of scenarios, such as data
acquisition for business intelligence, analytics and
data warehousing, but also synchronization of data
between operational applications, data migrations and
conversions, master data management, enterprise data
sharing and delivery of data services in a
service-oriented architecture context, amongst others.
With these scenarios relying on up-to-date information
it is critical to implement a highly performing,
scalable and easy to maintain data integration system.
This is especially important as the complexity, variety
and volume of data is constantly increasing and
performance of data integration systems is becoming
very critical. Despite the significance of having a
highly performing DI system, there has been no industry
standard for measuring and comparing their performance.
The TPC, acknowledging this void, has released TPC-DI,
an innovative benchmark for data integration. This
paper motivates the reasons behind its development,
describes its main characteristics including workload,
run rules, metric, and explains key decisions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gupta:2014:RTT,
author = "Pankaj Gupta and Venu Satuluri and Ajeet Grewal and
Siva Gurumurthy and Volodymyr Zhabiuk and Quannan Li
and Jimmy Lin",
title = "Real-time {Twitter} recommendation: online motif
detection in large dynamic graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1379--1380",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We describe a production Twitter system for generating
relevant, personalized, and timely recommendations
based on observing the temporally-correlated actions of
each user's followings. The system currently serves
millions of recommendations daily to tens of millions
of mobile users. The approach can be viewed as a
specific instance of the novel problem of online motif
detection in large dynamic graphs. Our current solution
partitions the graph across a number of machines, and
with the construction of appropriate data structures,
motif detection can be translated into the lookup and
intersection of adjacency lists in each partition. We
conclude by discussing a generalization of the problem
that perhaps represents a new class of data management
systems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cha:2014:IDN,
author = "Sang K. Cha and Kunsoo Park and Changbin Song and
Kihong Kim and Cheol Ryu and Sunho Lee",
title = "Interval disaggregate: a new operator for business
planning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1381--1392",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Business planning as well as analytics on top of
large-scale database systems is valuable to decision
makers, but planning operations known and implemented
so far are very basic. In this paper we propose a new
planning operation called interval disaggregate, which
goes as follows. Suppose that the planner, typically
the management of a company, plans sales revenues of
its products in the current year. An interval of the
expected revenue for each product in the current year
is computed from historical data in the database as the
prediction interval of linear regression on the data. A
total target revenue for the current year is given by
the planner. The goal of the interval disaggregate
operation is to find an appropriate disaggregation of
the target revenue, considering the intervals. We
formulate the problem of interval disaggregation more
precisely and give solutions for the problem.
Multidimensional geometry plays a crucial role in the
problem formulation and the solutions. We implemented
interval disaggregation into the planning engine of SAP
HANA and did experiments on real-world data. Our
experiments show that interval disaggregation gives
more appropriate solutions with respect to historical
data than the known basic disaggregation called
referential disaggregation. We also show that interval
disaggregation can be combined with the
deseasonalization technique when the dataset shows
seasonal fluctuations.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2014:FFT,
author = "Zhuo Zhang and Chao Li and Yangyu Tao and Renyu Yang
and Hong Tang and Jie Xu",
title = "{Fuxi}: a fault-tolerant resource management and job
scheduling system at {Internet} scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1393--1404",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Scalability and fault-tolerance are two fundamental
challenges for all distributed computing at Internet
scale. Despite many recent advances from both academia
and industry, these two problems are still far from
settled. In this paper, we present Fuxi, a resource
management and job scheduling system that is capable of
handling the kind of workload at Alibaba where hundreds
of terabytes of data are generated and analyzed
everyday to help optimize the company's business
operations and user experiences. We employ several
novel techniques to enable Fuxi to perform efficient
scheduling of hundreds of thousands of concurrent tasks
over large clusters with thousands of nodes: (1) an
incremental resource management protocol that supports
multi-dimensional resource allocation and data
locality; (2) user-transparent failure recovery where
failures of any Fuxi components will not impact the
execution of user jobs; and (3) an effective detection
mechanism and a multi-level blacklisting scheme that
prevents them from affecting job execution. Our
evaluation results demonstrate that 95\% and 91\%
scheduled CPU/memory utilization can be fulfilled under
synthetic workloads, and Fuxi is capable of achieving
2.36T-B/minute throughput in GraySort. Additionally,
the same Fuxi job only experiences approximately 16\%
slowdown under a 5\% fault-injection rate. The slowdown
only grows to 20\% when we double the fault-injection
rate to 10\%. Fuxi has been deployed in our production
environment since 2009, and it now manages hundreds of
thousands of server nodes.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Simmen:2014:LSG,
author = "David Simmen and Karl Schnaitter and Jeff Davis and
Yingjie He and Sangeet Lohariwala and Ajay Mysore and
Vinayak Shenoi and Mingfeng Tan and Yu Xiao",
title = "Large-scale graph analytics in {Aster 6}: bringing
context to big data discovery",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1405--1416",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Graph analytics is an important big data discovery
technique. Applications include identifying influential
employees for retention, detecting fraud in a complex
interaction network, and determining product affinities
by exploiting community buying patterns. Specialized
platforms have emerged to satisfy the unique processing
requirements of large-scale graph analytics; however,
these platforms do not enable graph analytics to be
combined with other analytics techniques, nor do they
work well with the vast ecosystem of SQL-based business
applications. Teradata Aster 6.0 adds support for
large-scale graph analytics to its repertoire of
analytics capabilities. The solution extends the
multi-engine processing architecture with support for
bulk synchronous parallel execution, and a specialized
graph engine that enables iterative analysis of graph
structures. Graph analytics functions written to the
vertex-oriented API exposed by the graph engine can be
invoked from the context of an SQL query and composed
with existing SQL-MR functions, thereby enabling data
scientists and business applications to express
computations that combine large-scale graph analytics
with techniques better suited to a different style of
processing. The solution includes a suite of pre-built
graph analytic functions adapted for parallel
execution.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chen:2014:FFK,
author = "Zhimin Chen and Vivek Narasayya and Surajit
Chaudhuri",
title = "Fast foreign-key detection in {Microsoft SQL} server
{PowerPivot} for {Excel}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1417--1428",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Microsoft SQL Server PowerPivot for Excel, or
PowerPivot for short, is an in-memory business
intelligence (BI) engine that enables Excel users to
interactively create pivot tables over large data sets
imported from sources such as relational databases,
text files and web data feeds. Unlike traditional pivot
tables in Excel that are defined on a single table,
PowerPivot allows analysis over multiple tables
connected via foreign-key joins. In many cases however,
these foreign-key relationships are not known a priori,
and information workers are often not be sophisticated
enough to define these relationships. Therefore, the
ability to automatically discover foreign-key
relationships in PowerPivot is valuable, if not
essential. The key challenge is to perform this
detection interactively and with high precision even
when data sets scale to hundreds of millions of rows
and the schema contains tens of tables and hundreds of
columns. In this paper, we describe techniques for fast
foreign-key detection in PowerPivot and experimentally
evaluate its accuracy, performance and scale on both
synthetic benchmarks and real-world data sets. These
techniques have been incorporated into PowerPivot for
Excel.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yu:2014:BDS,
author = "Meng-Chieh Yu and Tong Yu and Shao-Chen Wang and
Chih-Jen Lin and Edward Y. Chang",
title = "Big data small footprint: the design of a low-power
classifier for detecting transportation modes",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1429--1440",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Sensors on mobile phones and wearables, and in general
sensors on IoT (Internet of Things), bring forth a
couple of new challenges to big data research. First,
the power consumption for analyzing sensor data must be
low, since most wearables and portable devices are
power-strapped. Second, the velocity of analyzing big
data on these devices must be high, otherwise the
limited local storage may overflow. This paper presents
our hardware-software co-design of a classifier for
wearables to detect a person's transportation mode
(i.e., still, walking, running, biking, and on a
vehicle). We particularly focus on addressing the
big-data small-footprint requirement by designing a
classifier that is low in both computational complexity
and memory requirement. Together with a sensor-hub
configuration, we are able to drastically reduce power
consumption by 99\%, while maintaining competitive
mode-detection accuracy. The data used in the paper is
made publicly available for conducting research.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Boykin:2014:SFI,
author = "Oscar Boykin and Sam Ritchie and Ian O'Connell and
Jimmy Lin",
title = "{Summingbird}: a framework for integrating batch and
online {MapReduce} computations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1441--1451",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Summingbird is an open-source domain-specific language
implemented in Scala and designed to integrate online
and batch MapReduce computations in a single framework.
Summingbird programs are written using dataflow
abstractions such as sources, sinks, and stores, and
can run on different execution platforms: Hadoop for
batch processing (via Scalding/Cascading) and Storm for
online processing. Different execution modes require
different bindings for the dataflow abstractions (e.g.,
HDFS files or message queues for the source) but do not
require any changes to the program logic. Furthermore,
Summingbird can operate in a hybrid processing mode
that transparently integrates batch and online results
to efficiently generate up-to-date aggregations over
long time spans. The language was designed to improve
developer productivity and address pain points in
building analytics solutions at Twitter where often,
the same code needs to be written twice (once for batch
processing and again for online processing) and
indefinitely maintained in parallel. Our key insight is
that certain algebraic structures provide the
theoretical foundation for integrating batch and online
processing in a seamless fashion. This means that
Summingbird imposes constraints on the types of
aggregations that can be performed, although in
practice we have not found these constraints to be
overly restrictive for a broad range of analytics tasks
at Twitter.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ahmed:2014:SBT,
author = "Rafi Ahmed and Rajkumar Sen and Meikel Poess and Sunil
Chakkappen",
title = "Of snowstorms and bushy trees",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1452--1461",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Many workloads for analytical processing in commercial
RDBMSs are dominated by snowstorm queries, which are
characterized by references to multiple large fact
tables and their associated smaller dimension tables.
This paper describes a technique for bushy join tree
optimization for snowstorm queries in Oracle database
system. This technique generates bushy join trees
containing subtrees that produce substantially reduced
sets of rows and, therefore, their joins with other
subtrees are generally much more efficient than joins
in the left-deep trees. The generation of bushy join
trees within an existing commercial physical optimizer
requires extensive changes to the optimizer. Further,
the optimizer will have to consider a large join
permutation search space to generate efficient bushy
join trees. The novelty of the approach is that bushy
join trees can be generated outside the physical
optimizer using logical query transformation that
explores a considerably pruned search space. The paper
describes an algorithm for generating optimal bushy
join trees for snowstorm queries using an existing
query transformation framework. It also presents
performance results for this optimization, which show
significant execution time improvements.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Vemuri:2014:EPS,
author = "Srinivas Vemuri and Maneesh Varshney and Krishna
Puttaswamy and Rui Liu",
title = "Execution primitives for scalable joins and
aggregations in {MapReduce}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1462--1473",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Analytics on Big Data is critical to derive business
insights and drive innovation in today's Internet
companies. Such analytics involve complex computations
on large datasets, and are typically performed on
MapReduce based frameworks such as Hive and Pig.
However, in our experience, these systems are still
quite limited in performing at scale. In particular,
calculations that involve complex joins and
aggregations, e.g. statistical calculations, scale
poorly on these systems. In this paper we propose novel
primitives for scaling such calculations. We propose a
new data model for organizing datasets into calculation
data units that are organized based on user-defined
cost functions. We propose new operators that take
advantage of these organized data units to
significantly speed up joins and aggregations. Finally,
we propose strategies for dividing the aggregation load
uniformly across worker processes that are very
effective in avoiding skews and reducing (or in some
cases even removing) the associated overheads. We have
implemented all our proposed primitives in a framework
called Rubix, which has been in production at LinkedIn
for nearly a year. Rubix powers several applications
and processes TBs of data each day. We have seen
remarkable improvements in speed and cost of complex
calculations due to these primitives.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Arauz:2014:CLT,
author = "Javier Arauz",
title = "{CAP} limits in telecom subscriber database design",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1474--1483",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "While the notion of a Distributed DBMS has been
familiar to the IT industry for several decades, within
telecom networks the subscriber data management based
on DDBMS technology is a novel addition to a service
provider's infrastructure. Service providers are used
to telecom networks that are efficient, reliable and
easy to maintain and operate, in part thanks to the
node model used in designing such networks. A DDBMS
spanning a large geographical area however incurs into
distributed systems issues not previously seen in
telecom networks. Identifying and delivering the right
set of trade-offs that satisfies the service providers'
needs while staying within the known physical bounds of
a distributed system is therefore crucial if DDBMS are
to conquer the subscriber management space within
telecom networks.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bruno:2014:AJS,
author = "Nicolas Bruno and YongChul Kwon and Ming-Chuan Wu",
title = "Advanced join strategies for large-scale distributed
computation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1484--1495",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Companies providing cloud-scale data services have
increasing needs to store and analyze massive data sets
(e.g., search logs, click streams, and web graph data).
For cost and performance reasons, processing is
typically done on large clusters of thousands of
commodity machines by using high level scripting
languages. In the recent past, there has been
significant progress in adapting well-known techniques
from traditional relational DBMSs to this new scenario.
However, important challenges remain open. In this
paper we study the very common join operation, discuss
some unique challenges in the large-scale distributed
scenario, and explain how to efficiently and robustly
process joins in a distributed way. Specifically, we
introduce novel execution strategies that leverage
opportunities not available in centralized scenarios,
and others that robustly handle data skew. We report
experimental validations of our approaches on Scope
production clusters, which power the Applications and
Services Group at Microsoft.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2014:DSG,
author = "Yue Liu and Songlin Hu and Tilmann Rabl and Wantao Liu
and Hans-Arno Jacobsen and Kaifeng Wu and Jian Chen and
Jintao Li",
title = "{DGFIndex} for smart grid: enhancing {Hive} with a
cost-effective multidimensional range index",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1496--1507",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In Smart Grid applications, as the number of deployed
electric smart meters increases, massive amounts of
valuable meter data is generated and collected every
day. To enable reliable data collection and make
business decisions fast, high throughput storage and
high-performance analysis of massive meter data become
crucial for grid companies. Considering the advantage
of high efficiency, fault tolerance, and
price-performance of Hadoop and Hive systems, they are
frequently deployed as underlying platform for big data
processing. However, in real business use cases, these
data analysis applications typically involve
multidimensional range queries (MDRQ) as well as batch
reading and statistics on the meter data. While Hive is
high-performance at complex data batch reading and
analysis, it lacks efficient indexing techniques for
MDRQ. In this paper, we propose DGFIndex, an index
structure for Hive that efficiently supports MDRQ for
massive meter data. DGFIndex divides the data space
into cubes using the grid file technique. Unlike the
existing indexes in Hive, which stores all combinations
of multiple dimensions, DGFIndex only stores the
information of cubes. This leads to smaller index size
and faster query processing. Furthermore, with
pre-computing user-defined aggregations of each cube,
DGFIndex only needs to access the boundary region for
aggregation query. Our comprehensive experiments show
that DGFIndex can save significant disk space in
comparison with the existing indexes in Hive and the
query performance with DGFIndex is 2-50 times faster
than existing indexes in Hive and HadoopDB for
aggregation query, 2-5 times faster than both for
non-aggregation query, 2-75 times faster than scanning
the whole table in different query selectivity.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yan:2014:EBS,
author = "Ying Yan and Liang Jeff Chen and Zheng Zhang",
title = "Error-bounded sampling for analytics on big sparse
data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1508--1519",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Aggregation queries are at the core of business
intelligence and data analytics. In the big data era,
many scalable shared-nothing systems have been
developed to process aggregation queries over massive
amount of data. Microsoft's SCOPE is a well-known
instance in this category. Nevertheless, aggregation
queries are still expensive, because query processing
needs to consume the entire data set, which is often
hundreds of terabytes. Data sampling is a technique
that samples a small portion of data to process and
returns an approximate result with an error bound,
thereby reducing the query's execution time. While
similar problems were studied in the database
literature, we encountered new challenges that disable
most of prior efforts: (1) error bounds are dictated by
end users and cannot be compromised, (2) data is
sparse, meaning data has a limited population but a
wide range. For such cases, conventional uniform
sampling often yield high sampling rates and thus
deliver limited or no performance gains. In this paper,
we propose error-bounded stratified sampling to reduce
sample size. The technique relies on the insight that
we may only reduce the sampling rate with the knowledge
of data distributions. The technique has been
implemented into Microsoft internal search query
platform. Results show that the proposed approach can
reduce up to 99\% sample size comparing with uniform
sampling, and its performance is robust against data
volume and other key performance metrics.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gankidi:2014:IHD,
author = "Vinitha Reddy Gankidi and Nikhil Teletia and Jignesh
M. Patel and Alan Halverson and David J. DeWitt",
title = "Indexing {HDFS} data in {PDW}: splitting the data from
the index",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1520--1528",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "There is a growing interest in making relational DBMSs
work synergistically with MapReduce systems. However,
there are interesting technical challenges associated
with figuring out the right balance between the use and
co-deployment of these systems. This paper focuses on
one specific aspect of this balance, namely how to
leverage the superior indexing and query processing
power of a relational DBMS for data that is often more
cost-effectively stored in Hadoop/HDFS. We present a
method to use conventional B+-tree indices in an RDBMS
for data stored in HDFS and demonstrate that our
approach is especially effective for highly selective
queries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sun:2014:CLS,
author = "Chong Sun and Narasimhan Rampalli and Frank Yang and
AnHai Doan",
title = "{Chimera}: large-scale classification using machine
learning, rules, and crowdsourcing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1529--1540",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Large-scale classification is an increasingly critical
Big Data problem. So far, however, very little has been
published on how this is done in practice. In this
paper we describe Chimera, our solution to classify
tens of millions of products into 5000+ product types
at WalmartLabs. We show that at this scale, many
conventional assumptions regarding learning and
crowdsourcing break down, and that existing solutions
cease to work. We describe how Chimera employs a
combination of learning, rules (created by in-house
analysts), and crowdsourcing to achieve accurate,
continuously improving, and cost-effective
classification. We discuss a set of lessons learned for
other similar Big Data systems. In particular, we argue
that at large scales crowdsourcing is critical, but
must be used in combination with learning, rules, and
in-house analysts. We also argue that using rules (in
conjunction with learning) is a must, and that more
research attention should be paid to helping analysts
create and manage (tens of thousands of) rules more
effectively.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bonifati:2014:IJQ,
author = "Angela Bonifati and Radu Ciucanu and S{\L}Awek
Staworko",
title = "Interactive join query inference with {JIM}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1541--1544",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Specifying join predicates may become a cumbersome
task in many situations e.g., when the relations to be
joined come from disparate data sources, when the
values of the attributes carry little or no knowledge
of metadata, or simply when the user is unfamiliar with
querying formalisms. Such task is recurrent in many
traditional data management applications, such as data
integration, constraint inference, and database
denormalization, but it is also becoming pivotal in
novel crowdsourcing applications. We present Jim (Join
Inference Machine), a system for interactive join
specification tasks, where the user infers an $n$-ary
join predicate by selecting tuples that are part of the
join result via Boolean membership queries. The user
can label tuples as positive or negative, while the
system allows to identify and gray out the
uninformative tuples i.e., those that do not add any
information to the final learning goal. The tool also
guides the user to reach her join inference goal with a
minimal number of interactions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zheng:2014:MMS,
author = "Yuxin Zheng and Zhifeng Bao and Lidan Shou and Anthony
K. H. Tung",
title = "{MESA}: a map service to support fuzzy type-ahead
search over geo-textual data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1545--1548",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Geo-textual data are ubiquitous these days. Recent
study on spatial keyword search focused on the
processing of queries which retrieve objects that match
certain keywords within a spatial region. To ensure
effective data retrieval, various extensions were done
including the tolerance of errors in keyword matching
and the search-as-you-type feature using prefix
matching. We present MESA, a map application to support
different variants of spatial keyword query. In this
demonstration, we adopt the autocompletion paradigm
that generates the initial query as a prefix matching
query. If there are few matching results, other
variants are performed as a form of relaxation that
reuses the processing done in earlier phases. The types
of relaxation allowed include spatial region expansion
and exact/approximate prefix/substring matching. MESA
adopts the client-server architecture. It provides
fuzzy type-ahead search over geo-textual data. The core
of MESA is to adopt a unifying search strategy, which
incrementally applies the relaxation in an appropriate
order to maximize the efficiency of query processing.
In addition, MESA equips a user-friendly interface to
interact with users and visualize results. MESA also
provides customized search to meet the needs of
different users.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2014:RRT,
author = "Henan Wang and Guoliang Li and Huiqi Hu and Shuo Chen
and Bingwen Shen and Hao Wu and Wen-Syan Li and
Kian-Lee Tan",
title = "{R3}: a real-time route recommendation system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1549--1552",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Existing route recommendation systems have two main
weaknesses. First, they usually recommend the same
route for all users and cannot help control traffic
jam. Second, they do not take full advantage of
real-time traffic to recommend the best routes. To
address these two problems, we develop a real-time
route recommendation system, called R3, aiming to
provide users with the real-time-traffic-aware routes.
R3 recommends diverse routes for different users to
alleviate the traffic pressure. R3 utilizes historical
taxi driving data and real-time traffic data and
integrates them together to provide users with
real-time route recommendation.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Benedikt:2014:PPD,
author = "Michael Benedikt and Julien Leblay and Efthymia
Tsamoura",
title = "{PDQ}: proof-driven query answering over {Web}-based
data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1553--1556",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The data needed to answer queries is often available
through Web-based APIs. Indeed, for a given query there
may be many Web-based sources which can be used to
answer it, with the sources overlapping in their
vocabularies, and differing in their access
restrictions (required arguments) and cost. We
introduce PDQ (Proof-Driven Query Answering), a system
for determining a query plan in the presence of
web-based sources. It is: (i) constraint-aware ---
exploiting relationships between sources to rewrite an
expensive query into a cheaper one, (ii) access-aware
--- abiding by any access restrictions known in the
sources, and (iii) cost-aware --- making use of any
cost information that is available about services. PDQ
takes the novel approach of generating query plans from
proofs that a query is answerable. We demonstrate the
use of PDQ and its effectiveness in generating low-cost
plans.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Hassan:2014:DFA,
author = "Naeemul Hassan and Afroza Sultana and You Wu and
Gensheng Zhang and Chengkai Li and Jun Yang and Cong
Yu",
title = "Data in, fact out: automated monitoring of facts by
{FactWatcher}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1557--1560",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Towards computational journalism, we present
FactWatcher, a system that helps journalists identify
data-backed, attention-seizing facts which serve as
leads to news stories. FactWatcher discovers three
types of facts, including situational facts,
one-of-the-few facts, and prominent streaks, through a
unified suite of data model, algorithm framework, and
fact ranking measure. Given an append-only database,
upon the arrival of a new tuple, FactWatcher monitors
if the tuple triggers any new facts. Its algorithms
efficiently search for facts without exhaustively
testing all possible ones. Furthermore, FactWatcher
provides multiple features in striving for an
end-to-end system, including fact ranking,
fact-to-statement translation and keyword-based fact
search.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yuan:2014:ODA,
author = "Mingxuan Yuan and Ke Deng and Jia Zeng and Yanhua Li
and Bing Ni and Xiuqiang He and Fei Wang and Wenyuan
Dai and Qiang Yang",
title = "{OceanST}: a distributed analytic system for
large-scale spatiotemporal mobile broadband data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1561--1564",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the increasing prevalence of versatile mobile
devices and the fast deployment of broadband mobile
networks, a huge volume of Mobile Broadband (MBB) data
has been generated over time. The MBB data naturally
contain rich information of a large number of mobile
users, covering a considerable fraction of whole
population nowadays, including the mobile applications
they are using at different locations and time; the MBB
data may present the unprecedentedly large knowledge
base of human behavior which has highly recognized
commercial and social value. However, the storage,
management and analysis of the huge and fast growing
volume of MBB data post new and significant challenges
to the industrial practitioners and research community.
In this demonstration, we present a new, MBB data
tailored, distributed analytic system named OceanST
which has addressed a series of problems and weaknesses
of the existing systems, originally designed for more
general purpose and capable to handle MBB data to some
extent. OceanST is featured by (i) efficiently loading
of ever-growing MBB data, (ii) a bunch of
spatiotemporal aggregate queries and basic analysis
APIs frequently found in various MBB data application
scenarios, and (iii) sampling-based approximate
solution with provable accuracy bound to cope with huge
volume of MBB data. The demonstration will show the
advantage of OceanST in a cluster of 5 machines using
3TB data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Geerts:2014:TAF,
author = "Floris Geerts and Giansalvatore Mecca and Paolo
Papotti and Donatello Santoro",
title = "That's all folks!: {Llunatic} goes open source",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1565--1568",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "It is widely recognized that whenever different data
sources need to be integrated into a single target
database errors and inconsistencies may arise, so that
there is a strong need to apply data-cleaning
techniques to repair the data. Despite this need,
database research has so far investigated mappings and
data repairing essentially in isolation. Unfortunately,
schema-mappings and data quality rules interact with
each other, so that applying existing algorithms in a
pipelined way --- i.e., first exchange then data, then
repair the result --- does not lead to solutions even
in simple settings. We present the Llunatic mapping and
cleaning system, the first comprehensive proposal to
handle schema mappings and data repairing in a uniform
way. Llunatic is based on the intuition that
transforming and cleaning data are different facets of
the same problem, unified by their declarative nature.
This holistic approach allows us to incorporate unique
features into the system, such as configurable user
interaction and a tunable trade-off between efficiency
and quality of the solutions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2014:HMA,
author = "Weimo Liu and Saad Bin Suhaim and Saravanan
Thirumuruganathan and Nan Zhang and Gautam Das and Ali
Jaoua",
title = "{HDBTracker}: monitoring the aggregates on dynamic
hidden web databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1569--1572",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Numerous web databases, e.g., amazon.com, eBay.com,
are ``hidden'' behind (i.e., accessible only through)
their restrictive search and browsing interfaces. This
demonstration showcases HDBTracker, a web-based system
that reveals and tracks (the changes of) user-specified
aggregate queries over such hidden web databases,
especially those that are frequently updated, by
issuing a small number of search queries through the
public web interfaces of these databases. The ability
to track and monitor aggregates has applications over a
wide variety of domains --- e.g., government agencies
can track COUNT of openings at online job hunting
websites to understand key economic indicators, while
businesses can track the AVG price of a product over a
basket of e-commerce websites to understand the
competitive landscape and/or material costs. A key
technique used in HDBTracker is RS-ESTIMATOR, the first
algorithm that can efficiently monitor changes to
aggregate query answers over a hidden web database.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Xia:2014:BBA,
author = "Fan Xia and Ye Li and Chengcheng Yu and Haixin Ma and
Weining Qian",
title = "{BSMA}: a benchmark for analytical queries over social
media data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1573--1576",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The demonstration of a benchmark, named as BSMA, for
Benchmarking Social Media Analytics, is introduced in
this paper. BSMA is designed to benchmark data
management systems supporting analytical queries over
social media. It is different to existing benchmarks in
that: (1) Both real-life data and a synthetic data
generator are provided. The real-life dataset contains
a social network of 1.6 million users, and all their
tweeting and retweeting activities. The data generator
can generate both social networks and synthetic
timelines that follow data distributions determined by
predefined parameters. (2) A set of workloads are
provided. The data generator is in responsible for
producing updates. A workload generator produces
queries based on predefined query templates by
generating query arguments online. BSMA workloads cover
a large amount of queries with graph operations,
temporal queries, hotspot queries, and aggregate
queries. Furthermore, the argument generator is capable
of sampling data items in the timeline following
power-law distribution online. (3) A toolkit is
provided to measure and report the performance of
systems that implement the benchmark. Furthermore, a
prototype system based on dataset and workloads of BSMA
is also implemented. The demonstration will include two
parts, i.e. the internals of data and workload
generator, as well as the performance testing of
reference implementations.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Petermann:2014:GBD,
author = "Andr{\'e} Petermann and Martin Junghanns and Robert
M{\"u}ller and Erhard Rahm",
title = "Graph-based data integration and business intelligence
with {BIIIG}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1577--1580",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We demonstrate BIIIG (Business Intelligence with
Integrated Instance Graphs), a new system for
graph-based data integration and analysis. It aims at
improving business analytics compared to traditional
OLAP approaches by comprehensively tracking
relationships between entities and making them
available for analysis. BIIIG supports a largely
automatic data integration pipeline for metadata and
instance data. Metadata from heterogeneous sources are
integrated in a so-called Unified Metadata Graph (UMG)
while instance data is combined in a single integrated
instance graph (IIG). A unique feature of BIIIG is the
concept of business transaction graphs, which are
derived from the IIG and which reflect all steps
involved in a specific business process. Queries and
analysis tasks can refer to the entire instance graph
or sets of business transaction graphs. In the
demonstration, we perform all data integration steps
and present analytic queries including pattern matching
and graph-based aggregation of business measures.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Vartak:2014:SAG,
author = "Manasi Vartak and Samuel Madden and Aditya
Parameswaran and Neoklis Polyzotis",
title = "{SeeDB}: automatically generating query
visualizations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1581--1584",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data analysts operating on large volumes of data often
rely on visualizations to interpret the results of
queries. However, finding the right visualization for a
query is a laborious and time-consuming task. We
demonstrate SeeDB, a system that partially automates
this task: given a query, SeeDB explores the space of
all possible visualizations, and automatically
identifies and recommends to the analyst those
visualizations it finds to be most ``interesting'' or
``useful''. In our demonstration, conference attendees
will see SeeDB in action for a variety of queries on
multiple real-world datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dutt:2014:QEA,
author = "Anshuman Dutt and Sumit Neelam and Jayant R. Haritsa",
title = "{QUEST}: an exploratory approach to robust query
processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1585--1588",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lei:2014:RIR,
author = "Chuan Lei and Zhongfang Zhuang and Elke A.
Rundensteiner and Mohamed Y. Eltabakh",
title = "Redoop infrastructure for recurring big data queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1589--1592",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This demonstration presents the Redoop infrastructure,
the first full-fledged MapReduce framework with native
support for recurring big data queries. Recurring
queries, repeatedly being executed for long periods of
time over evolving high-volume data, have become a
bedrock component in most large-scale data analytic
applications. Redoop is a comprehensive extension to
Hadoop that pushes the support and optimization of
recurring queries into Hadoop's core functionality.
While backward compatible with regular MapReduce jobs,
Redoop achieves an order of magnitude better
performance than Hadoop for recurring workloads. Redoop
employs innovative window-aware optimization techniques
for such recurring workloads including adaptive
window-aware data partitioning, cache-aware task
scheduling, and inter-window caching mechanisms. We
will demonstrate Redoop's capabilities on a compute
cluster against real life workloads including
click-stream and sensor data analysis.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Brucato:2014:PTP,
author = "Matteo Brucato and Rahul Ramakrishna and Azza Abouzied
and Alexandra Meliou",
title = "{PackageBuilder}: from tuples to packages",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1593--1596",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this demo, we present PackageBuilder, a system that
extends database systems to support package queries. A
package is a collection of tuples that individually
satisfy base constraints and collectively satisfy
global constraints. The need for package support arises
in a variety of scenarios: For example, in the creation
of meal plans, users are not only interested in the
nutritional content of individual meals (base
constraints), but also care to specify daily
consumption limits and control the balance of the
entire plan (global constraints). We introduce PaQL, a
declarative SQL-based package query language, and the
interface abstractions which allow users to
interactively specify package queries and easily
navigate through their results. To efficiently evaluate
queries, the system employs pruning and heuristics, as
well as state-of-the-art constraint optimization
solvers. We demonstrate PackageBuilder by allowing
attendees to interact with the system's interface, to
define PaQL queries and to observe how query evaluation
is performed.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Amsterdamer:2014:OAC,
author = "Yael Amsterdamer and Susan B. Davidson and Tova Milo
and Slava Novgorodov and Amit Somech",
title = "Ontology assisted crowd mining",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1597--1600",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present OASSIS (for Ontology ASSISted crowd
mining), a prototype system which allows users to
declaratively specify their information needs, and
mines the crowd for answers. The answers that the
system computes are concise and relevant, and represent
frequent, significant data patterns. The system is
based on (1) a generic model that captures both
ontological knowledge, as well as the individual
knowledge of crowd members from which frequent patterns
are mined; (2) a query language in which users can
specify their information needs and types of data
patterns they seek; and (3) an efficient query
evaluation algorithm, for mining semantically concise
answers while minimizing the number of questions posed
to the crowd. We will demonstrate OASSIS using a couple
of real-life scenarios, showing how users can formulate
and execute queries through the OASSIS UI and how the
relevant data is mined from the crowd.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chen:2014:SSE,
author = "Lisi Chen and Yan Cui and Gao Cong and Xin Cao",
title = "{SOPS}: a system for efficient processing of
spatial-keyword publish\slash subscribe",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1601--1604",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Massive amount of data that are geo-tagged and
associated with text information are being generated at
an unprecedented scale. These geo-textual data cover a
wide range of topics. Users are interested in receiving
up-to-date geo-textual objects (e.g., geo-tagged
Tweets) such that their locations meet users' need and
their texts are interesting to users. For example, a
user may want to be updated with tweets near her home
on the topic ``dengue fever headache''. AB@In this
demonstration, we present SOPS, the Spatial-Keyword
Publish/Subscribe System, that is capable of
efficiently processing spatial keyword continuous
queries. SOPS supports two types of queries: (1)
Boolean Range Continuous (BRC) query that can be used
to subscribe the geo-textual objects satisfying a
boolean keyword expression and falling in a specified
spatial region; (2) Temporal Spatial-Keyword Top-$k$
Continuous (TaSK) query that continuously maintains
up-to-date top-$k$ most relevant results over a stream
of geo-textual objects. SOPS enables users to formulate
their queries and view the real-time results over a
stream of geo-textual objects by browser-based user
interfaces. On the server side, we propose solutions to
efficiently processing a large number of BRC queries
(tens of millions) and TaSK queries over a stream of
geo-textual objects.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shirakawa:2014:MLI,
author = "Masumi Shirakawa and Takahiro Hara and Shojiro
Nishio",
title = "{MLJ}: language-independent real-time search of tweets
reported by media outlets and journalists",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1605--1608",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this demonstration, we introduce MLJ (MultiLingual
Journalism, http://mljournalism.com), a first Web-based
system that enables users to search any topic of latest
tweets posted by media outlets and journalists beyond
languages. Handling multilingual tweets in real time
involves many technical challenges: language barrier,
sparsity of words, and real-time data stream. To
overcome the language barrier and the sparsity of
words, MLJ harnesses CL-ESA, a Wikipedia-based
language-independent method to generate a vector of
Wikipedia pages (entities) from an input text. To
continuously deal with tweet stream, we propose
one-pass DP-means, an online clustering method based on
DP-means. Given a new tweet as an input, MLJ generates
a vector using CL-ESA and classifies it into one of
clusters using one-pass DP-means. By interpreting a
search query as a vector, users can instantly search
clusters containing latest related tweets from the
query without being aware of language differences. MLJ
as of March 2014 supports nine languages including
English, Japanese, Korean, Spanish, Portuguese, German,
French, Italian, and Arabic covering 24 countries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bress:2014:OHO,
author = "Sebastian Bre{\ss} and Bastian K{\"o}cher and Max
Heimel and Volker Markl and Michael Saecker and Gunter
Saake",
title = "{Ocelot\slash HyPE}: optimized data processing on
heterogeneous hardware",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1609--1612",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The past years saw the emergence of highly
heterogeneous server architectures that feature
multiple accelerators in addition to the main
processor. Efficiently exploiting these systems for
data processing is a challenging research problem that
comprises many facets, including how to find an optimal
operator placement strategy, how to estimate runtime
costs across different hardware architectures, and how
to manage the code and maintenance blowup caused by
having to support multiple architectures. In prior
work, we already discussed solutions to some of these
problems: First, we showed that specifying operators in
a hardware-oblivious way can prevent code blowup while
still maintaining competitive performance when
supporting multiple architectures. Second, we presented
learning cost functions and several heuristics to
efficiently place operators across all available
devices. In this demonstration, we provide further
insights into this line of work by presenting our
combined system Ocelot/HyPE. Our system integrates a
hardware-oblivious data processing engine with a
learning query optimizer for placement decisions,
resulting in a highly adaptive DBMS that is
specifically tailored towards heterogeneous hardware
environments.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wu:2014:MMO,
author = "Fei Wu and Tobias Kin Hou Lei and Zhenhui Li and
Jiawei Han",
title = "{MoveMine 2.0}: mining object relationships from
movement data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1613--1616",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The development in positioning technology has enabled
us to collect a huge amount of movement data from
moving objects, such as human, animals, and vehicles.
The data embed rich information about the relationships
among moving objects and have applications in many
fields, e.g., in ecological study and human behavioral
study. Previously, we have proposed a system MoveMine
that integrates several start-of-art movement mining
methods. However, it does not include recent methods on
relationship pattern mining. Thus, we propose to extend
MoveMine to MoveMine 2.0 by adding substantial new
methods in mining dynamic relationship patterns. Newly
added methods focus on two types of pairwise
relationship patterns: (i) attraction/avoidance
relationship, and (ii) following pattern. A
user-friendly interface is designed to support
interactive exploration of the result and provides
flexibility in tuning parameters. MoveMine 2.0 is
tested on multiple types of real datasets to ensure its
practical use. Our system provides useful tools for
domain experts to gain insights on real dataset.
Meanwhile, it will promote further research in
relationship mining from moving objects.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sun:2014:PFA,
author = "Liwen Sun and Sanjay Krishnan and Reynold S. Xin and
Michael J. Franklin",
title = "A partitioning framework for aggressive data
skipping",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1617--1620",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We propose to demonstrate a fine-grained partitioning
framework that reorganizes the data tuples into small
blocks at data loading time. The goal is to enable
queries to maximally skip scanning data blocks. The
partition framework consists of four steps: (1)
workload analysis, which extracts features from a query
workload, (2) augmentation, which augments each data
tuple with a feature vector, (3) reduce, which
succinctly represents a set of data tuples using a set
of feature vectors, and (4) partitioning, which
performs a clustering algorithm to partition the
feature vectors and uses the clustering result to guide
the actual data partitioning. Our experiments show that
our techniques result in a 3-7x query response time
improvement over traditional range partitioning due to
more effective data skipping.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cao:2014:IOE,
author = "Lei Cao and Qingyang Wang and Elke A. Rundensteiner",
title = "Interactive outlier exploration in big data streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1621--1624",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We demonstrate our VSOutlier system for supporting
interactive exploration of outliers in big data
streams. VSOutlier not only supports a rich variety of
outlier types supported by innovative and efficient
outlier detection strategies, but also provides a rich
set of interactive interfaces to explore outliers in
real time. Using the stock transactions dataset from
the US stock market and the moving objects dataset from
MITRE, we demonstrate that the VSOutlier system enables
analysts to more efficiently identify, understand, and
respond to phenomena of interest in near real-time even
when applied to high volume streams.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{To:2014:SAE,
author = "Quoc-Cuong To and Benjamin Nguyen and Philippe
Pucheral",
title = "{SQL\slash AA}: executing {SQL} on an asymmetric
architecture",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1625--1628",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Current applications, from complex sensor systems
(e.g. quantified self) to online e-markets acquire vast
quantities of personal information which usually end-up
on central servers. This information represents an
unprecedented potential for user customized
applications and business (e.g., car insurance billing,
carbon tax, traffic decongestion, resource optimization
in smart grids, healthcare surveillance, participatory
sensing). However, the PRISM affair has shown that
public opinion is starting to wonder whether these new
services are not bringing us closer to science fiction
dystopias. It has become clear that centralizing and
processing all one's data on a single server is a major
problem with regards to privacy concerns. Conversely,
decentralized architectures, devised to help
individuals keep full control of their data, complexify
global treatments and queries, often impeding the
development of innovative services and applications.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chen:2014:GGS,
author = "Zhao Chen and Rui Fu and Ziyuan Zhao and Zheng Liu and
Leihao Xia and Lei Chen and Peng Cheng and Caleb Chen
Cao and Yongxin Tong and Chen Jason Zhang",
title = "{gMission}: a general spatial crowdsourcing platform",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1629--1632",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "As one of the successful forms of using Wisdom of
Crowd, crowdsourcing, has been widely used for many
human intrinsic tasks, such as image labeling, natural
language understanding, market predication and opinion
mining. Meanwhile, with advances in pervasive
technology, mobile devices, such as mobile phones and
tablets, have become extremely popular. These mobile
devices can work as sensors to collect multimedia
data(audios, images and videos) and location
information. This power makes it possible to implement
the new crowdsourcing mode: spatial crowdsourcing. In
spatial crowdsourcing, a requester can ask for
resources related a specific location, the mobile users
who would like to take the task will travel to that
place and get the data. Due to the rapid growth of
mobile device uses, spatial crowdsourcing is likely to
become more popular than general crowdsourcing, such as
Amazon Turk and Crowdflower. However, to implement such
a platform, effective and efficient solutions for
worker incentives, task assignment, result aggregation
and data quality control must be developed. In this
demo, we will introduce gMission, a general spatial
crowdsourcing platform, which features with a
collection of novel techniques, including geographic
sensing, worker detection, and task recommendation. We
introduce the sketch of system architecture and
illustrate scenarios via several case analysis.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cetintemel:2014:SSN,
author = "Ugur Cetintemel and Jiang Du and Tim Kraska and Samuel
Madden and David Maier and John Meehan and Andrew Pavlo
and Michael Stonebraker and Erik Sutherland and Nesime
Tatbul and Kristin Tufte and Hao Wang and Stanley
Zdonik",
title = "{S-Store}: a streaming {NewSQL} system for big
velocity applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1633--1636",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "First-generation streaming systems did not pay much
attention to state management via ACID transactions
(e.g., [3, 4]). S-Store is a data management system
that combines OLTP transactions with stream processing.
To create S-Store, we begin with H-Store, a main-memory
transaction processing engine, and add primitives to
support streaming. This includes triggers and
transaction workflows to implement push-based
processing, windows to provide a way to bound the
computation, and tables with hidden state to implement
scoping for proper isolation. This demo explores the
benefits of this approach by showing how a na{\"\i}ve
implementation of our benchmarks using only H-Store can
yield incorrect results. We also show that by
exploiting push-based semantics and our implementation
of triggers, we can achieve significant improvement in
transaction throughput. We demo two modern
applications: (i) leaderboard maintenance for a version
of ``American Idol'', and (ii) a city-scale bicycle
rental scenario.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Xie:2014:CRT,
author = "Runquan Xie and Feida Zhu and Hui Ma and Wei Xie and
Chen Lin",
title = "{CLEar}: a real-time online observatory for bursty and
viral events",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1637--1640",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We describe our demonstration of CLEar (CLairaudient
Ear), a real-time online platform for detecting,
monitoring, summarizing, contextualizing and
visualizing bursty and viral events, those triggering a
sudden surge of public interest and going viral on
micro-blogging platforms. This task is challenging for
existing methods as they either use complicated topic
models to analyze topics in a off-line manner or define
temporal structure of fixed granularity on the data
stream for online topic learning, leaving them hardly
scalable for real-time stream like that of Twitter. In
this demonstration of CLEar, we present a three-stage
system: First, we show a real-time bursty event
detection module based on a data-sketch topic model
which makes use of acceleration of certain stream
quantities as the indicators of topic burstiness to
trigger efficient topic inference. Second, we
demonstrate popularity prediction for the detected
bursty topics and event summarization based on
clustering related topics detected in successive time
periods. Third, we illustrate CLEar's module for
contextualizing and visualizing the event evolution
both along time-line and across other news media to
offer an easier understanding of the events.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Suh:2014:ALI,
author = "Young-Kyoon Suh and Richard T. Snodgrass and Rui
Zhang",
title = "{AZDBLab}: a laboratory information system for
large-scale empirical {DBMS} studies",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1641--1644",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In the database field, while very strong mathematical
and engineering work has been done, the scientific
approach has been much less prominent. The deep
understanding of query optimizers obtained through the
scientific approach can lead to better engineered
designs. Unlike other domains, there have been few
DBMS-dedicated laboratories, focusing on such
scientific investigation. In this demonstration, we
present a novel DBMS-oriented research infrastructure,
called Arizona Database Laboratory (AZDBLab), to assist
database researchers in conducting a large-scale
empirical study across multiple DBMSes. For them to
test their hypotheses on the behavior of query
optimizers, AZDBLab can run and monitor a large-scale
experiment with thousands (or millions) of queries on
different DBMSes. Furthermore, AZDBLab can help users
automatically analyze these queries. In the demo, the
audience will interact with AZDBLab through the
stand-alone application and the mobile app to conduct
such a large-scale experiment for a study. The audience
will then run a Tucson Timing Protocol analysis on the
finished experiment and then see the analysis (data
sanity check and timing) results.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2014:TTM,
author = "Qi Wang and Manohar Kaul and Cheng Long and Raymond
Chi-Wing Wong",
title = "{Terrain-Toolkit}: a multi-functional tool for terrain
data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1645--1648",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Terrain data is becoming increasingly popular both in
industry and in academia. Many tools have been
developed for visualizing terrain data. However, we
find that (1) they usually accept very few data formats
of terrain data only; (2) they do not support terrain
simplification well which, as will be shown, is used
heavily for query processing in spatial databases; and
(3) they do not provide the surface distance operator
which is fundamental for many applications based on
terrain data. Motivated by this, we developed a tool
called Terrain-Toolkit for terrain data which accepts a
comprehensive set of data formats, supports terrain
simplification and provides the surface distance
operator.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fu:2014:FDC,
author = "Yupeng Fu and Kian Win Ong and Yannis Papakonstantinou
and Erick Zamora",
title = "Forward: data-centric {ULS} using declarative
templates that efficiently wrap third-party
{JavaScript} components",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1649--1652",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/java2010.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "While Ajax programming and the plethora of JavaScript
component libraries enable high-quality Uls in web
applications, integrating them with page data is
laborious and error-prone as a developer has to
handcode incremental modifications with trigger-based
programming and manual coordination of data
dependencies. The FORWARD web framework simplifies the
development of Ajax applications through declarative,
state-based templates. This declarative, data-centric
approach is characterized by the principle of
logical/physical independence, which the database
community has often deployed successfully. It enables
FORWARD to leverage database techniques, such as
incremental view maintenance, updatable views,
capability-based component wrappers and cost-based
optimization to automate efficient live visualizations.
We demonstrate an end-to-end system implementation,
including a web-based IDE (itself built in FORWARD),
academic and commercial applications built in FORWARD
and a wide variety of JavaScript components supported
by the declarative templates.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lin:2014:SSP,
author = "Xika Lin and Abhishek Mukherji and Elke A.
Rundensteiner and Matthew O. Ward",
title = "{SPIRE}: supporting parameter-driven interactive rule
mining and exploration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1653--1656",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We demonstrate our SPIRE technology for supporting
interactive mining of both positive and negative rules
at the speed of thought. It is often misleading to
learn only about positive rules, yet extremely
revealing to find strongly supported negative rules.
Key technical contributions of SPIRE including
region-wise abstractions of rules, positive-negative
rule relationship analysis, rule redundancy management
and rule visualization supporting novel exploratory
queries will be showcased. The audience can
interactively explore complex rule relationships in a
visual manner, such as comparing negative rules with
their positive counterparts, that would otherwise take
prohibitive time. Overall, our SPIRE system provides
data analysts with rich insights into rules and rule
relationships while significantly reducing manual
effort and time investment required.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Anderson:2014:IDE,
author = "Michael R. Anderson and Michael Cafarella and Yixing
Jiang and Guan Wang and Bochun Zhang",
title = "An integrated development environment for faster
feature engineering",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1657--1660",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The application of machine learning to large datasets
has become a core component of many important and
exciting software systems being built today. The
extreme value in these trained systems is tempered,
however, by the difficulty of constructing them. As
shown by the experience of Google, Netflix, IBM, and
many others, a critical problem in building trained
systems is that of feature engineering. High-quality
machine learning features are crucial for the system's
performance but are difficult and time-consuming for
engineers to develop. Data-centric developer tools that
improve the productivity of feature engineers will thus
likely have a large impact on an important area of
work. We have built a demonstration integrated
development environment for feature engineers. It
accelerates one particular step in the feature
engineering development cycle: evaluating the
effectiveness of novel feature code. In particular, it
uses an index and runtime execution planner to process
raw data objects (e.g., Web pages) in order of
descending likelihood that the data object will be
relevant to the user's feature code. This demonstration
IDE allows the user to write arbitrary feature code,
evaluate its impact on learner quality, and observe
exactly how much faster our technique performs compared
to a baseline system.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Xiong:2014:PSD,
author = "Pengcheng Xiong and Hakan Hacig{\"u}m{\"u}s",
title = "{Pronto}: a software-defined networking based system
for performance management of analytical queries on
distributed data stores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1661--1664",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Nowadays data analytics applications are accessing
more and more data from distributed data stores,
creating large amount of data traffic on the network.
Therefore, distributed analytic queries are prone to
suffer from bad performance in terms of query execution
time when they encounter a network resource contention,
which is quite common in a shared network. Typical
distributed query optimizers do not have a way to solve
this problem because historically they have been
treating the network underneath as a black-box: they
are unable to monitor it, let alone to control it.
However, we are entering a new era of software-defined
networking (SDN), which provides visibility into and
control of the network's state for the applications
including distributed database systems. In this
demonstration, we present a system, called Pronto that
leverages the SDN capabilities for a distributed query
processor to achieve performance improvement and
differentiation for analytical queries. The system is
the real implementation of our recently developed
methods on commercial SDN products. The demonstration
shows the shortcomings of a distributed query
optimizer, which treats the underlying network as a
black box, and the advantages of the SDN-based approach
by allowing the users to selectively explore various
relevant and interesting settings in a distributed
query processing environment.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2014:GYB,
author = "Rui Zhang and Reshu Jain and Prasenjit Sarkar and
Lukas Rupprecht",
title = "Getting your big data priorities straight: a
demonstration of priority-based {QoS} using
social-network-driven stock recommendation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1665--1668",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "As we come to terms with various big data challenges,
one vital issue remains largely untouched. That is the
optimal multiplexing and prioritization of different
big data applications sharing the same underlying
infrastructure, for example, a public cloud platform.
Given these demanding applications and the necessary
practice to avoid over-provisioning, resource
contention between applications is inevitable. Priority
must be given to important applications (or sub
workloads in an application) in these circumstances.
This demo highlights the compelling impact
prioritization could make, using an example application
that recommends promising combinations of stocks to
purchase based on relevant Twitter sentiment. The
application consists of a batch job and an interactive
query, ran simultaneously. Our underlying solution
provides a unique capability to identify and
differentiate application workloads throughout a
complex big data platform. Its current implementation
is based on Apache Hadoop and the IBM GPFS distributed
storage system. The demo showcases the superior
interactive query performance achievable by
prioritizing its workloads and thereby avoiding I/O
bandwidth contention. The query time is 3.6 $ \times $
better compared to no prioritization. Such a
performance is within 0.3\% of that of an idealistic
system where the query runs without contention. The
demo is conducted on around 3 months of Twitter data,
pertinent to the S \& P 100 index, with about 4 $
\times $ 10$^{12}$ potential stock combinations
considered.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jindal:2014:VYR,
author = "Alekh Jindal and Praynaa Rawlani and Eugene Wu and
Samuel Madden and Amol Deshpande and Mike Stonebraker",
title = "{Vertexica}: your relational friend for graph
analytics!",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1669--1672",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we present Vertexica, a graph analytics
tools on top of a relational database, which is user
friendly and yet highly efficient. Instead of
constraining programmers to SQL, Vertexica offers a
popular vertex-centric query interface, which is more
natural for analysts to express many graph queries. The
programmers simply provide their vertex-compute
functions and Vertexica takes care of efficiently
executing them in the standard SQL engine. The
advantage of using Vertexica is its ability to leverage
the relational features and enable much more
sophisticated graph analysis. These include expressing
graph algorithms which are difficult in vertex-centric
but straightforward in SQL and the ability to compose
end-to-end data processing pipelines, including pre-
and post- processing of graphs as well as combining
multiple algorithms for deeper insights. Vertexica has
a graphical user interface and we outline several
demonstration scenarios including, interactive graph
analysis, complex graph analysis, and continuous and
time series analysis.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Quamar:2014:NNC,
author = "Abdul Quamar and Amol Deshpande and Jimmy Lin",
title = "{NScale}: neighborhood-centric analytics on large
graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1673--1676",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "There is an increasing interest in executing rich and
complex analysis tasks over large-scale graphs, many of
which require processing and reasoning about a large
number of multi-hop neighborhoods or subgraphs in the
graph. Examples of such tasks include ego network
analysis, motif counting in biological networks,
finding social circles, personalized recommendations,
link prediction, anomaly detection, analyzing influence
cascades, and so on. These tasks are not well served by
existing vertex-centric graph processing frameworks
whose computation and execution models limit the user
program to directly access the state of a single
vertex, resulting in high communication, scheduling,
and memory overheads in executing such tasks. Further,
most existing graph processing frameworks also
typically ignore the challenges in extracting the
relevant portions of the graph that an analysis task is
interested in, and loading it onto distributed memory.
In this demonstration proposal, we describe NScale, a
novel end-to-end graph processing framework that
enables the distributed execution of complex
neighborhood-centric analytics over large-scale graphs
in the cloud. NScale enables users to write programs at
the level of neighborhoods or subgraphs. NScale uses
Apache YARN for efficient and fault-tolerant
distribution of data and computation; it features GEL,
a novel graph extraction and loading phase, that
extracts the relevant portions of the graph and loads
them into distributed memory using as few machines as
possible. NScale utilizes novel techniques for the
distributed execution of user computation that minimize
memory consumption by exploiting overlap among the
neighborhoods of interest. A comprehensive experimental
evaluation shows orders-of-magnitude improvements in
performance and total cost over vertex-centric
approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2014:DDP,
author = "Haoran Li and Li Xiong and Lifan Zhang and Xiaoqian
Jiang",
title = "{DPSynthesizer}: differentially private data
synthesizer for privacy preserving data sharing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1677--1680",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Differential privacy has recently emerged in private
statistical data release as one of the strongest
privacy guarantees. Releasing synthetic data that mimic
original data with differential privacy provides a
promising way for privacy preserving data sharing and
analytics while providing a rigorous privacy guarantee.
However, to this date there is no open-source tools
that allow users to generate differentially private
synthetic data, in particular, for high dimensional and
large domain data. Most of the existing techniques that
generate differentially private histograms or synthetic
data only work well for single dimensional or
low-dimensional histograms. They become problematic for
high dimensional and large domain data due to increased
perturbation error and computation complexity. We
propose DPSynthesizer, a toolkit for differentially
private data synthesization. The core of DPSynthesizer
is DPCopula designed for high-dimensional and
large-domain data. DPCopula computes a differentially
private copula function from which synthetic data can
be sampled. Copula functions are used to describe the
dependence between multivariate random vectors and
allow us to build the multivariate joint distribution
using one-dimensional marginal distributions.
DPSynthesizer also implements a set of state-of-the-art
methods for building differentially private histograms,
suitable for low-dimensional data, from which synthetic
data can be generated. We will demonstrate the system
using DPCopula as well as other methods with various
data sets and show the feasibility, utility, and
efficiency of various methods.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kong:2014:SLS,
author = "Longbo Kong and Zhi Liu and Yan Huang",
title = "{SPOT}: locating social media users based on social
network context",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1681--1684",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A tremendous amount of information is being shared
everyday on social media sites such as Facebook,
Twitter or Google+. But only a small portion of users
provide their location information, which can be
helpful in targeted advertisement and many other
services. In this demo we present our large scale user
location estimation system, SPOT, which showcase
different location estimating models on real world data
sets. The demo shows three different location
estimation algorithms: a friend-based, a social
closeness-based, and an energy and local social
coefficient based. The first algorithm is a baseline
and the other two new algorithms utilize social
closeness information which was traditionally treated
as a binary friendship. The two algorithms are based on
the premise that friends are different and close
friends can help to estimate location better. The demo
will also show that all three algorithms benefit from a
confidence-based iteration method. The demo is
web-based. A user can specify different settings,
explore the estimation results on a map, and observe
the statistical information, e.g. accuracy and average
friends used in the estimation, dynamically. The demo
provides two datasets: Twitter (148,860 located users)
and Gowalla (99,563 located users). Furthermore, a user
can filter users with certain features, e.g. with more
than 100 friends, to see how the estimating models work
on a particular case. The estimated and real locations
of those users as well as their friends will be
displayed on the map.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Alavi:2014:RQE,
author = "Zohreh Alavi and Lu Zhou and James Powers and Keke
Chen",
title = "{RASP-QS}: efficient and confidential query services
in the cloud",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1685--1688",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Hosting data query services in public clouds is an
attractive solution for its great scalability and
significant cost savings. However, data owners also
have concerns on data privacy due to the lost control
of the infrastructure. This demonstration shows a
prototype for efficient and confidential range/kNN
query services built on top of the random space
perturbation (RASP) method. The RASP approach provides
a privacy guarantee practical to the setting of
cloud-based computing, while enabling much faster query
processing compared to the encryption-based approach.
This demonstration will allow users to more intuitively
understand the technical merits of the RASP approach
via interactive exploration of the visual interface.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kunjir:2014:TTM,
author = "Mayuresh Kunjir and Prajakta Kalmegh and Shivnath
Babu",
title = "{Thoth}: towards managing a multi-system cluster",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1689--1692",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Following the 'no one size fits all' philosophy,
active research in big data platforms is focusing on
creating an environment for multiple 'one-size' systems
to co-exist and cooperate in the same cluster.
Consequently, it has now become imperative to provide
an integrated management solution that provides a
database-centric view of the underlying multi-system
environment. We outline the proposal of DBMS$^+$, a
database management platform over multiple 'one-size'
systems. Our prototype implementation of DBMS$^+$,
called Thoth, adaptively chooses a best-fit system
based on application requirements. In this
demonstration, we propose to showcase Thoth DM, a data
management framework for Thoth which consists of a data
collection pipeline utility, data consolidation and
dispatcher module, and a warehouse for storing this
data. We further introduce the notion of apps; an app
is a utility that registers with Thoth DM and
interfaces with its warehouse to provide core database
management functionalities like dynamic provisioning of
resources, designing a multi-system-aware optimizer,
tuning of configuration parameters on each system, data
storage, and layout schemes. We will demonstrate Thoth
DM in action over Hive, Hadoop, Shark, Spark, and the
Hadoop Distributed File System. This demonstration will
focus on the following apps: (i) Dashboard for
administration and control that will let the audience
monitor and visualize a database-centric view of the
multi-system cluster, and (ii) Data Layout Recommender
app will allow searching for the optimal data layout in
the multi-system setting.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2014:XLC,
author = "Lei Zhang and Achim Rettinger",
title = "{X-LiSA}: cross-lingual semantic annotation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1693--1696",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The ever-increasing quantities of structured knowledge
on the Web and the impending need of multilinguality
and cross-linguality for information access pose new
challenges but at the same time open up new
opportunities for knowledge extraction research. In
this regard, cross-lingual semantic annotation has
emerged as a topic of major interest and it is
essential to build tools that can link words and
phrases in unstructured text in one language to
resources in structured knowledge bases in any other
language. In this paper, we demonstrate X-LiSA, an
infrastructure for cross-lingual semantic annotation,
which supports both service-oriented and user-oriented
interfaces for annotating text documents and web pages
in different languages using resources from Wikipedia
and Linked Open Data (LOD).",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jayachandran:2014:CUI,
author = "Prasanth Jayachandran and Karthik Tunga and Niranjan
Kamat and Arnab Nandi",
title = "Combining user interaction, speculative query
execution and sampling in the {DICE} system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1697--1700",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The interactive exploration of data cubes has become a
popular application, especially over large datasets. In
this paper, we present DICE, a combination of a novel
frontend query interface and distributed aggregation
backend that enables interactive cube exploration. DICE
provides a convenient, practical alternative to the
typical offline cube materialization strategy by
allowing the user to explore facets of the data cube,
trading off accuracy for interactive response-times, by
sampling the data. We consider the time spent by the
user perusing the results of their current query as an
opportunity to execute and cache the most likely
followup queries. The frontend presents a novel
intuitive interface that allows for sampling-aware
aggregations, and encourages interaction via our
proposed faceted model. The design of our backend is
tailored towards the low-latency user interaction at
the frontend, and vice-versa. We discuss the
synergistic design behind both the frontend user
experience and the backend architecture of DICE; and,
present a demonstration that allows the user to fluidly
interact with billion-tuple datasets within sub-second
interactive response times.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Su:2014:SSM,
author = "Han Su and Kai Zheng and Kai Zeng and Jiamin Huang and
Xiaofang Zhou",
title = "{STMaker}: a system to make sense of trajectory data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1701--1704",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Widely adoption of GPS-enabled devices generates large
amounts of trajectories every day. The raw trajectory
data describes the movement history of moving objects
by a sequence of (longitude, latitude, time-stamp)
triples, which are nonintuitive for human to perceive
the prominent features of the trajectory, such as where
and how the moving object travels. In this demo, we
present the STMaker system to help users make sense of
individual trajectories. Given a trajectory, STMaker
can automatically extract the significant semantic
behavior of the trajectory, and summarize the behavior
by a short human-readable text. In this paper, we first
introduce the phrases of generating trajectory
summarizations, and then show several real trajectory
summarization cases.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jugel:2014:FVA,
author = "Uwe Jugel and Zbigniew Jerzak and Gregor Hackenbroich
and Volker Markl",
title = "Faster visual analytics through pixel-perfect
aggregation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1705--1708",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "State-of-the-art visual data analysis tools ignore
bandwidth limitations. They fetch millions of records
of high-volume time series data from an underlying
RDBMS to eventually draw only a few thousand pixels on
the screen. In this work, we demonstrate a pixel-aware
big data visualization system that dynamically adapts
the number of data points transmitted and thus the data
rate, while preserving pixel-perfect visualizations. We
show how to carefully select the data points to fetch
for each pixel of a visualization, using a
visualization-driven data aggregation that models the
visualization process. Defining all required data
reduction operators at the query level, our system
trades off a few milliseconds of query execution time
for dozens of seconds of data transfer time. The
results are significantly reduced response times and a
near real-time visualization of millions of data
points. Using our pixel-aware system, the audience will
be able to enjoy the speed and ease of big data
visualizations and learn about the scientific
background of our system through an interactive
evaluation component, allowing the visitor to measure,
visualize, and compare competing visualization-related
data reduction techniques.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Khan:2014:SBG,
author = "Arijit Khan and Sameh Elnikety",
title = "Systems for big-graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1709--1710",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Graphs have become increasingly important to represent
highly-interconnected structures and schema-less data
including the World Wide Web, social networks,
knowledge graphs, genome and scientific databases,
medical and government records. The massive scale of
graph data easily overwhelms the main memory and
computation resources on commodity servers. In these
cases, achieving low latency and high throughput
requires partitioning the graph and processing the
graph data in parallel across a cluster of servers.
However, the software and hardware advances that have
worked well for developing parallel databases and
scientific applications are not necessarily effective
for big-graph problems. Graph processing poses
interesting system challenges: graphs represent
relationships which are usually irregular and
unstructured; and therefore, the computation and data
access patterns have poor locality. Hence, the last few
years has seen an unprecedented interest in building
systems for big-graphs by various communities including
databases, systems, semantic web, machine learning, and
operations research. In this tutorial, we discuss the
design of the emerging systems for processing of
big-graphs, key features of distributed graph
algorithms, as well as graph partitioning and workload
balancing techniques. We emphasize the current
challenges and highlight some future research
directions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gal:2014:UER,
author = "Avigdor Gal",
title = "Uncertain entity resolution: re-evaluating entity
resolution in the big data era: tutorial",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1711--1712",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Entity resolution is a fundamental problem in data
integration dealing with the combination of data from
different sources to a unified view of the data. Entity
resolution is inherently an uncertain process because
the decision to map a set of records to the same entity
cannot be made with certainty unless these are
identical in all of their attributes or have a common
key. In the light of recent advancement in data
accumulation, management, and analytics landscape
(known as big data) the tutorial re-evaluates the
entity resolution process and in particular looks at
best ways to handle data veracity. The tutorial ties
entity resolution with recent advances in probabilistic
database research, focusing on sources of uncertainty
in the entity resolution process.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Suchanek:2014:KBA,
author = "Fabian M. Suchanek and Gerhard Weikum",
title = "Knowledge bases in the age of big data analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1713--1714",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This tutorial gives an overview on state-of-the-art
methods for the automatic construction of large
knowledge bases and harnessing them for data and text
analytics. It covers both big-data methods for building
knowledge bases and knowledge bases being assets for
big-data applications. The tutorial also points out
challenges and research opportunities.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Meliou:2014:CED,
author = "Alexandra Meliou and Sudeepa Roy and Dan Suciu",
title = "Causality and explanations in databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1715--1716",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the surge in the availability of information,
there is a great demand for tools that assist users in
understanding their data. While today's exploration
tools rely mostly on data visualization, users often
want to go deeper and understand the underlying causes
of a particular observation. This tutorial surveys
research on causality and explanation for data-oriented
applications. We will review and summarize the research
thus far into causality and explanation in the database
and AI communities, giving researchers a snapshot of
the current state of the art on this topic, and propose
a unified framework as well as directions for future
research. We will cover both the theory of
causality/explanation and some applications; we also
discuss the connections with other topics in database
research like provenance, deletion propagation, why-not
queries, and OLAP techniques.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2014:ESB,
author = "Yunyao Li and Ziyang Liu and Huaiyu Zhu",
title = "Enterprise search in the big data era: recent
developments and open challenges",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1717--1718",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Enterprise search allows users in an enterprise to
retrieve desired information through a simple search
interface. It is widely viewed as an important
productivity tool within an enterprise. While Internet
search engines have been highly successful, enterprise
search remains notoriously challenging due to a variety
of unique challenges, and is being made more so by the
increasing heterogeneity and volume of enterprise data.
On the other hand, enterprise search also presents
opportunities to succeed in ways beyond current
Internet search capabilities. This tutorial presents an
organized overview of these challenges and
opportunities, and reviews the state-of-the-art
techniques for building a reliable and high quality
enterprise search engine, in the context of the rise of
big data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2014:VPD,
author = "Yunyao Li and Erich Neuhold",
title = "{VLDB 2014} {Ph.D.} workshop: an overview",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1719--1719",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The VLDB 2014 PhD Workshop is an one-day event to be
held in Hangzhou, China on September 1st, 2014, in
conjunction with VLDB 2014. The aim of this workshop is
to provide helpful feedback, useful information and
networking opportunities that can benefit the students'
dissertation work as well as their long-term career.
The selection process and the workshop program were
carefully designed with this specific goal in mind. The
accepted submissions are included in the online
proceedings for the Workshop at
\ur{http://www.vldb.org/2014/phd_workshop_proceedings.html}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Venkataraman:2014:DCG,
author = "Shivakumar Venkataraman and Divyakant Agrawal",
title = "Datacenters as computers: {Google} engineering \&
database research perspectives",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1720--1721",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this collaborative keynote address, we will share
Google's experience in building a scalable data
infrastructure that leverages datacenters for managing
Google's advertising data over the last decade. In
order to support the massive online advertising
platform at Google, the data infrastructure must
simultaneously support both transactional and
analytical workloads. The focus of this talk will be to
highlight how the datacenter architecture and the cloud
computing paradigm has enabled us to manage the
exponential growth in data volumes and user queries,
make our services highly available and fault tolerant
to massive datacenter outages, and deliver results with
very low latencies. We note that other Internet
companies have also undergone similar growth in data
volumes and user queries. In fact, this phenomenon has
resulted in at least two new terms in the technology
lexicon: big data and cloud computing. Cloud computing
(and datacenters) have been largely responsible for
scaling the data volumes from terabytes range just a
few years ago to now reaching in the exabyte range over
the next couple of years. Delivering solutions at this
scale that are fault-tolerant, latency sensitive, and
highly available requires a combination of research
advances with engineering ingenuity at Google and
elsewhere. Next, we will try to answer the following
question: is a datacenter just another (very large)
computer? Or, does it fundamentally change the design
principles for data-centric applications and systems.
We will conclude with some of the unique research
challenges that need to be addressed in order to
sustain continuous growth in data volumes while
supporting high throughput and low latencies.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Plattner:2014:ICM,
author = "Hasso Plattner",
title = "The impact of columnar in-memory databases on
enterprise systems: implications of eliminating
transaction-maintained aggregates",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1722--1729",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Five years ago I proposed a common database approach
for transaction processing and analytical systems using
a columnar in-memory database, disputing the common
belief that column stores are not suitable for
transactional workloads. Today, the concept has been
widely adopted in academia and industry and it is
proven that it is feasible to run analytical queries on
large data sets directly on a redundancy-free schema,
eliminating the need to maintain pre-built aggregate
tables during data entry transactions. The resulting
reduction in transaction complexity leads to a dramatic
simplification of data models and applications,
redefining the way we build enterprise systems. First
analyses of productive applications adopting this
concept confirm that system architectures enabled by
in-memory column stores are conceptually superior for
business transaction processing compared to row-based
approaches. Additionally, our analyses show a shift of
enterprise workloads to even more read-oriented
processing due to the elimination of updates of
transaction-maintained aggregates.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Markl:2014:BCD,
author = "Volker Markl",
title = "Breaking the chains: on declarative data analysis and
data independence in the big data era",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1730--1733",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data management research, systems, and technologies
have drastically improved the availability of data
analysis capabilities, particularly for non-experts,
due in part to low-entry barriers and reduced ownership
costs (e.g., for data management infrastructures and
applications). Major reasons for the widespread success
of database systems and today's multi-billion dollar
data management market include data independence,
separating physical representation and storage from the
actual information, and declarative languages,
separating the program specification from its intended
execution environment. In contrast, today's big data
solutions do not offer data independence and
declarative specification. As a result, big data
technologies are mostly employed in newly-established
companies with IT-savvy employees or in large
well-established companies with big IT departments. We
argue that current big data solutions will continue to
fall short of widespread adoption, due to usability
problems, despite the fact that in-situ data analytics
technologies achieve a good degree of schema
independence. In particular, we consider the lack of a
declarative specification to be a major road-block,
contributing to the scarcity in available data
scientists available and limiting the application of
big data to the IT-savvy industries. In particular,
data scientists currently have to spend a lot of time
on tuning their data analysis programs for specific
data characteristics and a specific execution
environment. We believe that the research community
needs to bring the powerful concepts of declarative
specification to current data analysis systems, in
order to achieve the broad big data technology adoption
and effectively deliver the promise that novel big data
technologies offer.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Neumann:2014:EHP,
author = "Thomas Neumann",
title = "Engineering high-performance database engines",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1734--1741",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Developing a database engine is both challenging and
rewarding. Database engines are very complex software
artifacts that have to scale to large data sizes and
large hardware configurations, and developing such
systems usually means choosing between different
trade-offs at various points of development. This
papers gives a survey over two different database
engines, the disk-based SPARQL-processing engine
RDF-3X, and the relational main-memory engine HyPer. It
discusses the design choices that were made during
development, and highlights optimization techniques
that are important for both systems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cao:2014:RLC,
author = "Wei Cao and Feng Yu and Jiasen Xie",
title = "Realization of the low cost and high performance
{MySQL} cloud database",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1742--1747",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "MySQL is a low cost, high performance, good
reliability and open source database product, widely
used in many Internet companies. For example, there are
thousands of MySQL servers being used in Taobao.
Although NoSQL developed very quickly in past two
years, and new products emerged in endlessly, but in
the actual business application of NoSQL, the
requirements to developers are relatively high.
Moreover, MySQL has many more mature middleware,
maintenance tools and a benign ecological circle, so
from this perspective, MySQL dominates in the whole
situation, while NoSQL is as a supplement. We (the core
system database team of Taobao) have done a lot of work
in the field of MySQL hosting platform, designed and
implemented a UMP (Unified MySQL Platform) system, to
provide a low cost and high performance MySQL cloud
database service.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Qin:2014:FCS,
author = "An Qin and Dianming Hu and Jun Liu and Wenjun Yang and
Dai Tan",
title = "{Fatman}: cost-saving and reliable archival storage
based on volunteer resources",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1748--1753",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present Fatman, an enterprise-scale archival
storage based on volunteer contribution resources from
underutilized web servers, usually deployed on
thousands of nodes with spare storage capacity. Fatman
is specifically designed for enhancing the utilization
of existing storage resources and cutting down the
hardware purchase cost. Two major concerned issues of
the system design are maximizing the resource
utilization of volunteer nodes without violating
Service Level Objectives (SLOs) and minimizing the cost
without reducing the availability of archival system.
Fatman has been widely deployed on tens of thousands of
server nodes across several datacenters, provided more
than 100PB storage capacity and served dozens of
internal mass-data applications. The system realizes an
efficient storage quota consolidation by strong
isolation and budget limitation, to maximally support
resources contribution without any degradation on
host-level SLOs. It firstly improves data reliability
by applying disk failure prediction to diminish failure
recovery cost, named fault-aware data management,
dramatically reduces the MTTR by 76.3\% and decreases
file crash ratio by 35\% on real-life product
workload.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2014:DIR,
author = "Shiming Zhang and Yin Yang and Wei Fan and Marianne
Winslett",
title = "Design and implementation of a real-time interactive
analytics system for large spatio-temporal data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1754--1759",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In real-time interactive data analytics, the user
expects to receive the results of each query within a
short time period such as seconds. This is especially
challenging when the data is big (e.g., on the scale of
petabytes), and the analytics system runs on top of
cloud infrastructure (e.g., thousands of interconnected
commodity servers). We have been building such a
system, called OceanRT, for managing large
spatio-temporal data such as call logs and mobile web
browsing records collected by a telecommunication
company. Although there already exist systems for
querying big data in real time, OceanRT's performance
stands out due to several novel designs and components
that address key efficiency and scalability issues that
were largely overlooked in existing systems. First,
OceanRT makes extensive use of software RDMA one-sided
operations, which reduce networking costs without
requiring specialized hardware. Second, OceanRT
exploits the parallel computing capabilities of each
node in the cloud through a novel architecture
consisting of Access-Query Engines (AQEs) connected
with minimal overhead. Third, OceanRT contains a novel
storage scheme that optimizes for queries with joins
and multi-dimensional selections, which are common for
large spatio-temporal data. Experiments using the
TPC-DS benchmark show that OceanRT is usually more than
an order of magnitude faster than the current
state-of-the-art systems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dai:2014:PRS,
author = "Chaoyue Dai and Feng Qian and Wei Jiang and Zhoutian
Wang and Zenghong Wu",
title = "A personalized recommendation system for {NetEase}
dating site",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1760--1765",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the popularity of internet, more and more people
try to find friends or dating partners on online dating
web sites. Recommending appropriate partners from a
large amount of candidates becomes an interesting and
challenging problem in the field of recommendation
system. Various types of recommendation techniques
(e.g., content based recommendation, collaborative
filtering and association rule mining) have be proposed
to tackle this problem. However most of them ignore the
personalization concerns that they (1) mainly consider
the hot users or frequent items, (2) cover only a
portion of users especially ignoring the long tails,
(3) and cannot deal with the cold start problem
properly. In this paper, we present a regression based
hybrid recommendation system that makes use of matching
degree, fancy degree, activity, sincerity, popularity
and enthusiasm, to recommend appropriate partners. The
experimental evaluation of our recommendation system on
a real dating web site shows our strategy is more
effective and efficient than its previous version which
follows the principle of giving higher priority to the
recent active users.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ling:2014:GIH,
author = "Zheng Jye Ling and Quoc Trung Tran and Ju Fan and
Gerald C. H. Koh and Thi Nguyen and Chuen Seng Tan and
James W. L. Yip and Meihui Zhang",
title = "{GEMINI}: an integrative healthcare analytics system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1766--1771",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Healthcare systems around the world are facing the
challenge of information overload in caring for
patients in an affordable, safe and high-quality manner
in a system with limited healthcare resources and
increasing costs. To alleviate this problem, we develop
an integrative healthcare analytics system called
GEMINI which allows point of care analytics for doctors
where real-time usable and relevant information of
their patients are required through the questions they
asked about the patients they are caring for. GEMINI
extracts data of each patient from various data sources
and stores them as information in a patient profile
graph. The data sources are complex and varied
consisting of both structured data (such as, patients'
demographic data, laboratory results and medications)
and unstructured data (such as, doctors' notes). Hence,
the patient profile graph provides a holistic and
comprehensive information of patients' healthcare
profile, from which GEMINI can infer implicit
information useful for administrative and clinical
purposes, and extract relevant information for
performing predictive analytics. At the core, GEMINI
keeps interacting with the healthcare professionals as
part of a feedback loop to gather, infer, ascertain and
enhance the self-learning knowledge base. We present a
case study on using GEMINI to predict the risk of
unplanned patient readmissions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zou:2014:MTD,
author = "Yongqiang Zou and Xing Jin and Yi Li and Zhimao Guo
and Eryu Wang and Bin Xiao",
title = "{Mariana}: {Tencent} deep learning platform and its
applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1772--1777",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Deep learning gains lots of attentions in recent years
and is more and more important for mining values in big
data. However, to make deep learning practical for a
wide range of applications in Tencent Inc., three
requirements must be considered: (1) Lots of
computational power are required to train a practical
model with tens of millions of parameters and billions
of samples for products such as automatic speech
recognition (ASR), and the number of parameters and
training data is still growing. (2) The capability of
training larger model is necessary for better model
quality. (3) Easy to use frameworks are valuable to do
many experiments to perform model selection, such as
finding an appropriate optimization algorithm and
tuning optimal hyper-parameters. To accelerate
training, support large models, and make experiments
easier, we built Mariana, the Tencent deep learning
platform, which utilizes GPU and CPU cluster to train
models parallelly with three frameworks: (1) a
multi-GPU data parallelism framework for deep neural
networks (DNNs). (2) a multi-GPU model parallelism and
data parallelism framework for deep convolutional
neural networks (CNNs). (3) a CPU cluster framework for
large scale DNNs. Mariana also provides built-in
algorithms and features to facilitate experiments.
Mariana is in production usage for more than one year,
achieves state-of-the-art acceleration performance, and
plays a key role in training models and improving
quality for automatic speech recognition and image
recognition in Tencent WeChat, a mobile social
platform, and for Ad click-through rate prediction
(pCTR) in Tencent QQ, an instant messaging platform,
and Tencent Qzone, a social networking service.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wu:2014:YPC,
author = "Sai Wu and Chun Chen and Gang Chen and Ke Chen and
Lidan Shou and Hui Cao and He Bai",
title = "{YZStack}: provisioning customizable solution for big
data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1778--1783",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "YZStack is our developing solution which implements
many well-established big data techniques as selectable
modules and allows users to customize their systems as
a process of module selection. In particular, it
includes an openstack based IaaS (Infrastructure as a
Service) layer, a distributed file system based DaaS
(Data as a Service) layer, a PaaS (Platform as a
Service) layer equipped with parallel processing
techniques and a SaaS (Software as a Service) layer
with popular data analytic algorithms. Layers of
YZStack are loosely connected, so that customization of
one layer does not affect the other layers and their
interactions. In this paper, we use a smart financial
system developed for the Zhejiang Provincial Department
of Finance to demonstrate how to leverage YZStack to
speed up the implementation of big data system. We also
introduce two popular applications of the financial
system, economic prediction and detection of improper
payment.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Klonatos:2014:EBE,
author = "Yannis Klonatos and Christoph Koch and Tiark Rompf and
Hassan Chafi",
title = "Errata for {``Building efficient query engines in a
high-level language'': PVLDB {\bf 7}(10):853--864}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "13",
pages = "1784--1784",
month = aug,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:31 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
note = "See \cite{Klonatos:2014:BEQ}.",
abstract = "This is in response to recent feedback from our peers
that calls for a number of clarifications regarding the
experimental section of our paper.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lu:2014:SMM,
author = "Wei Lu and Shanshan Chen and Keqian Li and Laks V. S.
Lakshmanan",
title = "Show me the money: dynamic recommendations for revenue
maximization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "14",
pages = "1785--1796",
month = oct,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:43 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Recommender Systems (RS) play a vital role in
applications such as e-commerce and on-demand content
streaming. Research on RS has mainly focused on the
customer perspective, i.e., accurate prediction of user
preferences and maximization of user utilities. As a
result, most existing techniques are not explicitly
built for revenue maximization, the primary business
goal of enterprises. In this work, we explore and
exploit a novel connection between RS and the
profitability of a business. As recommendations can be
seen as an information channel between a business and
its customers, it is interesting and important to
investigate how to make strategic dynamic
recommendations leading to maximum possible revenue. To
this end, we propose a novel revenue model that takes
into account a variety of factors including prices,
valuations, saturation effects, and competition amongst
products. Under this model, we study the problem of
finding revenue-maximizing recommendation strategies
over a finite time horizon. We show that this problem
is NP-hard, but approximation guarantees can be
obtained for a slightly relaxed version, by
establishing an elegant connection to matroid theory.
Given the prohibitively high complexity of the
approximation algorithm, we also design intelligent
heuristics for the original problem. Finally, we
conduct extensive experiments on two real and synthetic
datasets and demonstrate the efficiency, scalability,
and effectiveness our algorithms, and that they
significantly outperform several intuitive baselines.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lu:2014:SSG,
author = "Peng Lu and Gang Chen and Beng Chin Ooi and Hoang Tam
Vo and Sai Wu",
title = "{ScalaGiST}: scalable generalized search trees for
{MapReduce} systems [innovative systems paper]",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "14",
pages = "1797--1808",
month = oct,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:43 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "MapReduce has become the state-of-the-art for data
parallel processing. Nevertheless, Hadoop, an
open-source equivalent of MapReduce, has been noted to
have sub-optimal performance in the database context
since it is initially designed to operate on raw data
without utilizing any type of indexes. To alleviate the
problem, we present ScalaGiST --- scalable generalized
search tree that can be seamlessly integrated with
Hadoop, together with a cost-based data access
optimizer for efficient query processing at run-time.
ScalaGiST provides extensibility in terms of data and
query types, hence is able to support unconventional
queries (e.g., multi-dimensional range and $k$-NN
queries) in MapReduce systems, and can be dynamically
deployed in large cluster environments for handling big
users and data. We have built ScalaGiST and
demonstrated that it can be easily instantiated to
common B$^+$ -tree and R-tree indexes yet for dynamic
distributed environments. Our extensive performance
study shows that ScalaGiST can provide efficient write
and read performance, elastic scaling property, as well
as effective support for MapReduce execution of ad-hoc
analytic queries. Performance comparisons with recent
proposals of specialized distributed index structures,
such as SpatialHadoop, Data Mapping, and RT-CAN further
confirm its efficiency.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yang:2014:FPK,
author = "Mohan Yang and Bolin Ding and Surajit Chaudhuri and
Kaushik Chakrabarti",
title = "Finding patterns in a knowledge base using keywords to
compose table answers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "14",
pages = "1809--1820",
month = oct,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:43 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We aim to provide table answers to keyword queries
using a knowledge base. For queries referring to
multiple entities, like ``Washington cities
population'' and ``Mel Gibson movies'', it is better to
represent each relevant answer as a table which
aggregates a set of entities or joins of entities
within the same table scheme or pattern. In this paper,
we study how to find highly relevant patterns in a
knowledge base for user-given keyword queries to
compose table answers. A knowledge base is modeled as a
directed graph called knowledge graph, where nodes
represent its entities and edges represent the
relationships among them. Each node/edge is labeled
with type and text. A pattern is an aggregation of
subtrees which contain all keywords in the texts and
have the same structure and types on node/edges. We
propose efficient algorithms to find patterns that are
relevant to the query for a class of scoring functions.
We show the hardness of the problem in theory, and
propose path-based indexes that are affordable in
memory. Two query-processing algorithms are proposed:
one is fast in practice for small queries (with small
numbers of patterns as answers) by utilizing the
indexes; and the other one is better in theory, with
running time linear in the sizes of indexes and
answers, which can handle large queries better. We also
conduct extensive experimental study to compare our
approaches with a naive adaption of known techniques.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yan:2014:PAG,
author = "Da Yan and James Cheng and Kai Xing and Yi Lu and
Wilfred Ng and Yingyi Bu",
title = "{Pregel} algorithms for graph connectivity problems
with performance guarantees",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "14",
pages = "1821--1832",
month = oct,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:43 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Graphs in real life applications are often huge, such
as the Web graph and various social networks. These
massive graphs are often stored and processed in
distributed sites. In this paper, we study graph
algorithms that adopt Google's Pregel, an iterative
vertex-centric framework for graph processing in the
Cloud. We first identify a set of desirable properties
of an efficient Pregel algorithm, such as linear space,
communication and computation cost per iteration, and
logarithmic number of iterations. We define such an
algorithm as a practical Pregel algorithm (PPA). We
then propose PPAs for computing connected components
(CCs), biconnected components (BCCs) and strongly
connected components (SCCs). The PPAs for computing
BCCs and SCCs use the PPAs of many fundamental graph
problems as building blocks, which are of interest by
themselves. Extensive experiments over large real
graphs verified the efficiency of our algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shang:2014:AAG,
author = "Zechao Shang and Jeffrey Xu Yu",
title = "Auto-approximation of graph computing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "14",
pages = "1833--1844",
month = oct,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:43 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In the big data era, graph computing is one of the
challenging issues because there are numerous large
graph datasets emerging from real applications. A
question is: do we need to know the final exact answer
for a large graph? When it is impossible to know the
exact answer in a limited time, is it possible to
approximate the final answer in an automatic and
systematic way without having to designing new
approximate algorithms? The main idea behind the
question is: it is more important to find out something
meaningful quick from a large graph, and we should
focus on finding a way of making use of large graphs
instead of spending time on designing approximate
algorithms. In this paper, we give an innovative
approach which automatically and systematically
synthesizes a program to approximate the original
program. We show that we can give users some answers
with reasonable accuracy and high efficiency for a wide
spectrum of graph algorithms, without having to know
the details of graph algorithms. We have conducted
extensive experimental studies using many graph
algorithms that are supported in the existing graph
systems and large real graphs. Our extensive
experimental results reveal that our automatically
approximating approach is highly feasible.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Furche:2014:DTW,
author = "Tim Furche and Georg Gottlob and Giovanni Grasso and
Xiaonan Guo and Giorgio Orsi and Christian Schallhart
and Cheng Wang",
title = "{DIADEM}: thousands of websites to a single database",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "14",
pages = "1845--1856",
month = oct,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:43 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The web is overflowing with implicitly structured
data, spread over hundreds of thousands of sites,
hidden deep behind search forms, or siloed in
marketplaces, only accessible as HTML. Automatic
extraction of structured data at the scale of thousands
of websites has long proven elusive, despite its
central role in the ``web of data''. Through an
extensive evaluation spanning over 10000 web sites from
multiple application domains, we show that automatic,
yet accurate full-site extraction is no longer a
distant dream. diadem is the first automatic full-site
extraction system that is able to extract structured
data from different domains at very high accuracy. It
combines automated exploration of websites,
identification of relevant data, and induction of
exhaustive wrappers. Automating these components is the
first challenge. diadem overcomes this challenge by
combining phenomenological and ontological knowledge.
Integrating these components is the second challenge.
diadem overcomes this challenge through a self-adaptive
network of relational transducers that produces
effective wrappers for a wide variety of websites. Our
extensive and publicly available evaluation shows that,
for more than 90\% of sites from three domains, diadem
obtains an effective wrapper that extracts all relevant
data with 97\% average precision. diadem also tolerates
noisy entity recognisers, and its components
individually outperform comparable approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wu:2014:UAQ,
author = "Wentao Wu and Xi Wu and Hakan Hacig{\"u}m{\"u}s and
Jeffrey F. Naughton",
title = "Uncertainty aware query execution time prediction",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "14",
pages = "1857--1868",
month = oct,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:43 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Predicting query execution time is a fundamental issue
underlying many database management tasks. Existing
predictors rely on information such as cardinality
estimates and system performance constants that are
difficult to know exactly. As a result, accurate
prediction still remains elusive for many queries.
However, existing predictors provide a single, point
estimate of the true execution time, but fail to
characterize the uncertainty in the prediction. In this
paper, we take a first step towards providing
uncertainty information along with query execution time
predictions. We use the query optimizer's cost model to
represent the query execution time as a function of the
selectivities of operators in the query plan as well as
the constants that describe the cost of CPU and I/O
operations in the system. By treating these quantities
as random variables rather than constants, we show that
with low overhead we can infer the distribution of
likely prediction errors. We further show that the
estimated prediction errors by our proposed techniques
are strongly correlated with the actual prediction
errors.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Konstantinidis:2014:OCS,
author = "George Konstantinidis and Jos{\'e} Luis Ambite",
title = "Optimizing the chase: scalable data integration under
constraints",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "14",
pages = "1869--1880",
month = oct,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:43 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We are interested in scalable data integration and
data exchange under constraints/dependencies. In data
exchange the problem is how to materialize a target
database instance, satisfying the source-to-target and
target dependencies, that provides the certain answers.
In data integration, the problem is how to rewrite a
query over the target schema into a query over the
source schemas that provides the certain answers. In
both these problems we make use of the chase algorithm,
the main tool to reason with dependencies. Our first
contribution is to introduce the frugal chase, which
produces smaller universal solutions than the standard
chase, still remaining polynomial in data complexity.
Our second contribution is to use the frugal chase to
scale up query answering using views under LAV weakly
acyclic target constraints, a useful language capturing
RDF/S. The latter problem can be reduced to query
rewriting using views without constraints by chasing
the source-to-target mappings with the target
constraints. We construct a compact graph-based
representation of the mappings and the constraints and
develop an efficient algorithm to run the frugal chase
on this representation. We show experimentally that our
approach scales to large problems, speeding up the
compilation of the dependencies into the mappings by
close to 2 and 3 orders of magnitude, compared to the
standard and the core chase, respectively. Compared to
the standard chase, we improve online query rewriting
time by a factor of 3, while producing equivalent, but
smaller, rewritings of the original query.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Athanassoulis:2014:BTA,
author = "Manos Athanassoulis and Anastasia Ailamaki",
title = "{BF}-tree: approximate tree indexing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "14",
pages = "1881--1892",
month = oct,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:43 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The increasing volume of time-based generated data and
the shift in storage technologies suggest that we might
need to reconsider indexing. Several workloads --- like
social and service monitoring --- often include
attributes with implicit clustering because of their
time-dependent nature. In addition, solid state disks
(SSD) (using flash or other low-level technologies)
emerge as viable competitors of hard disk drives (HDD).
Capacity and access times of storage devices create a
trade-off between SSD and HDD. Slow random accesses in
HDD have been replaced by efficient random accesses in
SSD, but their available capacity is one or more orders
of magnitude more expensive than the one of HDD.
Indexing, however, is designed assuming HDD as
secondary storage, thus minimizing random accesses at
the expense of capacity. Indexing data using SSD as
secondary storage requires treating capacity as a
scarce resource. To this end, we introduce approximate
tree indexing, which employs probabilistic data
structures (Bloom filters) to trade accuracy for size
and produce smaller, yet powerful, tree indexes, which
we name Bloom filter trees (BF-Trees). BF-Trees exploit
pre-existing data ordering or partitioning to offer
competitive search performance. We demonstrate, both by
an analytical study and by experimental results, that
by using workload knowledge and reducing indexing
accuracy up to some extent, we can save substantially
on capacity when indexing on ordered or partitioned
attributes. In particular, in experiments with a
synthetic workload, approximate indexing offers
2.22x-48x smaller index footprint with competitive
response times, and in experiments with TPCH and a
monitoring real-life dataset from an energy company, it
offers 1.6x-4x smaller index footprint with competitive
search times as well.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tozun:2014:AAI,
author = "Pinar T{\"o}z{\"u}n and Islam Atta and Anastasia
Ailamaki and Andreas Moshovos",
title = "{ADDICT}: advanced instruction chasing for
transactions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "14",
pages = "1893--1904",
month = oct,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:43 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Recent studies highlight that traditional transaction
processing systems utilize the micro-architectural
features of modern processors very poorly. L1
instruction cache and long-latency data misses dominate
execution time. As a result, more than half of the
execution cycles are wasted on memory stalls. Previous
works on reducing stall time aim at improving locality
through either hardware or software techniques.
However, exploiting hardware resources based on the
hints given by the software-side has not been widely
studied for data management systems. In this paper, we
observe that, independently of their high-level
functionality, transactions running in parallel on a
multicore system execute actions chosen from a limited
sub-set of predefined database operations. Therefore,
we initially perform a memory characterization study of
modern transaction processing systems using
standardized benchmarks. The analysis demonstrates that
same-type transactions exhibit at most 6\% overlap in
their data footprints whereas there is up to 98\%
overlap in instructions. Based on the findings, we
design ADDICT, a transaction scheduling mechanism that
aims at maximizing the instruction cache locality.
ADDICT determines the most frequent actions of database
operations, whose instruction footprint can fit in an
L1 instruction cache, and assigns a core to execute
each of these actions. Then, it schedules each action
on its corresponding core. Our prototype implementation
of ADDICT reduces L1 instruction misses by 85\% and the
long latency data misses by 20\%. As a result, ADDICT
leads up to a 50\% reduction in the total execution
time for the evaluated workloads.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Alsubaiee:2014:ASO,
author = "Sattam Alsubaiee and Yasser Altowim and Hotham
Altwaijry and Alexander Behm and Vinayak Borkar and
Yingyi Bu and Michael Carey and Inci Cetindil and
Madhusudan Cheelangi and Khurram Faraaz and Eugenia
Gabrielova and Raman Grover and Zachary Heilbron and
Young-Seok Kim and Chen Li and Guangqiang Li and Ji
Mahn Ok and Nicola Onose and Pouria Pirzadeh and
Vassilis Tsotras and Rares Vernica and Jian Wen and
Till Westmann",
title = "{AsterixDB}: a scalable, open source {BDMS}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "14",
pages = "1905--1916",
month = oct,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:43 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "AsterixDB is a new, full-function BDMS (Big Data
Management System) with a feature set that
distinguishes it from other platforms in today's open
source Big Data ecosystem. Its features make it
well-suited to applications like web data warehousing,
social data storage and analysis, and other use cases
related to Big Data. AsterixDB has a flexible NoSQL
style data model; a query language that supports a wide
range of queries; a scalable runtime; partitioned,
LSM-based data storage and indexing (including
B$^+$-tree, R-tree, and text indexes); support for
external as well as natively stored data; a rich set of
built-in types; support for fuzzy, spatial, and
temporal types and queries; a built-in notion of data
feeds for ingestion of data; and transaction support
akin to that of a NoSQL store. Development of AsterixDB
began in 2009 and led to a mid-2013 initial open source
release. This paper is the first complete description
of the resulting open source AsterixDB system. Covered
herein are the system's data model, its query language,
and its software architecture. Also included are a
summary of the current status of the project and a
first glimpse into how AsterixDB performs when compared
to alternative technologies, including a parallel
relational DBMS, a popular NoSQL store, and a popular
Hadoop-based SQL data analytics platform, for things
that both technologies can do. Also included is a brief
description of some initial trials that the system has
undergone and the lessons learned (and plans laid)
based on those early ``customer'' engagements.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Xu:2014:LLB,
author = "Ning Xu and Lei Chen and Bin Cui",
title = "{LogGP}: a log-based dynamic graph partitioning
method",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "14",
pages = "1917--1928",
month = oct,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:43 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the increasing availability and scale of graph
data from Web 2.0, graph partitioning becomes one of
efficient preprocessing techniques to balance the
computing workload. Since the cost of partitioning the
entire graph is strictly prohibitive, there are some
recent tentative works towards streaming graph
partitioning which can run faster, be easily
paralleled, and be incrementally updated.
Unfortunately, the experiments show that the running
time of each partitioning is still unbalanced due to
the variation of workload access patterns during the
supersteps. In addition, the one-pass streaming
partitioning result is not always satisfactory for the
algorithms' local view of the graph. In this paper, we
present LogGP, a log-based graph partitioning system
that records, analyzes and reuses the historical
statistical information to refine the partitioning
result. LogGP can be used as a middle-ware and deployed
to many state-of-the-art paralleled graph processing
systems easily. LogGP utilizes the historical
partitioning results to generate a hyper-graph and uses
a novel hyper-graph streaming partitioning approach to
generate a better initial streaming graph partitioning
result. During the execution, the system uses running
logs to optimize graph partitioning which prevents
performance degradation. Moreover, LogGP can
dynamically repartition the massive graphs in
accordance with the structural changes. Extensive
experiments conducted on a moderate size of computing
cluster with real-world graph datasets demonstrate the
superiority of our approach against the
state-of-the-art solutions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Papadakis:2014:SMB,
author = "George Papadakis and George Papastefanatos and Georgia
Koutrika",
title = "Supervised meta-blocking",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "14",
pages = "1929--1940",
month = oct,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:43 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Entity Resolution matches mentions of the same entity.
Being an expensive task for large data, its performance
can be improved by blocking, i.e., grouping similar
entities and comparing only entities in the same group.
Blocking improves the run-time of Entity Resolution,
but it still involves unnecessary comparisons that
limit its performance. Meta-blocking is the process of
restructuring a block collection in order to prune such
comparisons. Existing unsupervised meta-blocking
methods use simple pruning rules, which offer a rather
coarse-grained filtering technique that can be
conservative (i.e., keeping too many unnecessary
comparisons) or aggressive (i.e., pruning good
comparisons). In this work, we introduce supervised
meta-blocking techniques that learn classification
models for distinguishing promising comparisons. For
this task, we propose a small set of generic features
that combine a low extraction cost with high
discriminatory power. We show that supervised
meta-blocking can achieve high performance with small
training sets that can be manually created. We
analytically compare our supervised approaches with
baseline and competitor methods over 10 large-scale
datasets, both real and synthetic.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Xie:2014:GTK,
author = "Min Xie and Laks V. S. Lakshmanan and Peter T. Wood",
title = "Generating top-$k$ packages via preference
elicitation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "14",
pages = "1941--1952",
month = oct,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:43 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "There are several applications, such as play lists of
songs or movies, and shopping carts, where users are
interested in finding top-$k$ packages, consisting of
sets of items. In response to this need, there has been
a recent flurry of activity around extending classical
recommender systems (RS), which are effective at
recommending individual items, to recommend packages,
or sets of items. The few recent proposals for package
RS suffer from one of the following drawbacks: they
either rely on hard constraints which may be difficult
to be specified exactly by the user or on returning
Pareto-optimal packages which are too numerous for the
user to sift through. To overcome these limitations, we
propose an alternative approach for finding
personalized top-$k$ packages for users, by capturing
users' preferences over packages using a linear utility
function which the system learns. Instead of asking a
user to specify this function explicitly, which is
unrealistic, we explicitly model the uncertainty in the
utility function and propose a preference
elicitation-based framework for learning the utility
function through feedback provided by the user. We
propose several sampling-based methods which, given
user feedback, can capture the updated utility
function. We develop an efficient algorithm for
generating top-$k$ packages using the learned utility
function, where the rank ordering respects any of a
variety of ranking semantics proposed in the
literature. Through extensive experiments on both real
and synthetic datasets, we demonstrate the efficiency
and effectiveness of the proposed system for finding
top-$k$ packages.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2014:FRQ,
author = "Rui Li and Alex X. Liu and Ann L. Wang and Bezawada
Bruhadeshwar",
title = "Fast range query processing with strong privacy
protection for cloud computing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "14",
pages = "1953--1964",
month = oct,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:43 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Privacy has been the key road block to cloud computing
as clouds may not be fully trusted. This paper concerns
the problem of privacy preserving range query
processing on clouds. Prior schemes are weak in privacy
protection as they cannot achieve index
indistinguishability, and therefore allow the cloud to
statistically estimate the values of data and queries
using domain knowledge and history query results. In
this paper, we propose the first range query processing
scheme that achieves index indistinguishability under
the indistinguishability against chosen keyword attack
(IND-CKA). Our key idea is to organize indexing
elements in a complete binary tree called PBtree, which
satisfies structure indistinguishability (i.e., two
sets of data items have the same PBtree structure if
and only if the two sets have the same number of data
items) and node indistinguishability (i.e., the values
of PBtree nodes are completely random and have no
statistical meaning). We prove that our scheme is
secure under the widely adopted IND-CKA security model.
We propose two algorithms, namely PBtree traversal
width minimization and PBtree traversal depth
minimization, to improve query processing efficiency.
We prove that the worse case complexity of our query
processing algorithm using PBtree is $ O(| R | \log n)
$, where $n$ is the total number of data items and $R$
is the set of data items in the query result. We
implemented and evaluated our scheme on a real world
data set with 5 million items. For example, for a query
whose results contain ten data items, it takes only
0.17 milliseconds.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gao:2014:FTP,
author = "Yihan Gao and Aditya Parameswaran",
title = "Finish them!: pricing algorithms for human
computation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "14",
pages = "1965--1976",
month = oct,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:43 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Given a batch of human computation tasks, a commonly
ignored aspect is how the price (i.e., the reward paid
to human workers) of these tasks must be set or varied
in order to meet latency or cost constraints. Often,
the price is set up-front and not modified, leading to
either a much higher monetary cost than needed (if the
price is set too high), or to a much larger latency
than expected (if the price is set too low). Leveraging
a pricing model from prior work, we develop algorithms
to optimally set and then vary price over time in order
to meet a (a) user-specified deadline while minimizing
total monetary cost (b) user-specified monetary budget
constraint while minimizing total elapsed time. We
leverage techniques from decision theory (specifically,
Markov Decision Processes) for both these problems, and
demonstrate that our techniques lead to upto 30\%
reduction in cost over schemes proposed in prior work.
Furthermore, we develop techniques to speed-up the
computation, enabling users to leverage the price
setting algorithms on-the-fly.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Catasta:2014:TTC,
author = "Michele Catasta and Alberto Tonon and Djellel Eddine
Difallah and Gianluca Demartini and Karl Aberer and
Philippe Cudre-Mauroux",
title = "{TransactiveDB}: tapping into collective human
memories",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "14",
pages = "1977--1980",
month = oct,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:43 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Database Management Systems (DBMSs) have been rapidly
evolving in the recent years, exploring ways to store
multi-structured data or to involve human processes
during query execution. In this paper, we outline a
future avenue for DBMSs supporting transactive memory
queries that can only be answered by a collection of
individuals connected through a given interaction
graph. We present TransactiveDB and its ecosystem,
which allow users to pose queries in order to
reconstruct collective human memories. We describe a
set of new transactive operators including TUnion,
TFill, TJoin, and TProjection. We also describe how
TransactiveDB leverages transactive operators---by
mixing query execution, social network analysis and
human computation---in order to effectively and
efficiently tap into the memories of all targeted
users.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yan:2014:BBC,
author = "Da Yan and James Cheng and Yi Lu and Wilfred Ng",
title = "{Blogel}: a block-centric framework for distributed
computation on real-world graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "14",
pages = "1981--1992",
month = oct,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:43 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The rapid growth in the volume of many real-world
graphs (e.g., social networks, web graphs, and spatial
networks) has led to the development of various
vertex-centric distributed graph computing systems in
recent years. However, real-world graphs from different
domains have very different characteristics, which
often create bottlenecks in vertex-centric parallel
graph computation. We identify three such important
characteristics from a wide spectrum of real-world
graphs, namely (1) skewed degree distribution, (2)
large diameter, and (3) (relatively) high density.
Among them, only (1) has been studied by existing
systems, but many real-world power-law graphs also
exhibit the characteristics of (2) and (3). In this
paper, we propose a block-centric framework, called
Blogel, which naturally handles all the three adverse
graph characteristics. Blogel programmers may think
like a block and develop efficient algorithms for
various graph problems. We propose parallel algorithms
to partition an arbitrary graph into blocks
efficiently, and block-centric programs are then run
over these blocks. Our experiments on large real-world
graphs verified that Blogel is able to achieve orders
of magnitude performance improvements over the
state-of-the-art distributed graph computing systems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liagouris:2014:EII,
author = "John Liagouris and Manolis Terrovitis",
title = "Efficient identification of implicit facts in
incomplete {OWL2-EL} knowledge bases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "14",
pages = "1993--2004",
month = oct,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:43 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Integrating incomplete and possibly inconsistent data
from various sources is a challenge that arises in
several application areas, especially in the management
of scientific data. A rising trend for data integration
is to model the data as axioms in the Web Ontology
Language (OWL) and use inference rules to identify new
facts. Although there are several approaches that
employ OWL for data integration, there is little work
on scalable algorithms able to handle large datasets
that do not fit in main memory. The main contribution
of this paper is an algorithm that allows the effective
use of OWL for integrating data in an environment with
limited memory. The core idea is to exhaustively apply
a set of complex inference rules on large disk-resident
datasets. To the best of our knowledge, this is the
first work that proposes an I/O-aware algorithm for
tackling with such an expressive subset of OWL like the
one we address here. Previous approaches considered
either simpler models (e.g. RDFS) or main-memory
algorithms. In the paper we detail the proposed
algorithm, prove its correctness, and experimentally
evaluate it on real and synthetic data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2014:WCA,
author = "Chen Jason Zhang and Yongxin Tong and Lei Chen",
title = "Where to: crowd-aided path selection",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "14",
pages = "2005--2016",
month = oct,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:43 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the widespread use of geo-positioning services
(GPS), GPS-based navigation systems have become ever
more of an integral part of our daily lives. GPS-based
navigation systems usually suggest multiple paths for
any given pair of source and target, leaving users
perplexed when trying to select the best one among
them, namely the problem of best path selection. Too
many suggested paths may jeopardize the usability of
the recommendation data, and decrease user
satisfaction. Although existing studies have already
partially relieved this problem through integrating
historical traffic logs or updating traffic conditions
periodically, their solutions neglect the potential
contribution of human experience. In this paper, we
resort to crowdsourcing to ease the pain of the best
path selection. The first step of appropriately using
the crowd is to ask proper questions. For the best path
selection problem, simple questions (e.g. binary
voting) over compete paths cannot be directly applied
to road networks due to their being too complex for
crowd workers. Thus, this paper makes the first
contribution by designing two types of questions,
namely Routing Query (RQ) and Binary Routing Query
(BRQ), to ask the crowd to decide which direction to
take at each road intersection. Furthermore, we propose
a series of efficient algorithms to dynamically manage
the questions in order to reduce the selection hardness
within a limited budget. Finally, we compare the
proposed methods against two baselines, and the
effectiveness and efficiency of our proposals are
verified by the results from simulations and
experiments on a real-world crowdsourcing platform.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Huang:2014:LSR,
author = "Yan Huang and Favyen Bastani and Ruoming Jin and
Xiaoyang Sean Wang",
title = "Large scale real-time ridesharing with service
guarantee on road networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "7",
number = "14",
pages = "2017--2028",
month = oct,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 4 17:20:43 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Urban traffic gridlock is a familiar scene. At the
same time, the mean occupancy rate of personal vehicle
trips in the United States is only 1.6 persons per
vehicle mile. Ridesharing has the potential to solve
many environmental, congestion, pollution, and energy
problems. In this paper, we introduce the problem of
large scale real-time ridesharing with service
guarantee on road networks. Trip requests are
dynamically matched to vehicles while trip waiting and
service time constraints are satisfied. We first
propose two scheduling algorithms: a branch-and-bound
algorithm and an integer programming algorithm.
However, these algorithms do not adapt well to the
dynamic nature of the ridesharing problem. Thus, we
propose kinetic tree algorithms which are better suited
to efficient scheduling of dynamic requests and adjust
routes on-the-fly. We perform experiments on a large
Shanghai taxi dataset. Results show that the kinetic
tree algorithms outperform other algorithms
significantly.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sun:2014:SSA,
author = "Yifang Sun and Wei Wang and Jianbin Qin and Ying Zhang
and Xuemin Lin",
title = "{SRS}: solving $c$-approximate nearest neighbor
queries in high dimensional {Euclidean} space with a
tiny index",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "1",
pages = "1--12",
month = sep,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:33 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Nearest neighbor searches in high-dimensional space
have many important applications in domains such as
data mining, and multimedia databases. The problem is
challenging due to the phenomenon called ``curse of
dimensionality''. An alternative solution is to
consider algorithms that returns a $c$-approximate
nearest neighbor ($c$-ANN) with guaranteed
probabilities. Locality Sensitive Hashing (LSH) is
among the most widely adopted method, and it achieves
high efficiency both in theory and practice. However,
it is known to require an extremely high amount of
space for indexing, hence limiting its scalability. In
this paper, we propose several surprisingly simple
methods to answer $c$-ANN queries with theoretical
guarantees requiring only a single tiny index. Our
methods are highly flexible and support a variety of
functionalities, such as finding the exact nearest
neighbor with any given probability. In the experiment,
our methods demonstrate superior performance against
the state-of-the-art LSH-based methods, and scale up
well to 1 billion high-dimensional points on a single
commodity PC.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dallachiesa:2014:TKN,
author = "Michele Dallachiesa and Themis Palpanas and Ihab F.
Ilyas",
title = "Top-$k$ nearest neighbor search in uncertain data
series",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "1",
pages = "13--24",
month = sep,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:33 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Many real applications consume data that is
intrinsically uncertain, noisy and error-prone. In this
study, we investigate the problem of finding the
top-$k$ nearest neighbors in uncertain data series,
which occur in several different domains. We formalize
the top-$k$ nearest neighbor problem for uncertain data
series, and describe a model for uncertain data series
that captures both uncertainty and correlation. This
distinguishes our approach from prior work that
compromises the accuracy of the model by assuming
independence of the value distribution at neighboring
time-stamps. We introduce the Holistic-P$k$NN
algorithm, which uses novel metric bounds for uncertain
series and an efficient refinement strategy to reduce
the overall number of required probability estimates.
We evaluate our proposal under a variety of settings
using a combination of synthetic and 45 real datasets
from diverse domains. The results demonstrate the
significant advantages of the proposed approach.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2014:RBP,
author = "Jiexing Li and Jeffrey Naughton and Rimma V. Nehme",
title = "Resource bricolage for parallel database systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "1",
pages = "25--36",
month = sep,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:33 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Running parallel database systems in an environment
with heterogeneous resources has become increasingly
common, due to cluster evolution and increasing
interest in moving applications into public clouds. For
database systems running in a heterogeneous cluster,
the default uniform data partitioning strategy may
overload some of the slow machines while at the same
time it may under-utilize the more powerful machines.
Since the processing time of a parallel query is
determined by the slowest machine, such an allocation
strategy may result in a significant query performance
degradation. We take a first step to address this
problem by introducing a technique we call resource
bricolage that improves database performance in
heterogeneous environments. Our approach quantifies the
performance differences among machines with various
resources as they process workloads with diverse
resource requirements. We formalize the problem of
minimizing workload execution time and view it as an
optimization problem, and then we employ linear
programming to obtain a recommended data partitioning
scheme. We verify the effectiveness of our technique
with an extensive experimental study on a commercial
database system.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Graefe:2014:MPB,
author = "Goetz Graefe and Haris Volos and Hideaki Kimura and
Harumi Kuno and Joseph Tucek and Mark Lillibridge and
Alistair Veitch",
title = "In-memory performance for big data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "1",
pages = "37--48",
month = sep,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:33 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "When a working set fits into memory, the overhead
imposed by the buffer pool renders traditional
databases non-competitive with in-memory designs that
sacrifice the benefits of a buffer pool. However,
despite the large memory available with modern
hardware, data skew, shifting workloads, and complex
mixed workloads make it difficult to guarantee that a
working set will fit in memory. Hence, some recent work
has focused on enabling in-memory databases to protect
performance when the working data set almost fits in
memory. Contrary to those prior efforts, we enable
buffer pool designs to match in-memory performance
while supporting the ``big data'' workloads that
continue to require secondary storage, thus providing
the best of both worlds. We introduce here a novel
buffer pool design that adapts pointer swizzling for
references between system objects (as opposed to
application objects), and uses it to practically
eliminate buffer pool overheads for memoryresident
data. Our implementation and experimental evaluation
demonstrate that we achieve graceful performance
degradation when the working set grows to exceed the
buffer pool size, and graceful improvement when the
working set shrinks towards and below the memory and
buffer pool sizes.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Long:2014:TSM,
author = "Cheng Long and Raymond Chi-Wing Wong and H. V.
Jagadish",
title = "Trajectory simplification: on minimizing the
direction-based error",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "1",
pages = "49--60",
month = sep,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:33 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Trajectory data is central to many applications with
moving objects. Raw trajectory data is usually very
large, and so is simplified before it is stored and
processed. Many trajectory simplification notions have
been proposed, and among them, the direction-preserving
trajectory simplification (DPTS) which aims at
protecting the direction information has been shown to
perform quite well. However, existing studies on DPTS
require users to specify an error tolerance which users
might not know how to set properly in some cases (e.g.,
the error tolerance could only be known at some future
time and simply setting one error tolerance does not
meet the needs since the simplified trajectories would
usually be used in many different applications which
accept different error tolerances). In these cases, a
better solution is to minimize the error while
achieving a pre-defined simplification size. For this
purpose, in this paper, we define a problem called
Min-Error and develop two exact algorithms and one
2-factor approximate algorithm for the problem.
Extensive experiments on real datasets verified our
algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{ElGebaly:2014:IIE,
author = "Kareem {El Gebaly} and Parag Agrawal and Lukasz Golab
and Flip Korn and Divesh Srivastava",
title = "Interpretable and informative explanations of
outcomes",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "1",
pages = "61--72",
month = sep,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:33 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we solve the following data
summarization problem: given a multi-dimensional data
set augmented with a binary attribute, how can we
construct an interpretable and informative summary of
the factors affecting the binary attribute in terms of
the combinations of values of the dimension attributes?
We refer to such summaries as explanation tables. We
show the hardness of constructing optimally-informative
explanation tables from data, and we propose effective
and efficient heuristics. The proposed heuristics are
based on sampling and include optimizations related to
computing the information content of a summary from a
sample of the data. Using real data sets, we
demonstrate the advantages of explanation tables
compared to related approaches that can be adapted to
solve our problem, and we show significant performance
benefits of our optimizations.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2014:CIN,
author = "Fei Li and H. V. Jagadish",
title = "Constructing an interactive natural language interface
for relational databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "1",
pages = "73--84",
month = sep,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:33 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Natural language has been the holy grail of query
interface designers, but has generally been considered
too hard to work with, except in limited specific
circumstances. In this paper, we describe the
architecture of an interactive natural language query
interface for relational databases. Through a carefully
limited interaction with the user, we are able to
correctly interpret complex natural language queries,
in a generic manner across a range of domains. By these
means, a logically complex English language sentence is
correctly translated into a SQL query, which may
include aggregation, nesting, and various types of
joins, among other things, and can be evaluated against
an RDBMS. We have constructed a system, NaLIR (Natural
Language Interface for Relational databases), embodying
these ideas. Our experimental assessment, through user
studies, demonstrates that NaLIR is good enough to be
usable in practice: even naive users are able to
specify quite complex ad-hoc queries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhu:2014:LGD,
author = "Yuanyuan Zhu and Jeffrey Xu Yu and Lu Qin",
title = "Leveraging graph dimensions in online graph search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "1",
pages = "85--96",
month = sep,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:33 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Graphs have been widely used due to its expressive
power to model complicated relationships. However,
given a graph database $ D_g = \{ g_1, g_2, \ldots, g_n
\} $, it is challenging to process graph queries since
a basic graph query usually involves costly graph
operations such as maximum common subgraph and graph
edit distance computation, which are NP-hard. In this
paper, we study a novel DS-preserved mapping which maps
graphs in a graph database$ D_g $ onto a
multidimensional space $ M_g $ under a structural
dimension $M$ using a mapping function $ \phi $ (). The
DS-preserved mapping preserves two things: distance and
structure. By the distance-preserving, it means that
any two graphs $ g_i$ and $ g_j$ in $ D_g$ must map to
two data objects $ \phi (g_i)$ and $ \phi (g_j)$ in $
M_g$, such that the distance, $ d(\phi (g_i), \phi
(g_j))$, between $ \phi (g_i)$ and $ \phi (g_j)$ in $
M_g$ approximates the graph dissimilarity $ \delta
(g_i, g_j)$ in $ D_g$. By the structure-preserving, it
further means that for a given unseen query graph $q$,
the distance between $q$ and any graph $ g_i$ in $ D_g$
needs to be preserved such that $ \delta (q, g_i)
\approx d(\phi (q), \phi (g_i))$. We discuss the
rationality of using graph dimension $M$ for online
graph processing, and show how to identify a small set
of subgraphs to form $M$ efficiently. We propose an
iterative algorithm DSPM to compute the graph
dimension, and discuss its optimization techniques. We
also give an approximate algorithm DSPMap in order to
handle a large graph database. We conduct extensive
performance studies on both real and synthetic datasets
to evaluate the top-$k$ similarity query which is to
find top-$k$ similar graphs from $ D_g$ for a query
graph, and show the effectiveness and efficiency of our
approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sidlauskas:2014:SJM,
author = "Darius Sidlauskas and Christian S. Jensen",
title = "Spatial joins in main memory: implementation
matters!",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "1",
pages = "97--100",
month = sep,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:33 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A recent PVLDB paper reports on experimental analyses
of ten spatial join techniques in main memory. We build
on this comprehensive study to raise awareness of the
fact that empirical running time performance findings
in main-memory settings are results of not only the
algorithms and data structures employed, but also their
implementation, which complicates the interpretation of
the results. In particular, we re-implement the worst
performing technique without changing the underlying
high-level algorithm, and we then offer evidence that
the resulting re-implementation is capable of
outperforming all the other techniques. This study
demonstrates that in main memory, where no
time-consuming I/O can mask variations in
implementation, implementation details are very
important; and it offers a concrete illustration of how
it is difficult to make conclusions from empirical
running time performance findings in main-memory
settings about data structures and algorithms
studied.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2014:SES,
author = "Xiaoyang Wang and Ying Zhang and Wenjie Zhang and
Xuemin Lin and Wei Wang",
title = "Selectivity estimation on streaming spatio-textual
data using local correlations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "2",
pages = "101--112",
month = oct,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:34 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we investigate the selectivity
estimation problem for streaming spatio-textual data,
which arises in many social network and geo-location
applications. Specifically, given a set of continuously
and rapidly arriving spatio-textual objects, each of
which is described by a geo-location and a short text,
we aim to accurately estimate the cardinality of a
spatial keyword query on objects seen so far, where a
spatial keyword query consists of a search region and a
set of query keywords. To the best of our knowledge,
this is the first work to address this important
problem. We first extend two existing techniques to
solve this problem, and show their limitations.
Inspired by two key observations on the ``locality'' of
the correlations among query keywords, we propose a
local correlation based method by utilizing an
augmented adaptive space partition tree ($ A^2 $SP-tree
for short) to approximately learn a local Bayesian
network on-the-fly for a given query and estimate its
selectivity. A novel local boosting approach is
presented to further enhance the learning accuracy of
local Bayesian networks. Our comprehensive experiments
on real-life datasets demonstrate the superior
performance of the local correlation based algorithm in
terms of estimation accuracy compared to other
competitors.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2014:PMK,
author = "Chuanwen Li and Yu Gu and Jianzhong Qi and Ge Yu and
Rui Zhang and Wang Yi",
title = "Processing moving $k$ {NN} queries using influential
neighbor sets",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "2",
pages = "113--124",
month = oct,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:34 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The moving $k$ nearest neighbor query, which computes
one's $k$ nearest neighbor set and maintains it while
at move, is gaining importance due to the prevalent use
of smart mobile devices such as smart phones. Safe
region is a popular technique in processing the moving
$k$ nearest neighbor query. It is a region where the
movement of the query object does not cause the current
$k$ nearest neighbor set to change. Processing a moving
$k$ nearest neighbor query is a continuing process of
checking the validity of the safe region and
recomputing it if invalidated. The size of the safe
region largely decides the frequency of safe region
recomputation and hence query processing efficiency.
Existing moving $k$ nearest neighbor algorithms lack
efficiency due to either computing small safe regions
and have to recompute frequently or computing large
safe regions (i.e., an order-$k$ Voronoi cell) with a
high cost. In this paper, we take a third approach.
Instead of safe regions, we use a small set of safe
guarding objects. We prove that, as long as the the
current $k$ nearest neighbors are closer to the query
object than the safe guarding objects, the current $k$
nearest neighbors stay valid and no recomputation is
required. This way, we avoid the high cost of safe
region recomputation. We also prove that, the region
defined by the safe guarding objects is the largest
possible safe region. This means that the recomputation
frequency of our method is also minimized. We conduct
extensive experiments comparing our method with the
state-of-the-art method on both real and synthetic data
sets. The results confirm the superiority of our
method.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mozafari:2014:SCS,
author = "Barzan Mozafari and Purna Sarkar and Michael Franklin
and Michael Jordan and Samuel Madden",
title = "Scaling up crowd-sourcing to very large datasets: a
case for active learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "2",
pages = "125--136",
month = oct,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:34 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Crowd-sourcing has become a popular means of acquiring
labeled data for many tasks where humans are more
accurate than computers, such as image tagging, entity
resolution, and sentiment analysis. However, due to the
time and cost of human labor, solutions that rely
solely on crowd-sourcing are often limited to small
datasets (i.e., a few thousand items). This paper
proposes algorithms for integrating machine learning
into crowd-sourced databases in order to combine the
accuracy of human labeling with the speed and
cost-effectiveness of machine learning classifiers. By
using active learning as our optimization strategy for
labeling tasks in crowd-sourced databases, we can
minimize the number of questions asked to the crowd,
allowing crowd-sourced applications to scale (i.e.,
label much larger datasets at lower costs). Designing
active learning algorithms for a crowd-sourced database
poses many practical challenges: such algorithms need
to be generic, scalable, and easy to use, even for
practitioners who are not machine learning experts. We
draw on the theory of nonparametric bootstrap to
design, to the best of our knowledge, the first active
learning algorithms that meet all these requirements.
Our results, on 3 real-world datasets collected with
Amazons Mechanical Turk, and on 15 UCI datasets, show
that our methods on average ask 1--2 orders of
magnitude fewer questions than the baseline, and $
4.5$--$ 44 \times $ fewer than existing active learning
algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yang:2014:CCO,
author = "Dingyu Yang and Dongxiang Zhang and Kian-Lee Tan and
Jian Cao and Fr{\'e}d{\'e}ric {Le Mou{\"e}l}",
title = "{CANDS}: continuous optimal navigation via distributed
stream processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "2",
pages = "137--148",
month = oct,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:34 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Shortest path query over a dynamic road network is a
prominent problem for the optimization of real-time
traffic systems. Existing solutions rely either on a
centralized index system with tremendous
pre-computation overhead, or on a distributed graph
processing system such as Pregel that requires much
synchronization effort. However, the performance of
these systems degenerates with frequent route path
updates caused by continuous traffic condition change.
In this paper, we build CANDS, a distributed stream
processing platform for continuous optimal shortest
path queries. It provides an asynchronous solution to
answering a large quantity of shortest path queries. It
is able to efficiently detect affected paths and adjust
their paths in the face of traffic updates. Moreover,
the affected paths can be quickly updated to the
optimal solutions throughout the whole navigation
process. Experimental results demonstrate that the
performance for answering shortest path queries by
CANDS is two orders of magnitude better than that of
GPS, an open-source implementation of Pregel. In
addition, CANDS provides fast response to traffic
updates to guarantee the optimality of answering
shortest path queries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Begum:2014:RTS,
author = "Nurjahan Begum and Eamonn Keogh",
title = "Rare time series motif discovery from unbounded
streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "2",
pages = "149--160",
month = oct,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:34 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The detection of time series motifs, which are
approximately repeated subsequences in time series
streams, has been shown to have great utility as a
subroutine in many higher-level data mining algorithms.
However, this detection becomes much harder in cases
where the motifs of interest are vanishingly rare or
when faced with a never-ending stream of data. In this
work we investigate algorithms to find such rare
motifs. We demonstrate that under reasonable
assumptions we must abandon any hope of an exact
solution to the motif problem as it is normally
defined; however, we introduce algorithms that allow us
to solve the underlying problem with high
probability.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bu:2014:PBG,
author = "Yingyi Bu and Vinayak Borkar and Jianfeng Jia and
Michael J. Carey and Tyson Condie",
title = "Pregelix: {Big(ger)} graph analytics on a dataflow
engine",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "2",
pages = "161--172",
month = oct,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:34 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "There is a growing need for distributed graph
processing systems that are capable of gracefully
scaling to very large graph datasets. Unfortunately,
this challenge has not been easily met due to the
intense memory pressure imposed by process-centric,
message passing designs that many graph processing
systems follow. Pregelix is a new open source
distributed graph processing system that is based on an
iterative dataflow design that is better tuned to
handle both in-memory and out-of-core workloads. As
such, Pregelix offers improved performance
characteristics and scaling properties over current
open source systems (e.g., we have seen up to $ 15
\times $ speedup compared to Apache Giraph and up to $
35 \times $ speedup compared to distributed GraphLab),
and more effective use of available machine resources
to support Big(ger) Graph Analytics.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sridharan:2014:PRC,
author = "Shriram Sridharan and Jignesh M. Patel",
title = "Profiling {R} on a contemporary processor",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "2",
pages = "173--184",
month = oct,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:34 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/s-plus.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "R is a popular data analysis language, but there is
scant experimental data characterizing the run-time
profile of R programs. This paper addresses this
limitation by systematically cataloging where time is
spent when running R programs. Our evaluation using
four different workloads shows that when analyzing
large datasets, R programs (a) spend more than 85\% of
their time in processor stalls, which leads to slower
execution times, (b) trigger the garbage collector
frequently, which leads to higher memory stalls, and
(c) create a large number of unnecessary temporary
objects that causes R to swap to disk quickly even for
datasets that are far smaller than the available main
memory. Addressing these issues should allow R programs
to run faster than they do today, and allow R to be
used for analyzing even larger datasets. As outlined in
this paper, the results presented in this paper
motivate a number of future research investigations in
the database, architecture, and programming language
communities. All data and code that is used in this
paper (which includes the R programs, and changes to
the R source code for instrumentation) can be found at:
{\tt http://quickstep.cs.wisc.edu/dissecting-R/}.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bailis:2014:CAD,
author = "Peter Bailis and Alan Fekete and Michael J. Franklin
and Ali Ghodsi and Joseph M. Hellerstein and Ion
Stoica",
title = "Coordination avoidance in database systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "3",
pages = "185--196",
month = nov,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:34 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Minimizing coordination, or blocking communication
between concurrently executing operations, is key to
maximizing scalability, availability, and high
performance in database systems. However, uninhibited
coordination-free execution can compromise application
correctness, or consistency. When is coordination
necessary for correctness? The classic use of
serializable transactions is sufficient to maintain
correctness but is not necessary for all applications,
sacrificing potential scalability. In this paper, we
develop a formal framework, invariant confluence, that
determines whether an application requires coordination
for correct execution. By operating on
application-level invariants over database states
(e.g., integrity constraints), invariant confluence
analysis provides a necessary and sufficient condition
for safe, coordination-free execution. When programmers
specify their application invariants, this analysis
allows databases to coordinate only when anomalies that
might violate invariants are possible. We analyze the
invariant confluence of common invariants and
operations from real-world database systems (i.e.,
integrity constraints) and applications and show that
many are invariant confluent and therefore achievable
without coordination. We apply these results to a
proof-of-concept coordination-avoiding database
prototype and demonstrate sizable performance gains
compared to serializable execution, notably a 25-fold
improvement over prior TPC-C New-Order performance on a
200 server cluster.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zeng:2014:QSI,
author = "Qiang Zeng and Jignesh M. Patel and David Page",
title = "{QuickFOIL}: scalable inductive logic programming",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "3",
pages = "197--208",
month = nov,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:34 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Inductive Logic Programming (ILP) is a classic machine
learning technique that learns first-order rules from
relational-structured data. However, to-date most ILP
systems can only be applied to small datasets (tens of
thousands of examples). A long-standing challenge in
the field is to scale ILP methods to larger data sets.
This paper presents a method called QuickFOIL that
addresses this limitation. QuickFOIL employs a new
scoring function and a novel pruning strategy that
enables the algorithm to find high-quality rules.
QuickFOIL can also be implemented as an in-RDBMS
algorithm. Such an implementation presents a host of
query processing and optimization challenges that we
address in this paper. Our empirical evaluation shows
that QuickFOIL can scale to large datasets consisting
of hundreds of millions tuples, and is often more than
order of magnitude more efficient than other existing
approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yu:2014:SAE,
author = "Xiangyao Yu and George Bezerra and Andrew Pavlo and
Srinivas Devadas and Michael Stonebraker",
title = "Staring into the abyss: an evaluation of concurrency
control with one thousand cores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "3",
pages = "209--220",
month = nov,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:34 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Computer architectures are moving towards an era
dominated by many-core machines with dozens or even
hundreds of cores on a single chip. This unprecedented
level of on-chip parallelism introduces a new dimension
to scalability that current database management systems
(DBMSs) were not designed for. In particular, as the
number of cores increases, the problem of concurrency
control becomes extremely challenging. With hundreds of
threads running in parallel, the complexity of
coordinating competing accesses to data will likely
diminish the gains from increased core counts. To
better understand just how unprepared current DBMSs are
for future CPU architectures, we performed an
evaluation of concurrency control for on-line
transaction processing (OLTP) workloads on many-core
chips. We implemented seven concurrency control
algorithms on a main-memory DBMS and using computer
simulations scaled our system to 1024 cores. Our
analysis shows that all algorithms fail to scale to
this magnitude but for different reasons. In each case,
we identify fundamental bottlenecks that are
independent of the particular database implementation
and argue that even state-of-the-art DBMSs suffer from
these limitations. We conclude that rather than
pursuing incremental solutions, many-core chips may
require a completely redesigned DBMS architecture that
is built from ground up and is tightly coupled with the
hardware.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Trummer:2014:MOP,
author = "Immanuel Trummer and Christoph Koch",
title = "Multi-objective parametric query optimization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "3",
pages = "221--232",
month = nov,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:34 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Classical query optimization compares query plans
according to one cost metric and associates each plan
with a constant cost value. In this paper, we introduce
the Multi-Objective Parametric Query Optimization (MPQ)
problem where query plans are compared according to
multiple cost metrics and the cost of a given plan
according to a given metric is modeled as a function
that depends on multiple parameters. The cost metrics
may for instance include execution time or monetary
fees; a parameter may represent the selectivity of a
query predicate that is unspecified at optimization
time. MPQ generalizes parametric query optimization
(which allows multiple parameters but only one cost
metric) and multi-objective query optimization (which
allows multiple cost metrics but no parameters). We
formally analyze the novel MPQ problem and show why
existing algorithms are inapplicable. We present a
generic algorithm for MPQ and a specialized version for
MPQ with piecewise-linear plan cost functions. We prove
that both algorithms find all relevant query plans and
experimentally evaluate the performance of our second
algorithm in a Cloud computing scenario.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Giceva:2014:DQP,
author = "Jana Giceva and Gustavo Alonso and Timothy Roscoe and
Tim Harris",
title = "Deployment of query plans on multicores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "3",
pages = "233--244",
month = nov,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:34 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Efficient resource scheduling of multithreaded
software on multicore hardware is difficult given the
many parameters involved and the hardware heterogeneity
of existing systems. In this paper we explore the
efficient deployment of query plans over a multicore
machine. We focus on shared query systems, and
implement the proposed ideas using SharedDB. The goal
of the paper is to explore how to deliver maximum
performance and predictability, while minimizing
resource utilization when deploying query plans on
multicore machines. We propose to use resource activity
vectors to characterize the behavior of individual
database operators. We then present a novel deployment
algorithm which uses these vectors together with
dataflow information from the query plan to optimally
assign relational operators to physical cores.
Experiments demonstrate that this approach
significantly reduces resource requirements while
preserving performance and is robust across different
server architectures.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Taft:2014:SFG,
author = "Rebecca Taft and Essam Mansour and Marco Serafini and
Jennie Duggan and Aaron J. Elmore and Ashraf Aboulnaga
and Andrew Pavlo and Michael Stonebraker",
title = "E-store: fine-grained elastic partitioning for
distributed transaction processing systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "3",
pages = "245--256",
month = nov,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:34 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "On-line transaction processing (OLTP) database
management systems (DBMSs) often serve time-varying
workloads due to daily, weekly or seasonal fluctuations
in demand, or because of rapid growth in demand due to
a company's business success. In addition, many OLTP
workloads are heavily skewed to ``hot'' tuples or
ranges of tuples. For example, the majority of NYSE
volume involves only 40 stocks. To deal with such
fluctuations, an OLTP DBMS needs to be elastic; that
is, it must be able to expand and contract resources in
response to load fluctuations and dynamically balance
load as hot tuples vary over time. This paper presents
E-Store, an elastic partitioning framework for
distributed OLTP DBMSs. It automatically scales
resources in response to demand spikes, periodic
events, and gradual changes in an application's
workload. E-Store addresses localized bottlenecks
through a two-tier data placement strategy: cold data
is distributed in large chunks, while smaller ranges of
hot tuples are assigned explicitly to individual nodes.
This is in contrast to traditional single-tier hash and
range partitioning strategies. Our experimental
evaluation of E-Store shows the viability of our
approach and its efficacy under variations in load
across a cluster of machines. Compared to single-tier
approaches, E-Store improves throughput by up to 130\%
while reducing latency by 80\%.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Thirumuruganathan:2014:BIM,
author = "Saravanan Thirumuruganathan and Habibur Rahman and
Sofiane Abbar and Gautam Das",
title = "Beyond itemsets: mining frequent featuresets over
structured items",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "3",
pages = "257--268",
month = nov,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:34 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We assume a dataset of transactions generated by a set
of users over structured items where each item could be
described through a set of features. In this paper, we
are interested in identifying the frequent featuresets
(set of features) by mining item transactions. For
example, in a news website, items correspond to news
articles, the features are the named-entities/topics in
the articles and an item transaction would be the set
of news articles read by a user within the same
session. We show that mining frequent featuresets over
structured item transactions is a novel problem and
show that straightforward extensions of existing
frequent itemset mining techniques provide
unsatisfactory results. This is due to the fact that
while users are drawn to each item in the transaction
due to a subset of its features, the transaction by
itself does not provide any information about such
underlying preferred features of users. In order to
overcome this hurdle, we propose a featureset
uncertainty model where each item transaction could
have been generated by various featuresets with
different probabilities. We describe a novel approach
to transform item transactions into uncertain
transaction over featuresets and estimate their
probabilities using constrained least squares based
approach. We propose diverse algorithms to mine
frequent featuresets. Our experimental evaluation
provides a comparative analysis of the different
approaches proposed.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2014:ICD,
author = "Jun Zhang and Chaokun Wang and Jianmin Wang and
Jeffrey Xu Yu",
title = "Inferring continuous dynamic social influence and
personal preference for temporal behavior prediction",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "3",
pages = "269--280",
month = nov,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:34 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "It is always attractive and challenging to explore the
intricate behavior data and uncover people's
motivations, preference and habits, which can greatly
benefit many tasks including link prediction, item
recommendation, etc. Traditional work usually studies
people's behaviors without time information in a static
or discrete manner, assuming the underlying factors
stay invariant in a long period. However, we believe
people's behaviors are dynamic, and the contributing
factors including the social influence and personal
preference for behaviors are varying continuously over
time. Such continuous dynamics convey important
knowledge about people's behavior patterns; ignoring
them would lead to inaccurate models. In this work, we
address the continuous dynamic modeling of temporal
behaviors. To model the fully continuous temporal
dynamics of behaviors and the underlying factors, we
propose the DP-Space, a dynamic preference probability
space, which can capture their smooth variation in
various shapes over time with flexible basis functions.
Upon that we propose a generative dynamic behavior
model, ConTyor, which considers the temporal
item-adoption behaviors as joint effect of dynamic
social influence and varying personal preference over
continuous time. We also develop effective inference
methods for ConTyor and present its applications. We
conduct a comprehensive experimental study using
real-world datasets to evaluate the effectiveness of
our model and the temporal modeling. Results verify
that ConTyor outperforms existing state-of-the-art
static and temporal models in behavior predictions.
Moreover, in our detailed study on temporal modeling,
we show that temporal modeling is superior to static
approaches and modeling over continuous time is further
better than that over discrete time. We also
demonstrate that the ancient behavior data can still
become important and beneficial if modeled well.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lu:2014:LSD,
author = "Yi Lu and James Cheng and Da Yan and Huanhuan Wu",
title = "Large-scale distributed graph computing systems: an
experimental evaluation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "3",
pages = "281--292",
month = nov,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:34 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the prevalence of graph data in real-world
applications (e.g., social networks, mobile phone
networks, web graphs, etc.) and their ever-increasing
size, many distributed graph computing systems have
been developed in recent years to process and analyze
massive graphs. Most of these systems adopt Pregel's
vertex-centric computing model, while various
techniques have been proposed to address the
limitations in the Pregel framework. However, there is
a lack of comprehensive comparative analysis to
evaluate the performance of various systems and their
techniques, making it difficult for users to choose the
best system for their applications. We conduct
extensive experiments to evaluate the performance of
existing systems on graphs with different
characteristics and on algorithms with different design
logic. We also study the effectiveness of various
techniques adopted in existing systems, and the
scalability of the systems. The results of our study
reveal the strengths and limitations of existing
systems, and provide valuable insights for users,
researchers and system developers.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Inoue:2014:FSI,
author = "Hiroshi Inoue and Moriyoshi Ohara and Kenjiro Taura",
title = "Faster set intersection with {SIMD} instructions by
reducing branch mispredictions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "3",
pages = "293--304",
month = nov,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:34 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Set intersection is one of the most important
operations for many applications such as Web search
engines or database management systems. This paper
describes our new algorithm to efficiently find set
intersections with sorted arrays on modern processors
with SIMD instructions and high branch misprediction
penalties. Our algorithm efficiently exploits SIMD
instructions and can drastically reduce branch
mispredictions. Our algorithm extends a merge-based
algorithm by reading multiple elements, instead of just
one element, from each of two input arrays and compares
all of the pairs of elements from the two arrays to
find the elements with the same values. The key insight
for our improvement is that we can reduce the number of
costly hard-to-predict conditional branches by
advancing a pointer by more than one element at a time.
Although this algorithm increases the total number of
comparisons, we can execute these comparisons more
efficiently using the SIMD instructions and gain the
benefits of the reduced branch misprediction overhead.
Our algorithm is suitable to replace existing standard
library functions, such as {\tt std::set\_intersection}
in C++, thus accelerating many applications, because
the algorithm is simple and requires no preprocessing
to generate additional data structures. We implemented
our algorithm on Xeon and POWER7+. The experimental
results show our algorithm outperforms the {\tt
std::set\_intersection} implementation delivered with
gcc by up to 5.2x using SIMD instructions and by up to
2.1x even without using SIMD instructions for 32-bit
and 64-bit integer datasets. Our SIMD algorithm also
outperformed an existing algorithm that can leverage
SIMD instructions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{El-Kishky:2014:STP,
author = "Ahmed El-Kishky and Yanglei Song and Chi Wang and
Clare R. Voss and Jiawei Han",
title = "Scalable topical phrase mining from text corpora",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "3",
pages = "305--316",
month = nov,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:34 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "While most topic modeling algorithms model text
corpora with unigrams, human interpretation often
relies on inherent grouping of terms into phrases. As
such, we consider the problem of discovering topical
phrases of mixed lengths. Existing work either performs
post processing to the results of unigram-based topic
models, or utilizes complex $n$-gram-discovery topic
models. These methods generally produce low-quality
topical phrases or suffer from poor scalability on even
moderately-sized datasets. We propose a different
approach that is both computationally efficient and
effective. Our solution combines a novel phrase mining
framework to segment a document into single and
multi-word phrases, and a new topic model that operates
on the induced document partition. Our approach
discovers high quality topical phrases with negligible
extra cost to the bag-of-words topic model in a variety
of datasets including research publication titles,
abstracts, reviews, and news articles.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tao:2014:ETK,
author = "Wenbo Tao and Minghe Yu and Guoliang Li",
title = "Efficient top-$k$ simrank-based similarity join",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "3",
pages = "317--328",
month = nov,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:34 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "SimRank is a popular and widely-adopted similarity
measure to evaluate the similarity between nodes in a
graph. It is time and space consuming to compute the
SimRank similarities for all pairs of nodes, especially
for large graphs. In real-world applications, users are
only interested in the most similar pairs. To address
this problem, in this paper we study the top-$k$
SimRank-based similarity join problem, which finds $k$
most similar pairs of nodes with the largest SimRank
similarities among all possible pairs. To the best of
our knowledge, this is the first attempt to address
this problem. We encode each node as a vector by
summarizing its neighbors and transform the calculation
of the SimRank similarity between two nodes to
computing the dot product between the corresponding
vectors. We devise an efficient two-step framework to
compute top-$k$ similar pairs using the vectors. For
large graphs, exact algorithms cannot meet the
high-performance requirement, and we also devise an
approximate algorithm which can efficiently identify
top-$k$ similar pairs under user-specified accuracy
requirement. Experiments on both real and synthetic
datasets show our method achieves high performance and
good scalability.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{He:2014:CQC,
author = "Jiong He and Shuhao Zhang and Bingsheng He",
title = "In-cache query co-processing on coupled {CPU--GPU}
architectures",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "4",
pages = "329--340",
month = dec,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:35 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Recently, there have been some emerging processor
designs that the CPU and the GPU (Graphics Processing
Unit) are integrated in a single chip and share Last
Level Cache (LLC). However, the main memory bandwidth
of such coupled CPU-GPU architectures can be much lower
than that of a discrete GPU. As a result, current GPU
query co-processing paradigms can severely suffer from
memory stalls. In this paper, we propose a novel
in-cache query co-processing paradigm for main memory
On-Line Analytical Processing (OLAP) databases on
coupled CPU-GPU architectures. Specifically, we adapt
CPU-assisted prefetching to minimize cache misses in
GPU query co-processing and CPU-assisted decompression
to improve query execution performance. Furthermore, we
develop a cost model guided adaptation mechanism for
distributing the workload of prefetching,
decompression, and query execution between CPU and GPU.
We implement a system prototype and evaluate it on two
recent AMD APUs A8 and A10. The experimental results
show that (1) in-cache query co-processing can
effectively improve the performance of the
state-of-the-art GPU co-processing paradigm by up to
30\% and 33\% on A8 and A10, respectively, and (2) our
workload distribution adaption mechanism can
significantly improve the query performance by up to
36\% and 40\% on A8 and A10, respectively.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fujiwara:2014:SMR,
author = "Yasuhiro Fujiwara and Go Irie and Shari Kuroyama and
Makoto Onizuka",
title = "Scaling manifold ranking based image retrieval",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "4",
pages = "341--352",
month = dec,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:35 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Manifold Ranking is a graph-based ranking algorithm
being successfully applied to retrieve images from
multimedia databases. Given a query image, Manifold
Ranking computes the ranking scores of images in the
database by exploiting the relationships among them
expressed in the form of a graph. Since Manifold
Ranking effectively utilizes the global structure of
the graph, it is significantly better at finding
intuitive results compared with current approaches.
Fundamentally, Manifold Ranking requires an inverse
matrix to compute ranking scores and so needs $ O(n^3)
$ time, where $n$ is the number of images. Manifold
Ranking, unfortunately, does not scale to support
databases with large numbers of images. Our solution,
Mogul, is based on two ideas: (1) It efficiently
computes ranking scores by sparse matrices, and (2) It
skips unnecessary score computations by estimating
upper bounding scores. These two ideas reduce the time
complexity of Mogul to $ O(n)$ from $ O(n^3)$ of the
inverse matrix approach. Experiments show that Mogul is
much faster and gives significantly better retrieval
quality than a state-of-the-art approximation
approach.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Barber:2014:MEH,
author = "R. Barber and G. Lohman and I. Pandis and V. Raman and
R. Sidle and G. Attaluri and N. Chainani and S.
Lightstone and D. Sharpe",
title = "Memory-efficient hash joins",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "4",
pages = "353--364",
month = dec,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:35 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present new hash tables for joins, and a hash join
based on them, that consumes far less memory and is
usually faster than recently published in-memory joins.
Our hash join is not restricted to outer tables that
fit wholly in memory. Key to this hash join is a new
concise hash table (CHT), a linear probing hash table
that has 100\% fill factor, and uses a sparse bitmap
with embedded population counts to almost entirely
avoid collisions. This bitmap also serves as a Bloom
filter for use in multi-table joins. We study the
random access characteristics of hash joins, and renew
the case for non-partitioned hash joins. We introduce a
variant of partitioned joins in which only the build is
partitioned, but the probe is not, as this is more
efficient for large outer tables than traditional
partitioned joins. This also avoids partitioning costs
during the probe, while at the same time allowing
parallel build without latching overheads.
Additionally, we present a variant of CHT, called a
concise array table (CAT), that can be used when the
key domain is moderately dense. CAT is collision-free
and avoids storing join keys in the hash table. We
perform a detailed comparison of CHT and CAT against
leading in-memory hash joins. Our experiments show that
we can reduce the memory usage by one to three orders
of magnitude, while also being competitive in
performance.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Alexe:2014:PAI,
author = "Bogdan Alexe and Mary Roth and Wang-Chiew Tan",
title = "Preference-aware integration of temporal data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "4",
pages = "365--376",
month = dec,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:35 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A complete description of an entity is rarely
contained in a single data source, but rather, it is
often distributed across different data sources.
Applications based on personal electronic health
records, sentiment analysis, and financial records all
illustrate that significant value can be derived from
integrated, consistent, and queryable profiles of
entities from different sources. Even more so, such
integrated profiles are considerably enhanced if
temporal information from different sources is
carefully accounted for. We develop a simple and yet
versatile operator, called prawn, that is typically
called as a final step of an entity integration
workflow. Prawn is capable of consistently integrating
and resolving temporal conflicts in data that may
contain multiple dimensions of time based on a set of
preference rules specified by a user (hence the name
prawn for preference-aware union). In the event that
not all conflicts can be resolved through preferences,
one can enumerate each possible consistent
interpretation of the result returned by prawn at a
given time point through a polynomial-delay algorithm.
In addition to providing algorithms for implementing
prawn, we study and establish several desirable
properties of prawn. First, prawn produces the same
temporally integrated outcome, modulo representation of
time, regardless of the order in which data sources are
integrated. Second, prawn can be customized to
integrate temporal data for different applications by
specifying application-specific preference rules.
Third, we show experimentally that our implementation
of prawn is feasible on both ``small'' and ``big'' data
platforms in that it is efficient in both storage and
execution time. Finally, we demonstrate a fundamental
advantage of prawn: we illustrate that standard query
languages can be immediately used to pose useful
temporal queries over the integrated and resolved
entity repository.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhou:2014:MSD,
author = "Chang Zhou and Jun Gao and Binbin Sun and Jeffrey Xu
Yu",
title = "{MOCgraph}: scalable distributed graph processing
using message online computing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "4",
pages = "377--388",
month = dec,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:35 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Existing distributed graph processing frameworks,
e.g., Pregel, Giraph, GPS and GraphLab, mainly exploit
main memory to support flexible graph operations for
efficiency. Due to the complexity of graph analytics,
huge memory space is required especially for those
graph analytics that spawn large intermediate results.
Existing frameworks may terminate abnormally or degrade
performance seriously when the memory is exhausted or
the external storage has to be used. In this paper, we
propose MOCgraph, a scalable distributed graph
processing framework to reduce the memory footprint and
improve the scalability, based on message online
computing. MOCgraph consumes incoming messages in a
streaming manner, so as to handle larger graphs or more
complex analytics with the same memory capacity.
MOCgraph also exploits message online computing with
external storage to provide an efficient out-of-core
support. We implement MOCgraph on top of Apache Giraph,
and test it against several representative graph
algorithms on large graph datasets. Experiments
illustrate that MOCgraph is efficient and
memory-saving, especially for graph analytics with
large intermediate results.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Huang:2014:NAL,
author = "Jian Huang and Karsten Schwan and Moinuddin K.
Qureshi",
title = "{NVRAM-aware} logging in transaction systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "4",
pages = "389--400",
month = dec,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:35 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Emerging byte-addressable, non-volatile memory
technologies (NVRAM) like phase-change memory can
increase the capacity of future memory systems by
orders of magnitude. Compared to systems that rely on
disk storage, NVRAM-based systems promise significant
improvements in performance for key applications like
online transaction processing (OLTP). Unfortunately,
NVRAM systems suffer from two drawbacks: their
asymmetric read-write performance and the notable
higher cost of the new memory technologies compared to
disk. This paper investigates the cost-effective use of
NVRAM in transaction systems. It shows that using NVRAM
only for the logging subsystem (NV-Logging) provides
much higher transactions per dollar than simply
replacing all disk storage with NVRAM. Specifically,
for NV-Logging, we show that the software overheads
associated with centralized log buffers cause
performance bottlenecks and limit scaling. The
per-transaction logging methods described in the paper
help avoid these overheads, enabling concurrent logging
for multiple transactions. Experimental results with a
faithful emulation of future NVRAM-based servers using
the TPCC, TATP, and TPCB benchmarks show that
NV-Logging improves throughput by 1.42 --- 2.72x over
the costlier option of replacing all disk storage with
NVRAM. Results also show that NV-Logging performs 1.21
--- 6.71x better than when logs are placed into the
PMFS NVRAM-optimized file system. Compared to
state-of-the-art distributed logging, NV-Logging
delivers 20.4\% throughput improvements.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chandramouli:2014:THP,
author = "Badrish Chandramouli and Jonathan Goldstein and Mike
Barnett and Robert DeLine and Danyel Fisher and John C.
Platt and James F. Terwilliger and John Wernsing",
title = "Trill: a high-performance incremental query processor
for diverse analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "4",
pages = "401--412",
month = dec,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:35 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper introduces Trill --- a new query processor
for analytics. Trill fulfills a combination of three
requirements for a query processor to serve the diverse
big data analytics space: (1) Query Model: Trill is
based on a tempo-relational model that enables it to
handle streaming and relational queries with early
results, across the latency spectrum from real-time to
offline; (2) Fabric and Language Integration: Trill is
architected as a high-level language library that
supports rich data-types and user libraries, and
integrates well with existing distribution fabrics and
applications; and (3) Performance: Trill's throughput
is high across the latency spectrum. For streaming
data, Trill's throughput is 2--4 orders of magnitude
higher than comparable streaming engines. For offline
relational queries, Trill's throughput is comparable to
a major modern commercial columnar DBMS. Trill uses a
streaming batched-columnar data representation with a
new dynamic compilation-based system architecture that
addresses all these requirements. In this paper, we
describe Trill's new design and architecture, and
report experimental results that demonstrate Trill's
high performance across diverse analytics scenarios. We
also describe how Trill's ability to support diverse
analytics has resulted in its adoption across many
usage scenarios at Microsoft.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Song:2014:EPM,
author = "Chunyao Song and Tingjian Ge and Cindy Chen and Jie
Wang",
title = "Event pattern matching over graph streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "4",
pages = "413--424",
month = dec,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:35 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A graph is a fundamental and general data structure
underlying all data applications. Many applications
today call for the management and query capabilities
directly on graphs. Real time graph streams, as seen in
road networks, social and communication networks, and
web requests, are such applications. Event pattern
matching requires the awareness of graph structures,
which is different from traditional complex event
processing. It also requires a focus on the dynamicity
of the graph, time order constraints in patterns, and
online query processing, which deviates significantly
from previous work on subgraph matching as well. We
study the semantics and efficient online algorithms for
this important and intriguing problem, and evaluate our
approaches with extensive experiments over real world
datasets in four different domains.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2014:CAA,
author = "Qi Li and Yaliang Li and Jing Gao and Lu Su and Bo
Zhao and Murat Demirbas and Wei Fan and Jiawei Han",
title = "A confidence-aware approach for truth discovery on
long-tail data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "4",
pages = "425--436",
month = dec,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:35 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In many real world applications, the same item may be
described by multiple sources. As a consequence,
conflicts among these sources are inevitable, which
leads to an important task: how to identify which piece
of information is trustworthy, i.e., the truth
discovery task. Intuitively, if the piece of
information is from a reliable source, then it is more
trustworthy, and the source that provides trustworthy
information is more reliable. Based on this principle,
truth discovery approaches have been proposed to infer
source reliability degrees and the most trustworthy
information (i.e., the truth) simultaneously. However,
existing approaches overlook the ubiquitous long-tail
phenomenon in the tasks, i.e., most sources only
provide a few claims and only a few sources make plenty
of claims, which causes the source reliability
estimation for small sources to be unreasonable. To
tackle this challenge, we propose a confidence-aware
truth discovery (CATD) method to automatically detect
truths from conflicting data with long-tail phenomenon.
The proposed method not only estimates source
reliability, but also considers the confidence interval
of the estimation, so that it can effectively reflect
real source reliability for sources with various levels
of participation. Experiments on four real world tasks
as well as simulated multi-source long-tail datasets
demonstrate that the proposed method outperforms
existing state-of-the-art truth discovery approaches by
successful discounting the effect of small sources.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shen:2014:FFR,
author = "Yanyan Shen and Gang Chen and H. V. Jagadish and Wei
Lu and Beng Chin Ooi and Bogdan Marius Tudor",
title = "Fast failure recovery in distributed graph processing
systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "4",
pages = "437--448",
month = dec,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:35 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Distributed graph processing systems increasingly
require many compute nodes to cope with the
requirements imposed by contemporary graph-based Big
Data applications. However, increasing the number of
compute nodes increases the chance of node failures.
Therefore, provisioning an efficient failure recovery
strategy is critical for distributed graph processing
systems. This paper proposes a novel recovery mechanism
for distributed graph processing systems that
parallelizes the recovery process. The key idea is to
partition the part of the graph that is lost during a
failure among a subset of the remaining nodes. To do
so, we augment the existing checkpoint-based and
log-based recovery schemes with a partitioning
mechanism that is sensitive to the total computation
and communication cost of the recovery process. Our
implementation on top of the widely used Giraph system
outperforms checkpoint-based recovery by up to 30x on a
cluster of 40 compute nodes.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Then:2014:MME,
author = "Manuel Then and Moritz Kaufmann and Fernando Chirigati
and Tuan-Anh Hoang-Vu and Kien Pham and Alfons Kemper
and Thomas Neumann and Huy T. Vo",
title = "The more the merrier: efficient multi-source graph
traversal",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "4",
pages = "449--460",
month = dec,
year = "2014",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:35 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Graph analytics on social networks, Web data, and
communication networks has been widely used in a
plethora of applications. Many graph analytics
algorithms are based on breadth-first search (BFS)
graph traversal, which is not only time-consuming for
large datasets but also involves much redundant
computation when executed multiple times from different
start vertices. In this paper, we propose Multi-Source
BFS (MS-BFS), an algorithm that is designed to run
multiple concurrent BFSs over the same graph on a
single CPU core while scaling up as the number of cores
increases. MS-BFS leverages the properties of
small-world networks, which apply to many real-world
graphs, and enables efficient graph traversal that: (i)
shares common computation across concurrent BFSs; (ii)
greatly reduces the number of random memory accesses;
and (iii) does not incur synchronization costs. We
demonstrate how a real graph analytics
application---all-vertices closeness centrality---can
be efficiently solved with MS-BFS. Furthermore, we
present an extensive experimental evaluation with both
synthetic and real datasets, including Twitter and
Wikipedia, showing that MS-BFS provides almost linear
scalability with respect to the number of cores and
excellent scalability for increasing graph sizes,
outperforming state-of-the-art BFS algorithms by more
than one order of magnitude when running a large number
of BFSs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wandelt:2015:MCS,
author = "Sebastian Wandelt and Ulf Leser",
title = "{MRCSI}: compressing and searching string collections
with multiple references",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "5",
pages = "461--472",
month = jan,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:35 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/string-matching.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Efficiently storing and searching collections of
similar strings, such as large populations of genomes
or long change histories of documents from Wikis, is a
timely and challenging problem. Several recent
proposals could drastically reduce space requirements
by exploiting the similarity between strings in
so-called reference-based compression. However, these
indexes are usually not searchable any more, i.e., in
these methods search efficiency is sacrificed for
storage efficiency. We propose Multi-Reference
Compressed Search Indexes (MRCSI) as a framework for
efficiently compressing dissimilar string collections.
In contrast to previous works which can use only a
single reference for compression, MRCSI (a) uses
multiple references for achieving increased compression
rates, where the reference set need not be specified by
the user but is determined automatically, and (b)
supports efficient approximate string searching with
edit distance constraints. We prove that finding the
smallest MRCSI is NP-hard. We then propose three
heuristics for computing MRCSIs achieving increasing
compression ratios. Compared to state-of-the-art
competitors, our methods target an interesting and
novel sweet-spot between high compression ratio versus
search efficiency.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ding:2015:YFC,
author = "Rui Ding and Qiang Wang and Yingnong Dang and Qiang Fu
and Haidong Zhang and Dongmei Zhang",
title = "{YADING}: fast clustering of large-scale time series
data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "5",
pages = "473--484",
month = jan,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:35 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Fast and scalable analysis techniques are becoming
increasingly important in the era of big data, because
they are the enabling techniques to create real-time
and interactive experiences in data analysis. Time
series are widely available in diverse application
areas. Due to the large number of time series instances
(e.g., millions) and the high dimensionality of each
time series instance (e.g., thousands), it is
challenging to conduct clustering on large-scale time
series, and it is even more challenging to do so in
real-time to support interactive exploration. In this
paper, we propose a novel end-to-end time series
clustering algorithm, YADING, which automatically
clusters large-scale time series with fast performance
and quality results. Specifically, YADING consists of
three steps: sampling the input dataset, conducting
clustering on the sampled dataset, and assigning the
rest of the input data to the clusters generated on the
sampled dataset. In particular, we provide theoretical
proof on the lower and upper bounds of the sample size,
which not only guarantees YADING's high performance,
but also ensures the distribution consistency between
the input dataset and the sampled dataset. We also
select $ L_1 $ norm as similarity measure and the
multi-density approach as the clustering method. With
theoretical bound, this selection ensures YADING's
robustness to time series variations due to phase
perturbation and random noise. Evaluation results have
demonstrated that on typical-scale (100,000 time series
each with 1,000 dimensions) datasets, YADING is about
40 times faster than the state-of-the-art,
sampling-based clustering algorithm DENCLUE 2.0, and
about 1,000 times faster than DBSCAN and CLARANS.
YADING has also been used by product teams at Microsoft
to analyze service performance. Two of such use cases
are shared in this paper.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wu:2015:HWS,
author = "Ting Wu and Lei Chen and Pan Hui and Chen Jason Zhang
and Weikai Li",
title = "Hear the whole story: towards the diversity of opinion
in crowdsourcing markets",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "5",
pages = "485--496",
month = jan,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:35 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The recent surge in popularity of crowdsourcing has
brought with it a new opportunity for engaging human
intelligence in the process of data analysis.
Crowdsourcing provides a fundamental mechanism for
enabling online workers to participate in tasks that
are either too difficult to be solved solely by a
computer or too expensive to employ experts to perform.
In the field of social science, four elements are
required to form a wise crowd --- Diversity of Opinion,
Independence, Decentralization and Aggregation.
However, while the other three elements are already
studied and implemented in current crowdsourcing
platforms, the 'Diversity of Opinion' has not been
functionally enabled. In this paper, we address the
algorithmic optimizations towards the diversity of
opinion of crowdsourcing marketplaces. From a
computational perspective, in order to build a wise
crowd, we need to quantitatively modeling the
diversity, and take it into consideration for
constructing the crowd. In a crowdsourcing marketplace,
we usually encounter two basic paradigms for worker
selection: building a crowd to wait for tasks to come
and selecting workers for a given task. Therefore, we
propose our Similarity-driven Model (S-Model) and
Task-driven Model (T-Model) for both of the paradigms.
Under both of the models, we propose efficient and
effective algorithms to enlist a budgeted number of
workers, which have the optimal diversity. We have
verified our solutions with extensive experiments on
both synthetic datasets and real data sets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chatzistergiou:2015:RUR,
author = "Andreas Chatzistergiou and Marcelo Cintra and Stratis
D. Viglas",
title = "{REWIND}: recovery write-ahead system for in-memory
non-volatile data-structures",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "5",
pages = "497--508",
month = jan,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:35 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Recent non-volatile memory (NVM) technologies, such as
PCM, STT-MRAM and ReRAM, can act as both main memory
and storage. This has led to research into NVM
programming models, where persistent data structures
remain in memory and are accessed directly through CPU
loads and stores. Existing mechanisms for transactional
updates are not appropriate in such a setting as they
are optimized for block-based storage. We present
REWIND, a user-mode library approach to managing
transactional updates directly from user code written
in an imperative general-purpose language. REWIND
relies on a custom persistent in-memory data structure
for the log that supports recoverable operations on
itself. The scheme also employs a combination of
non-temporal updates, persistent memory fences, and
lightweight logging. Experimental results on synthetic
transactional workloads and TPC-C show the overhead of
REWIND compared to its non-recoverable equivalent to be
within a factor of only 1.5 and 1.39 respectively.
Moreover, REWIND outperforms state-of-the-art
approaches for data structure recoverability as well as
general purpose and NVM-aware DBMS-based recovery
schemes by up to two orders of magnitude.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2015:ICS,
author = "Rong-Hua Li and Lu Qin and Jeffrey Xu Yu and Rui Mao",
title = "Influential community search in large networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "5",
pages = "509--520",
month = jan,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:35 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Community search is a problem of finding densely
connected subgraphs that satisfy the query conditions
in a network, which has attracted much attention in
recent years. However, all the previous studies on
community search do not consider the influence of a
community. In this paper, we introduce a novel
community model called $k$-influential community based
on the concept of $k$-core, which can capture the
influence of a community. Based on the new community
model, we propose a linear-time online search algorithm
to find the top-$r$ $k$-influential communities in a
network. To further speed up the influential community
search algorithm, we devise a linear-space index
structure which supports efficient search of the
top-$r$ $k$-influential communities in optimal time. We
also propose an efficient algorithm to maintain the
index when the network is frequently updated. We
conduct extensive experiments on 7 real-world large
networks, and the results demonstrate the efficiency
and effectiveness of the proposed methods.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kim:2015:RSV,
author = "Albert Kim and Eric Blais and Aditya Parameswaran and
Piotr Indyk and Sam Madden and Ronitt Rubinfeld",
title = "Rapid sampling for visualizations with ordering
guarantees",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "5",
pages = "521--532",
month = jan,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:35 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Visualizations are frequently used as a means to
understand trends and gather insights from datasets,
but often take a long time to generate. In this paper,
we focus on the problem of rapidly generating
approximate visualizations while preserving crucial
visual properties of interest to analysts. Our primary
focus will be on sampling algorithms that preserve the
visual property of ordering; our techniques will also
apply to some other visual properties. For instance,
our algorithms can be used to generate an approximate
visualization of a bar chart very rapidly, where the
comparisons between any two bars are correct. We
formally show that our sampling algorithms are
generally applicable and provably optimal in theory, in
that they do not take more samples than necessary to
generate the visualizations with ordering guarantees.
They also work well in practice, correctly ordering
output groups while taking orders of magnitude fewer
samples and much less time than conventional sampling
schemes.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chang:2015:OEE,
author = "Lijun Chang and Xuemin Lin and Wenjie Zhang and
Jeffrey Xu Yu and Ying Zhang and Lu Qin",
title = "Optimal enumeration: efficient top-$k$ tree matching",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "5",
pages = "533--544",
month = jan,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:35 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Driven by many real applications, graph pattern
matching has attracted a great deal of attention
recently. Consider that a twig-pattern matching may
result in an extremely large number of matches in a
graph; this may not only confuse users by providing too
many results but also lead to high computational costs.
In this paper, we study the problem of top-$k$ tree
pattern matching; that is, given a rooted tree $T$,
compute its top-$k$ matches in a directed graph $G$
based on the twig-pattern matching semantics. We
firstly present a novel and optimal enumeration
paradigm based on the principle of Lawler's procedure.
We show that our enumeration algorithm runs in $ O(n_T
+ \log k)$ time in each round where $ n_T$ is the
number of nodes in $T$. Considering that the time
complexity to output a match of $T$ is $ O(n_T)$ and $
n_T \geq \log k$ in practice, our enumeration technique
is optimal. Moreover, the cost of generating top-$1$
match of $T$ in our algorithm is $ O(m_R)$ where $ m_R$
is the number of edges in the transitive closure of a
data graph $G$ involving all relevant nodes to $T$. $
O(m_R)$ is also optimal in the worst case without
pre-knowledge of $G$. Consequently, our algorithm is
optimal with the running time $ O(m_R + k(n_T + \log
k))$ in contrast to the time complexity $ O(m_R \log k
+ k n_T (\log k + d_T))$ of the existing technique
where $ d_T$ is the maximal node degree in $T$.
Secondly, a novel priority based access technique is
proposed, which greatly reduces the number of edges
accessed and results in a significant performance
improvement. Finally, we apply our techniques to the
general form of top-$k$ graph pattern matching problem
(i.e., query is a graph) to improve the existing
techniques. Comprehensive empirical studies demonstrate
that our techniques may improve the existing techniques
by orders of magnitude.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lazerson:2015:MDS,
author = "Arnon Lazerson and Izchak Sharfman and Daniel Keren
and Assaf Schuster and Minos Garofalakis and Vasilis
Samoladas",
title = "Monitoring distributed streams using convex
decompositions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "5",
pages = "545--556",
month = jan,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:35 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Emerging large-scale monitoring applications rely on
continuous tracking of complex data-analysis queries
over collections of massive, physically-distributed
data streams. Thus, in addition to the space- and
time-efficiency requirements of conventional stream
processing (at each remote monitor site), effective
solutions also need to guarantee communication
efficiency (over the underlying communication network).
The complexity of the monitored query adds to the
difficulty of the problem --- this is especially true
for non-linear queries (e.g., joins), where no obvious
solutions exist for distributing the monitored
condition across sites. The recently proposed geometric
method, based on the notion of covering spheres, offers
a generic methodology for splitting an arbitrary
(non-linear) global condition into a collection of
local site constraints, and has been applied to massive
distributed stream-monitoring tasks, achieving
state-of-the-art performance. In this paper, we present
a far more general geometric approach, based on the
convex decomposition of an appropriate subset of the
domain of the monitoring query, and formally prove that
it is always guaranteed to perform at least as good as
the covering spheres method. We analyze our approach
and demonstrate its effectiveness for the important
case of sketch-based approximate tracking for norm,
range-aggregate, and join-aggregate queries, which have
numerous applications in streaming data analysis.
Experimental results on real-life data streams verify
the superiority of our approach in practical settings,
showing that it substantially outperforms the covering
spheres method.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2015:UGD,
author = "Kun Li and Daisy Zhe Wang and Alin Dobra and
Christopher Dudley",
title = "{UDA}-{GIST}: an in-database framework to unify
data-parallel and state-parallel analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "5",
pages = "557--568",
month = jan,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:35 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Enterprise applications need sophisticated in-database
analytics in addition to traditional online analytical
processing from a database. To meet customers' pressing
demands, database vendors have been pushing advanced
analytical techniques into databases. Most major DBMSes
offer User-Defined Aggregate (UDA), a data-driven
operator, to implement many of the analytical
techniques in parallel. However, UDAs can not be used
to implement statistical algorithms such as Markov
chain Monte Carlo (MCMC), where most of the work is
performed by iterative transitions over a large state
that can not be naively partitioned due to data
dependency. Typically, this type of statistical
algorithm requires pre-processing to setup the large
state in the first place and demands post-processing
after the statistical inference. This paper presents
General Iterative State Transition (GIST), a new
database operator for parallel iterative state
transitions over large states. GIST receives a state
constructed by a UDA, and then performs rounds of
transitions on the state until it converges. A final
UDA performs post-processing and result extraction. We
argue that the combination of UDA and GIST (UDA-GIST)
unifies data-parallel and state-parallel processing in
a single system, thus significantly extending the
analytical capabilities of DBMSes. We exemplify the
framework through two high-profile applications:
cross-document coreference and image denoising. We show
that the in-database framework allows us to tackle a 27
times larger problem than solved by the
state-of-the-art for the first application and achieves
43 times speedup over the state-of-the-art for the
second application.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yu:2015:EPP,
author = "Weiren Yu and Julie A. McCann",
title = "Efficient partial-pairs {SimRank} search on large
networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "5",
pages = "569--580",
month = jan,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:35 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The assessment of node-to-node similarities based on
graph topology arises in a myriad of applications,
e.g., web search. SimRank is a notable measure of this
type, with the intuition that ``two nodes are similar
if their in-neighbors are similar''. While most
existing work retrieving SimRank only considers
all-pairs SimRank $ s(*, *) $ and single-source SimRank
$ s(*, j) $ (scores between every node and query $j$),
there are appealing applications for partial-pairs
SimRank, e.g., similarity join. Given two node subsets
$A$ and $B$ in a graph, partial-pairs SimRank
assessment aims to retrieve only $ \{ s(a, b) \}_{
\forall a \epsilon A, \forall b \epsilon B}$. However,
the best-known solution appears not self-contained
since it hinges on the premise that the SimRank scores
with node-pairs in an $h$-go cover set must be given
beforehand. This paper focuses on efficient assessment
of partial-pairs SimRank in a self-contained manner.
(1) We devise a novel ``seed germination'' model that
computes partial-pairs SimRank in $ O(k | E | \{ \min |
A |, | B | \})$ time and $ O(| E | + k | V |)$ memory
for $k$ iterations on a graph of $ | V |$ nodes and $ |
E |$ edges. (2) We further eliminate unnecessary edge
access to improve the time of partial-pairs SimRank to
$ O(m \{ \min | A |, | B | \})$, where $ m \geq \{ \min
k | E |, \Delta^{2 k} \} $, and $ \Delta $ is the
maximum degree. (3) We show that our partial-pairs
SimRank model also can handle the computations of
all-pairs and single-source SimRanks. (4) We
empirically verify that our algorithms are (a) $ 38
\times $ faster than the best-known competitors, and
(b) memory-efficient, allowing scores to be assessed
accurately on graphs with tens of millions of links.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gatterbauer:2015:LSP,
author = "Wolfgang Gatterbauer and Stephan G{\"u}nnemann and
Danai Koutra and Christos Faloutsos",
title = "Linearized and single-pass belief propagation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "5",
pages = "581--592",
month = jan,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:35 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "How can we tell when accounts are fake or real in a
social network? And how can we tell which accounts
belong to liberal, conservative or centrist users?
Often, we can answer such questions and label nodes in
a network based on the labels of their neighbors and
appropriate assumptions of homophily (``birds of a
feather flock together'') or heterophily (``opposites
attract''). One of the most widely used methods for
this kind of inference is Belief Propagation (BP) which
iteratively propagates the information from a few nodes
with explicit labels throughout a network until
convergence. A well-known problem with BP, however, is
that there are no known exact guarantees of convergence
in graphs with loops. This paper introduces Linearized
Belief Propagation (LinBP), a linearization of BP that
allows a closed-form solution via intuitive matrix
equations and, thus, comes with exact convergence
guarantees. It handles homophily, heterophily, and more
general cases that arise in multi-class settings. Plus,
it allows a compact implementation in SQL. The paper
also introduces Single-pass Belief Propagation (SBP), a
localized (or ``myopic'') version of LinBP that
propagates information across every edge at most once
and for which the final class assignments depend only
on the nearest labeled neighbors. In addition, SBP
allows fast incremental updates in dynamic networks.
Our runtime experiments show that LinBP and SBP are
orders of magnitude faster than standard BP, while
leading to almost identical node labels.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Do:2015:MRM,
author = "Loc Do and Hady W. Lauw and Ke Wang",
title = "Mining revenue-maximizing bundling configuration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "5",
pages = "593--604",
month = jan,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:35 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With greater prevalence of social media, there is an
increasing amount of user-generated data revealing
consumer preferences for various products and services.
Businesses seek to harness this wealth of data to
improve their marketing strategies. Bundling, or
selling two or more items for one price is a
highly-practiced marketing strategy. In this paper, we
address the bundle configuration problem from the
data-driven perspective. Given a set of items in a
seller's inventory, we seek to determine which items
should belong to which bundle so as to maximize the
total revenue, by mining consumer preferences data. We
show that this problem is NP-hard when bundles are
allowed to contain more than two items. Therefore, we
describe an optimal solution for bundle sizes up to two
items, and propose two heuristic solutions for bundles
of any larger size. We investigate the effectiveness
and the efficiency of the proposed algorithms through
experimentations on real-life rating-based preferences
data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yang:2015:RKN,
author = "Shiyu Yang and Muhammad Aamir Cheema and Xuemin Lin
and Wei Wang",
title = "Reverse $k$ nearest neighbors query processing:
experiments and analysis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "5",
pages = "605--616",
month = jan,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:35 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Given a set of users, a set of facilities and a query
facility $q$, a reverse $k$ nearest neighbors (R $k$
NN) query returns every user u for which the query is
one of its $k$ closest facilities. R $k$ NN queries
have been extensively studied under a variety of
settings and many sophisticated algorithms have been
proposed to answer these queries. However, the existing
experimental studies suffer from a few limitations. For
example, some studies estimate the I/O cost by charging
a fixed penalty per I/O and we show that this may be
misleading. Also, the existing studies either use an
extremely small buffer or no buffer at all which puts
some algorithms at serious disadvantage. We show that
the performance of these algorithms is significantly
improved even when a small buffer (containing 100
pages) is used. Finally, in each of the existing
studies, the proposed algorithm is mainly compared only
with its predecessor assuming that it was the best
algorithm at the time which is not necessarily true as
shown in our experimental study. Motivated by these
limitations, we present a comprehensive experimental
study that addresses these limitations and compares
some of the most notable algorithms under a wide
variety of settings. Furthermore, we also present a
carefully developed filtering strategy that
significantly improves TPL which is one of the most
popular R $k$ NN algorithms. Specifically, the
optimized version is up to 20 times faster than the
original version and reduces its I/O cost up to two
times.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ren:2015:EVR,
author = "Xuguang Ren and Junhu Wang",
title = "Exploiting vertex relationships in speeding up
subgraph isomorphism over large graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "5",
pages = "617--628",
month = jan,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:35 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Subgraph Isomorphism is a fundamental problem in graph
data processing. Most existing subgraph isomorphism
algorithms are based on a backtracking framework which
computes the solutions by incrementally matching all
query vertices to candidate data vertices. However, we
observe that extensive duplicate computation exists in
these algorithms, and such duplicate computation can be
avoided by exploiting relationships between data
vertices. Motivated by this, we propose a novel
approach, BoostIso, to reduce duplicate computation.
Our extensive experiments with real datasets show that,
after integrating our approach, most existing subgraph
isomorphism algorithms can be speeded up significantly,
especially for some graphs with intensive vertex
relationships, where the improvement can be up to
several orders of magnitude.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gatterbauer:2015:ALI,
author = "Wolfgang Gatterbauer and Dan Suciu",
title = "Approximate lifted inference with probabilistic
databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "5",
pages = "629--640",
month = jan,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:35 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper proposes a new approach for approximate
evaluation of \#P-hard queries with probabilistic
databases. In our approach, every query is evaluated
entirely in the database engine by evaluating a fixed
number of query plans, each providing an upper bound on
the true probability, then taking their minimum. We
provide an algorithm that takes into account important
schema information to enumerate only the minimal
necessary plans among all possible plans. Importantly,
this algorithm is a strict generalization of all known
results of PTIME self-join-free conjunctive queries: A
query is safe if and only if our algorithm returns one
single plan. We also apply three relational query
optimization techniques to evaluate all minimal safe
plans very fast. We give a detailed experimental
evaluation of our approach and, in the process, provide
a new way of thinking about the value of probabilistic
methods over non-probabilistic methods for ranking
query answers.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Vesdapunt:2015:ECA,
author = "Norases Vesdapunt and Kedar Bellare and Nilesh Dalvi",
title = "Errata for {``Crowdsourcing algorithms for entity
resolution''}: {(PVLDB {\bf 7}(12): 1071--1082)}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "5",
pages = "641--641",
month = jan,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Feb 9 18:24:35 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We discovered that there was a duplicate figure in our
paper. We accidentally put Figure 13(b) for Figure
12(b). We have provided the correct Figure 12(b) above
(See Figure 1). Figure 1 plots the recall of various
strategies as a function of the number of questions
asked for Places dataset. There was no error in the
discussion in our paper (See Section 6.2.1 in our paper
for more details).",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jha:2015:IMM,
author = "Saurabh Jha and Bingsheng He and Mian Lu and Xuntao
Cheng and Huynh Phung Huynh",
title = "Improving main memory hash joins on {Intel Xeon Phi}
processors: an experimental approach",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "6",
pages = "642--653",
month = feb,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Feb 10 17:42:37 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Modern processor technologies have driven new designs
and implementations in main-memory hash joins.
Recently, Intel Many Integrated Core (MIC)
co-processors (commonly known as Xeon Phi) embrace
emerging x86 single-chip many-core techniques. Compared
with contemporary multi-core CPUs, Xeon Phi has quite
different architectural features: wider SIMD
instructions, many cores and hardware contexts, as well
as lower-frequency in-order cores. In this paper, we
experimentally revisit the state-of-the-art hash join
algorithms on Xeon Phi co-processors. In particular, we
study two camps of hash join algorithms:
hardware-conscious ones that advocate careful tailoring
of the join algorithms to underlying hardware
architectures and hardware-oblivious ones that omit
such careful tailoring. For each camp, we study the
impact of architectural features and software
optimizations on Xeon Phi in comparison with results on
multi-core CPUs. Our experiments show two major
findings on Xeon Phi, which are quantitatively
different from those on multi-core CPUs. First, the
impact of architectural features and software
optimizations has quite different behavior on Xeon Phi
in comparison with those on the CPU, which calls for
new optimization and tuning on Xeon Phi. Second,
hardware oblivious algorithms can outperform hardware
conscious algorithms on a wide parameter window. These
two findings further shed light on the design and
implementation of query processing on new-generation
single-chip many-core technologies.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Hammoud:2015:DDR,
author = "Mohammad Hammoud and Dania Abed Rabbou and Reza Nouri
and Seyed-Mehdi-Reza Beheshti and Sherif Sakr",
title = "{DREAM}: distributed {RDF} engine with adaptive query
planner and minimal communication",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "6",
pages = "654--665",
month = feb,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Feb 10 17:42:37 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The Resource Description Framework (RDF) and SPARQL
query language are gaining wide popularity and
acceptance. In this paper, we present DREAM, a
distributed and adaptive RDF system. As opposed to
existing RDF systems, DREAM avoids partitioning RDF
datasets and partitions only SPARQL queries. By not
partitioning datasets, DREAM offers a general paradigm
for different types of pattern matching queries, and
entirely averts intermediate data shuffling (only
auxiliary data are shuffled). Besides, by partitioning
queries, DREAM presents an adaptive scheme, which
automatically runs queries on various numbers of
machines depending on their complexities. Hence, in
essence DREAM combines the advantages of the
state-of-the-art centralized and distributed RDF
systems, whereby data communication is avoided and
cluster resources are aggregated. Likewise, it
precludes their disadvantages, wherein system resources
are limited and communication overhead is typically
hindering. DREAM achieves all its goals via employing a
novel graph-based, rule-oriented query planner and a
new cost model. We implemented DREAM and conducted
comprehensive experiments on a private cluster and on
the Amazon EC2 platform. Results show that DREAM can
significantly outperform three related popular RDF
systems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chen:2015:OTA,
author = "Shuo Chen and Ju Fan and Guoliang Li and Jianhua Feng
and Kian-lee Tan and Jinhui Tang",
title = "Online topic-aware influence maximization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "6",
pages = "666--677",
month = feb,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Feb 10 17:42:37 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Influence maximization, whose objective is to select
$k$ users (called seeds) from a social network such
that the number of users influenced by the seeds
(called influence spread) is maximized, has attracted
significant attention due to its widespread
applications, such as viral marketing and rumor
control. However, in real-world social networks, users
have their own interests (which can be represented as
topics) and are more likely to be influenced by their
friends (or friends' friends) with similar topics. We
can increase the influence spread by taking into
consideration topics. To address this problem, we study
topic-aware influence maximization, which, given a
topic-aware influence maximization (TIM) query, finds
$k$ seeds from a social network such that the
topic-aware influence spread of the $k$ seeds is
maximized. Our goal is to enable online TIM queries.
Since the topic-aware influence maximization problem is
NP-hard, we focus on devising efficient algorithms to
achieve instant performance while keeping a high
influence spread. We utilize a maximum influence
arborescence (MIA) model to approximate the computation
of influence spread. To efficiently find $k$ seeds
under the MIA model, we first propose a best-effort
algorithm with $ 1 - 1 / e$ approximation ratio, which
estimates an upper bound of the topic-aware influence
of each user and utilizes the bound to prune large
numbers of users with small influence. We devise
effective techniques to estimate tighter upper bounds.
We then propose a faster topic-sample-based algorithm
with $ \epsilon \cdot (1 - 1 / e)$ approximation ratio
for any $ \epsilon \in (0, 1]$, which materializes the
influence spread of some topic-distribution samples and
utilizes the materialized information to avoid
computing the actual influence of users with small
influences. Experimental results show that our methods
significantly outperform baseline approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Nazi:2015:WWF,
author = "Azade Nazi and Zhuojie Zhou and Saravanan
Thirumuruganathan and Nan Zhang and Gautam Das",
title = "Walk, not wait: faster sampling over online social
networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "6",
pages = "678--689",
month = feb,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Feb 10 17:42:37 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we introduce a novel, general purpose,
technique for faster sampling of nodes over an online
social network. Specifically, unlike traditional random
walks which wait for the convergence of sampling
distribution to a predetermined target distribution ---
a waiting process that incurs a high query cost --- we
develop WALK-ESTIMATE, which starts with a much shorter
random walk, and then proactively estimate the sampling
probability for the node taken before using
acceptance--rejection sampling to adjust the sampling
probability to the predetermined target distribution.
We present a novel backward random walk technique which
provides provably unbiased estimations for the sampling
probability, and demonstrate the superiority of
WALK-ESTIMATE over traditional random walks through
theoretical analysis and extensive experiments over
real world online social networks.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Benedikt:2015:QAP,
author = "Michael Benedikt and Julien Leblay and Efthymia
Tsamoura",
title = "Querying with access patterns and integrity
constraints",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "6",
pages = "690--701",
month = feb,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Feb 10 17:42:37 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Traditional query processing involves a search for
plans formed by applying algebraic operators on top of
primitives representing access to relations in the
input query. But many querying scenarios involve two
interacting issues that complicate the search. On the
one hand, the search space may be limited by access
restrictions associated with the interfaces to
datasources, which require certain parameters to be
given as inputs. On the other hand, the search space
may be extended through the presence of integrity
constraints that relate sources to each other, allowing
for plans that do not match the structure of the user
query. In this paper we present the first optimization
approach that attacks both these difficulties within a
single framework, presenting a system in which
classical cost-based join optimization is extended to
support both access-restrictions and constraints.
Instead of iteratively exploring subqueries of the
input query, our optimizer explores a space of proofs
that witness the answering of the query, where each
proof has a direct correspondence with a query plan.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tangwongsan:2015:GIS,
author = "Kanat Tangwongsan and Martin Hirzel and Scott
Schneider and Kun-Lung Wu",
title = "General incremental sliding-window aggregation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "7",
pages = "702--713",
month = feb,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Apr 15 19:04:24 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Stream processing is gaining importance as more data
becomes available in the form of continuous streams and
companies compete to promptly extract insights from
them. In such applications, sliding-window aggregation
is a central operator, and incremental aggregation
helps avoid the performance penalty of re-aggregating
from scratch for each window change. This paper
presents Reactive Aggregator (RA), a new framework for
incremental sliding-window aggregation. RA is general
in that it does not require aggregation functions to be
invertible or commutative, and it does not require
windows to be FIFO. We implemented RA as a drop-in
replacement for the Aggregate operator of a commercial
streaming engine. Given m updates on a window of size
$n$, RA has an algorithmic complexity of $ O(m + m \log
(n / m))$, rivaling the best prior algorithms for any
$m$. Furthermore, RA's implementation minimizes
overheads from allocation and pointer traversals by
using a single flat array.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lei:2015:SER,
author = "Chuan Lei and Zhongfang Zhuang and Elke A.
Rundensteiner and Mohamed Eltabakh",
title = "Shared execution of recurring workloads in
{MapReduce}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "7",
pages = "714--725",
month = feb,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Apr 15 19:04:24 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the increasing complexity of data-intensive
MapReduce workloads, Hadoop must often accommodate
hundreds or even thousands of recurring analytics
queries that periodically execute over frequently
updated datasets, e.g., latest stock transactions, new
log files, or recent news feeds. For many applications,
such recurring queries come with user-specified
service-level agreements (SLAs), commonly expressed as
the maximum allowed latency for producing results
before their merits decay. The recurring nature of
these emerging workloads combined with their SLA
constraints make it challenging to share and optimize
their execution. While some recent efforts on multi-job
optimization in MapReduce have emerged, they focus on
only sharing work among ad-hoc jobs on static datasets.
Unfortunately, these sharing techniques neither take
the recurring nature of the queries into account nor
guarantee the satisfaction of the SLA requirements. In
this work, we propose the first scalable multi-query
sharing engine tailored for recurring workloads in the
MapReduce infrastructure, called ``Helix''. Helix
deploys new sliced window-alignment techniques to
create sharing opportunities among recurring queries
without introducing additional I/O overheads or
unnecessary data scans. And then, Helix introduces a
cost/benefit model for creating a sharing plan among
the recurring queries, and a scheduling strategy for
executing them to maximize the SLA satisfaction. Our
experimental results over real-world datasets confirm
that Helix significantly outperforms the state-of-art
techniques by an order of magnitude.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Narasayya:2015:SBP,
author = "Vivek Narasayya and Ishai Menache and Mohit Singh and
Feng Li and Manoj Syamala and Surajit Chaudhuri",
title = "Sharing buffer pool memory in multi-tenant relational
database-as-a-service",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "7",
pages = "726--737",
month = feb,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Apr 15 19:04:24 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Relational database-as-a-service (DaaS) providers need
to rely on multi-tenancy and resource sharing among
tenants, since statically reserving resources for a
tenant is not cost effective. A major consequence of
resource sharing is that the performance of one tenant
can be adversely affected by resource demands of other
co-located tenants. One such resource that is essential
for good performance of a tenant's workload is buffer
pool memory. In this paper, we study the problem of how
to effectively share buffer pool memory in multi-tenant
relational DaaS. We first develop an SLA framework that
defines and enforces accountability of the service
provider to the tenant even when buffer pool memory is
not statically reserved on behalf of the tenant. Next,
we present a novel buffer pool page replacement
algorithm (MT-LRU) that builds upon theoretical
concepts from weighted online caching, and is designed
for multi-tenant scenarios involving SLAs and
overbooking. MT-LRU generalizes the LRU-K algorithm
which is commonly used in relational database systems.
We have prototyped our techniques inside a commercial
DaaS engine and extensive experiments demonstrate the
effectiveness of our solution.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gao:2015:AWQ,
author = "Yunjun Gao and Qing Liu and Gang Chen and Baihua Zheng
and Linlin Zhou",
title = "Answering why-not questions on reverse top-$k$
queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "7",
pages = "738--749",
month = feb,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Apr 15 19:04:24 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Why-not questions, which aim to seek clarifications on
the missing tuples for query results, have recently
received considerable attention from the database
community. In this paper, we systematically explore
why-not questions on reverse top-$k$ queries, owing to
its importance in multi-criteria decision making. Given
an initial reverse top-$k$ query and a missing/why-not
weighting vector set W$_m$ that is absent from the
query result, why-not questions on reverse top-$k$
queries explain why W$_m$ does not appear in the query
result and provide suggestions on how to refine the
initial query with minimum penalty to include W$_m$ in
the refined query result. We first formalize why-not
questions on reverse top-$k$ queries and reveal their
semantics, and then propose a unified framework called
WQRTQ to answer why-not questions on both monochromatic
and bichromatic reverse top-$k$ queries. Our framework
offers three solutions, namely, (i) modifying a query
point $q$, (ii) modifying a why-not weighting vector
set W$_m$ and a parameter $k$, and (iii) modifying $q$,
W$_m$, and $k$ simultaneously, to cater for different
application scenarios. Extensive experimental
evaluation using both real and synthetic data sets
verifies the effectiveness and efficiency of the
presented algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Papadopoulos:2015:PAP,
author = "Dimitrios Papadopoulos and Charalampos Papamanthou and
Roberto Tamassia and Nikos Triandopoulos",
title = "Practical authenticated pattern matching with optimal
proof size",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "7",
pages = "750--761",
month = feb,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Apr 15 19:04:24 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/string-matching.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We address the problem of authenticating pattern
matching queries over textual data that is outsourced
to an untrusted cloud server. By employing
cryptographic accumulators in a novel optimal
integrity-checking tool built directly over a suffix
tree, we design the first authenticated data structure
for verifiable answers to pattern matching queries
featuring fast generation of constant-size proofs. We
present two main applications of our new construction
to authenticate: (i) pattern matching queries over text
documents, and (ii) exact path queries over XML
documents. Answers to queries are verified by proofs of
size at most 500 bytes for text pattern matching, and
at most 243 bytes for exact path XML search,
independently of the document or answer size. By
design, our authentication schemes can also be
parallelized to offer extra efficiency during data
outsourcing. We provide a detailed experimental
evaluation of our schemes showing that for both
applications the times required to compute and verify a
proof are very small --- e.g., it takes less than $ 10
\mu $ s to generate a proof for a pattern (mis)match of
$ 10^2 $ characters in a text of $ 10^6 $ characters,
once the query has been evaluated.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Loghin:2015:PSB,
author = "Dumitrel Loghin and Bogdan Marius Tudor and Hao Zhang
and Beng Chin Ooi and Yong Meng Teo",
title = "A performance study of big data on small nodes",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "7",
pages = "762--773",
month = feb,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Apr 15 19:04:24 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The continuous increase in volume, variety and
velocity of Big Data exposes datacenter resource
scaling to an energy utilization problem.
Traditionally, datacenters employ x86-64 (big) server
nodes with power usage of tens to hundreds of Watts.
But lately, low-power (small) systems originally
developed for mobile devices have seen significant
improvements in performance. These improvements could
lead to the adoption of such small systems in servers,
as announced by major industry players. In this
context, we systematically conduct a performance study
of Big Data execution on small nodes in comparison with
traditional big nodes, and present insights that would
be useful for future development. We run Hadoop
MapReduce, MySQL and in-memory Shark workloads on
clusters of ARM big. LITTLE boards and Intel Xeon
server systems. We evaluate execution time, energy
usage and total cost of running the workloads on
self-hosted ARM and Xeon nodes. Our study shows that
there is no one size fits all rule for judging the
efficiency of executing Big Data workloads on small and
big nodes. But small memory size, low memory and I/O
bandwidths, and software immaturity concur in canceling
the lower-power advantage of ARM servers. We show that
I/O-intensive MapReduce workloads are more
energy-efficient to run on Xeon nodes. In contrast,
database query processing is always more
energy-efficient on ARM servers, at the cost of
slightly lower throughput. With minor software
modifications, CPU-intensive MapReduce workloads are
almost four times cheaper to execute on ARM servers.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Papenbrock:2015:DCB,
author = "Thorsten Papenbrock and Sebastian Kruse and
Jorge-Arnulfo Quian{\'e}-Ruiz and Felix Naumann",
title = "Divide \& conquer-based inclusion dependency
discovery",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "7",
pages = "774--785",
month = feb,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Apr 15 19:04:24 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The discovery of all inclusion dependencies (INDs) in
a dataset is an important part of any data profiling
effort. Apart from the detection of foreign key
relationships, INDs can help to perform data
integration, query optimization, integrity checking, or
schema (re-)design. However, the detection of INDs gets
harder as datasets become larger in terms of number of
tuples as well as attributes. To this end, we propose
Binder, an IND detection system that is capable of
detecting both unary and $n$-ary INDs. It is based on a
divide \& conquer approach, which allows to handle very
large datasets --- an important property on the face of
the ever increasing size of today's data. In contrast
to most related works, we do not rely on existing
database functionality nor assume that inspected
datasets fit into main memory. This renders Binder an
efficient and scalable competitor. Our exhaustive
experimental evaluation shows the high superiority of
Binder over the state-of-the-art in both unary (Spider)
and $n$-ary (Mind) IND discovery. Binder is up to $ 26
\times $ faster than Spider and more than $ 2500 \times
$ faster than Mind.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chen:2015:PBT,
author = "Shimin Chen and Qin Jin",
title = "Persistent {B+}-trees in non-volatile main memory",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "7",
pages = "786--797",
month = feb,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Apr 15 19:04:24 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Computer systems in the near future are expected to
have Non-Volatile Main Memory (NVMM), enabled by a new
generation of Non-Volatile Memory (NVM) technologies,
such as Phase Change Memory (PCM), STT-MRAM, and
Memristor. The non-volatility property has the promise
to persist in-memory data structures for instantaneous
failure recovery. However, realizing such promise
requires a careful design to ensure that in-memory data
structures are in known consistent states after
failures. This paper studies persistent in-memory $
B^+$-Trees as $ B^+$-Trees are widely used in database
and data-intensive systems. While traditional
techniques, such as undo-redo logging and shadowing,
support persistent $ B^+$-Trees, we find that they
incur drastic performance overhead because of extensive
NVM writes and CPU cache flush operations. PCM-friendly
$ B^+$-Trees with unsorted leaf nodes help mediate this
issue, but the remaining overhead is still large. In
this paper, we propose write atomic $ B^+$-Trees (w$
B^+$-Trees), a new type of main-memory $ B^+$-Trees,
that aim to reduce such overhead as much as possible. $
w B^+$-Tree nodes employ a small indirect slot array
and/or a bitmap so that most insertions and deletions
do not require the movement of index entries. In this
way, $ w B^+$-Trees can achieve node consistency either
through atomic writes in the nodes or by redo-only
logging. We model fast NVM using DRAM on a real machine
and model PCM using a cycle-accurate simulator.
Experimental results show that compared with previous
persistent $ B^+$-Tree solutions, $ w B^+$-Trees
achieve up to $ 8.8 \times $ speedups on DRAM-like fast
NVM and up to $ 27.1 \times $ speedups on PCM for
insertions and deletions while maintaining good search
performance. Moreover, we replaced Memcached's internal
hash index with tree indices. Our real machine
Memcached experiments show that $ w B^+$-Trees achieve
up to 3.8X improvements over previous persistent tree
structures with undo-redo logging or shadowing.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wu:2015:RLC,
author = "Yubao Wu and Ruoming Jin and Jing Li and Xiang Zhang",
title = "Robust local community detection: on free rider effect
and its elimination",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "7",
pages = "798--809",
month = feb,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Apr 15 19:04:24 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Given a large network, local community detection aims
at finding the community that contains a set of query
nodes and also maximizes (minimizes) a goodness metric.
This problem has recently drawn intense research
interest. Various goodness metrics have been proposed.
However, most existing metrics tend to include
irrelevant subgraphs in the detected local community.
We refer to such irrelevant subgraphs as free riders.
We systematically study the existing goodness metrics
and provide theoretical explanations on why they may
cause the free rider effect. We further develop a query
biased node weighting scheme to reduce the free rider
effect. In particular, each node is weighted by its
proximity to the query node. We define a query biased
density metric to integrate the edge and node weights.
The query biased densest subgraph, which has the
largest query biased density, will shift to the
neighborhood of the query nodes after node weighting.
We then formulate the query biased densest connected
subgraph (QDC) problem, study its complexity, and
provide efficient algorithms to solve it. We perform
extensive experiments on a variety of real and
synthetic networks to evaluate the effectiveness and
efficiency of the proposed methods.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fan:2015:UCC,
author = "Hua Fan and Aditya Ramaraju and Marlon McKenzie and
Wojciech Golab and Bernard Wong",
title = "Understanding the causes of consistency anomalies in
{Apache Cassandra}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "7",
pages = "810--813",
month = feb,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Apr 15 19:04:24 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A recent paper on benchmarking eventual consistency
showed that when a constant workload is applied against
Cassandra, the staleness of values returned by read
operations exhibits interesting but unexplained
variations when plotted against time. In this paper we
reproduce this phenomenon and investigate in greater
depth the low-level mechanisms that give rise to stale
reads. We show that the staleness spikes exhibited by
Cassandra are strongly correlated with garbage
collection, particularly the ``stop-the-world'' phase
which pauses all application threads in a Java virtual
machine. We show experimentally that the staleness
spikes can be virtually eliminated by delaying read
operations artificially at servers immediately after a
garbage collection pause. In our experiments this
yields more than a 98\% reduction in the number of
consistency anomalies that exceed 5ms, and has
negligible impact on throughput and latency.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Aslay:2015:VMM,
author = "Cigdem Aslay and Wei Lu and Francesco Bonchi and Amit
Goyal and Laks V. S. Lakshmanan",
title = "Viral marketing meets social advertising: ad
allocation with minimum regret",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "7",
pages = "814--825",
month = feb,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Apr 15 19:04:24 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Social advertisement is one of the fastest growing
sectors in the digital advertisement landscape: ads in
the form of promoted posts are shown in the feed of
users of a social networking platform, along with
normal social posts; if a user clicks on a promoted
post, the host (social network owner) is paid a fixed
amount from the advertiser. In this context, allocating
ads to users is typically performed by maximizing
click-through-rate, i.e., the likelihood that the user
will click on the ad. However, this simple strategy
fails to leverage the fact the ads can propagate
virally through the network, from endorsing users to
their followers. In this paper, we study the problem of
allocating ads to users through the viral-marketing
lenses. We show that allocation that takes into account
the propensity of ads for viral propagation can achieve
significantly better performance. However, uncontrolled
virality could be undesirable for the host as it
creates room for exploitation by the advertisers:
hoping to tap uncontrolled virality, an advertiser
might declare a lower budget for its marketing
campaign, aiming at the same large outcome with a
smaller cost. This creates a challenging trade-off: on
the one hand, the host aims at leveraging virality and
the network effect to improve advertising efficacy,
while on the other hand the host wants to avoid giving
away free service due to uncontrolled virality. We
formalize this as the problem of ad allocation with
minimum regret, which we show is NP-hard and
inapproximable w.r.t. any factor. However, we devise an
algorithm that provides approximation guarantees w.r.t.
the total budget of all advertisers. We develop a
scalable version of our approximation algorithm, which
we extensively test on four real-world data sets,
confirming that our algorithm delivers high quality
solutions, is scalable, and significantly outperforms
several natural baselines.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chu:2015:ASD,
author = "Lingyang Chu and Shuhui Wang and Siyuan Liu and
Qingming Huang and Jian Pei",
title = "{ALID}: scalable dominant cluster detection",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "8",
pages = "826--837",
month = apr,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Apr 15 19:02:29 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Detecting dominant clusters is important in many
analytic applications. The state-of-the-art methods
find dense subgraphs on the affinity graph as dominant
clusters. However, the time and space complexities of
those methods are dominated by the construction of
affinity graph, which is quadratic with respect to the
number of data points, and thus are impractical on
large data sets. To tackle the challenge, in this
paper, we apply Evolutionary Game Theory (EGT) and
develop a scalable algorithm, Approximate Localized
Infection Immunization Dynamics (ALID). The major idea
is to perform Localized Infection Immunization Dynamics
(LID) to find dense subgraphs within local ranges of
the affinity graph. LID is further scaled up with
guaranteed high efficiency and detection quality by an
estimated Region of Interest (ROI) and a Candidate
Infective Vertex Search method (CIVS). ALID only
constructs small local affinity graphs and has time
complexity $ O(C(a^* + \delta) n) $ and space
complexity $ O(a^*(a^* + \delta)) $, where $ a^* $ is
the size of the largest dominant cluster, and $ C \ll n
$ and $ \delta \ll n $ are small constants. We
demonstrate by extensive experiments on both synthetic
data and real world data that ALID achieves the
state-of-the-art detection quality with much lower time
and space cost on single machine. We also demonstrate
the encouraging parallelization performance of ALID by
implementing the Parallel ALID (PALID) on Apache Spark.
PALID processes 50 million SIFT data points in 2.29
hours, achieving a speedup ratio of 7.51 with 8
executors.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shao:2015:ESS,
author = "Yingxia Shao and Bin Cui and Lei Chen and Mingming Liu
and Xing Xie",
title = "An efficient similarity search framework for {SimRank}
over large dynamic graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "8",
pages = "838--849",
month = apr,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Apr 15 19:02:29 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "SimRank is an important measure of vertex-pair
similarity according to the structure of graphs. The
similarity search based on SimRank is an important
operation for identifying similar vertices in a graph
and has been employed in many data analysis
applications. Nowadays, graphs in the real world become
much larger and more dynamic. The existing solutions
for similarity search are expensive in terms of time
and space cost. None of them can efficiently support
similarity search over large dynamic graphs. In this
paper, we propose a novel two-stage random-walk
sampling framework (TSF) for SimRank-based similarity
search (e.g., top-$k$ search). In the preprocessing
stage, TSF samples a set of one-way graphs to index raw
random walks in a novel manner within $ O(N R_g)$ time
and space, where $N$ is the number of vertices and $
R_g$ is the number of one-way graphs. The one-way graph
can be efficiently updated in accordance with the graph
modification, thus TSF is well suited to dynamic
graphs. During the query stage, TSF can search similar
vertices fast by naturally pruning unqualified vertices
based on the connectivity of one-way graphs.
Furthermore, with additional $ R_q$ samples, TSF can
estimate the SimRank score with probability [EQUATION]
if the error of approximation is bounded by $ 1 -
\epsilon $. Finally, to guarantee the scalability of
TSF, the one-way graphs can also be compactly stored on
the disk when the memory is limited. Extensive
experiments have demonstrated that TSF can handle
dynamic billion-edge graphs with high performance.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ahmad:2015:CMD,
author = "Muhammad Yousuf Ahmad and Bettina Kemme",
title = "Compaction management in distributed key--value
datastores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "8",
pages = "850--861",
month = apr,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Apr 15 19:02:29 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Compactions are a vital maintenance mechanism used by
datastores based on the log-structured merge-tree to
counter the continuous buildup of data files under
update-intensive workloads. While compactions help keep
read latencies in check over the long run, this comes
at the cost of significantly degraded read performance
over the course of the compaction itself. In this
paper, we offer an in-depth analysis of
compaction-related performance overheads and propose
techniques for their mitigation. We offload large,
expensive compactions to a dedicated compaction server
to allow the datastore server to better utilize its
resources towards serving the actual workload.
Moreover, since the newly compacted data is already
cached in the compaction server's main memory, we fetch
this data over the network directly into the datastore
server's local cache, thereby avoiding the performance
penalty of reading it back from the filesystem. In
fact, pre-fetching the compacted data from the remote
cache prior to switching the workload over to it can
eliminate local cache misses altogether. Therefore, we
implement a smarter warmup algorithm that ensures that
all incoming read requests are served from the
datastore server's local cache even as it is warming
up. We have integrated our solution into HBase, and
using the YCSB and TPC-C benchmarks, we show that our
approach significantly mitigates compaction-related
performance problems. We also demonstrate the
scalability of our solution by distributing compactions
across multiple compaction servers.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Guerraoui:2015:DPD,
author = "Rachid Guerraoui and Anne-Marie Kermarrec and Rhicheek
Patra and Mahsa Taziki",
title = "{D2P}: distance-based differential privacy in
recommenders",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "8",
pages = "862--873",
month = apr,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Apr 15 19:02:29 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The upsurge in the number of web users over the last
two decades has resulted in a significant growth of
online information. This information growth calls for
recommenders that personalize the information proposed
to each individual user. Nevertheless, personalization
also opens major privacy concerns. This paper presents
D2P, a novel protocol that ensures a strong form of
differential privacy, which we call distance-based
differential privacy, and which is particularly well
suited to recommenders. D2P avoids revealing exact user
profiles by creating altered profiles where each item
is replaced with another one at some distance. We
evaluate D2P analytically and experimentally on
MovieLens and Jester datasets and compare it with other
private and non-private recommenders.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mitliagkas:2015:FFP,
author = "Ioannis Mitliagkas and Michael Borokhovich and
Alexandros G. Dimakis and Constantine Caramanis",
title = "{FrogWild!}: fast {PageRank} approximations on graph
engines",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "8",
pages = "874--885",
month = apr,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Apr 15 19:02:29 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/pagerank.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We propose FrogWild, a novel algorithm for fast
approximation of high PageRank vertices, geared towards
reducing network costs of running traditional PageRank
algorithms. Our algorithm can be seen as a quantized
version of power iteration that performs multiple
parallel random walks over a directed graph. One
important innovation is that we introduce a
modification to the GraphLab framework that only
partially synchronizes mirror vertices. This partial
synchronization vastly reduces the network traffic
generated by traditional PageRank algorithms, thus
greatly reducing the per-iteration cost of PageRank. On
the other hand, this partial synchronization also
creates dependencies between the random walks used to
estimate PageRank. Our main theoretical innovation is
the analysis of the correlations introduced by this
partial synchronization process and a bound
establishing that our approximation is close to the
true PageRank vector. We implement our algorithm in
GraphLab and compare it against the default PageRank
implementation. We show that our algorithm is very
fast, performing each iteration in less than one second
on the Twitter graph and can be up to $ 7 \times $
faster compared to the standard GraphLab PageRank
implementation.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Vattani:2015:OPC,
author = "Andrea Vattani and Flavio Chierichetti and Keegan
Lowenstein",
title = "Optimal probabilistic cache stampede prevention",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "8",
pages = "886--897",
month = apr,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Apr 15 19:02:29 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "When a frequently-accessed cache item expires,
multiple requests to that item can trigger a cache miss
and start regenerating that same item at the same time.
This phenomenon, known as cache stampede, severely
limits the performance of databases and web servers. A
natural countermeasure to this issue is to let the
processes that perform such requests to randomly ask
for a regeneration before the expiration time of the
item. In this paper we give optimal algorithms for
performing such probabilistic early expirations. Our
algorithms are theoretically optimal and have much
better performances than other solutions used in
real-world applications.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Potti:2015:DNP,
author = "Navneet Potti and Jignesh M. Patel",
title = "{DAQ}: a new paradigm for approximate query
processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "9",
pages = "898--909",
month = may,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2777598.2777599",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 15 17:15:24 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Many modern applications deal with exponentially
increasing data volumes and aid business-critical
decisions in near real-time. Particularly in
exploratory data analysis, the focus is on interactive
querying and some degree of error in estimated results
is tolerable. A common response to this challenge is
approximate query processing, where the user is
presented with a quick confidence interval estimate
based on a sample of the data. In this work, we
highlight some of the problems that are associated with
this probabilistic approach when extended to more
complex queries, both in semantic interpretation and
the lack of a formal algebra. As an alternative, we
propose deterministic approximate querying (DAQ)
schemes, formalize a closed deterministic approximation
algebra, and outline some design principles for DAQ
schemes. We also illustrate the utility of this
approach with an example deterministic online
approximation scheme which uses a bitsliced index
representation and computes the most significant bits
of the result first. Our prototype scheme delivers
speedups over exact aggregation and predicate
evaluation, and outperforms sampling-based schemes for
extreme value aggregations.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Anciaux:2015:SSE,
author = "Nicolas Anciaux and Saliha Lallali and Iulian Sandu
Popa and Philippe Pucheral",
title = "A scalable search engine for mass storage smart
objects",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "9",
pages = "910--921",
month = may,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2777598.2777600",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 15 17:15:24 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper presents a new embedded search engine
designed for smart objects. Such devices are generally
equipped with extremely low RAM and large Flash storage
capacity. To tackle these conflicting hardware
constraints, conventional search engines privilege
either insertion or query scalability but cannot meet
both requirements at the same time. Moreover, very few
solutions support document deletions and updates in
this context. In this paper, we introduce three design
principles, namely Write-Once Partitioning, Linear
Pipelining and Background Linear Merging, and show how
they can be combined to produce an embedded search
engine reconciling high insert\slash delete\slash
update rate and query scalability. We have implemented
our search engine on a development board having a
hardware configuration representative for smart objects
and have conducted extensive experiments using two
representative datasets. The experimental results
demonstrate the scalability of the approach and its
superiority compared to state of the art methods.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2015:SMD,
author = "Lanjun Wang and Shuo Zhang and Juwei Shi and Limei
Jiao and Oktie Hassanzadeh and Jia Zou and Chen Wangz",
title = "Schema management for document stores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "9",
pages = "922--933",
month = may,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2777598.2777601",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 15 17:15:24 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Document stores that provide the efficiency of a
schema-less interface are widely used by developers in
mobile and cloud applications. However, the simplicity
developers achieved controversially leads to complexity
for data management due to lack of a schema. In this
paper, we present a schema management framework for
document stores. This framework discovers and persists
schemas of JSON records in a repository, and also
supports queries and schema summarization. The major
technical challenge comes from varied structures of
records caused by the schema-less data model and schema
evolution. In the discovery phase, we apply a canonical
form based method and propose an algorithm based on
equivalent sub-trees to group equivalent schemas
efficiently. Together with the algorithm, we propose a
new data structure, eSiBu-Tree, to store schemas and
support queries. In order to present a single
summarized representation for heterogeneous schemas in
records, we introduce the concept of ``skeleton'', and
propose to use it as a relaxed form of the schema,
which captures a small set of core attributes. Finally,
extensive experiments based on real data sets
demonstrate the efficiency of our proposed schema
discovery algorithms, and practical use cases in
real-world data exploration and integration scenarios
are presented to illustrate the effectiveness of using
skeletons in these applications.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Schuhknecht:2015:SDS,
author = "Felix Martin Schuhknecht and Pankaj Khanchandani and
Jens Dittrich",
title = "On the surprising difficulty of simple things: the
case of radix partitioning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "9",
pages = "934--937",
month = may,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2777598.2777602",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 15 17:15:24 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Partitioning a dataset into ranges is a task that is
common in various applications such as sorting
[1,6,7,8,9] and hashing [3] which are in turn building
blocks for almost any type of query processing.
Especially radix-based partitioning is very popular due
to its simplicity and high performance over
comparison-based versions [6].",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dong:2015:KBT,
author = "Xin Luna Dong and Evgeniy Gabrilovich and Kevin Murphy
and Van Dang and Wilko Horn and Camillo Lugaresi and
Shaohua Sun and Wei Zhang",
title = "Knowledge-based trust: estimating the trustworthiness
of web sources",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "9",
pages = "938--949",
month = may,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2777598.2777603",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 15 17:15:24 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The quality of web sources has been traditionally
evaluated using exogenous signals such as the hyperlink
structure of the graph. We propose a new approach that
relies on endogenous signals, namely, the correctness
of factual information provided by the source. A source
that has few false facts is considered to be
trustworthy. The facts are automatically extracted from
each source by information extraction methods commonly
used to construct knowledge bases. We propose a way to
distinguish errors made in the extraction process from
factual errors in the web source per se, by using joint
inference in a novel multi-layer probabilistic model.
We call the trustworthiness score we computed
Knowledge-Based Trust (KBT). On synthetic data, we show
that our method can reliably compute the true
trustworthiness levels of the sources. We then apply it
to a database of 2.8B facts extracted from the web, and
thereby estimate the trustworthiness of 119M webpages.
Manual evaluation of a subset of the results confirms
the effectiveness of the method.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Han:2015:GUB,
author = "Minyang Han and Khuzaima Daudjee",
title = "{Giraph} unchained: barrierless asynchronous parallel
execution in {Pregel}-like graph processing systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "9",
pages = "950--961",
month = may,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2777598.2777604",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 15 17:15:24 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The bulk synchronous parallel (BSP) model used by
synchronous graph processing systems allows algorithms
to be easily implemented and reasoned about. However,
BSP can suffer from poor performance due to stale
messages and frequent global synchronization barriers.
Asynchronous computation models have been proposed to
alleviate these overheads but existing asynchronous
systems that implement such models have limited
scalability or retain frequent global barriers, and do
not always support graph mutations or algorithms with
multiple computation phases. We propose barrierless
asynchronous parallel (BAP), a new computation model
that reduces both message staleness and global
synchronization. This enables BAP to overcome the
limitations of existing asynchronous models while
retaining support for graph mutations and algorithms
with multiple computation phases. We present GiraphUC,
which implements our BAP model in the open source
distributed graph processing system Giraph, and
evaluate our system at scale with large real-world
graphs on 64 EC2 machines. We show that GiraphUC
provides across-the-board performance improvements of
up to $ 5 \times $ faster over synchronous systems and
up to an order of magnitude faster than asynchronous
systems. Our results demonstrate that the BAP model
provides efficient and transparent asynchronous
execution of algorithms that are programmed
synchronously.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bogh:2015:WEP,
author = "Kenneth S. B{\o}gh and Sean Chester and Ira Assent",
title = "Work-efficient parallel skyline computation for the
{GPU}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "9",
pages = "962--973",
month = may,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2777598.2777605",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 15 17:15:24 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The skyline operator returns records in a dataset that
provide optimal trade-offs of multiple dimensions.
State-of-the-art skyline computation involves complex
tree traversals, data-ordering, and conditional
branching to minimize the number of point-to-point
comparisons. Meanwhile, GPGPU computing offers the
potential for parallelizing skyline computation across
thousands of cores. However, attempts to port skyline
algorithms to the GPU have prioritized throughput and
failed to outperform sequential algorithms. In this
paper, we introduce a new skyline algorithm, designed
for the GPU, that uses a global, static partitioning
scheme. With the partitioning, we can permit controlled
branching to exploit transitive relationships and avoid
most point-to-point comparisons. The result is a
non-traditional GPU algorithm, SkyAlign, that
prioritizes work-efficiency and respectable throughput,
rather than maximal throughput, to achieve orders of
magnitude faster performance.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lai:2015:SSE,
author = "Longbin Lai and Lu Qin and Xuemin Lin and Lijun
Chang",
title = "Scalable subgraph enumeration in {MapReduce}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "10",
pages = "974--985",
month = jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2794367.2794368",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 30 16:13:06 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Subgraph enumeration, which aims to find all the
subgraphs of a large data graph that are isomorphic to
a given pattern graph, is a fundamental graph problem
with a wide range of applications. However, existing
sequential algorithms for subgraph enumeration fall
short in handling large graphs due to the involvement
of computationally intensive subgraph isomorphism
operations. Thus, some recent researches focus on
solving the problem using MapReduce. Nevertheless,
exiting MapReduce approaches are not scalable to handle
very large graphs since they either produce a huge
number of partial results or consume a large amount of
memory. Motivated by this, in this paper, we propose a
new algorithm TwinTwigJoin based on a left-deep-join
framework in MapReduce, in which the basic join unit is
a TwinTwig (an edge or two incident edges of a node).
We show that in the Erd{\H{o}}s--R{\'e}nyi random-graph
model, TwinTwigJoin is instance optimal in the
left-deep-join framework under reasonable assumptions,
and we devise an algorithm to compute the optimal join
plan. Three optimization strategies are explored to
improve our algorithm. Furthermore, we discuss how our
approach can be adapted in the power-law random-graph
model. We conduct extensive performance studies in
several real graphs, one of which contains billions of
edges. Our approach significantly outperforms existing
solutions in all tests.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Finis:2015:IHD,
author = "Jan Finis and Robert Brunel and Alfons Kemper and
Thomas Neumann and Norman May and Franz Faerber",
title = "Indexing highly dynamic hierarchical data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "10",
pages = "986--997",
month = jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2794367.2794369",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 30 16:13:06 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Maintaining and querying hierarchical data in a
relational database system is an important task in many
business applications. This task is especially
challenging when considering dynamic use cases with a
high rate of complex, possibly skewed structural
updates. Labeling schemes are widely considered the
indexing technique of choice for hierarchical data, and
many different schemes have been proposed. However,
they cannot handle dynamic use cases well due to
various problems which we investigate in this paper. We
therefore propose our dynamic Order Indexes, which
offer competitive query performance, unprecedented
update efficiency, and robustness for highly dynamic
workloads.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2015:CDS,
author = "Meng Wang and Chaokun Wang and Jeffrey Xu Yu and Jun
Zhang",
title = "Community detection in social networks: an in-depth
benchmarking study with a procedure-oriented
framework",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "10",
pages = "998--1009",
month = jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2794367.2794370",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 30 16:13:06 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Revealing the latent community structure, which is
crucial to understanding the features of networks, is
an important problem in network and graph analysis.
During the last decade, many approaches have been
proposed to solve this challenging problem in diverse
ways, i.e. different measures or data structures.
Unfortunately, experimental reports on existing
techniques fell short in validity and integrity since
many comparisons were not based on a unified code base
or merely discussed in theory. We engage in an in-depth
benchmarking study of community detection in social
networks. We formulate a generalized community
detection procedure and propose a procedure-oriented
framework for benchmarking. This framework enables us
to evaluate and compare various approaches to community
detection systematically and thoroughly under identical
experimental conditions. Upon that we can analyze and
diagnose the inherent defect of existing approaches
deeply, and further make effective improvements
correspondingly. We have re-implemented ten
state-of-the-art representative algorithms upon this
framework and make comprehensive evaluations of
multiple aspects, including the efficiency evaluation,
performance evaluations, sensitivity evaluations, etc.
We discuss their merits and faults in depth, and draw a
set of take-away interesting conclusions. In addition,
we present how we can make diagnoses for these
algorithms resulting in significant improvements.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kazemi:2015:GGM,
author = "Ehsan Kazemi and S. Hamed Hassani and Matthias
Grossglauser",
title = "Growing a graph matching from a handful of seeds",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "10",
pages = "1010--1021",
month = jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2794367.2794371",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 30 16:13:06 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In many graph--mining problems, two networks from
different domains have to be matched. In the absence of
reliable node attributes, graph matching has to rely on
only the link structures of the two networks, which
amounts to a generalization of the classic graph
isomorphism problem. Graph matching has applications in
social--network reconciliation and de-anonymization,
protein--network alignment in biology, and computer
vision. The most scalable graph--matching approaches
use ideas from percolation theory, where a matched node
pair ``infects'' neighbouring pairs as additional
potential matches. This class of matching algorithm
requires an initial seed set of known matches to start
the percolation. The size and correctness of the
matching is very sensitive to the size of the seed set.
In this paper, we give a new graph--matching algorithm
that can operate with a much smaller seed set than
previous approaches, with only a small increase in
matching errors. We characterize a phase transition in
matching performance as a function of the seed set
size, using a random bigraph model and ideas from
bootstrap percolation theory. We also show the
excellent performance in matching several real
large-scale social networks, using only a handful of
seeds.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cheng:2015:RDB,
author = "Peng Cheng and Xiang Lian and Zhao Chen and Rui Fu and
Lei Chen and Jinsong Han and Jizhong Zhao",
title = "Reliable diversity-based spatial crowdsourcing by
moving workers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "10",
pages = "1022--1033",
month = jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2794367.2794372",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 30 16:13:06 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the rapid development of mobile devices and the
crowdsourcing platforms, the spatial crowdsourcing has
attracted much attention from the database community,
specifically, spatial crowdsourcing refers to sending a
location-based request to workers according to their
positions. In this paper, we consider an important
spatial crowdsourcing problem, namely reliable
diversity-based spatial crowdsourcing (RDB-SC), in
which spatial tasks (such as taking videos/photos of a
landmark or firework shows, and checking whether or not
parking spaces are available) are time-constrained, and
workers are moving towards some directions. Our RDB-SC
problem is to assign workers to spatial tasks such that
the completion reliability and the spatial/temporal
diversities of spatial tasks are maximized. We prove
that the RDB-SC problem is NP-hard and intractable.
Thus, we propose three effective approximation
approaches, including greedy, sampling, and
divide-and-conquer algorithms. In order to improve the
efficiency, we also design an effective
cost-model-based index, which can dynamically maintain
moving workers and spatial tasks with low cost, and
efficiently facilitate the retrieval of RDB-SC answers.
Through extensive experiments, we demonstrate the
efficiency and effectiveness of our proposed approaches
over both real and synthetic datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhou:2015:LHF,
author = "Zhuojie Zhou and Nan Zhang and Gautam Das",
title = "Leveraging history for faster sampling of online
social networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "10",
pages = "1034--1045",
month = jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2794367.2794373",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 30 16:13:06 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With a vast amount of data available on online social
networks, how to enable efficient analytics over such
data has been an increasingly important research
problem. Given the sheer size of such social networks,
many existing studies resort to sampling techniques
that draw random nodes from an online social network
through its restrictive web/API interface. While these
studies differ widely in analytics tasks supported and
algorithmic design, almost all of them use the exact
same underlying technique of random walk --- a Markov
Chain Monte Carlo based method which iteratively
transits from one node to its random neighbor. Random
walk fits naturally with this problem because, for most
online social networks, the only query we can issue
through the interface is to retrieve the neighbors of a
given node (i.e., no access to the full graph
topology). A problem with random walks, however, is the
``burn-in'' period which requires a large number of
transitions/queries before the sampling distribution
converges to a stationary value that enables the
drawing of samples in a statistically valid manner. In
this paper, we consider a novel problem of speeding up
the fundamental design of random walks (i.e., reducing
the number of queries it requires) without changing the
stationary distribution it achieves --- thereby
enabling a more efficient ``drop-in'' replacement for
existing sampling-based analytics techniques over
online social networks. Technically, our main idea is
to leverage the history of random walks to construct a
higher-ordered Markov chain. We develop two algorithms,
Circulated Neighbors and Groupby Neighbors Random Walk
(CNRW and GNRW) and rigidly prove that, no matter what
the social network topology is, CNRW and GNRW offer
better efficiency than baseline random walks while
achieving the same stationary distribution. We
demonstrate through extensive experiments on real-world
social networks and synthetic graphs the superiority of
our techniques over the existing ones.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ding:2015:TFE,
author = "Yufei Ding and Xipeng Shen and Madanlal Musuvathi and
Todd Mytkowicz",
title = "{TOP}: a framework for enabling algorithmic
optimizations for distance-related problems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "10",
pages = "1046--1057",
month = jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2794367.2794374",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 30 16:13:06 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Computing distances among data points is an essential
part of many important algorithms in data analytics,
graph analysis, and other domains. In each of these
domains, developers have spent significant manual
effort optimizing algorithms, often through novel
applications of the triangle equality, in order to
minimize the number of distance computations in the
algorithms. In this work, we observe that many
algorithms across these domains can be generalized as
an instance of a generic distance-related abstraction.
Based on this abstraction, we derive seven principles
for correctly applying the triangular inequality to
optimize distance-related algorithms. Guided by the
findings, we develop {Triangular} {OPtimizer} (TOP),
the first software framework that is able to
automatically produce optimized algorithms that either
matches or outperforms manually designed algorithms for
solving distance-related problems. TOP achieves up to
237x speedups and 2.5X on average.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Leis:2015:EPW,
author = "Viktor Leis and Kan Kundhikanjana and Alfons Kemper
and Thomas Neumann",
title = "Efficient processing of window functions in analytical
{SQL} queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "10",
pages = "1058--1069",
month = jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2794367.2794375",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 30 16:13:06 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Window functions, also known as analytic OLAP
functions, have been part of the SQL standard for more
than a decade and are now a widely-used feature. Window
functions allow to elegantly express many useful query
types including time series analysis, ranking,
percentiles, moving averages, and cumulative sums.
Formulating such queries in plain SQL-92 is usually
both cumbersome and inefficient. Despite being
supported by all major database systems, there have
been few publications that describe how to implement an
efficient relational window operator. This work aims at
filling this gap by presenting an efficient and general
algorithm for the window operator. Our algorithm is
optimized for high-performance main-memory database
systems and has excellent performance on modern
multi-core CPUs. We show how to fully parallelize all
phases of the operator in order to effectively scale
for arbitrary input distributions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2015:RTT,
author = "Yuchen Li and Dongxiang Zhang and Kian-Lee Tan",
title = "Real-time targeted influence maximization for online
advertisements",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "10",
pages = "1070--1081",
month = jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2794367.2794376",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 30 16:13:06 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Advertising in social network has become a
multi-billion-dollar industry. A main challenge is to
identify key influencers who can effectively contribute
to the dissemination of information. Although the
influence maximization problem, which finds a seed set
of k most influential users based on certain
propagation models, has been well studied, it is not
target-aware and cannot be directly applied to online
advertising. In this paper, we propose a new problem,
named Keyword-Based Targeted Influence Maximization
(KB-TIM), to find a seed set that maximizes the
expected influence over users who are relevant to a
given advertisement. To solve the problem, we propose a
sampling technique based on weighted reverse influence
set and achieve an approximation ratio of $ (1 - - 1 /
e - - \epsilon) $. To meet the instant-speed
requirement, we propose two disk-based solutions that
improve the query processing time by two orders of
magnitude over the state-of-the-art solutions, while
keeping the theoretical bound. Experiments conducted on
two real social networks confirm our theoretical
findings as well as the efficiency. Given an
advertisement with 5 keywords, it takes only 2 seconds
to find the most influential users in a social network
with billions of edges.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Papenbrock:2015:FDD,
author = "Thorsten Papenbrock and Jens Ehrlich and Jannik Marten
and Tommy Neubert and Jan-Peer Rudolph and Martin
Sch{\"o}nberg and Jakob Zwiener and Felix Naumann",
title = "Functional dependency discovery: an experimental
evaluation of seven algorithms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "10",
pages = "1082--1093",
month = jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2794367.2794377",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 30 16:13:06 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Functional dependencies are important metadata used
for schema normalization, data cleansing and many other
tasks. The efficient discovery of functional
dependencies in tables is a well-known challenge in
database research and has seen several approaches.
Because no comprehensive comparison between these
algorithms exist at the time, it is hard to choose the
best algorithm for a given dataset. In this
experimental paper, we describe, evaluate, and compare
the seven most cited and most important algorithms, all
solving this same problem. First, we classify the
algorithms into three different categories, explaining
their commonalities. We then describe all algorithms
with their main ideas. The descriptions provide
additional details where the original papers were
ambiguous or incomplete. Our evaluation of careful
re-implementations of all algorithms spans a broad test
space including synthetic and real-world data. We show
that all functional dependency algorithms optimize for
certain data characteristics and provide hints on when
to choose which algorithm. In summary, however, all
current approaches scale surprisingly poorly, showing
potential for future research.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kalinin:2015:SEI,
author = "Alexander Kalinin and Ugur Cetintemel and Stan
Zdonik",
title = "{Searchlight}: enabling integrated search and
exploration over large multidimensional data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "10",
pages = "1094--1105",
month = jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2794367.2794378",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 30 16:13:06 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present a new system, called Searchlight, that
uniquely integrates constraint solving and data
management techniques. It allows Constraint Programming
(CP) machinery to run efficiently inside a DBMS without
the need to extract, transform and move the data. This
marriage concurrently offers the rich expressiveness
and efficiency of constraint-based search and
optimization provided by modern CP solvers, and the
ability of DBMSs to store and query data at scale,
resulting in an enriched functionality that can
effectively support both data- and search-intensive
applications. As such, Searchlight is the first system
to support generic search, exploration and mining over
large multi-dimensional data collections, going beyond
point algorithms designed for point search and mining
tasks. Searchlight makes the following scientific
contributions: o Constraint solvers as first-class
citizens Instead of treating solver logic as a
black-box, Searchlight provides native support,
incorporating the necessary APIs for its specification
and transparent execution as part of query plans, as
well as novel algorithms for its optimized execution
and parallelization. o Speculative solving Existing
solvers assume that the entire data set is main-memory
resident. Searchlight uses an innovative two stage
Solve-Validate approach that allows it to operate
speculatively yet safely on main-memory synopses,
quickly producing candidate search results that can
later be efficiently validated on real data. o
Computation and I/O load balancing As CP solver logic
can be computationally expensive, executing it on large
search and data spaces requires novel CPU-I/O balancing
approaches when performing search distribution. We
built a prototype implementation of Searchlight on
Google's Or-Tools, an open-source suite of operations
research tools, and the array DBMS SciDB. Extensive
experimental results show that Searchlight often
performs orders of magnitude faster than the next best
approach (SciDB-only or CP-solver-only) in terms of end
response time and time to first result.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Rahman:2015:PID,
author = "Md Farhadur Rahman and Weimo Liu and Saravanan
Thirumuruganathan and Nan Zhang and Gautam Das",
title = "Privacy implications of database ranking",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "10",
pages = "1106--1117",
month = jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2794367.2794379",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 30 16:13:06 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In recent years, there has been much research in the
adoption of Ranked Retrieval model (in addition to the
Boolean retrieval model) in structured databases,
especially those in a client-server environment (e.g.,
web databases). With this model, a search query returns
top- k tuples according to not just exact matches of
selection conditions, but a suitable ranking function.
While much research has gone into the design of ranking
functions and the efficient processing of top- k
queries, this paper studies a novel problem on the
privacy implications of database ranking. The
motivation is a novel yet serious privacy leakage we
found on real-world web databases which is caused by
the ranking function design. Many such databases
feature private attributes --- e.g., a social network
allows users to specify certain attributes as only
visible to him/herself, but not to others. While these
websites generally respect the privacy settings by not
directly displaying private attribute values in search
query answers, many of them nevertheless take into
account such private attributes in the ranking function
design. The conventional belief might be that tuple
ranks alone are not enough to reveal the private
attribute values. Our investigation, however, shows
that this is not the case in reality. To address the
problem, we introduce a taxonomy of the problem space
with two dimensions, (1) the type of query interface
and (2) the capability of adversaries. For each
subspace, we develop a novel technique which either
guarantees the successful inference of private
attributes, or does so for a significant portion of
real-world tuples. We demonstrate the effectiveness and
efficiency of our techniques through theoretical
analysis, extensive experiments over real-world
datasets, as well as successful online attacks over
websites with tens to hundreds of millions of users ---
e.g., Amazon Goodreads and Renren.com.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kohler:2015:PCS,
author = "Henning K{\"o}hler and Sebastian Link and Xiaofang
Zhou",
title = "Possible and certain {SQL} keys",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "11",
pages = "1118--1129",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2809974.2809975",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 30 16:13:08 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Driven by the dominance of the relational model, the
requirements of modern applications, and the veracity
of data, we revisit the fundamental notion of a key in
relational databases with NULLs. In SQL database
systems primary key columns are NOT NULL by default.
NULL columns may occur in unique constraints which only
guarantee uniqueness for tuples which do not feature
null markers in any of the columns involved, and
therefore serve a different function than primary keys.
We investigate the notions of possible and certain
keys, which are keys that hold in some or all possible
worlds that can originate from an SQL table,
respectively. Possible keys coincide with the unique
constraint of SQL, and thus provide a semantics for
their syntactic definition in the SQL standard. Certain
keys extend primary keys to include NULL columns, and
thus form a sufficient and necessary condition to
identify tuples uniquely, while primary keys are only
sufficient for that purpose. In addition to basic
characterization, axiomatization, and simple discovery
approaches for possible and certain keys, we
investigate the existence and construction of Armstrong
tables, and describe an indexing scheme for enforcing
certain keys. Our experiments show that certain keys
with NULLs do occur in real-world databases, and that
related computational problems can be solved
efficiently. Certain keys are therefore semantically
well-founded and able to maintain data quality in the
form of Codd's entity integrity rule while handling the
requirements of modern applications, that is, higher
volumes of incomplete data from different formats.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tang:2015:SSJ,
author = "Yu Tang and Yilun Cai and Nikos Mamoulis",
title = "Scaling similarity joins over tree-structured data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "11",
pages = "1130--1141",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2809974.2809976",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 30 16:13:08 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Given a large collection of tree-structured objects
(e.g., XML documents), the similarity join finds the
pairs of objects that are similar to each other, based
on a similarity threshold and a tree edit distance
measure. The state-of-the-art similarity join methods
compare simpler approximations of the objects (e.g.,
strings), in order to prune pairs that cannot be part
of the similarity join result based on distance bounds
derived by the approximations. In this paper, we
propose a novel similarity join approach, which is
based on the dynamic decomposition of the tree objects
into subgraphs, according to the similarity threshold.
Our technique avoids computing the exact distance
between two tree objects, if the objects do not share
at least one common subgraph. In order to scale up the
join, the computed subgraphs are managed in a two-layer
index. Our experimental results on real and synthetic
data collections show that our approach outperforms the
state-of-the-art methods by up to an order of
magnitude.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Rahman:2015:WSE,
author = "Habibur Rahman and Saravanan Thirumuruganathan and
Senjuti Basu Roy and Sihem Amer-Yahia and Gautam Das",
title = "Worker skill estimation in team-based tasks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "11",
pages = "1142--1153",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2809974.2809977",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 30 16:13:08 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Many emerging applications such as collaborative
editing, multi-player games, or fan-subbing require to
form a team of experts to accomplish a task together.
Existing research has investigated how to assign
workers to such team-based tasks to ensure the best
outcome assuming the skills of individual workers to be
known. In this work, we investigate how to estimate
individual worker's skill based on the outcome of the
team-based tasks they have undertaken. We consider two
popular skill aggregation functions and estimate the
skill of the workers, where skill is either a
deterministic value or a probability distribution. We
propose efficient solutions for worker skill estimation
using continuous and discrete optimization techniques.
We present comprehensive experiments and validate the
scalability and effectiveness of our proposed solutions
using multiple real-world datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{He:2015:DDP,
author = "Xi He and Graham Cormode and Ashwin Machanavajjhala
and Cecilia M. Procopiuc and Divesh Srivastava",
title = "{DPT}: differentially private trajectory synthesis
using hierarchical reference systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "11",
pages = "1154--1165",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2809974.2809978",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 30 16:13:08 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "GPS-enabled devices are now ubiquitous, from airplanes
and cars to smartphones and wearable technology. This
has resulted in a wealth of data about the movements of
individuals and populations, which can be analyzed for
useful information to aid in city and traffic planning,
disaster preparedness and so on. However, the places
that people go can disclose extremely sensitive
information about them, and thus their use needs to be
filtered through privacy preserving mechanisms. This
turns out to be a highly challenging task: raw
trajectories are highly detailed, and typically no pair
is alike. Previous attempts fail either to provide
adequate privacy protection, or to remain sufficiently
faithful to the original behavior. This paper presents
DPT, a system to synthesize mobility data based on raw
GPS trajectories of individuals while ensuring strong
privacy protection in the form of $ \epsilon
$-differential privacy. DPT makes a number of novel
modeling and algorithmic contributions including (i)
discretization of raw trajectories using hierarchical
reference systems (at multiple resolutions) to capture
individual movements at differing speeds, (ii) adaptive
mechanisms to select a small set of reference systems
and construct prefix tree counts privately, and (iii)
use of direction-weighted sampling for improved
utility. While there have been prior attempts to solve
the subproblems required to generate synthetic
trajectories, to the best of our knowledge, ours is the
first system that provides an end-to-end solution. We
show the efficacy of our synthetic trajectory
generation system using an extensive empirical
evaluation.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2015:SSA,
author = "Boduo Li and Yanlei Diao and Prashant Shenoy",
title = "Supporting scalable analytics with latency
constraints",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "11",
pages = "1166--1177",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2809974.2809979",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 30 16:13:08 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Recently there has been a significant interest in
building big data analytics systems that can handle
both ``big data'' and ``fast data''. Our work is
strongly motivated by recent real-world use cases that
point to the need for a general, unified data
processing framework to support analytical queries with
different latency requirements. Toward this goal, we
start with an analysis of existing big data systems to
understand the causes of high latency. We then propose
an extended architecture with mini-batches as
granularity for computation and shuffling, and augment
it with new model-driven resource allocation and
runtime scheduling techniques to meet user latency
requirements while maximizing throughput. Results from
real-world workloads show that our techniques,
implemented in Incremental Hadoop, reduce its latency
from tens of seconds to sub-second, with 2x-5x increase
in throughput. Our system also outperforms
state-of-the-art distributed stream systems, Storm and
Spark Streaming, by 1-2 orders of magnitude when
combining latency and throughput.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shiokawa:2015:SEA,
author = "Hiroaki Shiokawa and Yasuhiro Fujiwara and Makoto
Onizuka",
title = "{SCAN++}: efficient algorithm for finding clusters,
hubs and outliers on large-scale graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "11",
pages = "1178--1189",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2809974.2809980",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 30 16:13:08 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Graph clustering is one of the key techniques for
understanding the structures present in graphs. Besides
cluster detection, identifying hubs and outliers is
also a key task, since they have important roles to
play in graph data mining. The structural clustering
algorithm SCAN, proposed by Xu et al., is successfully
used in many application because it not only detects
densely connected nodes as clusters but also identifies
sparsely connected nodes as hubs or outliers. However,
it is difficult to apply SCAN to large-scale graphs due
to its high time complexity. This is because it
evaluates the density for all adjacent nodes included
in the given graphs. In this paper, we propose a novel
graph clustering algorithm named SCAN ++. In order to
reduce time complexity, we introduce new data structure
of directly two-hop-away reachable node set (DTAR).
DTAR is the set of two-hop-away nodes from a given node
that are likely to be in the same cluster as the given
node. SCAN++ employs two approaches for efficient
clustering by using DTARs without sacrificing
clustering quality. First, it reduces the number of the
density evaluations by computing the density only for
the adjacent nodes such as indicated by DTARs. Second,
by sharing a part of the density evaluations for DTARs,
it offers efficient density evaluations of adjacent
nodes. As a result, SCAN++ detects exactly the same
clusters, hubs, and outliers from large-scale graphs as
SCAN with much shorter computation time. Extensive
experiments on both real-world and synthetic graphs
demonstrate the performance superiority of SCAN++ over
existing approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Faleiro:2015:RSM,
author = "Jose M. Faleiro and Daniel J. Abadi",
title = "Rethinking serializable multiversion concurrency
control",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "11",
pages = "1190--1201",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2809974.2809981",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 30 16:13:08 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Multi-versioned database systems have the potential to
significantly increase the amount of concurrency in
transaction processing because they can avoid
read-write conflicts. Unfortunately, the increase in
concurrency usually comes at the cost of transaction
serializability. If a database user requests full
serializability, modern multi-versioned systems
significantly constrain read-write concurrency among
conflicting transactions and employ expensive
synchronization patterns in their design. In
main-memory multi-core settings, these additional
constraints are so burdensome that multi-versioned
systems are often significantly outperformed by
single-version systems. We propose B ohm, a new
concurrency control protocol for main-memory
multi-versioned database systems. Bohm guarantees
serializable execution while ensuring that reads never
block writes. In addition, Bohm does not require reads
to perform any bookkeeping whatsoever, thereby avoiding
the overhead of tracking reads via contended writes to
shared memory. This leads to excellent scalability and
performance in multi-core settings. Bohm has all the
above characteristics without performing validation
based concurrency control. Instead, it is pessimistic,
and is therefore not prone to excessive aborts in the
presence of contention. An experimental evaluation
shows that Bohm performs well in both high contention
and low contention settings, and is able to
dramatically outperform state-of-the-art
multi-versioned systems despite maintaining the full
set of serializability guarantees.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Brancotte:2015:RAT,
author = "Bryan Brancotte and Bo Yang and Guillaume Blin and
Sarah Cohen-Boulakia and Alain Denise and Sylvie
Hamel",
title = "Rank aggregation with ties: experiments and analysis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "11",
pages = "1202--1213",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2809974.2809982",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 30 16:13:08 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The problem of aggregating multiple rankings into one
consensus ranking is an active research topic
especially in the database community. Various studies
have implemented methods for rank aggregation and may
have come up with contradicting conclusions upon which
algorithms work best. Comparing such results is
cumbersome, as the original studies mixed different
approaches and used very different evaluation datasets
and metrics. Additionally, in real applications, the
rankings to be aggregated may not be permutations where
elements are strictly ordered, but they may have ties
where some elements are placed at the same position.
However, most of the studies have not considered ties.
This paper introduces the first large scale study of
algorithms for rank aggregation with ties. More
precisely, (i) we review rank aggregation algorithms
and determine whether or not they can handle ties; (ii)
we propose the first implementation to compute the
exact solution of the Rank Aggregation with ties
problem; (iii) we evaluate algorithms for rank
aggregation with ties on a very large panel of both
real and carefully generated synthetic datasets; (iv)
we provide guidance on the algorithms to be favored
depending on dataset features.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sundaram:2015:GHP,
author = "Narayanan Sundaram and Nadathur Satish and Md Mostofa
Ali Patwary and Subramanya R. Dulloor and Michael J.
Anderson and Satya Gautam Vadlamudi and Dipankar Das
and Pradeep Dubey",
title = "{GraphMat}: high performance graph analytics made
productive",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "11",
pages = "1214--1225",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2809974.2809983",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 30 16:13:08 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Given the growing importance of large-scale graph
analytics, there is a need to improve the performance
of graph analysis frameworks without compromising on
productivity. GraphMat is our solution to bridge this
gap between a user-friendly graph analytics framework
and native, hand-optimized code. GraphMat functions by
taking vertex programs and mapping them to high
performance sparse matrix operations in the backend. We
thus get the productivity benefits of a vertex
programming framework without sacrificing performance.
GraphMat is a single-node multicore graph framework
written in C++ which has enabled us to write a diverse
set of graph algorithms with the same effort compared
to other vertex programming frameworks. GraphMat
performs 1.1-7X faster than high performance frameworks
such as GraphLab, CombBLAS and Galois. GraphMat also
matches the performance of MapGraph, a GPU-based graph
framework, despite running on a CPU platform with
significantly lower compute and bandwidth resources. It
achieves better multicore scalability (13-15X on 24
cores) than other frameworks and is 1.2X off native,
hand-optimized code on a variety of graph algorithms.
Since GraphMat performance depends mainly on a few
scalable and well-understood sparse matrix operations,
GraphMat can naturally benefit from the trend of
increasing parallelism in future hardware.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2015:MKC,
author = "Kai Zhang and Kaibo Wang and Yuan Yuan and Lei Guo and
Rubao Lee and Xiaodong Zhang",
title = "{Mega-KV}: a case for {GPUs} to maximize the
throughput of in-memory key--value stores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "11",
pages = "1226--1237",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2809974.2809984",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 30 16:13:08 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In-memory key--value stores play a critical role in
data processing to provide high throughput and low
latency data accesses. In-memory key--value stores have
several unique properties that include (1) data
intensive operations demanding high memory bandwidth
for fast data accesses, (2) high data parallelism and
simple computing operations demanding many slim
parallel computing units, and (3) a large working set.
As data volume continues to increase, our experiments
show that conventional and general-purpose multicore
systems are increasingly mismatched to the special
properties of key--value stores because they do not
provide massive data parallelism and high memory
bandwidth; the powerful but the limited number of
computing cores do not satisfy the demand of the unique
data processing task; and the cache hierarchy may not
well benefit to the large working set. In this paper,
we make a strong case for GPUs to serve as
special-purpose devices to greatly accelerate the
operations of in-memory key--value stores.
Specifically, we present the design and implementation
of Mega-KV, a GPU-based in-memory key--value store
system that achieves high performance and high
throughput. Effectively utilizing the high memory
bandwidth and latency hiding capability of GPUs,
Mega-KV provides fast data accesses and significantly
boosts overall performance. Running on a commodity PC
installed with two CPUs and two GPUs, Mega-KV can
process up to 160+ million key--value operations per
second, which is 1.4-2.8 times as fast as the
state-of-the-art key--value store system on a
conventional CPU-based platform.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kim:2015:TSI,
author = "Jinha Kim and Hyungyu Shin and Wook-Shin Han and
Sungpack Hong and Hassan Chafi",
title = "Taming subgraph isomorphism for {RDF} query
processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "11",
pages = "1238--1249",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2809974.2809985",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 30 16:13:08 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "RDF data are used to model knowledge in various areas
such as life sciences, Semantic Web, bioinformatics,
and social graphs. The size of real RDF data reaches
billions of triples. This calls for a framework for
efficiently processing RDF data. The core function of
processing RDF data is subgraph pattern matching. There
have been two completely different directions for
supporting efficient subgraph pattern matching. One
direction is to develop specialized RDF query
processing engines exploiting the properties of RDF
data for the last decade, while the other direction is
to develop efficient subgraph isomorphism algorithms
for general, labeled graphs for over 30 years. Although
both directions have a similar goal (i.e., finding
subgraphs in data graphs for a given query graph), they
have been independently researched without clear
reason. We argue that a subgraph isomorphism algorithm
can be easily modified to handle the graph
homomorphism, which is the RDF pattern matching
semantics, by just removing the injectivity constraint.
In this paper, based on the state-of-the-art subgraph
isomorphism algorithm, we propose an in-memory
solution, Turbo$_{HOM + +}$, which is tamed for the RDF
processing, and we compare it with the representative
RDF processing engines for several RDF benchmarks in a
server machine where billions of triples can be loaded
in memory. In order to speed up Turbo$_{HOM + +}$, we
also provide a simple yet effective transformation and
a series of optimization techniques. Extensive
experiments using several RDF benchmarks show that
Turbo$_{HOM + +}$ consistently and significantly
outperforms the representative RDF engines.
Specifically, Turbo$_{HOM + +}$ outperforms its
competitors by up to five orders of magnitude.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jiang:2015:SPI,
author = "Lilong Jiang and Arnab Nandi",
title = "{SnapToQuery}: providing interactive feedback during
exploratory query specification",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "11",
pages = "1250--1261",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2809974.2809986",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 30 16:13:08 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A critical challenge in the data exploration process
is discovering and issuing the ``right'' query,
especially when the space of possible queries is large.
This problem of exploratory query specification is
exacerbated by the use of interactive user interfaces
driven by mouse, touch, or next-generation,
three-dimensional, motion capture-based devices; which,
are often imprecise due to jitter and sensitivity
issues. In this paper, we propose SnapToQuery, a novel
technique that guides users through the query space by
providing interactive feedback during the query
specification process by ``snapping'' to the user's
likely intended queries. These intended queries can be
derived from prior query logs, or from the data itself,
using methods described in this paper. In order to
provide interactive response times over large datasets,
we propose two data reduction techniques when snapping
to these queries. Performance experiments demonstrate
that our algorithms help maintain an interactive
experience while allowing for accurate guidance. User
studies over three kinds of devices (mouse, touch, and
motion capture) show that SnapToQuery can help users
specify queries quicker and more accurately; resulting
in a query specification time speedup of $ 1.4 \times $
for mouse and touch-based devices and $ 2.2 \times $
for motion capture-based devices.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhou:2015:GFI,
author = "Yang Zhou and Ling Liu and Kisung Lee and Qi Zhang",
title = "{GraphTwist}: fast iterative graph computation with
two-tier optimizations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "11",
pages = "1262--1273",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2809974.2809987",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 30 16:13:08 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Large-scale real-world graphs are known to have highly
skewed vertex degree distribution and highly skewed
edge weight distribution. Existing vertex-centric
iterative graph computation models suffer from a number
of serious problems: (1) poor performance of parallel
execution due to inherent workload imbalance at vertex
level; (2) inefficient CPU resource utilization due to
short execution time for low-degree vertices compared
to the cost of in-memory or on-disk vertex access; and
(3) incapability of pruning insignificant vertices or
edges to improve the computational performance. In this
paper, we address the above technical challenges by
designing and implementing a scalable, efficient, and
provably correct two-tier graph parallel processing
system, GraphTwist. At storage and access tier,
GraphTwist maximizes parallel efficiency by employing
three graph parallel abstractions for partitioning a
big graph by slice, strip or dice based partitioning
techniques. At computation tier, GraphTwist presents
two utility-aware pruning strategies: slice pruning and
cut pruning, to further improve the computational
performance while preserving the computational utility
defined by graph applications. Theoretic analysis is
provided to quantitatively prove that iterative graph
computations powered by utility-aware pruning
techniques can achieve a very good approximation with
bounds on the introduced error.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Inoue:2015:SCF,
author = "Hiroshi Inoue and Kenjiro Taura",
title = "{SIMD}- and cache-friendly algorithm for sorting an
array of structures",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "11",
pages = "1274--1285",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2809974.2809988",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 30 16:13:08 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper describes our new algorithm for sorting an
array of structures by efficiently exploiting the SIMD
instructions and cache memory of today's processors.
Recently, multiway mergesort implemented with SIMD
instructions has been used as a high-performance
in-memory sorting algorithm for sorting integer values.
For sorting an array of structures with SIMD
instructions, a frequently used approach is to first
pack the key and index for each record into an integer
value, sort the key-index pairs using SIMD
instructions, then rearrange the records based on the
sorted key-index pairs. This approach can efficiently
exploit SIMD instructions because it sorts the
key-index pairs while packed into integer values;
hence, it can use existing high-performance sorting
implementations of the SIMD-based multiway mergesort
for integers. However, this approach has frequent cache
misses in the final rearranging phase due to its random
and scattered memory accesses so that this phase limits
both single-thread performance and scalability with
multiple cores. Our approach is also based on multiway
mergesort, but it can avoid costly random accesses for
rearranging the records while still efficiently
exploiting the SIMD instructions. Our results showed
that our approach exhibited up to 2.1x better
single-thread performance than the key-index approach
implemented with SIMD instructions when sorting 512M
16-byte records on one core. Our approach also yielded
better performance when we used multiple cores.
Compared to an optimized radix sort, our vectorized
multiway mergesort achieved better performance when the
each record is large. Our vectorized multiway mergesort
also yielded higher scalability with multiple cores
than the radix sort.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Song:2015:EDI,
author = "Shaoxu Song and Aoqian Zhang and Lei Chen and Jianmin
Wang",
title = "Enriching data imputation with extensive similarity
neighbors",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "11",
pages = "1286--1297",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2809974.2809989",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 30 16:13:08 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Incomplete information often occur along with many
database applications, e.g., in data integration, data
cleaning or data exchange. The idea of data imputation
is to fill the missing data with the values of its
neighbors who share the same information. Such
neighbors could either be identified certainly by
editing rules or statistically by relational dependency
networks. Unfortunately, owing to data sparsity, the
number of neighbors (identified w.r.t. value equality)
is rather limited, especially in the presence of data
values with variances. In this paper, we argue to
extensively enrich similarity neighbors by similarity
rules with tolerance to small variations. More fillings
can thus be acquired that the aforesaid equality
neighbors fail to reveal. To fill the missing values
more, we study the problem of maximizing the missing
data imputation. Our major contributions include (1)
the np-hardness analysis on solving and approximating
the problem, (2) exact algorithms for tackling the
problem, and (3) efficient approximation with
performance guarantees. Experiments on real and
synthetic data sets demonstrate that the filling
accuracy can be improved.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Makreshanski:2015:LSE,
author = "Darko Makreshanski and Justin Levandoski and Ryan
Stutsman",
title = "To lock, swap, or elide: on the interplay of hardware
transactional memory and lock-free indexing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "11",
pages = "1298--1309",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2809974.2809990",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 30 16:13:08 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The release of hardware transactional memory (HTM) in
commodity CPUs has major implications on the design and
implementation of main-memory databases, especially on
the architecture of high-performance lock-free indexing
methods at the core of several of these systems. This
paper studies the interplay of HTM and lock-free
indexing methods. First, we evaluate whether HTM will
obviate the need for crafty lock-free index designs by
integrating it in a traditional B-tree architecture.
HTM performs well for simple data sets with small
fixed-length keys and payloads, but its benefits
disappear for more complex scenarios (e.g., larger
variable-length keys and payloads), making it
unattractive as a general solution for achieving high
performance. Second, we explore fundamental differences
between HTM-based and lock-free B-tree designs. While
lock-freedom entails design complexity and extra
mechanism, it has performance advantages in several
scenarios, especially high-contention cases where
readers proceed uncontested (whereas HTM aborts
readers). Finally, we explore the use of HTM as a
method to simplify lock-free design. We find that using
HTM to implement a multi-word compare-and-swap greatly
reduces lock-free programming complexity at the cost of
only a 10--15\% performance degradation. Our study uses
two state-of-the-art index implementations: a
memory-optimized B-tree extended with HTM to provide
multi-threaded concurrency and the Bw-tree lock-free
B-tree used in several Microsoft production
environments.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shin:2015:IKB,
author = "Jaeho Shin and Sen Wu and Feiran Wang and Christopher
{De Sa} and Ce Zhang and Christopher R{\'e}",
title = "Incremental knowledge base construction using
{DeepDive}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "11",
pages = "1310--1321",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2809974.2809991",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 30 16:13:08 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Populating a database with unstructured information is
a long-standing problem in industry and research that
encompasses problems of extraction, cleaning, and
integration. Recent names used for this problem include
dealing with dark data and knowledge base construction
(KBC). In this work, we describe DeepDive, a system
that combines database and machine learning ideas to
help develop KBC systems, and we present techniques to
make the KBC process more efficient. We observe that
the KBC process is iterative, and we develop techniques
to incrementally produce inference results for KBC
systems. We propose two methods for incremental
inference, based respectively on sampling and
variational techniques. We also study the tradeoff
space of these methods and develop a simple rule-based
optimizer. DeepDive includes all of these
contributions, and we evaluate DeepDive on five KBC
systems, showing that it can speed up KBC inference
tasks by up to two orders of magnitude with negligible
impact on quality.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Qian:2015:LUP,
author = "Li Qian and Jinyang Gao and H. V. Jagadish",
title = "Learning user preferences by adaptive pairwise
comparison",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "11",
pages = "1322--1333",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2809974.2809992",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 30 16:13:08 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Users make choices among multi-attribute objects in a
data set in a variety of domains including used car
purchase, job search and hotel room booking. Individual
users sometimes have strong preferences between
objects, but these preferences may not be universally
shared by all users. If we can cast these preferences
as derived from a quantitative user-specific preference
function, then we can predict user preferences by
learning their preference function, even though the
preference function itself is not directly observable,
and may be hard to express. In this paper we study the
problem of preference learning with pairwise
comparisons on a set of entities with multiple
attributes. We formalize the problem into two
subproblems, namely preference estimation and
comparison selection. We propose an innovative approach
to estimate the preference, and introduce a binary
search strategy to adaptively select the comparisons.
We introduce the concept of an orthogonal query to
support this adaptive selection, as well as a novel
S-tree index to enable efficient evaluation of
orthogonal queries. We integrate these components into
a system for inferring user preference with adaptive
pairwise comparisons. Our experiments and user study
demonstrate that our adaptive system significantly
outperforms the na{\"\i}ve random selection system on
both real data and synthetic data, with either
simulated or real user feedback. We also show our
preference learning approach is much more effective
than existing approaches, and our S-tree can be
constructed efficiently and perform orthogonal query at
interactive speeds.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2015:AEL,
author = "Weimo Liu and Md Farhadur Rahman and Saravanan
Thirumuruganathan and Nan Zhang and Gautam Das",
title = "Aggregate estimations over location based services",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1334--1345",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824034",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Location based services (LBS) have become very popular
in recent years. They range from map services (e.g.,
Google Maps) that store geographic locations of points
of interests, to online social networks (e.g., WeChat,
Sina Weibo, FourSquare) that leverage user geographic
locations to enable various recommendation functions.
The public query interfaces of these services may be
abstractly modeled as a k NN interface over a database
of two dimensional points on a plane: given an
arbitrary query point, the system returns the k points
in the database that are nearest to the query point. In
this paper we consider the problem of obtaining
approximate estimates of SUM and COUNT aggregates by
only querying such databases via their restrictive
public interfaces. We distinguish between interfaces
that return location information of the returned tuples
(e.g., Google Maps), and interfaces that do not return
location information (e.g., Sina Weibo). For both types
of interfaces, we develop aggregate estimation
algorithms that are based on novel techniques for
precisely computing or approximately estimating the
Voronoi cell of tuples. We discuss a comprehensive set
of real-world experiments for testing our algorithms,
including experiments on Google Maps, WeChat, and Sina
Weibo.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bhattacherjee:2015:PDV,
author = "Souvik Bhattacherjee and Amit Chavan and Silu Huang
and Amol Deshpande and Aditya Parameswaran",
title = "Principles of dataset versioning: exploring the
recreation\slash storage tradeoff",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1346--1357",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824035",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The relative ease of collaborative data science and
analysis has led to a proliferation of many thousands
or millions of versions of the same datasets in many
scientific and commercial domains, acquired or
constructed at various stages of data analysis across
many users, and often over long periods of time.
Managing, storing, and recreating these dataset
versions is a non-trivial task. The fundamental
challenge here is the storage-recreation trade-off: the
more storage we use, the faster it is to recreate or
retrieve versions, while the less storage we use, the
slower it is to recreate or retrieve versions. Despite
the fundamental nature of this problem, there has been
a surprisingly little amount of work on it. In this
paper, we study this trade-off in a principled manner:
we formulate six problems under various settings,
trading off these quantities in various ways,
demonstrate that most of the problems are intractable,
and propose a suite of inexpensive heuristics drawing
from techniques in delay-constrained scheduling, and
spanning tree literature, to solve these problems. We
have built a prototype version management system, that
aims to serve as a foundation to our D ataHub system
for facilitating collaborative data science. We
demonstrate, via extensive experiments, that our
proposed heuristics provide efficient solutions in
practical dataset versioning scenarios.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{He:2015:SJJ,
author = "Yeye He and Kris Ganjam and Xu Chu",
title = "{SEMA--JOIN}: joining semantically-related tables
using big table corpora",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1358--1369",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824036",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Join is a powerful operator that combines records from
two or more tables, which is of fundamental importance
in the field of relational database. However,
traditional join processing mostly relies on string
equality comparisons. Given the growing demand for
ad-hoc data analysis, we have seen an increasing number
of scenarios where the desired join relationship is not
equi-join. For example, in a spreadsheet environment, a
user may want to join one table with a subject column
country-name, with another table with a subject column
country-code. Traditional equi-join cannot handle such
joins automatically, and the user typically has to
manually find an intermediate mapping table in order to
perform the desired join. We develop a SEMA-JOIN
approach that is a first step toward allowing users to
perform semantic join automatically, with a click of
the button. Our main idea is to utilize a data-driven
method that leverages a big table corpus with over 100
million tables to determine statistical correlation
between cell values at both row-level and column-level.
We use the intuition that the correct join mapping is
the one that maximizes aggregate pairwise correlation,
to formulate the join prediction problem as an
optimization problem. We develop a linear program
relaxation and a rounding argument to obtain a
2-approximation algorithm in polynomial time. Our
evaluation using both public tables from the Web and
proprietary Enterprise tables from a large company
shows that the proposed approach can perform automatic
semantic joins with high precision for a variety of
common join scenarios.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Krishnan:2015:SVC,
author = "Sanjay Krishnan and Jiannan Wang and Michael J.
Franklin and Ken Goldberg and Tim Kraska",
title = "Stale view cleaning: getting fresh answers from stale
materialized views",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1370--1381",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824037",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Materialized views (MVs), stored pre-computed results,
are widely used to facilitate fast queries on large
datasets. When new records arrive at a high rate, it is
infeasible to continuously update (maintain) MVs and a
common solution is to defer maintenance by batching
updates together. Between batches the MVs become
increasingly stale with incorrect, missing, and
superfluous rows leading to increasingly inaccurate
query results. We propose Stale View Cleaning (SVC)
which addresses this problem from a data cleaning
perspective. In SVC, we efficiently clean a sample of
rows from a stale MV, and use the clean sample to
estimate aggregate query results. While approximate,
the estimated query results reflect the most recent
data. As sampling can be sensitive to long-tailed
distributions, we further explore an outlier indexing
technique to give increased accuracy when the data
distributions are skewed. SVC complements existing
deferred maintenance approaches by giving accurate and
bounded query answers between maintenance. We evaluate
our method on a generated dataset from the TPC-D
benchmark and a real video distribution application.
Experiments confirm our theoretical results: (1)
cleaning an MV sample is more efficient than full view
maintenance, (2) the estimated results are more
accurate than using the stale MV, and (3) SVC is
applicable for a wide variety of MVs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Nagarkar:2015:CSH,
author = "Parth Nagarkar and K. Sel{\c{c}}uk Candan and Aneesha
Bhat",
title = "Compressed spatial hierarchical bitmap {(cSHB)}
indexes for efficiently processing spatial range query
workloads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1382--1393",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824038",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In most spatial data management applications, objects
are represented in terms of their coordinates in a
2-dimensional space and search queries in this space
are processed using spatial index structures. On the
other hand, bitmap-based indexing, especially thanks to
the compression opportunities bitmaps provide, has been
shown to be highly effective for query processing
workloads including selection and aggregation
operations. In this paper, we show that bitmap-based
indexing can also be highly effective for managing
spatial data sets. More specifically, we propose a
novel compressed spatial hierarchical bitmap (cSHB)
index structure to support spatial range queries. We
consider query workloads involving multiple range
queries over spatial data and introduce and consider
the problem of bitmap selection for identifying the
appropriate subset of the bitmap files for processing
the given spatial range query workload. We develop cost
models for compressed domain range query processing and
present query planning algorithms that not only select
index nodes for query processing, but also associate
appropriate bitwise logical operations to identify the
data objects satisfying the range queries in the given
workload. Experiment results confirm the efficiency and
effectiveness of the proposed compressed spatial
hierarchical bitmap (cSHB) index structure and the
range query planning algorithms in supporting spatial
range query workloads.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Deutch:2015:SPD,
author = "Daniel Deutch and Amir Gilad and Yuval Moskovitch",
title = "Selective provenance for datalog programs using
top-$k$ queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1394--1405",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824039",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Highly expressive declarative languages, such as
datalog, are now commonly used to model the operational
logic of data-intensive applications. The typical
complexity of such datalog programs, and the large
volume of data that they process, call for result
explanation. Results may be explained through the
tracking and presentation of data provenance, and here
we focus on a detailed form of provenance (
how-provenance), defining it as the set of derivation
trees of a given fact. While informative, the size of
such full provenance information is typically too large
and complex (even when compactly represented) to allow
displaying it to the user. To this end, we propose a
novel top-$k$ query language for querying datalog
provenance, supporting selection criteria based on tree
patterns and ranking based on the rules and database
facts used in derivation. We propose an efficient novel
algorithm based on (1) instrumenting the datalog
program so that, upon evaluation, it generates only
relevant provenance, and (2) efficient top-$k$
(relevant) provenance generation, combined with
bottom-up datalog evaluation. The algorithm computes in
polynomial data complexity a compact representation of
the top-$k$ trees which may be explicitly constructed
in linear time with respect to their size. We further
experimentally study the algorithm performance, showing
its scalability even for complex datalog programs where
full provenance tracking is infeasible.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Park:2015:PPS,
author = "Yoonjae Park and Jun-Ki Min and Kyuseok Shim",
title = "Processing of probabilistic skyline queries using
{MapReduce}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1406--1417",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824040",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "There has been an increased growth in a number of
applications that naturally generate large volumes of
uncertain data. By the advent of such applications, the
support of advanced analysis queries such as the
skyline and its variant operators for big uncertain
data has become important. In this paper, we propose
the effective parallel algorithms using MapReduce to
process the probabilistic skyline queries for uncertain
data modeled by both discrete and continuous models. We
present three filtering methods to identify
probabilistic non-skyline objects in advance. We next
develop a single MapReduce phase algorithm PS-QP-MR by
utilizing space partitioning based on a variant of
quadtrees to distribute the instances of objects
effectively and the enhanced algorithm PS-QPF-MR by
applying the three filtering methods additionally. We
also propose the workload balancing technique to
balance the workload of reduce functions based on the
number of machines available. Finally, we present the
brute-force algorithms PS-BR-MR and PS-BRF-MR with
partitioning randomly and applying the filtering
methods. In our experiments, we demonstrate the
efficiency and scalability of PS-QPF-MR compared to the
other algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2015:BVS,
author = "Xiaofei Zhang and Hong Cheng and Lei Chen",
title = "Bonding vertex sets over distributed graph: a
betweenness aware approach",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1418--1429",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824041",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Given two sets of vertices in a graph, it is often of
a great interest to find out how these vertices are
connected, especially to identify the vertices of high
prominence defined on the topological structure. In
this work, we formally define a V ertex S et B onding
query (shorted as VSB), which returns a minimum set of
vertices with the maximum importance w.r.t total
betweenness and shortest path reachability in
connecting two sets of input vertices. We find that
such a kind of query is representative and could be
widely applied in many real world scenarios, e.g.,
logistic planning, social community bonding and etc.
Challenges are that many of such applications are
constructed on graphs that are too large to fit in
single server, and the VSB query evaluation turns to be
NP-hard. To cope with the scalability issue and return
the near optimal result in almost real time, we propose
a generic solution framework on a shared nothing
distributed environment. With the development of two
novel techniques, guided graph exploration and
betweenness ranking on exploration, we are able to
efficiently evaluate queries for error bounded results
with bounded space cost. We demonstrate the
effectiveness of our solution with extensive
experiments over both real and synthetic large graphs
on the Google's Cloud platform. Comparing to the
exploration only baseline method, our method achieves
several times of speedup.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Amsterdamer:2015:NLI,
author = "Yael Amsterdamer and Anna Kukliansky and Tova Milo",
title = "A natural language interface for querying general and
individual knowledge",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1430--1441",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824042",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Many real-life scenarios require the joint analysis of
general knowledge, which includes facts about the
world, with individual knowledge, which relates to the
opinions or habits of individuals. Recently developed
crowd mining platforms, which were designed for such
tasks, are a major step towards the solution. However,
these platforms require users to specify their
information needs in a formal, declarative language,
which may be too complicated for na{\"\i}ve users. To
make the joint analysis of general and individual
knowledge accessible to the public, it is desirable to
provide an interface that translates the user
questions, posed in natural language (NL), into the
formal query languages that crowd mining platforms
support. While the translation of NL questions to
queries over conventional databases has been studied in
previous work, a setting with mixed individual and
general knowledge raises unique challenges. In
particular, to support the distinct query constructs
associated with these two types of knowledge, the NL
question must be partitioned and translated using
different means; yet eventually all the translated
parts should be seamlessly combined to a well-formed
query. To account for these challenges, we design and
implement a modular translation framework that employs
new solutions along with state-of-the art NL parsing
tools. The results of our experimental study, involving
real user questions on various topics, demonstrate that
our framework provides a high-quality translation for
many questions that are not handled by previous
translation tools.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Psaroudakis:2015:SCM,
author = "Iraklis Psaroudakis and Tobias Scheuer and Norman May
and Abdelkader Sellami and Anastasia Ailamaki",
title = "Scaling up concurrent main-memory column-store scans:
towards adaptive {NUMA}-aware data and task placement",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1442--1453",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824043",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Main-memory column-stores are called to efficiently
use modern non-uniform memory access (NUMA)
architectures to service concurrent clients on big
data. The efficient usage of NUMA architectures depends
on the data placement and scheduling strategy of the
column-store. Most column-stores choose a static
strategy that involves partitioning all data across the
NUMA architecture, and employing a stealing-based task
scheduler. In this paper, we implement different
strategies for data placement and task scheduling for
the case of concurrent scans. We compare these
strategies with an extensive sensitivity analysis. Our
most significant findings include that unnecessary
partitioning can hurt throughput by up to 70\%, and
that stealing memory-intensive tasks can hurt
throughput by up to 58\%. Based on our analysis, we
envision a design that adapts the data placement and
task scheduling strategy to the workload.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Oh:2015:SOP,
author = "Gihwan Oh and Sangchul Kim and Sang-Won Lee and Bongki
Moon",
title = "{SQLite} optimization with phase change memory for
mobile applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1454--1465",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824044",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Given its pervasive use in smart mobile platforms,
there is a compelling need to optimize the performance
of sluggish SQLite databases. Popular mobile
applications such as messenger, email and social
network services rely on SQLite for their data
management need. Those mobile applications tend to
execute relatively short transactions in the autocommit
mode for transactional consistency in databases. This
often has adverse effect on the flash memory storage in
mobile devices because the small random updates cause
high write amplification and high write latency. In
order to address this problem, we propose a new
optimization strategy, called per-page logging (PPL),
for mobile data management, and have implemented the
key functions in SQLite/PPL. The hardware component of
SQLite/PPL includes phase change memory (PCM) with a
byte-addressable, persistent memory abstraction. By
capturing an update in a physiological log record and
adding it to the PCM log sector, SQLite/PPL can replace
a multitude of successive page writes made to the same
logical page with much smaller log writes done to PCM
much more efficiently. We have observed that SQLite/PPL
would potentially improve the performance of mobile
applications by an order of magnitude while supporting
transactional atomicity and durability.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Crotty:2015:ACU,
author = "Andrew Crotty and Alex Galakatos and Kayhan Dursun and
Tim Kraska and Carsten Binnig and Ugur Cetintemel and
Stan Zdonik",
title = "An architecture for compiling {UDF}-centric
workflows",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1466--1477",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824045",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data analytics has recently grown to include
increasingly sophisticated techniques, such as machine
learning and advanced statistics. Users frequently
express these complex analytics tasks as workflows of
user-defined functions (UDFs) that specify each
algorithmic step. However, given typical hardware
configurations and dataset sizes, the core challenge of
complex analytics is no longer sheer data volume but
rather the computation itself, and the next generation
of analytics frameworks must focus on optimizing for
this computation bottleneck. While query compilation
has gained widespread popularity as a way to tackle the
computation bottleneck for traditional SQL workloads,
relatively little work addresses UDF-centric workflows
in the domain of complex analytics. In this paper, we
describe a novel architecture for automatically
compiling workflows of UDFs. We also propose several
optimizations that consider properties of the data,
UDFs, and hardware together in order to generate
different code on a case-by-case basis. To evaluate our
approach, we implemented these techniques in Tupleware,
a new high-performance distributed analytics system,
and our benchmarks show performance improvements of up
to three orders of magnitude compared to alternative
systems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Margo:2015:SDG,
author = "Daniel Margo and Margo Seltzer",
title = "A scalable distributed graph partitioner",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1478--1489",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824046",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present Scalable Host-tree Embeddings for Efficient
Partitioning (Sheep), a distributed graph partitioning
algorithm capable of handling graphs that far exceed
main memory. Sheep produces high quality edge
partitions an order of magnitude faster than both state
of the art offline (e.g., METIS) and streaming
partitioners (e.g., Fennel). Sheep's partitions are
independent of the input graph distribution, which
means that graph elements can be assigned to processing
nodes arbitrarily without affecting the partition
quality. Sheep transforms the input graph into a
strictly smaller elimination tree via a distributed
map-reduce operation. By partitioning this tree, Sheep
finds an upper-bounded communication volume
partitioning of the original graph. We describe the
Sheep algorithm and analyze its space-time
requirements, partition quality, and intuitive
characteristics and limitations. We compare Sheep to
contemporary partitioners and demonstrate that Sheep
creates competitive partitions, scales to larger
graphs, and has better runtime.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sharov:2015:TMY,
author = "Artyom Sharov and Alexander Shraer and Arif Merchant
and Murray Stokely",
title = "Take me to your leader!: online optimization of
distributed storage configurations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1490--1501",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824047",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The configuration of a distributed storage system
typically includes, among other parameters, the set of
servers and their roles in the replication protocol.
Although mechanisms for changing the configuration at
runtime exist, it is usually left to system
administrators to manually determine the ``best''
configuration and periodically reconfigure the system,
often by trial and error. This paper describes a new
workload-driven optimization framework that dynamically
determines the optimal configuration at run-time. We
focus on optimizing leader and quorum based replication
schemes and divide the framework into three
optimization tiers, dynamically optimizing different
configuration aspects: (1) leader placement, (2) roles
of different servers in the replication protocol, and
(3) replica locations. We showcase our optimization
framework by applying it to a large-scale distributed
storage system used internally in Google and
demonstrate that most client applications significantly
benefit from using our framework, reducing average
operation latency by up to 94\%.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fan:2015:ARG,
author = "Wenfei Fan and Xin Wang and Yinghui Wu and Jingbo Xu",
title = "Association rules with graph patterns",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1502--1513",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824048",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We propose graph-pattern association rules (GPARs) for
social media marketing. Extending association rules for
item-sets, GPARs help us discover regularities between
entities in social graphs, and identify potential
customers by exploring social influence. We study the
problem of discovering top- k diversified GPARs. While
this problem is NP-hard, we develop a parallel
algorithm with accuracy bound. We also study the
problem of identifying potential customers with GPARs.
While it is also NP-hard, we provide a parallel
scalable algorithm that guarantees a polynomial speedup
over sequential algorithms with the increase of
processors. Using real-life and synthetic graphs, we
experimentally verify the scalability and effectiveness
of the algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kimmett:2015:FJM,
author = "Ben Kimmett and Venkatesh Srinivasan and Alex Thomo",
title = "Fuzzy joins in {MapReduce}: an experimental study",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1514--1517",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824049",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We report experimental results for the MapReduce
algorithms proposed by Afrati, Das Sarma, Menestrina,
Parameswaran and Ullman in ICDE'12 to compute fuzzy
joins of binary strings using Hamming Distance. Their
algorithms come with complete theoretical analysis,
however, no experimental evaluation is provided. They
argue that there is a tradeoff between communication
cost and processing cost, and that there is a skyline
of the proposed algorithms; i.e. none dominates
another. We observe via experiments that, from a
practical point of view, some algorithms are almost
always preferable to others. We provide detailed
experimental results and insights that show the
different facets of each algorithm.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cho:2015:PEP,
author = "Minsik Cho and Daniel Brand and Rajesh Bordawekar and
Ulrich Finkler and Vincent Kulandaisamy and Ruchir
Puri",
title = "{PARADIS}: an efficient parallel algorithm for
in-place radix sort",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1518--1529",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824050",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In-place radix sort is a popular distribution-based
sorting algorithm for short numeric or string keys due
to its linear run-time and constant memory complexity.
However, efficient parallelization of in-place radix
sort is very challenging for two reasons. First, the
initial phase of permuting elements into buckets
suffers read-write dependency inherent in its in-place
nature. Secondly, load balancing of the recursive
application of the algorithm to the resulting buckets
is difficult when the buckets are of very different
sizes, which happens for skewed distributions of the
input data. In this paper, we present a novel parallel
in-place radix sort algorithm, PARADIS, which addresses
both problems: (a) ``speculative permutation'' solves
the first problem by assigning multiple non-continuous
array stripes to each processor. The resulting
shared-nothing scheme achieves full parallelization.
Since our speculative permutation is not complete, it
is followed by a ``repair'' phase, which can again be
done in parallel without any data sharing among the
processors. (b) ``distribution-adaptive load
balancing'' solves the second problem. We dynamically
allocate processors in the context of radix sort, so as
to minimize the overall completion time. Our
experimental results show that PARADIS offers excellent
performance/scalability on a wide range of input data
sets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Vengerov:2015:JSE,
author = "David Vengerov and Andre Cavalheiro Menck and Mohamed
Zait and Sunil P. Chakkappen",
title = "Join size estimation subject to filter conditions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1530--1541",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824051",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we present a new algorithm for
estimating the size of equality join of multiple
database tables. The proposed algorithm, Correlated
Sampling, constructs a small space synopsis for each
table, which can then be used to provide a quick
estimate of the join size of this table with other
tables subject to dynamically specified predicate
filter conditions, possibly specified over multiple
columns (attributes) of each table. This algorithm
makes a single pass over the data and is thus suitable
for streaming scenarios. We compare this algorithm
analytically to two other previously known sampling
approaches (independent Bernoulli Sampling and
End-Biased Sampling) and to a novel sketch-based
approach. We also compare these four algorithms
experimentally and show that results fully correspond
to our analytical predictions based on derived
expressions for the estimator variances, with
Correlated Sampling giving the best estimates in a
large range of situations.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2015:AFT,
author = "Jingjing Wang and Magdalena Balazinska and Daniel
Halperin",
title = "Asynchronous and fault-tolerant recursive datalog
evaluation in shared-nothing engines",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1542--1553",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824052",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present a new approach for data analytics with
iterations. Users express their analysis in Datalog
with bag-monotonic aggregate operators, which enables
the expression of computations from a broad variety of
application domains. Queries are translated into query
plans that can execute in shared-nothing engines, are
incremental, and support a variety of iterative models
(synchronous, asynchronous, different processing
priorities) and failure-handling techniques. The plans
require only small extensions to an existing
shared-nothing engine, making the approach easily
implementable. We implement the approach in the Myria
big-data management system and use our implementation
to empirically study the performance characteristics of
different combinations of iterative models, failure
handling methods, and applications. Our evaluation uses
workloads from a variety of application domains. We
find that no single method outperforms others but
rather that application properties must drive the
selection of the iterative query execution model.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mouratidis:2015:MRQ,
author = "Kyriakos Mouratidis and Jilian Zhang and HweeHwa
Pang",
title = "Maximum rank query",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1554--1565",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824053",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The top- k query is a common means to shortlist a
number of options from a set of alternatives, based on
the user's preferences. Typically, these preferences
are expressed as a vector of query weights, defined
over the options' attributes. The query vector
implicitly associates each alternative with a numeric
score, and thus imposes a ranking among them. The top-
k result includes the k options with the highest
scores. In this context, we define the maximum rank
query (MaxRank). Given a focal option in a set of
alternatives, the MaxRank problem is to compute the
highest rank this option may achieve under any possible
user preference, and furthermore, to report all the
regions in the query vector's domain where that rank is
achieved. MaxRank finds application in market impact
analysis, customer profiling, targeted advertising,
etc. We propose a methodology for MaxRank processing
and evaluate it with experiments on real and benchmark
synthetic datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Katsarou:2015:PSI,
author = "Foteini Katsarou and Nikos Ntarmos and Peter
Triantafillou",
title = "Performance and scalability of indexed subgraph query
processing methods",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1566--1577",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824054",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Graph data management systems have become very popular
as graphs are the natural data model for many
applications. One of the main problems addressed by
these systems is subgraph query processing; i.e., given
a query graph, return all graphs that contain the
query. The naive method for processing such queries is
to perform a subgraph isomorphism test against each
graph in the dataset. This obviously does not scale, as
subgraph isomorphism is NP-Complete. Thus, many
indexing methods have been proposed to reduce the
number of candidate graphs that have to underpass the
subgraph isomorphism test. In this paper, we identify a
set of key factors-parameters, that influence the
performance of related methods: namely, the number of
nodes per graph, the graph density, the number of
distinct labels, the number of graphs in the dataset,
and the query graph size. We then conduct comprehensive
and systematic experiments that analyze the sensitivity
of the various methods on the values of the key
parameters. Our aims are twofold: first to derive
conclusions about the algorithms' relative performance,
and, second, to stress-test all algorithms, deriving
insights as to their scalability, and highlight how
both performance and scalability depend on the above
factors. We choose six well-established indexing
methods, namely Grapes, CT-Index, GraphGrepSX, gIndex,
Tree+ $ \Delta $, and gCode, as representative
approaches of the overall design space, including the
most recent and best performing methods. We report on
their index construction time and index size, and on
query processing performance in terms of time and false
positive ratio. We employ both real and synthetic
datasets. Specifically, four real datasets of different
characteristics are used: AIDS, PDBS, PCM, and PPI. In
addition, we generate a large number of synthetic graph
datasets, empowering us to systematically study the
algorithms' performance and scalability versus the
aforementioned key parameters.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yang:2015:LDA,
author = "Ying Yang and Niccol{\`o} Meneghetti and Ronny Fehling
and Zhen Hua Liu and Oliver Kennedy",
title = "Lenses: an on-demand approach to {ETL}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1578--1589",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824055",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Three mentalities have emerged in analytics. One view
holds that reliable analytics is impossible without
high-quality data, and relies on heavy-duty ETL
processes and upfront data curation to provide it. The
second view takes a more ad-hoc approach, collecting
data into a data lake, and placing responsibility for
data quality on the analyst querying it. A third,
on-demand approach has emerged over the past decade in
the form of numerous systems like Paygo or HLog, which
allow for incremental curation of the data and help
analysts to make principled trade-offs between data
quality and effort. Though quite useful in isolation,
these systems target only specific quality problems
(e.g., Paygo targets only schema matching and entity
resolution). In this paper, we explore the design of a
general, extensible infrastructure for on-demand
curation that is based on probabilistic query
processing. We illustrate its generality through
examples and show how such an infrastructure can be
used to gracefully make existing ETL workflows
``on-demand''. Finally, we present a user interface for
On-Demand ETL and address ensuing challenges, including
that of efficiently ranking potential data curation
tasks. Our experimental results show that On-Demand ETL
is feasible and that our greedy ranking strategy for
curation tasks, called CPI, is effective.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fan:2015:KG,
author = "Wenfei Fan and Zhe Fan and Chao Tian and Xin Luna
Dong",
title = "Keys for graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1590--1601",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824056",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Keys for graphs aim to uniquely identify entities
represented by vertices in a graph. We propose a class
of keys that are recursively defined in terms of graph
patterns, and are interpreted with subgraph
isomorphism. Extending conventional keys for relations
and XML, these keys find applications in object
identification, knowledge fusion and social network
reconciliation. As an application, we study the entity
matching problem that, given a graph $G$ and a set $
\Sigma $ of keys, is to find all pairs of entities
(vertices) in $G$ that are identified by keys in $
\Sigma $. We show that the problem is intractable, and
cannot be parallelized in logarithmic rounds.
Nonetheless, we provide two parallel scalable
algorithms for entity matching, in MapReduce and a
vertex-centric asynchronous model. Using real-life and
synthetic data, we experimentally verify the
effectiveness and scalability of the algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Eldawy:2015:SPT,
author = "Ahmed Eldawy and Louai Alarabi and Mohamed F. Mokbel",
title = "Spatial partitioning techniques in {SpatialHadoop}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1602--1605",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824057",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "SpatialHadoop is an extended MapReduce framework that
supports global indexing that spatial partitions the
data across machines providing orders of magnitude
speedup, compared to traditional Hadoop. In this paper,
we describe seven alternative partitioning techniques
and experimentally study their effect on the quality of
the generated index and the performance of range and
spatial join queries. We found that using a 1\% sample
is enough to produce high quality partitions. Also, we
found that the total area of partitions is a reasonable
measure of the quality of indexes when running spatial
join. This study will assist researchers in choosing a
good spatial partitioning technique in distributed
environments.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Manabe:2015:ELH,
author = "Tomohiro Manabe and Keishi Tajima",
title = "Extracting logical hierarchical structure of {HTML}
documents based on headings",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1606--1617",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824058",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We propose a method for extracting logical
hierarchical structure of HTML documents. Because
mark-up structure in HTML documents does not
necessarily coincide with logical hierarchical
structure, it is not trivial how to extract logical
structure of HTML documents. Human readers, however,
easily understand their logical structure. The key
information used by them is headings in the documents.
Human readers exploit the following properties of
headings: (1) headings appear at the beginning of the
corresponding blocks, (2) headings are given prominent
visual styles, (3) headings of the same level share the
same visual style, and (4) headings of higher levels
are given more prominent visual styles. Our method also
exploits these properties for extracting hierarchical
headings and their associated blocks. Our experiment
shows that our method outperforms existing methods. In
addition, our method extracts not only hierarchical
blocks but also their associated headings.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Naidan:2015:PSM,
author = "Bilegsaikhan Naidan and Leonid Boytsov and Eric
Nyberg",
title = "Permutation search methods are efficient, yet faster
search is possible",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1618--1629",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824059",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We survey permutation-based methods for approximate k
-nearest neighbor search. In these methods, every data
point is represented by a ranked list of pivots sorted
by the distance to this point. Such ranked lists are
called permutations. The underpinning assumption is
that, for both metric and non-metric spaces, the
distance between permutations is a good proxy for the
distance between original points. Thus, it should be
possible to efficiently retrieve most true nearest
neighbors by examining only a tiny subset of data
points whose permutations are similar to the
permutation of a query. We further test this assumption
by carrying out an extensive experimental evaluation
where permutation methods are pitted against
state-of-the art benchmarks (the multi-probe LSH, the
VP-tree, and proximity-graph based retrieval) on a
variety of realistically large data set from the image
and textual domain. The focus is on the high-accuracy
retrieval methods for generic spaces. Additionally, we
assume that both data and indices are stored in main
memory. We find permutation methods to be reasonably
efficient and describe a setup where these methods are
most useful. To ease reproducibility, we make our
software and data sets publicly available.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mukherjee:2015:DAO,
author = "Niloy Mukherjee and Shasank Chavan and Maria Colgan
and Dinesh Das and Mike Gleeson and Sanket Hase and
Allison Holloway and Hui Jin and Jesse Kamp and Kartik
Kulkarni and Tirthankar Lahiri and Juan Loaiza and Neil
Macnaughton and Vineet Marwah and Atrayee Mullick and
Andy Witkowski and Jiaqi Yan and Mohamed Zait",
title = "Distributed architecture of {Oracle} database
in-memory",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1630--1641",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824061",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Over the last few years, the information technology
industry has witnessed revolutions in multiple
dimensions. Increasing ubiquitous sources of data have
posed two connected challenges to data management
solutions --- processing unprecedented volumes of data,
and providing ad-hoc real-time analysis in mainstream
production data stores without compromising regular
transactional workload performance. In parallel,
computer hardware systems are scaling out elastically,
scaling up in the number of processors and cores, and
increasing main memory capacity extensively. The data
processing challenges combined with the rapid
advancement of hardware systems has necessitated the
evolution of a new breed of main-memory databases
optimized for mixed OLTAP environments and designed to
scale. The Oracle RDBMS In-memory Option (DBIM) is an
industry-first distributed dual format architecture
that allows a database object to be stored in columnar
format in main memory highly optimized to break
performance barriers in analytic query workloads,
simultaneously maintaining transactional consistency
with the corresponding OLTP optimized row-major format
persisted in storage and accessed through database
buffer cache. In this paper, we present the
distributed, highly-available, and fault-tolerant
architecture of the Oracle DBIM that enables the RDBMS
to transparently scale out in a database cluster, both
in terms of memory capacity and query processing
throughput. We believe that the architecture is unique
among all mainstream in-memory databases. It allows
complete application-transparent, extremely scalable
and automated distribution of Oracle RDBMS objects
in-memory across a cluster, as well as across multiple
NUMA nodes within a single server. It seamlessly
provides distribution awareness to the Oracle SQL
execution framework through affinitized fault-tolerant
parallel execution within and across servers without
explicit optimizer plan changes or query rewrites.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Haas:2015:AMC,
author = "Daniel Haas and Jason Ansel and Lydia Gu and Adam
Marcus",
title = "{Argonaut}: macrotask crowdsourcing for complex data
processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1642--1653",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824062",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Crowdsourced workflows are used in research and
industry to solve a variety of tasks. The databases
community has used crowd workers in query
operators/optimization and for tasks such as entity
resolution. Such research utilizes microtasks where
crowd workers are asked to answer simple yes/no or
multiple choice questions with little training.
Typically, microtasks are used with voting algorithms
to combine redundant responses from multiple crowd
workers to achieve result quality. Microtasks are
powerful, but fail in cases where larger context (e.g.,
domain knowledge) or significant time investment is
needed to solve a problem, for example in
large-document structured data extraction. In this
paper, we consider context-heavy data processing tasks
that may require many hours of work, and refer to such
tasks as macrotasks. Leveraging the infrastructure and
worker pools of existing crowdsourcing platforms, we
automate macrotask scheduling, evaluation, and pay
scales. A key challenge in macrotask-powered work,
however, is evaluating the quality of a worker's
output, since ground truth is seldom available and
redundancy-based quality control schemes are
impractical. We present Argonaut, a framework that
improves macrotask powered work quality using a
hierarchical review. Argonaut uses a predictive model
of worker quality to select trusted workers to perform
review, and a separate predictive model of task quality
to decide which tasks to review. Finally, Argonaut can
identify the ideal trade-off between a single phase of
review and multiple phases of review given a
constrained review budget in order to maximize overall
output quality. We evaluate an industrial use of
Argonaut to power a structured data extraction pipeline
that has utilized over half a million hours of crowd
worker input to complete millions of macrotasks. We
show that Argonaut can capture up to 118\% more errors
than random spot-check reviews in review
budget-constrained environments with up to two review
layers.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2015:BRL,
author = "Guozhang Wang and Joel Koshy and Sriram Subramanian
and Kartik Paramasivam and Mammad Zadeh and Neha
Narkhede and Jun Rao and Jay Kreps and Joe Stein",
title = "Building a replicated logging system with {Apache
Kafka}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1654--1655",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824063",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Apache Kafka is a scalable publish-subscribe messaging
system with its core architecture as a distributed
commit log. It was originally built at LinkedIn as its
centralized event pipelining platform for online data
integration tasks. Over the past years developing and
operating Kafka, we extend its log-structured
architecture as a replicated logging backbone for much
wider application scopes in the distributed
environment. In this abstract, we will talk about our
design and engineering experience to replicate Kafka
logs for various distributed data-driven systems at
LinkedIn, including source-of-truth data storage and
stream processing.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Loro:2015:ISH,
author = "Alessandra Loro and Anja Gruenheid and Donald Kossmann
and Damien Profeta and Philippe Beaudequin",
title = "Indexing and selecting hierarchical business logic",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1656--1667",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824064",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Business rule management is the task of storing and
maintaining company-specific decision rules and
business logic that is queried frequently by
application users. These rules can impede efficient
query processing when they require the business rule
engine to resolve semantic hierarchies. To address this
problem, this work discusses hierarchical indexes that
are performance and storage-conscious. In the first
part of this work, we develop a tree-based hierarchical
structure that represents client-defined semantic
hierarchies as well as two variants of this structure
that improve performance and main memory allocation.
The second part of our work focuses on selecting the
top rules out of those retrieved from the index. We
formally define a priority score-based decision scheme
that allows for a conflict-free rule system and
efficient rule ranking. Additionally, we introduce a
weight-based lazy merging technique for rule selection.
All of these techniques are evaluated with real world
and synthetic data sets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shukla:2015:SAI,
author = "Dharma Shukla and Shireesh Thota and Karthik Raman and
Madhan Gajendran and Ankur Shah and Sergii Ziuzin and
Krishnan Sundaram and Miguel Gonzalez Guajardo and Anna
Wawrzyniak and Samer Boshra and Renato Ferreira and
Mohamed Nassar and Michael Koltachev and Ji Huang and
Sudipta Sengupta and Justin Levandoski and David
Lomet",
title = "Schema-agnostic indexing with {Azure DocumentDB}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1668--1679",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824065",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Azure DocumentDB is Microsoft's multi-tenant
distributed database service for managing JSON
documents at Internet scale. DocumentDB is now
generally available to Azure developers. In this paper,
we describe the DocumentDB indexing subsystem.
DocumentDB indexing enables automatic indexing of
documents without requiring a schema or secondary
indices. Uniquely, DocumentDB provides real-time
consistent queries in the face of very high rates of
document updates. As a multi-tenant service, DocumentDB
is designed to operate within extremely frugal resource
budgets while providing predictable performance and
robust resource isolation to its tenants. This paper
describes the DocumentDB capabilities, including
document representation, query language, document
indexing approach, core index support, and early
production experiences.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Boutin:2015:JRI,
author = "Eric Boutin and Paul Brett and Xiaoyu Chen and Jaliya
Ekanayake and Tao Guan and Anna Korsun and Zhicheng Yin
and Nan Zhang and Jingren Zhou",
title = "{JetScope}: reliable and interactive analytics at
cloud scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1680--1691",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824066",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Interactive, reliable, and rich data analytics at
cloud scale is a key capability to support low latency
data exploration and experimentation over terabytes of
data for a wide range of business scenarios. Besides
the challenges in massive scalability and low latency
distributed query processing, it is imperative to
achieve all these requirements with effective fault
tolerance and efficient recovery, as failures and
fluctuations are the norm in such a distributed
environment. We present a cloud scale interactive query
processing system, called JetScope, developed at
Microsoft. The system has a SQL-like declarative
scripting language and delivers massive scalability and
high performance through advanced optimizations. In
order to achieve low latency, the system leverages
various access methods, optimizes delivering first
rows, and maximizes network and scheduling efficiency.
The system also provides a fine-grained fault tolerance
mechanism which is able to efficiently detect and
mitigate failures without significantly impacting the
query latency and user experience. JetScope has been
deployed to hundreds of servers in production at
Microsoft, serving a few million queries every day.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Hu:2015:DPT,
author = "Xueyang Hu and Mingxuan Yuan and Jianguo Yao and Yu
Deng and Lei Chen and Qiang Yang and Haibing Guan and
Jia Zeng",
title = "Differential privacy in telco big data platform",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1692--1703",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824067",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Differential privacy (DP) has been widely explored in
academia recently but less so in industry possibly due
to its strong privacy guarantee. This paper makes the
first attempt to implement three basic DP architectures
in the deployed telecommunication (telco) big data
platform for data mining applications. We find that all
DP architectures have less than 5\% loss of prediction
accuracy when the weak privacy guarantee is adopted
(e.g., privacy budget parameter $ \epsilon \geq 3$).
However, when the strong privacy guarantee is assumed
(e.g., privacy budget parameter $ \epsilon \leq = 0
\colon 1$), all DP architectures lead to 15\%--30\%
accuracy loss, which implies that real-word industrial
data mining systems cannot work well under such a
strong privacy guarantee recommended by previous
research works. Among the three basic DP architectures,
the Hybridized DM (Data Mining) and DB (Database)
architecture performs the best because of its
complicated privacy protection design for the specific
data mining algorithm. Through extensive experiments on
big data, we also observe that the accuracy loss
increases by increasing the variety of features, but
decreases by increasing the volume of training data.
Therefore, to make DP practically usable in large-scale
industrial systems, our observations suggest that we
may explore three possible research directions in
future: (1) Relaxing the privacy guarantee (e.g.,
increasing privacy budget $ \epsilon $) and studying
its effectiveness on specific industrial applications;
(2) Designing specific privacy scheme for specific data
mining algorithms; and (3) Using large volume of data
but with low variety for training the classification
models.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{El-Helw:2015:OCT,
author = "Amr El-Helw and Venkatesh Raghavan and Mohamed A.
Soliman and George Caragea and Zhongxian Gu and
Michalis Petropoulos",
title = "Optimization of common table expressions in {MPP}
database systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1704--1715",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824068",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Big Data analytics often include complex queries with
similar or identical expressions, usually referred to
as Common Table Expressions (CTEs). CTEs may be
explicitly defined by users to simplify query
formulations, or implicitly included in queries
generated by business intelligence tools, financial
applications and decision support systems. In Massively
Parallel Processing (MPP) database systems, CTEs pose
new challenges due to the distributed nature of query
processing, the overwhelming volume of underlying data
and the scalability criteria that systems are required
to meet. In these settings, the effective optimization
and efficient execution of CTEs are crucial for the
timely processing of analytical queries over Big Data.
In this paper, we present a comprehensive framework for
the representation, optimization and execution of CTEs
in the context of Orca --- Pivotal's query optimizer
for Big Data. We demonstrate experimentally the
benefits of our techniques using industry standard
decision support benchmark.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Goel:2015:TSR,
author = "Anil K. Goel and Jeffrey Pound and Nathan Auch and
Peter Bumbulis and Scott MacLean and Franz F{\"a}rber
and Francis Gropengiesser and Christian Mathis and
Thomas Bodner and Wolfgang Lehner",
title = "Towards scalable real-time analytics: an architecture
for scale-out of {OLxP} workloads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1716--1727",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824069",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present an overview of our work on the SAP HANA
Scale-out Extension, a novel distributed database
architecture designed to support large scale analytics
over real-time data. This platform permits high
performance OLAP with massive scale-out capabilities,
while concurrently allowing OLTP workloads. This dual
capability enables analytics over real-time changing
data and allows fine grained user-specified service
level agreements (SLAs) on data freshness. We advocate
the decoupling of core database components such as
query processing, concurrency control, and persistence,
a design choice made possible by advances in
high-throughput low-latency networks and storage
devices. We provide full ACID guarantees and build on a
logical timestamp mechanism to provide MVCC-based
snapshot isolation, while not requiring synchronous
updates of replicas. Instead, we use asynchronous
update propagation guaranteeing consistency with
timestamp validation. We provide a view into the design
and development of a large scale data management
platform for real-time analytics, driven by the needs
of modern enterprise customers.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dasu:2015:FMF,
author = "Tamraparni Dasu and Vladislav Shkapenyuk and Divesh
Srivastava and Deborah F. Swayne",
title = "{FIT} to monitor feed quality",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1728--1739",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824070",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "While there has been significant focus on collecting
and managing data feeds, it is only now that attention
is turning to their quality. In this paper, we propose
a principled approach to online data quality monitoring
in a dynamic feed environment. Our goal is to alert
quickly when feed behavior deviates from expectations.
We make contributions in two distinct directions.
First, we propose novel enhancements to permit a
publish-subscribe approach to incorporate data quality
modules into the DFMS architecture. Second, we propose
novel temporal extensions to standard statistical
techniques to adapt them to online feed monitoring for
outlier detection and alert generation at multiple
scales along three dimensions: aggregation at multiple
time intervals to detect at varying levels of
sensitivity; multiple lengths of data history for
varying the speed at which models adapt to change; and
multiple levels of monitoring delay to address lagged
data arrival. FIT, or Feed Inspection Tool, is the
result of a successful implementation of our approach.
We present several case studies outlining the effective
deployment of FIT in real applications along with user
testimonials.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Larson:2015:RTA,
author = "Per-{\AA}ke Larson and Adrian Birka and Eric N. Hanson
and Weiyun Huang and Michal Nowakiewicz and Vassilis
Papadimos",
title = "Real-time analytical processing with {SQL} server",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1740--1751",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824071",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Over the last two releases SQL Server has integrated
two specialized engines into the core system: the
Apollo column store engine for analytical workloads and
the Hekaton in-memory engine for high-performance OLTP
workloads. There is an increasing demand for real-time
analytics, that is, for running analytical queries and
reporting on the same system as transaction processing
so as to have access to the freshest data. SQL Server
2016 will include enhancements to column store indexes
and in-memory tables that significantly improve
performance on such hybrid workloads. This paper
describes four such enhancements: column store indexes
on in-memory tables, making secondary column store
indexes on disk-based tables updatable, allowing B-tree
indexes on primary column store indexes, and further
speeding up the column store scan oper ator.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wu:2015:EEO,
author = "You Wu and Boulos Harb and Jun Yang and Cong Yu",
title = "Efficient evaluation of object-centric exploration
queries for visualization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1752--1763",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824072",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The most effective way to explore data is through
visualizing the results of exploration queries. For
example, an exploration query could be an aggregate of
some measures over time intervals, and a pattern or
abnormality can be discovered through a time series
plot of the query results. In this paper, we examine a
special kind of exploration query, namely
object-centric exploration query. Common examples
include claims made about athletes in sports databases,
such as ``it is newsworthy that LeBron James has scored
35 or more points in nine consecutive games.'' We focus
on one common type of visualization, i.e., 2d scatter
plot with heatmap. Namely, we consider exploration
queries whose results can be plotted on a
two-dimensional space, possibly with colors indicating
object densities in regions. While we model results as
pairs of numbers, the types of the queries are limited
only by the users' imagination. In the LeBron James
example above, the two dimensions are minimum points
scored per game and number of consecutive games,
respectively. It is easy to find other equally
interesting dimensions, such as minimum rebounds per
game or number of playoff games. We formalize this
problem and propose an efficient, interactive-speed
algorithm that takes a user-provided exploration query
(which can be a blackbox function) and produces an
approximate visualization that preserves the two most
important visual properties: the outliers and the
overall distribution of all result points.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Qiao:2015:GUD,
author = "Lin Qiao and Yinan Li and Sahil Takiar and Ziyang Liu
and Narasimha Veeramreddy and Min Tu and Ying Dai and
Issac Buenrostro and Kapil Surlaker and Shirshanka Das
and Chavdar Botev",
title = "{Gobblin}: unifying data ingestion for {Hadoop}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1764--1769",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824073",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data ingestion is an essential part of companies and
organizations that collect and analyze large volumes of
data. This paper describes Gobblin, a generic data
ingestion framework for Hadoop and one of LinkedIn's
latest open source products. At LinkedIn we need to
ingest data from various sources such as relational
stores, NoSQL stores, streaming systems, REST
endpoints, filesystems, etc. into our Hadoop clusters.
Maintaining independent pipelines for each source can
lead to various operational problems. Gobblin aims to
solve this issue by providing a centralized data
ingestion framework that makes it easy to support
ingesting data from a variety of sources. Gobblin
distinguishes itself from similar frameworks by
focusing on three core principles: generality,
extensibility, and operability. Gobblin supports a
mixture of data sources out-of-the-box and can be
easily extended for more. This enables an organization
to use a single framework to handle different data
ingestion needs, making it easy and inexpensive to
operate. Moreover, with an end-to-end metrics
collection and reporting module, Gobblin makes it
simple and efficient to identify issues in
production.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Das:2015:QOO,
author = "Dinesh Das and Jiaqi Yan and Mohamed Zait and
Satyanarayana R. Valluri and Nirav Vyas and Ramarajan
Krishnamachari and Prashant Gaharwar and Jesse Kamp and
Niloy Mukherjee",
title = "Query optimization in {Oracle 12c} database
in-memory",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1770--1781",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824074",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Traditional on-disk row major tables have been the
dominant storage mechanism in relational databases for
decades. Over the last decade, however, with explosive
growth in data volume and demand for faster analytics,
has come the recognition that a different data
representation is needed. There is widespread agreement
that in-memory column-oriented databases are best
suited to meet the realities of this new world. Oracle
12c Database In-memory, the industry's first
dual-format database, allows existing row major on-disk
tables to have complementary in-memory columnar
representations. The new storage format brings new data
processing techniques and query execution algorithms
and thus new challenges for the query optimizer.
Execution plans that are optimal for one format may be
sub-optimal for the other. In this paper, we describe
the changes made in the query optimizer to generate
execution plans optimized for the specific format ---
row major or columnar --- that will be scanned during
query execution. With enhancements in several areas ---
statistics, cost model, query transformation, access
path and join optimization, parallelism, and
cluster-awareness --- the query optimizer plays a
significant role in unlocking the full promise and
performance of Oracle Database In-Memory.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Green:2015:LPL,
author = "Todd J. Green and Dan Olteanu and Geoffrey Washburn",
title = "Live programming in the {LogicBlox} system: a
{MetaLogiQL} approach",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1782--1791",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824075",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The emerging category of self-service enterprise
applications motivates support for ``live programming''
in the database, where the user's iterative data
exploration triggers changes to installed application
code and its output in real time. This paper discusses
the technical challenges in supporting live programming
in the database and presents the solution implemented
in the LogicBlox commercial system. The workhorse
architectural component is a ``meta-engine'' that
incrementally maintains metadata representing
application code, guides its compilation into an
internal representation in the database kernel, and
orchestrates maintenance of materialized views based on
those changes. Our approach mirrors LogicBlox's
declarative programming model and describes the
maintenance of application code using declarative
meta-rules; the meta-engine is essentially a
``bootstrap'' version of the database engine proper.
Beyond live programming, the meta-engine turns out
effective for a range of static analysis and
optimization tasks. Outside of the database context, we
speculate that our design may even provide a novel
means of building incremental compilers for
general-purpose programming languages.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Akidau:2015:DMP,
author = "Tyler Akidau and Robert Bradshaw and Craig Chambers
and Slava Chernyak and Rafael J.
Fern{\'a}ndez-Moctezuma and Reuven Lax and Sam McVeety
and Daniel Mills and Frances Perry and Eric Schmidt and
Sam Whittle",
title = "The dataflow model: a practical approach to balancing
correctness, latency, and cost in massive-scale,
unbounded, out-of-order data processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1792--1803",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824076",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Unbounded, unordered, global-scale datasets are
increasingly common in day-to-day business (e.g. Web
logs, mobile usage statistics, and sensor networks). At
the same time, consumers of these datasets have evolved
sophisticated requirements, such as event-time ordering
and windowing by features of the data themselves, in
addition to an insatiable hunger for faster answers.
Meanwhile, practicality dictates that one can never
fully optimize along all dimensions of correctness,
latency, and cost for these types of input. As a
result, data processing practitioners are left with the
quandary of how to reconcile the tensions between these
seemingly competing propositions, often resulting in
disparate implementations and systems. We propose that
a fundamental shift of approach is necessary to deal
with these evolved requirements in modern data
processing. We as a field must stop trying to groom
unbounded datasets into finite pools of information
that eventually become complete, and instead live and
breathe under the assumption that we will never know if
or when we have seen all of our data, only that new
data will arrive, old data may be retracted, and the
only way to make this problem tractable is via
principled abstractions that allow the practitioner the
choice of appropriate tradeoffs along the axes of
interest: correctness, latency, and cost. In this
paper, we present one such approach, the Dataflow
Model, along with a detailed examination of the
semantics it enables, an overview of the core
principles that guided its design, and a validation of
the model itself via the real-world experiences that
led to its development.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ching:2015:OTE,
author = "Avery Ching and Sergey Edunov and Maja Kabiljo and
Dionysios Logothetis and Sambavi Muthukrishnan",
title = "One trillion edges: graph processing at
{Facebook}-scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1804--1815",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824077",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Analyzing large graphs provides valuable insights for
social networking and web companies in content ranking
and recommendations. While numerous graph processing
systems have been developed and evaluated on available
benchmark graphs of up to 6.6B edges, they often face
significant difficulties in scaling to much larger
graphs. Industry graphs can be two orders of magnitude
larger --- hundreds of billions or up to one trillion
edges. In addition to scalability challenges, real
world applications often require much more complex
graph processing workflows than previously evaluated.
In this paper, we describe the usability, performance,
and scalability improvements we made to Apache Giraph,
an open-source graph processing system, in order to use
it on Facebook-scale graphs of up to one trillion
edges. We also describe several key extensions to the
original Pregel model that make it possible to develop
a broader range of production graph applications and
workflows as well as improve code reuse. Finally, we
report on real-world operations as well as performance
characteristics of several large-scale production
applications.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Pelkonen:2015:GFS,
author = "Tuomas Pelkonen and Scott Franklin and Justin Teller
and Paul Cavallaro and Qi Huang and Justin Meza and
Kaushik Veeraraghavan",
title = "{Gorilla}: a fast, scalable, in-memory time series
database",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1816--1827",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824078",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Large-scale internet services aim to remain highly
available and responsive in the presence of unexpected
failures. Providing this service often requires
monitoring and analyzing tens of millions of
measurements per second across a large number of
systems, and one particularly effective solution is to
store and query such measurements in a time series
database (TSDB). A key challenge in the design of TSDBs
is how to strike the right balance between efficiency,
scalability, and reliability. In this paper we
introduce Gorilla, Facebook's in-memory TSDB. Our
insight is that users of monitoring systems do not
place much emphasis on individual data points but
rather on aggregate analysis, and recent data points
are of much higher value than older points to quickly
detect and diagnose the root cause of an ongoing
problem. Gorilla optimizes for remaining highly
available for writes and reads, even in the face of
failures, at the expense of possibly dropping small
amounts of data on the write path. To improve query
efficiency, we aggressively leverage compression
techniques such as delta-of-delta timestamps and XOR'd
floating point values to reduce Gorilla's storage
footprint by 10x. This allows us to store Gorilla's
data in memory, reducing query latency by 73x and
improving query throughput by 14x when compared to a
traditional database (HBase)-backed time series data.
This performance improvement has unlocked new
monitoring and debugging tools, such as time series
correlation search and more dense visualization tools.
Gorilla also gracefully handles failures from a
single-node to entire regions with little to no
operational overhead.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Potharaju:2015:CLC,
author = "Rahul Potharaju and Joseph Chan and Luhui Hu and
Cristina Nita-Rotaru and Mingshi Wang and Liyuan Zhang
and Navendu Jain",
title = "{ConfSeer}: leveraging customer support knowledge
bases for automated misconfiguration detection",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1828--1839",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824079",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We introduce ConfSeer, an automated system that
detects potential configuration issues or deviations
from identified best practices by leveraging a
knowledge base (KB) of technical solutions. The
intuition is that these KB articles describe the
configuration problems and their fixes so if the system
can accurately understand them, it can automatically
pinpoint both the errors and their resolution.
Unfortunately, finding an accurate match is difficult
because (a) the KB articles are written in natural
language text, and (b) configuration files typically
contain a large number of parameters with a high value
range. Thus, expert-driven manual troubleshooting is
not scalable. While there are several state-of-the-art
techniques proposed for individual tasks such as
keyword matching, concept determination and entity
resolution, none offer a practical end-to-end solution
to detect problems in machine configurations. In this
paper, we describe our experiences building ConfSeer
using a novel combinations of ideas from natural
language processing, information retrieval and
interactive learning. ConfSeer powers the
recommendation engine behind Microsoft Operations
Management Suite that proposes fixes for software
configuration errors. The system has been running in
production for about a year to proactively find
misconfigurations on tens of thousands of servers. Our
evaluation of ConfSeer against an expert-defined
rule-based commercial system, an expert survey and web
search engines shows that it achieves 80\%-97.5\%
accuracy and incurs low runtime overheads.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Armbrust:2015:SSR,
author = "Michael Armbrust and Tathagata Das and Aaron Davidson
and Ali Ghodsi and Andrew Or and Josh Rosen and Ion
Stoica and Patrick Wendell and Reynold Xin and Matei
Zaharia",
title = "Scaling spark in the real world: performance and
usability",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1840--1843",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824080",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Apache Spark is one of the most widely used open
source processing engines for big data, with rich
language-integrated APIs and a wide range of libraries.
Over the past two years, our group has worked to deploy
Spark to a wide range of organizations through
consulting relationships as well as our hosted service,
Databricks. We describe the main challenges and
requirements that appeared in taking Spark to a wide
set of users, and usability and performance
improvements we have made to the engine in response.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sahli:2015:SLS,
author = "Majed Sahli and Essam Mansour and Panos Kalnis",
title = "{StarDB}: a large-scale {DBMS} for strings",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1844--1847",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824082",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Strings and applications using them are proliferating
in science and business. Currently, strings are stored
in file systems and processed using ad-hoc procedural
code. Existing techniques are not flexible and cannot
efficiently handle complex queries or large datasets.
In this paper, we demonstrate StarDB, a distributed
database system for analytics on strings. StarDB hides
data and system complexities and allows users to focus
on analytics. It uses a comprehensive set of parallel
string operations and provides a declarative query
language to solve complex queries. StarDB automatically
tunes itself and runs with over 90\% efficiency on
supercomputers, public clouds, clusters, and
workstations. We test StarDB using real datasets that
are 2 orders of magnitude larger than the datasets
reported by previous works.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Harbi:2015:ESQ,
author = "Razen Harbi and Ibrahim Abdelaziz and Panos Kalnis and
Nikos Mamoulis",
title = "Evaluating {SPARQL} queries on massive {RDF}
datasets",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1848--1851",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824083",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Distributed RDF systems partition data across multiple
computer nodes. Partitioning is typically based on
heuristics that minimize inter-node communication and
it is performed in an initial, data pre-processing
phase. Therefore, the resulting partitions are static
and do not adapt to changes in the query workload; as a
result, existing systems are unable to consistently
avoid communication for queries that are not favored by
the initial data partitioning. Furthermore, for very
large RDF knowledge bases, the partitioning phase
becomes prohibitively expensive, leading to high
startup costs. In this paper, we propose AdHash, a
distributed RDF system which addresses the shortcomings
of previous work. First, AdHash initially applies
lightweight hash partitioning, which drastically
minimizes the startup cost, while favoring the parallel
processing of join patterns on subjects, without any
data communication. Using a locality-aware planner,
queries that cannot be processed in parallel are
evaluated with minimal communication. Second, AdHash
monitors the data access patterns and adapts
dynamically to the query load by incrementally
redistributing and replicating frequently accessed
data. As a result, the communication cost for future
queries is drastically reduced or even eliminated. Our
experiments with synthetic and real data verify that
AdHash (i) starts faster than all existing systems,
(ii) processes thousands of queries before other
systems become online, and (iii) gracefully adapts to
the query load, being able to evaluate queries on
billion-scale RDF data in sub-seconds. In this
demonstration, audience can use a graphical interface
of AdHash to verify its performance superiority
compared to state-of-the-art distributed RDF systems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kou:2015:TBR,
author = "Ngai Meng Kou and Leong Hou U. and Nikos Mamoulis and
Yuhong Li and Ye Li and Zhiguo Gong",
title = "A topic-based reviewer assignment system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1852--1855",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824084",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Peer reviewing is a widely accepted mechanism for
assessing the quality of submitted articles to
scientific conferences or journals. Conference
management systems (CMS) are used by conference
organizers to invite appropriate reviewers and assign
them to submitted papers. Typical CMS rely on paper
bids entered by the reviewers and apply simple matching
algorithms to compute the paper assignment. In this
paper, we demonstrate our Reviewer Assignment System
(RAS), which has advanced features compared to broadly
used CMSs. First, RAS automatically extracts the
profiles of reviewers and submissions in the form of
topic vectors. These profiles can be used to
automatically assign reviewers to papers without
relying on a bidding process, which can be tedious and
error-prone. Second, besides supporting classic
assignment models (e.g., stable marriage and optimal
assignment), RAS includes a recently published
assignment model by our research group, which
maximizes, for each paper, the coverage of its topics
by the profiles of its reviewers. The features of the
demonstration include (1) automatic extraction of paper
and reviewer profiles, (2) assignment computation by
different models, and (3) visualization of the results
by different models, in order to assess their
effectiveness.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liroz-Gistau:2015:FHE,
author = "Miguel Liroz-Gistau and Reza Akbarinia and Patrick
Valduriez",
title = "{FP--Hadoop}: efficient execution of parallel jobs
over skewed data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1856--1859",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824085",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Big data parallel frameworks, such as MapReduce or
Spark have been praised for their high scalability and
performance, but show poor performance in the case of
data skew. There are important cases where a high
percentage of processing in the reduce side ends up
being done by only one node. In this demonstration, we
illustrate the use of FP-Hadoop, a system that
efficiently deals with data skew in MapReduce jobs. In
FP-Hadoop, there is a new phase, called intermediate
reduce (IR), in which blocks of intermediate values,
constructed dynamically, are processed by intermediate
reduce workers in parallel, by using a scheduling
strategy. Within the IR phase, even if all intermediate
values belong to only one key, the main part of the
reducing work can be done in parallel using the
computing resources of all available workers. We
implemented a prototype of FP-Hadoop, and conducted
extensive experiments over synthetic and real datasets.
We achieve excellent performance gains compared to
native Hadoop, e.g. more than 10 times in reduce time
and 5 times in total execution time. During our
demonstration, we give the users the possibility to
execute and compare job executions in FP-Hadoop and
Hadoop. They can retrieve general information about the
job and the tasks and a summary of the phases. They can
also visually compare different configurations to
explore the difference between the approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Papenbrock:2015:DPM,
author = "Thorsten Papenbrock and Tanja Bergmann and Moritz
Finke and Jakob Zwiener and Felix Naumann",
title = "Data profiling with {Metanome}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1860--1863",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824086",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data profiling is the discipline of discovering
metadata about given datasets. The metadata itself
serve a variety of use cases, such as data integration,
data cleansing, or query optimization. Due to the
importance of data profiling in practice, many tools
have emerged that support data scientists and IT
professionals in this task. These tools provide good
support for profiling statistics that are easy to
compute, but they are usually lacking automatic and
efficient discovery of complex statistics, such as
inclusion dependencies, unique column combinations, or
functional dependencies. We present Metanome, an
extensible profiling platform that incorporates many
state-of-the-art profiling algorithms. While Metanome
is able to calculate simple profiling statistics in
relational data, its focus lies on the automatic
discovery of complex metadata. Metanome's goal is to
provide novel profiling algorithms from research,
perform comparative evaluations, and to support
developers in building and testing new algorithms. In
addition, Metanome is able to rank profiling results
according to various metrics and to visualize the, at
times, large metadata sets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kumar:2015:DSO,
author = "Arun Kumar and Mona Jalal and Boqun Yan and Jeffrey
Naughton and Jignesh M. Patel",
title = "Demonstration of {Santoku}: optimizing machine
learning over normalized data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1864--1867",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824087",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Advanced analytics is a booming area in the data
management industry and a hot research topic. Almost
all toolkits that implement machine learning (ML)
algorithms assume that the input is a single table, but
most relational datasets are not stored as single
tables due to normalization. Thus, analysts often join
tables to obtain a denormalized table. Also, analysts
typically ignore any functional dependencies among
features because ML toolkits do not support them. In
both cases, time is wasted in learning over data with
redundancy. We demonstrate Santoku, a toolkit to help
analysts improve the performance of ML over normalized
data. Santoku applies the idea of factorized learning
and automatically decides whether to denormalize or
push ML computations through joins. Santoku also
exploits database dependencies to provide automatic
insights that could help analysts with exploratory
feature selection. It is usable as a library in R,
which is a popular environment for advanced analytics.
We demonstrate the benefits of Santoku in improving ML
performance and helping analysts with feature
selection.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Seah:2015:PCP,
author = "Boon Siew Seah and Sourav S. Bhowmick and Aixin Sun",
title = "{PRISM}: concept-preserving summarization of top-$k$
social image search results",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1868--1871",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824088",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Most existing tag-based social image search engines
present search results as a ranked list of images,
which cannot be consumed by users in a natural and
intuitive manner. In this demonstration, we present a
novel concept-preserving image search results
summarization system called prism. prism exploits both
visual features and tags of the search results to
generate high quality summary, which not only breaks
the results into visually and semantically coherent
clusters but it also maximizes the coverage of the
original top- k search results. It first constructs a
visual similarity graph where the nodes are images in
the top- k search results and the edges represent
visual similarities between pairs of images. This graph
is optimally decomposed and compressed into a set of
concept-preserving subgraphs based on a set of
summarization criteria. One or more exemplar images
from each subgraph is selected to form the exemplar
summary of the result set. We demonstrate various
innovative features of prism and the promise of
superior quality summary construction of social image
search results.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Muller:2015:PST,
author = "Tobias M{\"u}ller and Torsten Grust",
title = "Provenance for {SQL} through abstract interpretation:
value-less, but worthwhile",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1872--1875",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824089",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We demonstrate the derivation of fine-grained where
--- and why -provenance for a rich dialect of SQL that
includes recursion, (correlated) subqueries, windows,
grouping/aggregation, and the RDBMS's library of
built-in functions. The approach relies on ideas that
originate in the programming language
community---program slicing and abstract
interpretation, in particular. A two-stage process
first records a query's control flow decisions and
locations of data access before it derives provenance
without consultation of the actual data values
(rendering the method largely ``value-less''). We will
bring an interactive demonstrator that uses this
provenance information to make input/output
dependencies in real-world SQL queries tangible.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{He:2015:SSQ,
author = "Zhian He and Wai Kit Wong and Ben Kao and David Wai
Lok Cheung and Rongbin Li and Siu Ming Yiu and Eric
Lo",
title = "{SDB}: a secure query processing system with data
interoperability",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1876--1879",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824090",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We address security issues in a cloud database system
which employs the DBaaS model --- a data owner (DO)
exports data to a cloud database service provider (SP).
To provide data security, sensitive data is encrypted
by the DO before it is uploaded to the SP. Compared to
existing secure query processing systems like CryptDB
[7] and MONOMI [8], in which data operations (e.g.,
comparison or addition) are supported by specialized
encryption schemes, our demo system, SDB, is
implemented based on a set of data-interoperable secure
operators, i.e., the output of an operator can be used
as input of another operator. As a result, SDB can
support a wide range of complex queries (e.g., all
TPC-H queries) efficiently. In this demonstration, we
show how our SDB prototype supports secure query
processing on complex workload like TPC-H. We also
demonstrate how our system protects sensitive
information from malicious attackers.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Abdelaziz:2015:SVC,
author = "Ibrahim Abdelaziz and Razen Harbi and Semih Salihoglu
and Panos Kalnis and Nikos Mamoulis",
title = "{SPARTex}: a vertex-centric framework for {RDF} data
analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1880--1883",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824091",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A growing number of applications require combining
SPARQL queries with generic graph search on RDF data.
However, the lack of procedural capabilities in SPARQL
makes it inappropriate for graph analytics. Moreover,
RDF engines focus on SPARQL query evaluation whereas
graph management frameworks perform only generic graph
computations. In this work, we bridge the gap by
introducing SPARTex, an RDF analytics framework based
on the vertex-centric computation model. In SPARTex,
user-defined vertex centric programs can be invoked
from SPARQL as stored procedures. SPARTex allows the
execution of a pipeline of graph algorithms without the
need for multiple reads/writes of input data and
intermediate results. We use a cost-based optimizer for
minimizing the communication cost. SPARTex evaluates
queries that combine SPARQL and generic graph
computations orders of magnitude faster than existing
RDF engines. We demonstrate a real system prototype of
SPARTex running on a local cluster using real and
synthetic datasets. SPARTex has a real-time graphical
user interface that allows the participants to write
regular SPARQL queries, use our proposed SPARQL
extension to declaratively invoke graph algorithms or
combine/pipeline both SPARQL querying and generic graph
analytics.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chen:2015:IDG,
author = "Lu Chen and Yunjun Gao and Zhihao Xing and Christian
S. Jensen and Gang Chen",
title = "{I2RS}: a distributed geo-textual image retrieval and
recommendation system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1884--1887",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824092",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Massive amounts of geo-tagged and textually annotated
images are provided by online photo services such as
Flickr and Zommr. However, most existing image
retrieval engines only consider text annotations. We
present I2RS, a system that allows users to view
geo-textual images on Google Maps, find hot topics
within a specific geographic region and time period,
retrieve images similar to a query image, and receive
recommended images that they might be interested in.
I2RS is a distributed geo-textual image retrieval and
recommendation system that employs SPB-trees to index
geo-textual images, and that utilizes metric similarity
queries, including top-$m$ spatio-temporal range and k
nearest neighbor queries, to support geo-textual image
retrieval and recommendation. The system adopts the
browser-server model, whereas the server is deployed in
a distributed environment that enables efficiency and
scalability to huge amounts of data and requests. A
rich set of 100 million geo-textual images crawled from
Flickr is used to demonstrate that, I2RS can return
high-quality answers in an interactive way and support
efficient updates for high image arrival rates.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bursztyn:2015:RBQ,
author = "Damian Bursztyn and Fran{\c{c}}ois Goasdou{\'e} and
Ioana Manolescu",
title = "Reformulation-based query answering in {RDF}:
alternatives and performance",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1888--1891",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824093",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Answering queries over Semantic Web data, i.e., RDF
graphs, must account for both explicit data and
implicit data, entailed by the explicit data and the
semantic constraints holding on them. Two main query
answering techniques have been devised, namely
Saturation -based (S at) which precomputes and adds to
the graph all implicit information, and Reformulation
-based (Ref) which reformulates the query based on the
graph constraints, so that evaluating the reformulated
query directly against the explicit data (i.e., without
considering the constraints) produces the query answer.
While S at is well known, Ref has received less
attention so far. In particular, reformulated queries
often perform poorly if the query is complex. Our
demonstration showcases a large set of Ref techniques,
including but not limited to one we proposed recently.
The audience will be able to 1: test them against
different datasets, constraints and queries, as well as
different well-established systems, 2: analyze and
understand the performance challenges they raise, and
3: alter the scenarios to visualize the impact on
performance. In particular, we show how a cost-based
Ref approach allows avoiding reformulation performance
pitfalls.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bux:2015:SSS,
author = "Marc Bux and J{\"o}rgen Brandt and Carsten Lipka and
Kamal Hakimzadeh and Jim Dowling and Ulf Leser",
title = "{SAASFEE}: scalable scientific workflow execution
engine",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1892--1895",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824094",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Across many fields of science, primary data sets like
sensor read-outs, time series, and genomic sequences
are analyzed by complex chains of specialized tools and
scripts exchanging intermediate results in
domain-specific file formats. Scientific workflow
management systems (SWfMSs) support the development and
execution of these tool chains by providing workflow
specification languages, graphical editors,
fault-tolerant execution engines, etc. However, many
SWfMSs are not prepared to handle large data sets
because of inadequate support for distributed
computing. On the other hand, most SWfMSs that do
support distributed computing only allow static task
execution orders. We present SAASFEE, a SWfMS which
runs arbitrarily complex workflows on Hadoop YARN.
Workflows are specified in Cuneiform, a functional
workflow language focusing on parallelization and easy
integration of existing software. Cuneiform workflows
are executed on Hi-WAY, a higher-level scheduler for
running workflows on YARN. Distinct features of SAASFEE
are the ability to execute iterative workflows, an
adaptive task scheduler, re-executable provenance
traces, and compatibility to selected other workflow
systems. In the demonstration, we present all
components of SAASFEE using real-life workflows from
the field of genomics.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Eldawy:2015:DHE,
author = "Ahmed Eldawy and Mohamed F. Mokbel and Christopher
Jonathan",
title = "A demonstration of {HadoopViz}: an extensible
{MapReduce} system for visualizing big spatial data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1896--1899",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824095",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This demonstration presents HadoopViz; an extensible
MapReduce-based system for visualizing Big Spatial
Data. HadoopViz has two main unique features that
distinguish it from other techniques. (1) It provides
an extensible interface that allows users to visualize
various types of data by defining five abstract
functions, without delving into the details of the
MapReduce algorithms. We show how it is used to create
four types of visualizations, namely, scatter plot,
road network, frequency heat map, and temperature heat
map. (2) HadoopViz is capable of generating big images
with giga-pixel resolution by employing a three-phase
approach of partitioning, rasterize, and merging.
HadoopViz generates single and multi-level images,
where the latter allows users to zoom in/out to get
more/less details. Both types of images are generated
with a very high resolution using the extensible and
scalable framework of HadoopViz.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bergman:2015:QQO,
author = "Moria Bergman and Tova Milo and Slava Novgorodov and
Wang-Chiew Tan",
title = "{QOCO}: a query oriented data cleaning system with
oracles",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1900--1903",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824096",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "As key decisions are often made based on information
contained in a database, it is important for the
database to be as complete and correct as possible. For
this reason, many data cleaning tools have been
developed to automatically resolve inconsistencies in
databases. However, data cleaning tools provide only
best-effort results and usually cannot eradicate all
errors that may exist in a database. Even more
importantly, existing data cleaning tools do not
typically address the problem of determining what
information is missing from a database. To tackle these
problems, we present QOCO, a novel query oriented
cleaning system that leverages materialized views that
are defined by user queries as a trigger for
identifying the remaining incorrect/missing
information. Given a user query, QOCO interacts with
domain experts (which we model as oracle crowds) to
identify potentially wrong or missing answers in the
result of the user query, as well as determine and
correct the wrong data that is the cause for the
error(s). We will demonstrate QOCO over a World Cup
Games database, and illustrate the interaction between
QOCO and the oracles. Our demo audience will play the
role of oracles, and we show how QOCO's underlying
operations and optimization mechanisms can effectively
prune the search space and minimize the number of
questions that need to be posed to accelerate the
cleaning process.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ying:2015:TFS,
author = "Shanshan Ying and Flip Korn and Barna Saha and Divesh
Srivastava",
title = "{TreeScope}: finding structural anomalies in
semi-structured data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1904--1907",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824097",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Semi-structured data are prevalent on the web, with
formats such as XML and JSON soaring in popularity due
to their generality, flexibility and easy
customization. However, these very same features make
semi-structured data prone to a range of data quality
errors, from errors in content to errors in structure.
While the former has been well studied, little
attention has been paid to structural errors. In this
demonstration, we present T reeScope, which analyzes
semi-structured data sets with the goal of
automatically identifying structural anomalies from the
data. Our techniques learn robust structural models
that have high support, to identify potential errors in
the structure. Identified structural anomalies are then
concisely summarized to provide plausible explanations
of the potential errors. The goal of this demonstration
is to enable an interactive exploration of the process
of identifying and summarizing structural anomalies in
semi-structured data sets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Elmore:2015:DBP,
author = "A. Elmore and J. Duggan and M. Stonebraker and M.
Balazinska and U. Cetintemel and V. Gadepally and J.
Heer and B. Howe and J. Kepner and T. Kraska and S.
Madden and D. Maier and T. Mattson and S. Papadopoulos
and J. Parkhurst and N. Tatbul and M. Vartak and S.
Zdonik",
title = "A demonstration of the {BigDAWG} polystore system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1908--1911",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824098",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper presents BigDAWG, a reference
implementation of a new architecture for ``Big Data''
applications. Such applications not only call for
large-scale analytics, but also for real-time streaming
support, smaller analytics at interactive speeds, data
visualization, and cross-storage-system queries. Guided
by the principle that ``one size does not fit all'', we
build on top of a variety of storage engines, each
designed for a specialized use case. To illustrate the
promise of this approach, we demonstrate its
effectiveness on a hospital application using data from
an intensive care unit (ICU). This complex application
serves the needs of doctors and researchers and
provides real-time support for streams of patient data.
It showcases novel approaches for querying across
multiple storage engines, data visualization, and
scalable real-time analytics.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zoumpatianos:2015:RID,
author = "Kostas Zoumpatianos and Stratos Idreos and Themis
Palpanas",
title = "{RINSE}: interactive data series exploration with
{ADS+}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1912--1915",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824099",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Numerous applications continuously produce big amounts
of data series, and in several time critical scenarios
analysts need to be able to query these data as soon as
they become available. An adaptive index data
structure, ADS+, which is specifically tailored to
solve the problem of indexing and querying very large
data series collections has been recently proposed as a
solution to this problem. The main idea is that instead
of building the complete index over the complete data
set up-front and querying only later, we interactively
and adaptively build parts of the index, only for the
parts of the data on which the users pose queries. The
net effect is that instead of waiting for extended
periods of time for the index creation, users can
immediately start exploring the data series. In this
work, we present a demonstration of ADS+; we introduce
RINSE, a system that allows users to experience the
benefits of the ADS+ adaptive index through an
intuitive web interface. Users can explore large
datasets and find patterns of interest, using nearest
neighbor search. They can draw queries (data series)
using a mouse, or touch screen, or they can select from
a predefined list of data series. RINSE can scale to
large data sizes, while drastically reducing the data
to query delay: by the time state-of-the-art indexing
techniques finish indexing 1 billion data series (and
before answering even a single query), adaptive data
series indexing can already answer 3 * 10$^5$
queries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bhardwaj:2015:CDA,
author = "Anant Bhardwaj and Amol Deshpande and Aaron J. Elmore
and David Karger and Sam Madden and Aditya Parameswaran
and Harihar Subramanyam and Eugene Wu and Rebecca
Zhang",
title = "Collaborative data analytics with {DataHub}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1916--1919",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824100",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "While there have been many solutions proposed for
storing and analyzing large volumes of data, all of
these solutions have limited support for collaborative
data analytics, especially given the many individuals
and teams are simultaneously analyzing, modifying and
exchanging datasets, employing a number of
heterogeneous tools or languages for data analysis, and
writing scripts to clean, preprocess, or query data. We
demonstrate DataHub, a unified platform with the
ability to load, store, query, collaboratively analyze,
interactively visualize, interface with external
applications, and share datasets. We will demonstrate
the following aspects of the DataHub platform: (a)
flexible data storage, sharing, and native versioning
capabilities: multiple conference attendees can
concurrently update the database and browse the
different versions and inspect conflicts; (b) an app
ecosystem that hosts apps for various data-processing
activities: conference attendees will be able to
effortlessly ingest, query, and visualize data using
our existing apps; (c) thrift-based data serialization
permits data analysis in any combination of 20+
languages, with DataHub as the common data store:
conference attendees will be able to analyze datasets
in R, Python, and Matlab, while the inputs and the
results are still stored in DataHub. In particular,
conference attendees will be able to use the DataHub
notebook ---an IPython-based notebook for analyzing
data and storing the results of data analysis.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shin:2015:MDD,
author = "Jaeho Shin and Christopher R{\'e} and Michael
Cafarella",
title = "{Mindtagger}: a demonstration of data labeling in
knowledge base construction",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1920--1923",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824101",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "End-to-end knowledge base construction systems using
statistical inference are enabling more people to
automatically extract high-quality domain-specific
information from unstructured data. As a result of
deploying DeepDive framework across several domains, we
found new challenges in debugging and improving such
end-to-end systems to construct high-quality knowledge
bases. DeepDive has an iterative development cycle in
which users improve the data. To help our users, we
needed to develop principles for analyzing the system's
error as well as provide tooling for inspecting and
labeling various data products of the system. We
created guidelines for error analysis modeled after our
colleagues' best practices, in which data labeling
plays a critical role in every step of the analysis. To
enable more productive and systematic data labeling, we
created Mindtagger, a versatile tool that can be
configured to support a wide range of tasks. In this
demonstration, we show in detail what data labeling
tasks are modeled in our error analysis guidelines and
how each of them is performed using Mindtagger.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Koutra:2015:PIL,
author = "Danai Koutra and Di Jin and Yuanchi Ning and Christos
Faloutsos",
title = "{Perseus}: an interactive large-scale graph mining and
visualization tool",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1924--1927",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824102",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Given a large graph with several millions or billions
of nodes and edges, such as a social network, how can
we explore it efficiently and find out what is in the
data? In this demo we present P erseus, a large-scale
system that enables the comprehensive analysis of large
graphs by supporting the coupled summarization of graph
properties and structures, guiding attention to
outliers, and allowing the user to interactively
explore normal and anomalous node behaviors.
Specifically, P erseus provides for the following
operations: (1) It automatically extracts graph
invariants (e.g., degree, PageRank, real eigenvectors)
by performing scalable, offline batch processing on
Hadoop; (2) It interactively visualizes univariate and
bivariate distributions for those invariants; (3) It
summarizes the properties of the nodes that the user
selects; (4) It efficiently visualizes the induced
subgraph of a selected node and its neighbors, by
incrementally revealing its neighbors. In our
demonstration, we invite the audience to interact with
P erseus to explore a variety of multi-million-edge
social networks including a Wikipedia vote network, a
friendship/foeship network in Slashdot, and a trust
network based on the consumer review website
Epinions.com.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Joglekar:2015:SDN,
author = "Manas Joglekar and Hector Garcia-Molina and Aditya
Parameswaran",
title = "Smart drill-down: a new data exploration operator",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1928--1931",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824103",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present a data exploration system equipped with
smart drill-down, a novel operator for interactively
exploring a relational table to discover and summarize
``interesting'' groups of tuples. Each such group of
tuples is represented by a rule. For instance, the rule
(a, b, *, 1000) tells us that there are a thousand
tuples with value a in the first column and b in the
second column (and any value in the third column).
Smart drill-down presents an analyst with a list of
rules that together describe interesting aspects of the
table. The analyst can tailor the definition of
interesting, and can interactively apply smart
drill-down on an existing rule to explore that part of
the table. In the demonstration, conference attendees
will be able to use the data exploration system
equipped with smart drill-down, and will be able to
contrast smart drill-down to traditional drill-down,
for various interestingness measures, and resource
constraints.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dyreson:2015:VED,
author = "Curtis E. Dyreson and Sourav S. Bhowmick and Ryan
Grapp",
title = "Virtual {eXist-db}: liberating hierarchical queries
from the shackles of access path dependence",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1932--1935",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824104",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "XQuery programs can be hard to write and port to new
data collections because the path expressions in a
query are dependent on the hierarchy of the data. We
propose to demonstrate a system to liberate query
writers from this dependence. A plug-and-play query
contains a specification of what data the query needs
in order to evaluate. We implemented virtual eXist-db
to support plug-and-play XQuery queries. Our system
adds a virtualDoc function that lets a programmer
sketch the hierarchy needed by the query, which may
well be different than what the data has, and logically
(not physically) transforms the data (with information
loss guarantees) to the hierarchy specified by the
virtualDoc. The demonstration will consist of a
sequence of XQuery queries using a virtual hierarchy,
including queries suggested by the audience. We will
also demonstrate a GUI tool to construct a virtual
hierarchy.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cortez:2015:ADS,
author = "Eli Cortez and Philip A. Bernstein and Yeye He and Lev
Novik",
title = "Annotating database schemas to help enterprise
search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1936--1939",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824105",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In large enterprises, data discovery is a common
problem faced by users who need to find relevant
information in relational databases. In this scenario,
schema annotation is a useful tool to enrich a database
schema with descriptive keywords. In this paper, we
demonstrate Barcelos, a system that automatically
annotates corporate databases. Unlike existing
annotation approaches that use Web oriented knowledge
bases, Barcelos mines enterprise spreadsheets to find
candidate annotations. Our experimental evaluation
shows that Barcelos produces high quality annotations;
the top-5 have an average precision of 87\%.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jayaram:2015:VAS,
author = "Nandish Jayaram and Sidharth Goyal and Chengkai Li",
title = "{VIIQ}: auto-suggestion enabled visual interface for
interactive graph query formulation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1940--1943",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824106",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present VIIQ (pronounced as wick), an interactive
and iterative visual query formulation interface that
helps users construct query graphs specifying their
exact query intent. Heterogeneous graphs are
increasingly used to represent complex relationships in
schemaless data, which are usually queried using query
graphs. Existing graph query systems offer little help
to users in easily choosing the exact labels of the
edges and vertices in the query graph. VIIQ helps users
easily specify their exact query intent by providing a
visual interface that lets them graphically add various
query graph components, backed by an edge suggestion
mechanism that suggests edges relevant to the user's
query intent. In this demo we present: (1) a detailed
description of the various features and user-friendly
graphical interface of VIIQ, (2) a brief description of
the edge suggestion algorithm, and (3) a demonstration
scenario that we intend to show the audience.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2015:FSS,
author = "Qingyuan Liu and Eduard C. Dragut and Arjun Mukherjee
and Weiyi Meng",
title = "{FLORIN}: a system to support (near) real-time
applications on user generated content on daily news",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1944--1947",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824107",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we propose a system, FLORIN, which
provides support for near real-time applications on
user generated content on daily news. FLORIN
continuously crawls news outlets for articles and user
comments accompanying them. It attaches the articles
and comments to daily event stories. It identifies the
opinionated content in user comments and performs named
entity recognition on news articles. All these pieces
of information are organized hierarchically and
exportable to other applications. Multiple applications
can be built on this data. We have implemented a
sentiment analysis system that runs on top of it.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2015:VVI,
author = "Yunyao Li and Elmer Kim and Marc A. Touchette and
Ramiya Venkatachalam and Hao Wang",
title = "{VINERy}: a visual {IDE} for information extraction",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1948--1951",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824108",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Information Extraction (IE) is the key technology
enabling analytics over unstructured and
semi-structured data. Not surprisingly, it is becoming
a critical building block for a wide range of emerging
applications. To satisfy the rising demands for
information extraction in real-world applications, it
is crucial to lower the barrier to entry for IE
development and enable users with general computer
science background to develop higher quality
extractors. In this demonstration$^1$, we present
VINERy, an intuitive yet expressive visual IDE for
information extraction. We show how it supports the
full cycle of IE development without requiring a single
line of code and enables a wide range of users to
develop high quality IE extractors with minimal
efforts. The extractors visually built in VINERY are
automatically translated into semantically equivalent
extractors in a state-of-the-art declarative language
for IE. We also demonstrate how the auto-generated
extractors can then be imported into a conventional
Eclipse-based IDE for further enhancement. The results
of our user studies indicate that VINERY is a
significant step forward in facilitating extractor
development for both expert and novice IE developers.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chu:2015:KRD,
author = "Xu Chu and John Morcos and Ihab F. Ilyas and Mourad
Ouzzani and Paolo Papotti and Nan Tang and Yin Ye",
title = "{KATARA}: reliable data cleaning with knowledge bases
and crowdsourcing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1952--1955",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824109",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data cleaning with guaranteed reliability is hard to
achieve without accessing external sources, since the
truth is not necessarily discoverable from the data at
hand. Furthermore, even in the presence of external
sources, mainly knowledge bases and humans, effectively
leveraging them still faces many challenges, such as
aligning heterogeneous data sources and decomposing a
complex task into simpler units that can be consumed by
humans. We present K atara, a novel end-to-end data
cleaning system powered by knowledge bases and
crowdsourcing. Given a table, a kb, and a crowd, Katara
(i) interprets the table semantics w.r.t. the given kb;
(ii) identifies correct and wrong data; and (iii)
generates top- k possible repairs for the wrong data.
Users will have the opportunity to experience the
following features of Katara: (1) Easy specification:
Users can define a Katara job with a browser-based
specification; (2) Pattern validation: Users can help
the system to resolve the ambiguity of different table
patterns (i.e., table semantics) discovered by Katara;
(3) Data annotation: Users can play the role of
internal crowd workers, helping Katara annotate data.
Moreover, Katara will visualize the annotated data as
correct data validated by the kb, correct data jointly
validated by the kb and the crowd, or erroneous tuples
along with their possible repairs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Alvanaki:2015:GNB,
author = "Foteini Alvanaki and Romulo Goncalves and Milena
Ivanova and Martin Kersten and Kostis Kyzirakos",
title = "{GIS} navigation boosted by column stores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1956--1959",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824110",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Earth observation sciences, astronomy, and seismology
have large data sets which have inherently rich spatial
and geospatial information. In combination with large
collections of semantically rich objects which have a
large number of thematic properties, they form a new
source of knowledge for urban planning, smart cities
and natural resource management. Modeling and storing
these properties indicating the relationships between
them is best handled in a relational database.
Furthermore, the scalability requirements posed by the
latest 26-attribute light detection and ranging (LIDAR)
data sets are a challenge for file-based solutions. In
this demo we show how to query a 640 billion point data
set using a column store enriched with GIS
functionality. Through a lightweight and cache
conscious secondary index called Imprints, spatial
queries performance on a flat table storage is
comparable to traditional file-based solutions. All the
results are visualised in real time using QGIS.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Arocena:2015:GCY,
author = "Patricia C. Arocena and Radu Ciucanu and Boris Glavic
and Ren{\'e}e J. Miller",
title = "Gain control over your integration evaluations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1960--1963",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824111",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Integration systems are typically evaluated using a
few real-world scenarios (e.g., bibliographical or
biological datasets) or using synthetic scenarios
(e.g., based on star-schemas or other patterns for
schemas and constraints). Reusing such evaluations is a
cumbersome task because their focus is usually limited
to showcasing a specific feature of an approach. This
makes it difficult to compare integration solutions,
understand their generality, and understand their
performance for different application scenarios. Based
on this observation, we demonstrate some of the
requirements for developing integration benchmarks. We
argue that the major abstractions used for integration
problems have converged in the last decade which
enables the application of robust empirical methods to
integration problems (from schema evolution, to data
exchange, to answering queries using views and many
more). Specifically, we demonstrate that schema
mappings are the main abstraction that now drives most
integration solutions and show how a metadata generator
can be used to create more credible evaluations of the
performance and scalability of data integration
systems. We will use the demonstration to evangelize
for more robust, shared empirical evaluations of data
integration systems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Diao:2015:AAU,
author = "Yanlei Diao and Kyriaki Dimitriadou and Zhan Li and
Wenzhao Liu and Olga Papaemmanouil and Kemi Peng and
Liping Peng",
title = "{AIDE}: an automatic user navigation system for
interactive data exploration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1964--1967",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824112",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data analysts often engage in data exploration tasks
to discover interesting data patterns, without knowing
exactly what they are looking for. Such exploration
tasks can be very labor-intensive because they often
require the user to review many results of ad-hoc
queries and adjust the predicates of subsequent queries
to balance the tradeoff between collecting all
interesting information and reducing the size of
returned data. In this demonstration we introduce AIDE,
a system that automates these exploration tasks. AIDE
steers the user towards interesting data areas based on
her relevance feedback on database samples, aiming to
achieve the goal of identifying all database objects
that match the user interest with high efficiency. In
our demonstration, conference attendees will see AIDE
in action for a variety of exploration tasks on
real-world datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Aly:2015:DAA,
author = "Ahmed M. Aly and Ahmed S. Abdelhamid and Ahmed R.
Mahmood and Walid G. Aref and Mohamed S. Hassan and
Hazem Elmeleegy and Mourad Ouzzani",
title = "A demonstration of {AQWA}: adaptive
query-workload-aware partitioning of big spatial data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1968--1971",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824113",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The ubiquity of location-aware devices, e.g.,
smartphones and GPS devices, has led to a plethora of
location-based services in which huge amounts of
geotagged information need to be efficiently processed
by large-scale computing clusters. This demo presents
AQWA, an adaptive and query-workload-aware data
partitioning mechanism for processing large-scale
spatial data. Unlike existing cluster-based systems,
e.g., SpatialHadoop, that apply static partitioning of
spatial data, AQWA has the ability to react to changes
in the query-workload and data distribution. A key
feature of AQWA is that it does not assume prior
knowledge of the query-workload or data distribution.
Instead, AQWA reacts to changes in both the data and
the query-workload by incrementally updating the
partitioning of the data. We demonstrate two prototypes
of AQWA deployed over Hadoop and Spark. In both
prototypes, we process spatial range and k
-nearest-neighbor (k NN, for short) queries over
large-scale spatial datasets, and we exploit the
performance of AQWA under different query-workloads.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dittrich:2015:JID,
author = "Jens Dittrich and Patrick Bender",
title = "Janiform intra-document analytics for reproducible
research",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1972--1975",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824114",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Peer-reviewed publication of research papers is a
cornerstone of science. However, one of the many issues
of our publication culture is that our publications
only publish a summary of the final result of a long
project. This means that we put well-polished graphs
describing (some) of our experimental results into our
publications. However, the algorithms, input datasets,
benchmarks, raw result datasets, as well as scripts
that were used to produce the graphs in the first place
are rarely published and typically not available to
other researchers. Often they are only available when
personally asking the authors. In many cases, however,
they are not available at all. This means from a long
workflow that led to producing a graph for a research
paper, we only publish the final result rather than the
entire workflow. This is unfortunate and has been
criticized in various scientific communities. In this
demo we argue that one part of the problem is our dated
view on what a ``document'' and hence ``a publication''
is, should, and can be. As a remedy, we introduce
portable database files (PDbF). These files are
janiform, i.e. they are at the same time a standard
static pdf as well as a highly dynamic (offline)
HTML-document. PDbFs allow you to access the raw data
behind a graph, perform OLAP-style analysis, and
reproduce your own graphs from the raw data --- all of
this within a portable document. We demo a tool
allowing you to create PDbFs smoothly from within L$^A$
T$_E$ X. This tool allows you to preserve the workflow
of raw measurement data to its final graphical output
through all processing steps. Notice that this pdf
already showcases our technology: rename this file to
``.html'' and see what happens (currently we support
the desktop versions of Firefox, Chrome, and Safari).
But please: do not try to rename this file to ``.ova''
and mount it in VirtualBox.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Schubert:2015:FCU,
author = "Erich Schubert and Alexander Koos and Tobias Emrich
and Andreas Z{\"u}fle and Klaus Arthur Schmid and
Arthur Zimek",
title = "A framework for clustering uncertain data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1976--1979",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824115",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The challenges associated with handling uncertain
data, in particular with querying and mining, are
finding increasing attention in the research community.
Here we focus on clustering uncertain data and describe
a general framework for this purpose that also allows
to visualize and understand the impact of
uncertainty---using different uncertainty models---on
the data mining results. Our framework constitutes
release 0.7 of ELKI (http://elki.dbs.ifi.lmu.de/) and
thus comes along with a plethora of implementations of
algorithms, distance measures, indexing techniques,
evaluation measures and visualization components.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bidoit:2015:EWA,
author = "Nicole Bidoit and Melanie Herschel and Katerina
Tzompanaki",
title = "{EFQ}: why-not answer polynomials in action",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1980--1983",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824116",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "One important issue in modern database applications is
supporting the user with efficient tools to debug and
fix queries because such tasks are both time and skill
demanding. One particular problem is known as Why-Not
question and focusses on the reasons for missing tuples
from query results. The EFQ platform demonstrated here
has been designed in this context to efficiently
leverage Why-Not Answers polynomials, a novel approach
that provides the user with complete explanations to
Why-Not questions and allows for automatic, relevant
query refinements.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2015:EDD,
author = "Xiaolan Wang and Mary Feng and Yue Wang and Xin Luna
Dong and Alexandra Meliou",
title = "Error diagnosis and data profiling with {DataXRay}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1984--1987",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824117",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The problem of identifying and repairing data errors
has been an area of persistent focus in data management
research. However, while traditional data cleaning
techniques can be effective at identifying several data
discrepancies, they disregard the fact that many errors
are systematic, inherent to the process that produces
the data, and thus will keep occurring unless the root
cause is identified and corrected. In this
demonstration, we will present a large-scale diagnostic
framework called D ataXRay. Like a medical X-ray that
aids the diagnosis of medical conditions by revealing
problems underneath the surface, DataXRay reveals
hidden connections and common properties among data
errors. Thus, in contrast to traditional cleaning
methods, which treat the symptoms, our system
investigates the underlying conditions that cause the
errors. The core of D ataXRay combines an intuitive and
principled cost model derived by Bayesian analysis, and
an efficient, highly-parallelizable diagnostic
algorithm that discovers common properties among
erroneous data elements in a top-down fashion. Our
system has a simple interface that allows users to load
different datasets, to interactively adjust key
diagnostic parameters, to explore the derived
diagnoses, and to compare with solutions produced by
alternative algorithms. Through this demonstration,
participants will understand (1) the characteristics of
good diagnoses, (2) how and why errors occur in
real-world datasets, and (3) the distinctions with
other related problems and approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Pham:2015:SRD,
author = "Quan Pham and Severin Thaler and Tanu Malik and Ian
Foster and Boris Glavic",
title = "Sharing and reproducing database applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1988--1991",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824118",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Sharing and repeating scientific applications is
crucial for verifying claims, reproducing experimental
results (e.g., to repeat a computational experiment
described in a publication), and promoting reuse of
complex applications. The predominant methods of
sharing and making applications repeatable are building
a companion web site and/or provisioning a virtual
machine image (VMI). Recently, application
virtualization (AV), has emerged as a light-weight
alternative for sharing and efficient repeatability. AV
approaches such as Linux Containers create a
chroot-like environment [4], while approaches such as
CDE [1] trace system calls during application execution
to copy all binaries, data, and software dependencies
into a self-contained package.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wylot:2015:DTT,
author = "Marcin Wylot and Philippe Cudr{\'e}-Mauroux and Paul
Groth",
title = "A demonstration of {TripleProv}: tracking and querying
provenance over {Web} data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1992--1995",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824119",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The proliferation of heterogeneous Linked Data on the
Web poses new challenges to database systems. In
particular, the capacity to store, track, and query
provenance data is becoming a pivotal feature of modern
triple stores. In this demonstration, we present
TripleProv: a new system extending a native RDF store
to efficiently handle the storage, tracking and
querying of provenance in RDF data. In the following,
we give an overview of our approach providing a
reliable and understandable specification of the way
results were derived from the data and how particular
pieces of data were combined to answer the query.
Subsequently, we present techniques enabling to tailor
queries with provenance data. Finally, we describe our
demonstration and how the attendees will be able to
interact with our system during the conference.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ortona:2015:WJW,
author = "Stefano Ortona and Giorgio Orsi and Marcello
Buoncristiano and Tim Furche",
title = "{WADaR}: joint wrapper and data repair",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "1996--1999",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824120",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Web scraping (or wrapping) is a popular means for
acquiring data from the web. Recent advancements have
made scalable wrapper-generation possible and enabled
data acquisition processes involving thousands of
sources. This makes wrapper analysis and maintenance
both needed and challenging as no scalable tools exists
that support these tasks. We demonstrate WADaR, a
scalable and highly automated tool for joint wrapper
and data repair. WADaR uses off-the-shelf entity
recognisers to locate target entities in
wrapper-generated data. Markov chains are used to
determine structural repairs, that are then encoded
into suitable repairs for both the data and
corresponding wrappers. We show that WADaR is able to
increase the quality of wrapper-generated relations
between 15\% and 60\%, and to fully repair the
corresponding wrapper without any knowledge of the
original website in more than 50\% of the cases.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bendre:2015:DUD,
author = "Mangesh Bendre and Bofan Sun and Ding Zhang and Xinyan
Zhou and Kevin Chen-Chuan Chang and Aditya
Parameswaran",
title = "{DataSpread}: unifying databases and spreadsheets",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "2000--2003",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824121",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Spreadsheet software is often the tool of choice for
ad-hoc tabular data management, processing, and
visualization, especially on tiny data sets. On the
other hand, relational database systems offer
significant power, expressivity, and efficiency over
spreadsheet software for data management, while lacking
in the ease of use and ad-hoc analysis capabilities. We
demonstrate D ataSpread, a data exploration tool that
holistically unifies databases and spreadsheets. It
continues to offer a Microsoft Excel-based spreadsheet
front-end, while in parallel managing all the data in a
back-end database, specifically, PostgreSQL. DataSpread
retains all the advantages of spreadsheets, including
ease of use, ad-hoc analysis and visualization
capabilities, and a schema-free nature, while also
adding the advantages of traditional relational
databases, such as scalability and the ability to use
arbitrary SQL to import, filter, or join external or
internal tables and have the results appear in the
spreadsheet. DataSpread needs to reason about and
reconcile differences in the notions of schema,
addressing of cells and tuples, and the current
``pane'' (which exists in spreadsheets but not in
traditional databases), and support data modifications
at both the front-end and the back-end. Our
demonstration will center on our first and early
prototype of the DataSpread, and will give the
attendees a sense for the enormous data exploration
capabilities offered by unifying spreadsheets and
databases.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Haas:2015:WNS,
author = "Daniel Haas and Sanjay Krishnan and Jiannan Wang and
Michael J. Franklin and Eugene Wu",
title = "Wisteria: nurturing scalable data cleaning
infrastructure",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "2004--2007",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824122",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Analysts report spending upwards of 80\% of their time
on problems in data cleaning. The data cleaning process
is inherently iterative, with evolving cleaning
workflows that start with basic exploratory data
analysis on small samples of dirty data, then refine
analysis with more sophisticated/expensive cleaning
operators (e.g., crowdsourcing), and finally apply the
insights to a full dataset. While an analyst often
knows at a logical level what operations need to be
done, they often have to manage a large search space of
physical operators and parameters. We present Wisteria,
a system designed to support the iterative development
and optimization of data cleaning workflows, especially
ones that utilize the crowd. Wisteria separates logical
operations from physical implementations, and driven by
analyst feedback, suggests optimizations and/or
replacements to the analyst's choice of physical
implementation. We highlight research challenges in
sampling, in-flight operator replacement, and
crowdsourcing. We overview the system architecture and
these techniques, then provide a demonstration designed
to showcase how Wisteria can improve iterative data
analysis and cleaning. The code is available at:
http://www.sampleclean.org.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{S:2015:CDA,
author = "Ashoke S. and Jayant R. Haritsa",
title = "{CODD}: a dataless approach to big data testing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "2008--2011",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824123",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The construction and development of the so-called Big
Data systems has occupied centerstage in the data
management community in recent years. However, there
has been comparatively little attention paid to the
testing of such systems, an essential pre-requisite for
successful deployment. This is surprising given that
traditional testing techniques, which typically involve
construction of representative databases and regression
query suites, are completely impractical at Big Data
scale --- simply due to the time and space overheads
involved in their execution. For instance, consider the
situation where a database engineer wishes to evaluate
the query optimizer's behavior on a futuristic Big Data
setup featuring ``yottabyte'' ($ 10^{24} $ bytes) sized
relational tables. Obviously, just generating this
data, let alone storing it, is practically infeasible
even on the best of systems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cebiric:2015:QOS,
author = "Sejla Cebiri{\'c} and Fran{\c{c}}ois Goasdou{\'e} and
Ioana Manolescu",
title = "Query-oriented summarization of {RDF} graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "2012--2015",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824124",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The Resource Description Framework (RDF) is a
graph-based data model promoted by the W3C as the
standard for Semantic Web applications. Its associated
query language is SPARQL. RDF graphs are often large
and varied, produced in a variety of contexts, e.g.,
scientific applications, social or online media,
government data etc. They are heterogeneous, i.e.,
resources described in an RDF graph may have very
different sets of properties. An RDF resource may have:
no types, one or several types (which may or may not be
related to each other). RDF Schema (RDFS) information
may optionally be attached to an RDF graph, to enhance
the description of its resources. Such statements also
entail that in an RDF graph, some data is implicit.
According to the W3C RDF and SPARQL specification, the
semantics of an RDF graph comprises both its explicit
and implicit data; in particular, SPARQL query answers
must be computed reflecting both the explicit and
implicit data. These features make RDF graphs complex,
both structurally and conceptually. It is intrinsically
hard to get familiar with a new RDF dataset, especially
if an RDF schema is sparse or not available at all.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chodpathumwan:2015:UDT,
author = "Yodsawalai Chodpathumwan and Amirhossein Aleyasen and
Arash Termehchy and Yizhou Sun",
title = "{Universal-DB}: towards representation independent
graph analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "2016--2019",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824125",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Graph analytics algorithms leverage quantifiable
structural properties of the data to predict
interesting concepts and relationships. The same
information, however, can be represented using many
different structures and the structural properties
observed over particular representations do not
necessarily hold for alternative structures. Because
these algorithms tend to be highly effective over some
choices of structure, such as that of the databases
used to validate them, but not so effective with
others, graph analytics has largely remained the
province of experts who can find the desired forms for
these algorithms. We argue that in order to make graph
analytics usable, we should develop systems that are
effective over a wide range of choices of structural
organizations. We demonstrate Universal-DB an entity
similarity and proximity search system that returns the
same answers for a query over a wide range of choices
to represent the input database.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mahmood:2015:TDS,
author = "Ahmed R. Mahmood and Ahmed M. Aly and Thamir Qadah and
El Kindi Rezig and Anas Daghistani and Amgad Madkour
and Ahmed S. Abdelhamid and Mohamed S. Hassan and Walid
G. Aref and Saleh Basalamah",
title = "{Tornado}: a distributed spatio-textual stream
processing system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "2020--2023",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824126",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The widespread use of location-aware devices together
with the increased popularity of micro-blogging
applications (e.g., Twitter) led to the creation of
large streams of spatio-textual data. In order to serve
real-time applications, the processing of these
large-scale spatio-textual streams needs to be
distributed. However, existing distributed stream
processing systems (e.g., Spark and Storm) are not
optimized for spatial/textual content. In this
demonstration, we introduce Tornado, a distributed
in-memory spatio-textual stream processing server that
extends Storm. To efficiently process spatio-textual
streams, Tornado introduces a spatio-textual indexing
layer to the architecture of Storm. The indexing layer
is adaptive, i.e., dynamically re-distributes the
processing across the system according to changes in
the data distribution and/or query workload. In
addition to keywords, higher-level textual concepts are
identified and are semantically matched against
spatio-textual queries. Tornado provides data
deduplication and fusion to eliminate redundant textual
data. We demonstrate a prototype of Tornado running
against real Twitter streams, where the users can
register continuous or snapshot spatio-textual queries
using a map-assisted query-interface.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Crotty:2015:VIA,
author = "Andrew Crotty and Alex Galakatos and Emanuel Zgraggen
and Carsten Binnig and Tim Kraska",
title = "{Vizdom}: interactive analytics through pen and
touch",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "2024--2027",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824127",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Machine learning (ML) and advanced statistics are
important tools for drawing insights from large
datasets. However, these techniques often require human
intervention to steer computation towards meaningful
results. In this demo, we present V izdom, a new system
for interactive analytics through pen and touch.
Vizdom's frontend allows users to visually compose
complex workflows of ML and statistics operators on an
interactive whiteboard, and the back-end leverages
recent advances in workflow compilation techniques to
run these computations at interactive speeds.
Additionally, we are exploring approximation techniques
for quickly visualizing partial results that
incrementally refine over time. This demo will show
Vizdom's capabilities by allowing users to
interactively build complex analytics workflows using
real-world datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Consens:2015:SCE,
author = "Mariano P. Consens and Valeria Fionda and Shahan
Khatchadourian and Giuseppe Pirr{\`o}",
title = "{S+EPPs}: construct and explore bisimulation
summaries, plus optimize navigational queries; all on
existing {SPARQL} systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "2028--2031",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824128",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We demonstrate S+EPPs, a system that provides fast
construction of bisimulation summaries using graph
analytics platforms, and then enhances existing SPARQL
engines to support summary-based exploration and
navigational query optimization. The construction
component adds a novel optimization to a parallel
bisimulation algorithm implemented on a multi-core
graph processing framework. We show that for several
large, disk resident, real world graphs, full summary
construction can be completed in roughly the same time
as the data load. The query translation component
supports Extended Property Paths (EPPs), an enhancement
of SPARQL 1.1 property paths that can express a
significantly larger class of navigational queries.
EPPs are implemented via rewritings into a widely used
SPARQL subset. The optimization component can
(transparently to users) translate EPPs defined on
instance graphs into EPPs that take advantage of
bisimulation summaries. S+EPPs combines the query and
optimization translations to enable summary-based
optimization of graph traversal queries on top of
off-the-shelf SPARQL processors. The demonstration
showcases the construction of bisimulation summaries of
graphs (ranging from millions to billions of edges),
together with the exploration benefits and the
navigational query speedups obtained by leveraging
summaries stored alongside the original datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Xirogiannopoulos:2015:GEI,
author = "Konstantinos Xirogiannopoulos and Udayan Khurana and
Amol Deshpande",
title = "{GraphGen}: exploring interesting graphs in relational
data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "2032--2035",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824129",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Analyzing interconnection structures among the data
through the use of graph algorithms and graph analytics
has been shown to provide tremendous value in many
application domains. However, graphs are not the
primary choice for how most data is currently stored,
and users who want to employ graph analytics are forced
to extract data from their data stores, construct the
requisite graphs, and then use a specialized engine to
write and execute their graph analysis tasks. This
cumbersome and costly process not only raises barriers
in using graph analytics, but also makes it hard to
explore and identify hidden or implicit graphs in the
data. Here we demonstrate a system, called G raphGen,
that enables users to declaratively specify graph
extraction tasks over relational databases, visually
explore the extracted graphs, and write and execute
graph algorithms over them, either directly or using
existing graph libraries like the widely used NetworkX
Python library. We also demonstrate how unifying the
extraction tasks and the graph algorithms enables
significant optimizations that would not be possible
otherwise.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yoon:2015:DPF,
author = "Dong Young Yoon and Barzan Mozafari and Douglas P.
Brown",
title = "{DBSeer}: pain-free database administration through
workload intelligence",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "2036--2039",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824130",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The pressing need for achieving and maintaining high
performance in database systems has made database
administration one of the most stressful jobs in
information technology. On the other hand, the
increasing complexity of database systems has made
qualified database administrators (DBAs) a scarce
resource. DBAs are now responsible for an array of
demanding tasks; they need to (i) provision and tune
their database according to their application
requirements, (ii) constantly monitor their database
for any performance failures or slowdowns, (iii)
diagnose the root cause of the performance problem in
an accurate and timely fashion, and (iv) take prompt
actions that can restore acceptable database
performance. However, much of the research in the past
years has focused on improving the raw performance of
the database systems, rather than improving their
manageability. Besides sophisticated consoles for
monitoring performance and a few auto-tuning wizards,
DBAs are not provided with any help other than their
own many years of experience. Typically, their only
resort is trial-and-error, which is a tedious, ad-hoc
and often sub-optimal solution. In this demonstration,
we present DBSeer, a workload intelligence framework
that exploits advanced machine learning and causality
techniques to aid DBAs in their various
responsibilities. DBSeer analyzes large volumes of
statistics and telemetry data collected from various
log files to provide the DBA with a suite of rich
functionalities including performance prediction,
performance diagnosis, bottleneck explanation, workload
insight, optimal admission control, and what-if
analysis. In this demo, we showcase various features of
DBSeer by predicting and analyzing the performance of a
live database system. Will also reproduce a number of
realistic performance problems in the system, and allow
the audience to use DBSeer to quickly diagnose and
resolve their root cause.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kejariwal:2015:RTA,
author = "Arun Kejariwal and Sanjeev Kulkarni and Karthik
Ramasamy",
title = "Real time analytics: algorithms and systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "2040--2041",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824132",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "V elocity is one of the 4 Vs commonly used to
characterize Big Data [5]. In this regard, Forrester
remarked the following in Q3 2014 [8]: ``The high
velocity, white-water flow of data from innumerable
real-time data sources such as market data, Internet of
Things, mobile, sensors, click-stream, and even
transactions remain largely unnavigated by most firms.
The opportunity to leverage streaming analytics has
never been greater.'' Example use cases of streaming
analytics include, but not limited to: (a)
visualization of business metrics in real-time (b)
facilitating highly personalized experiences (c)
expediting response during emergencies. Streaming
analytics is extensively used in a wide variety of
domains such as healthcare, e-commerce, financial
services, telecommunications, energy and utilities,
manufacturing, government and transportation. In this
tutorial, we shall present an in-depth overview of
streaming analytics --- applications, algorithms and
platforms --- landscape. We shall walk through how the
field has evolved over the last decade and then discuss
the current challenges --- the impact of the other
three V s, viz., V olume, V ariety and V eracity, on
Big Data streaming analytics. The tutorial is intended
for both researchers and practitioners in the industry.
We shall also present state-of-the-affairs of streaming
analytics at Twitter.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Khan:2015:UGM,
author = "Arijit Khan and Lei Chen",
title = "On uncertain graphs modeling and queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "2042--2043",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824133",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Large-scale, highly-interconnected networks pervade
both our society and the natural world around us.
Uncertainty, on the other hand, is inherent in the
underlying data due to a variety of reasons, such as
noisy measurements, lack of precise information needs,
inference and prediction models, or explicit
manipulation, e.g., for privacy purposes. Therefore,
uncertain, or probabilistic, graphs are increasingly
used to represent noisy linked data in many emerging
application scenarios, and they have recently become a
hot topic in the database research community. While
many classical graph algorithms such as reachability
and shortest path queries become \# P -complete, and
hence, more expensive in uncertain graphs; various
complex queries are also emerging over uncertain
networks, such as pattern matching, information
diffusion, and influence maximization queries. In this
tutorial, we discuss the sources of uncertain graphs
and their applications, uncertainty modeling, as well
as the complexities and algorithmic advances on
uncertain graphs processing in the context of both
classical and emerging graph queries. We emphasize the
current challenges and highlight some future research
directions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dong:2015:TMI,
author = "Xin Luna Dong and Wang-Chiew Tan",
title = "A time machine for information: looking back to look
forward",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "2044--2045",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824134",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the abundant availability of information one can
mine from the Web today, there is increasing interest
to develop a complete understanding of the history of
an entity (i.e., a person, a company, a music genre, a
country, etc.) (see, for example, [7, 9, 10, 11]) and
to depict trends over time [5, 12, 13]. This, however,
remains a largely difficult and manual task despite
more than a couple of decades of research in the areas
of temporal databases and data integration.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Das:2015:SAS,
author = "Mahashweta Das and Gautam Das",
title = "Structured analytics in social media",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "2046--2047",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824135",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The rise of social media has turned the Web into an
online community where people connect, communicate, and
collaborate with each other. Structured analytics in
social media is the process of discovering the
structure of the relationships emerging from this
social media use. It focuses on identifying the users
involved, the activities they undertake, the actions
they perform, and the items (e.g., movies, restaurants,
blogs, etc.) they create and interact with. There are
two key challenges facing these tasks: how to organize
and model social media content, which is often
unstructured in its raw form, in order to employ
structured analytics on it; and how to employ analytics
algorithms to capture both explicit link-based
relationships and implicit behavior-based
relationships. In this tutorial, we systemize and
summarize the research so far in analyzing social
interactions between users and items in the Web from
data mining and database perspectives. We start with a
general overview of the topic, including discourse to
various exciting and practical applications. Then, we
discuss the state-of-art for modeling the data,
formalizing the mining task, developing the algorithmic
solutions, and evaluating on real datasets. We also
emphasize open problems and challenges for future
research in the area of structured analytics and social
media.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gao:2015:TDC,
author = "Jing Gao and Qi Li and Bo Zhao and Wei Fan and Jiawei
Han",
title = "Truth discovery and crowdsourcing aggregation: a
unified perspective",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "2048--2049",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824136",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In the era of Big Data, data entries, even describing
the same objects or events, can come from a variety of
sources, where a data source can be a web page, a
database or a person. Consequently, conflicts among
sources become inevitable. To resolve the conflicts and
achieve high quality data, truth discovery and
crowdsourcing aggregation have been studied
intensively. However, although these two topics have a
lot in common, they are studied separately and are
applied to different domains. To answer the need of a
systematic introduction and comparison of the two
topics, we present an organized picture on truth
discovery and crowdsourcing aggregation in this
tutorial. They are compared on both theory and
application levels, and their related areas as well as
open questions are discussed.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Abadi:2015:SHS,
author = "Daniel Abadi and Shivnath Babu and Fatma {\"O}zcan and
Ippokratis Pandis",
title = "{SQL-on-Hadoop} systems: tutorial",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "2050--2051",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824137",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Enterprises are increasingly using Apache Hadoop, more
specifically HDFS, as a central repository for all
their data; data coming from various sources, including
operational systems, social media and the web, sensors
and smart devices, as well as their applications. At
the same time many enterprise data management tools
(e.g. from SAP ERP and SAS to Tableau) rely on SQL and
many enterprise users are familiar and comfortable with
SQL. As a result, SQL processing over Hadoop data has
gained significant traction over the recent years, and
the number of systems that provide such capability has
increased significantly. In this tutorial we use the
term SQL-on-Hadoop to refer to systems that provide
some level of declarative SQL(-like) processing over
HDFS and noSQL data sources, using architectures that
include computational or storage engines compatible
with Apache Hadoop.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Loaiza:2015:EDH,
author = "Juan Loaiza",
title = "Engineering database hardware and software together",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "2052--2052",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824139",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Since its inception, Oracle's database software
primarily ran on customer configured off-the-shelf
hardware. A decade ago, the architecture of
conventional systems started to become a bottleneck and
Oracle developed the Oracle Exadata Database Machine to
optimize the full hardware and software stack for
database workloads. Exadata is based on a scale-out
architecture of database servers and storage servers
that optimizes both OLTP and Analytic workloads while
hosting hundreds of databases simultaneously on the
same system. By using database specific protocols for
storage and networking we bypass limitations imposed by
conventional network and storage layers. Exadata is now
deployed at thousands of Enterprises including 4 of the
5 largest banks, telecoms, and retailers for varied
workloads such as interbank funds transfers,
e-commerce, ERP, Cloud SaaS applications, and petabyte
data warehouses. Five years ago, Oracle initiated a
project to extend our database stack beyond software
and systems and into the architecture of the
microprocessor itself. The goal of this effort is to
dramatically improve the performance, reliability and
cost effectiveness of a new generation of database
machines. The new SPARC M7 processor is the first step.
The M7 is an extraordinarily fast conventional
processor with 32-cores per socket and an extremely
high bandwidth memory system. Added to its conventional
processing capabilities are 32 custom on-chip database
co-processors that run database searches at full memory
bandwidth rates, and decompress data in real-time to
increase memory bandwidth and capacity. Further, the M7
implements innovative fine-grained memory protection to
secure sensitive business data. In the presentation we
will describe how Oracle's engineering teams integrate
software and hardware at all levels to achieve
breakthrough performance, reliability, and security for
the database and rest of the modern data processing
stack.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Balazinska:2015:BDR,
author = "Magdalena Balazinska",
title = "Big data research: will industry solve all the
problems?",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "2053--2056",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824140",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The need for effective tools for big data data
management and analytics continues to grow. While the
ecosystem of tools is expanding many research problems
remain open: they include challenges around efficient
processing, flexible analytics, ease of use, and
operation as a service. Many new systems and much
innovation, however, come from industry (or from
academic projects that quickly became big players in
industry). An important question for our community is
whether industry will solve all the problems or whether
there is a place for academic research in big data and
what is that place. In this paper, we address this
question by looking back at our research on the Nuage,
CQMS, Myria, and Data Pricing projects, and the SciDB
collaboration.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Walter:2015:BPB,
author = "Todd Walter",
title = "Big plateaus of {Big Data} on the big island",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "2057--2057",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824141",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In ancient texts, 40 was a magic number. It meant a
``lot'' or ``a long time.'' 40 years represented the
time it took for a new generation to arise. A look back
at 40 years of VLDB suggests that this applies to
database researchers as well --- the young researchers
of the early VLDBs are now the old folks of the
database world, and a new generation is creating
afresh. Over this period many plateaus of ``Big Data''
have challenged the database community and been
conquered. But there is still no free lunch ---
database research is really the science of trade-offs,
many of which are no different today than 40 years ago.
And of course the evolution of hardware technology
continues to swing the trade-off pendulum while
enabling new plateaus to be reached. Todd will take a
look back at customer big data plateaus of the past. He
will look at where we are today, then use his crystal
ball and the lessons of the past to extrapolate the
next several plateaus --- how they will be the same and
how will they be different. Along the way we will have
a little fun with some VLDB and Teradata history.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ailamaki:2015:DHB,
author = "Anastasia Ailamaki",
title = "Databases and hardware: the beginning and sequel of a
beautiful friendship",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "12",
pages = "2058--2061",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2824032.2824142",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 16 18:23:11 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Fast query and transaction processing is the goal of
40 years of database research and the reason of
existence for many new database system architectures.
In data management, system performance means acceptable
response time and throughput on critical-path
operations, ideally with scalability guarantees.
Performance is improved with top-of-the line research
on data processing algorithms; efficiency, however, is
contingent on seamless collaboration between the
database software and hardware and storage devices. In
1980, the goal was to minimize disk accesses; in 2000,
memory replaced disks in terms of access costs.
Nowadays performance is synonymous to scalability;
scalability, in turn, translates into sustainable and
predictable use of hardware resources in the face of
embarrassing parallelism and deep storage hierarchies
while minimizing energy needs --- a challenging goal in
multiple dimensions. We discuss work done in the past
four decades to tighten the interaction between the
database software and underlying hardware and show
that, as application and microarchitecture roadmaps
evolve, the effort of maintaining smooth collaboration
blossoms into a multitude of interesting research
avenues with direct technological impact.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Aly:2015:AAQ,
author = "Ahmed M. Aly and Ahmed R. Mahmood and Mohamed S.
Hassan and Walid G. Aref and Mourad Ouzzani and Hazem
Elmeleegy and Thamir Qadah",
title = "{AQWA}: adaptive query workload aware partitioning of
big spatial data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "13",
pages = "2062--2073",
month = sep,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2831360.2831361",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 30 17:17:35 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The unprecedented spread of location-aware devices has
resulted in a plethora of location-based services in
which huge amounts of spatial data need to be
efficiently processed by large-scale computing
clusters. Existing cluster-based systems for processing
spatial data employ static data-partitioning structures
that cannot adapt to data changes, and that are
insensitive to the query workload. Hence, these systems
are incapable of consistently providing good
performance. To close this gap, we present AQWA, an
adaptive and query-workload-aware mechanism for
partitioning large-scale spatial data. AQWA does not
assume prior knowledge of the data distribution or the
query workload. Instead, as data is consumed and
queries are processed, the data partitions are
incrementally updated. With extensive experiments using
real spatial data from Twitter, and various workloads
of range and k -nearest-neighbor queries, we
demonstrate that AQWA can achieve an order of magnitude
enhancement in query performance compared to the
state-of-the-art systems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Khayyat:2015:LFS,
author = "Zuhair Khayyat and William Lucia and Meghna Singh and
Mourad Ouzzani and Paolo Papotti and Jorge-Arnulfo
Quian{\'e}-Ruiz and Nan Tang and Panos Kalnis",
title = "Lightning fast and space efficient inequality joins",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "13",
pages = "2074--2085",
month = sep,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2831360.2831362",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 30 17:17:35 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
note = "See erratum \cite{Khayyat:2017:ELF}.",
abstract = "Inequality joins, which join relational tables on
inequality conditions, are used in various
applications. While there have been a wide range of
optimization methods for joins in database systems,
from algorithms such as sort-merge join and band join,
to various indices such as B$^+$ -tree, R$^*$ -tree and
Bitmap, inequality joins have received little attention
and queries containing such joins are usually very
slow. In this paper, we introduce fast inequality join
algorithms. We put columns to be joined in sorted
arrays and we use permutation arrays to encode
positions of tuples in one sorted array w.r.t. the
other sorted array. In contrast to sort-merge join, we
use space efficient bit-arrays that enable
optimizations, such as Bloom filter indices, for fast
computation of the join results. We have implemented a
centralized version of these algorithms on top of
PostgreSQL, and a distributed version on top of Spark
SQL. We have compared against well known optimization
techniques for inequality joins and show that our
solution is more scalable and several orders of
magnitude faster.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2015:FPO,
author = "Jinfei Liu and Li Xiong and Jian Pei and Jun Luo and
Haoyu Zhang",
title = "Finding {Pareto} optimal groups: group-based skyline",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "13",
pages = "2086--2097",
month = sep,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2831360.2831363",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 30 17:17:35 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Skyline computation, aiming at identifying a set of
skyline points that are not dominated by any other
point, is particularly useful for multi-criteria data
analysis and decision making. Traditional skyline
computation, however, is inadequate to answer queries
that need to analyze not only individual points but
also groups of points. To address this gap, we
generalize the original skyline definition to the novel
group-based skyline (G-Skyline), which represents
Pareto optimal groups that are not dominated by other
groups. In order to compute G-Skyline groups consisting
of k points efficiently, we present a novel structure
that represents the points in a directed skyline graph
and captures the dominance relationships among the
points based on the first k skyline layers. We propose
efficient algorithms to compute the first k skyline
layers. We then present two heuristic algorithms to
efficiently compute the G-Skyline groups: the
point-wise algorithm and the unit group-wise algorithm,
using various pruning strategies. The experimental
results on the real NBA dataset and the synthetic
datasets show that G-Skyline is interesting and useful,
and our algorithms are efficient and scalable.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Faulkner:2015:RQN,
author = "Taylor Kessler Faulkner and Will Brackenbury and
Ashwin Lall",
title = "$k$-regret queries with nonlinear utilities",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "13",
pages = "2098--2109",
month = sep,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2831360.2831364",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 30 17:17:35 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In exploring representative databases, a primary issue
has been finding accurate models of user preferences.
Given this, our work generalizes the method of regret
minimization as proposed by Nanongkai et al. to include
nonlinear utility functions. Regret minimization is an
approach for selecting k representative points from a
database such that every user's ideal point in the
entire database is similar to one of the k points. This
approach combines benefits of the methods top- k and
skyline; it controls the size of the output but does
not require knowledge of users' preferences. Prior work
with k -regret queries assumes users' preferences to be
modeled by linear utility functions. In this paper, we
derive upper and lower bounds for nonlinear utility
functions, as these functions can better fit
occurrences such as diminishing marginal returns,
propensity for risk, and substitutability of
preferences. To model these phenomena, we analyze a
broad subset of convex, concave, and constant
elasticity of substitution functions. We also run
simulations on real and synthetic data to prove the
efficacy of our bounds in practice.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shi:2015:CTM,
author = "Juwei Shi and Yunjie Qiu and Umar Farooq Minhas and
Limei Jiao and Chen Wang and Berthold Reinwald and
Fatma {\"O}zcan",
title = "Clash of the titans: {MapReduce} vs. {Spark} for large
scale data analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "13",
pages = "2110--2121",
month = sep,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2831360.2831365",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 30 17:17:35 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "MapReduce and Spark are two very popular open source
cluster computing frameworks for large scale data
analytics. These frameworks hide the complexity of task
parallelism and fault-tolerance, by exposing a simple
programming API to users. In this paper, we evaluate
the major architectural components in MapReduce and
Spark frameworks including: shuffle, execution model,
and caching, by using a set of important analytic
workloads. To conduct a detailed analysis, we developed
two profiling tools: (1) We correlate the task
execution plan with the resource utilization for both
MapReduce and Spark, and visually present this
correlation; (2) We provide a break-down of the task
execution time for in-depth analysis. Through detailed
experiments, we quantify the performance differences
between MapReduce and Spark. Furthermore, we attribute
these performance differences to different components
which are architected differently in the two
frameworks. We further expose the source of these
performance differences by using a set of
micro-benchmark experiments. Overall, our experiments
show that Spark is about 2.5x, 5x, and 5x faster than
MapReduce, for Word Count, k-means, and PageRank,
respectively. The main causes of these speedups are the
efficiency of the hash-based aggregation component for
combine, as well as reduced CPU and disk overheads due
to RDD caching in Spark. An exception to this is the
Sort workload, for which MapReduce is 2x faster than
Spark. We show that MapReduce's execution model is more
efficient for shuffling data than Spark, thus making
Sort run faster on MapReduce.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2015:TMI,
author = "Yu Liu and Jiaheng Lu and Hua Yang and Xiaokui Xiao
and Zhewei Wei",
title = "Towards maximum independent sets on massive graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "13",
pages = "2122--2133",
month = sep,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2831360.2831366",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 30 17:17:35 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Maximum independent set (MIS) is a fundamental problem
in graph theory and it has important applications in
many areas such as social network analysis, graphical
information systems and coding theory. The problem is
NP-hard, and there has been numerous studies on its
approximate solutions. While successful to a certain
degree, the existing methods require memory space at
least linear in the size of the input graph. This has
become a serious concern in view of the massive volume
of today's fast-growing graphs. In this paper, we study
the MIS problem under the semi-external setting, which
assumes that the main memory can accommodate all
vertices of the graph but not all edges. We present a
greedy algorithm and a general vertex-swap framework,
which swaps vertices to incrementally increase the size
of independent sets. Our solutions require only few
sequential scans of graphs on the disk file, thus
enabling in-memory computation without costly random
disk accesses. Experiments on large-scale datasets show
that our solutions are able to compute a large
independent set for a massive graph with 59 million
vertices and 151 million edges using a commodity
machine, with a memory cost of 469MB and a time cost of
three minutes, while yielding an approximation ratio
that is around 99\% of the theoretical optimum.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Meehan:2015:SSM,
author = "John Meehan and Nesime Tatbul and Stan Zdonik and
Cansu Aslantas and Ugur Cetintemel and Jiang Du and Tim
Kraska and Samuel Madden and David Maier and Andrew
Pavlo and Michael Stonebraker and Kristin Tufte and Hao
Wang",
title = "{S-Store}: streaming meets transaction processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "13",
pages = "2134--2145",
month = sep,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2831360.2831367",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 30 17:17:35 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Stream processing addresses the needs of real-time
applications. Transaction processing addresses the
coordination and safety of short atomic computations.
Heretofore, these two modes of operation existed in
separate, stove-piped systems. In this work, we attempt
to fuse the two computational paradigms in a single
system called S-Store. In this way, S-Store can
simultaneously accommodate OLTP and streaming
applications. We present a simple transaction model for
streams that integrates seamlessly with a traditional
OLTP system, and provides both ACID and stream-oriented
guarantees. We chose to build S-Store as an extension
of H-Store --- an open-source, in-memory, distributed
OLTP database system. By implementing S-Store in this
way, we can make use of the transaction processing
facilities that H-Store already provides, and we can
concentrate on the additional features that are needed
to support streaming. Similar implementations could be
done using other main-memory OLTP platforms. We show
that we can actually achieve higher throughput for
streaming workloads in S-Store than an equivalent
deployment in H-Store alone. We also show how this can
be achieved within H-Store with the addition of a
modest amount of new functionality. Furthermore, we
compare S-Store to two state-of-the-art streaming
systems, Esper and Apache Storm, and show how S-Store
can sometimes exceed their performance while at the
same time providing stronger correctness guarantees.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Levandoski:2015:MVR,
author = "Justin Levandoski and David Lomet and Sudipta Sengupta
and Ryan Stutsman and Rui Wang",
title = "Multi-version range concurrency control in
{Deuteronomy}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "13",
pages = "2146--2157",
month = sep,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2831360.2831368",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 30 17:17:35 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The Deuteronomy transactional key value store executes
millions of serializable transactions/second by
exploiting multi-version timestamp order concurrency
control. However, it has not supported range
operations, only individual record operations (e.g.,
create, read, update, delete). In this paper, we
enhance our multi-version timestamp order technique to
handle range concurrency and prevent phantoms.
Importantly, we maintain high performance while
respecting the clean separation of duties required by
Deuteronomy, where a transaction component performs
purely logical concurrency control (including range
support), while a data component performs data storage
and management duties. Like the rest of the Deuteronomy
stack, our range technique manages concurrency
information in a latch-free manner. With our range
enhancement, Deuteronomy can reach scan speeds of
nearly 250 million records/s (more than 27 GB/s) on
modern hardware, while providing serializable isolation
complete with phantom prevention.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2015:QEI,
author = "Hao Li and Chee-Yong Chan and David Maier",
title = "Query from examples: an iterative, data-driven
approach to query construction",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "13",
pages = "2158--2169",
month = sep,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2831360.2831369",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 30 17:17:35 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we propose a new approach, called Query
from Examples (QFE), to help non-expert database users
construct SQL queries. Our approach, which is designed
for users who might be unfamiliar with SQL, only
requires that the user is able to determine whether a
given output table is the result of his or her intended
query on a given input database. To kick-start the
construction of a target query Q, the user first
provides a pair of inputs: a sample database D and an
output table R which is the result of Q on D. As there
will be many candidate queries that transform D to R,
QFE winnows this collection by presenting the user with
new database-result pairs that distinguish these
candidates. Unlike previous approaches that use
synthetic data for such pairs, QFE strives to make
these distinguishing pairs as close to the original (
D,R) pair as possible. By doing so, it seeks to
minimize the effort needed by a user to determine if a
new database-result pair is consistent with his or her
desired query. We demonstrate the effectiveness and
efficiency of our approach using real datasets from
SQLShare, a cloud-based platform designed to help
scientists utilize RDBMS technology for data
analysis.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Galhotra:2015:TCR,
author = "Sainyam Galhotra and Amitabha Bagchi and Srikanta
Bedathur and Maya Ramanath and Vidit Jain",
title = "Tracking the conductance of rapidly evolving
topic-subgraphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "13",
pages = "2170--2181",
month = sep,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2831360.2831370",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 30 17:17:35 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Monitoring the formation and evolution of communities
in large online social networks such as Twitter is an
important problem that has generated considerable
interest in both industry and academia. Fundamentally,
the problem can be cast as studying evolving sugraphs
(each subgraph corresponding to a topical community) on
an underlying social graph --- with users as nodes and
the connection between them as edges. A key metric of
interest in this setting is tracking the changes to the
conductance of subgraphs induced by edge activations.
This metric quantifies how well or poorly connected a
subgraph is to the rest of the graph relative to its
internal connections. Conductance has been demonstrated
to be of great use in many applications, such as
identifying bursty topics, tracking the spread of
rumors, and so on. However, tracking this simple metric
presents a considerable scalability challenge --- the
underlying social network is large, the number of
communities that are active at any moment is large, the
rate at which these communities evolve is high, and
moreover, we need to track conductance in real-time. We
address these challenges in this paper. We propose an
in-memory approximation called BloomGraphs to store and
update these (possibly overlapping) evolving subgraphs.
As the name suggests, we use Bloom filters to represent
an approximation of the underlying graph. This
representation is compact and computationally efficient
to maintain in the presence of updates. This is
especially important when we need to simultaneously
maintain thousands of evolving subgraphs. BloomGraphs
are used in computing and tracking conductance of these
subgraphs as edge-activations arrive. BloomGraphs have
several desirable properties in the context of this
application, including a small memory footprint and
efficient updateability. We also demonstrate
mathematically that the error incurred in computing
conductance is one-sided and that in the case of
evolving subgraphs the change in approximate
conductance has the same sign as the change in exact
conductance in most cases. We validate the
effectiveness of BloomGraphs through extensive
experimentation on large Twitter graphs and other
social networks.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Vartak:2015:SED,
author = "Manasi Vartak and Sajjadur Rahman and Samuel Madden
and Aditya Parameswaran and Neoklis Polyzotis",
title = "{SeeDB}: efficient data-driven visualization
recommendations to support visual analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "13",
pages = "2182--2193",
month = sep,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2831360.2831371",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 30 17:17:35 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data analysts often build visualizations as the first
step in their analytical workflow. However, when
working with high-dimensional datasets, identifying
visualizations that show relevant or desired trends in
data can be laborious. We propose S eeDB, a
visualization recommendation engine to facilitate fast
visual analysis: given a subset of data to be studied,
SeeDB intelligently explores the space of
visualizations, evaluates promising visualizations for
trends, and recommends those it deems most ``useful''
or ``interesting''. The two major obstacles in
recommending interesting visualizations are (a) scale:
evaluating a large number of candidate visualizations
while responding within interactive time scales, and
(b) utility: identifying an appropriate metric for
assessing interestingness of visualizations. For the
former, SeeDB introduces pruning optimizations to
quickly identify high-utility visualizations and
sharing optimizations to maximize sharing of
computation across visualizations. For the latter, as a
first step, we adopt a deviation-based metric for
visualization utility, while indicating how we may be
able to generalize it to other factors influencing
utility. We implement SeeDB as a middleware layer that
can run on top of any DBMS. Our experiments show that
our framework can identify interesting visualizations
with high accuracy. Our optimizations lead to multiple
orders of magnitude speedup on relational row and
column stores and provide recommendations at
interactive time scales. Finally, we demonstrate via a
user study the effectiveness of our deviation-based
utility metric and the value of recommendations in
supporting visual analytics.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Qiu:2015:DLS,
author = "Disheng Qiu and Luciano Barbosa and Xin Luna Dong and
Yanyan Shen and Divesh Srivastava",
title = "{Dexter}: large-scale discovery and extraction of
product specifications on the web",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "8",
number = "13",
pages = "2194--2205",
month = sep,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.14778/2831360.2831372",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Sep 30 17:17:35 MDT 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The web is a rich resource of structured data. There
has been an increasing interest in using web structured
data for many applications such as data integration,
web search and question answering. In this paper, we
present Dexter, a system to find product sites on the
web, and detect and extract product specifications from
them. Since product specifications exist in multiple
product sites, our focused crawler relies on search
queries and backlinks to discover product sites. To
perform the detection, and handle the high diversity of
specifications in terms of content, size and format,
our system uses supervised learning to classify HTML
fragments (e.g., tables and lists) present in web pages
as specifications or not. To perform large-scale
extraction of the attribute-value pairs from the HTML
fragments identified by the specification detector,
Dexter adopts two lightweight strategies: a
domain-independent and unsupervised wrapper method,
which relies on the observation that these HTML
fragments have very similar structure; and a
combination of this strategy with a previous approach,
which infers extraction patterns by annotations
generated by automatic but noisy annotators. The
results show that our crawler strategy to locate
product specification pages is effective: (1) it
discovered 1:46AM product specification pages from
3,005 sites and 9 different categories; (2) the
specification detector obtains high values of F-measure
(close to 0:9) over a heterogeneous set of product
specifications; and (3) our efficient wrapper methods
for attribute-value extraction get very high values of
precision (0.92) and recall (0.95) and obtain better
results than a state-of-the-art, supervised rule-based
wrapper.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Huang:2015:QAL,
author = "Qiang Huang and Jianlin Feng and Yikai Zhang and Qiong
Fang and Wilfred Ng",
title = "Query-aware locality-sensitive hashing for approximate
nearest neighbor search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "1",
pages = "1--12",
month = sep,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Dec 19 17:42:24 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Locality-Sensitive Hashing (LSH) and its variants are
the well-known indexing schemes for the $c$-Approximate
Nearest Neighbor (c -ANN) search problem in
high-dimensional Euclidean space. Traditionally, LSH
functions are constructed in a query-oblivious manner
in the sense that buckets are partitioned before any
query arrives. However, objects closer to a query may
be partitioned into different buckets, which is
undesirable. Due to the use of query-oblivious bucket
partition, the state-of-the-art LSH schemes for
external memory, namely C2LSH and LSB-Forest, only work
with approximation ratio of integer $ c \geq 2$. In
this paper, we introduce a novel concept of query-aware
bucket partition which uses a given query as the
``anchor'' for bucket partition. Accordingly, a
query-aware LSH function is a random projection coupled
with query-aware bucket partition, which removes random
shift required by traditional query-oblivious LSH
functions. Notably, query-aware bucket partition can be
easily implemented so that query performance is
guaranteed. We propose a novel query-aware LSH scheme
named QALSH for $c$-ANN search over external memory.
Our theoretical studies show that QALSH enjoys a
guarantee on query quality. The use of query-aware LSH
function enables QALSH to work with any approximation
ratio $ c > 1$. Extensive experiments show that QALSH
outperforms C2LSH and LSB-Forest, especially in
high-dimensional space. Specifically, by using a ratio
$ c < 2$, QALSH can achieve much better query
quality.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Khaouid:2015:KCD,
author = "Wissam Khaouid and Marina Barsky and Venkatesh
Srinivasan and Alex Thomo",
title = "{$K$}-core decomposition of large networks on a single
{PC}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "1",
pages = "13--23",
month = sep,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Dec 19 17:42:24 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Studying the topology of a network is critical to
inferring underlying dynamics such as tolerance to
failure, group behavior and spreading patterns.
$k$-core decomposition is a well-established metric
which partitions a graph into layers from external to
more central vertices. In this paper we aim to explore
whether $k$-core decomposition of large networks can be
computed using a consumer-grade PC. We feature
implementations of the ``vertex-centric'' distributed
protocol introduced by Montresor, De Pellegrini and
Miorandi on GraphChi and Webgraph. Also, we present an
accurate implementation of the Batagelj and Zaversnik
algorithm for $k$-core decomposition in Webgraph. With
our implementations, we show that we can efficiently
handle networks of billions of edges using a single
consumer-level machine within reasonable time and can
produce excellent approximations in only a fraction of
the execution time. To the best of our knowledge, our
biggest graphs are considerably larger than the graphs
considered in the literature. Next, we present an
optimized implementation of an external-memory
algorithm (EMcore) by Cheng, Ke, Chu, and {\"O}zsu. We
show that this algorithm also performs well for large
datasets, however, it cannot predict whether a given
memory budget is sufficient for a new dataset. We
present a thorough analysis of all algorithms
concluding that it is viable to compute $k$-core
decomposition for large networks in a consumer-grade
PC.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2015:WCP,
author = "Zhenguo Li and Yixiang Fang and Qin Liu and Jiefeng
Cheng and Reynold Cheng and John C. S. Lui",
title = "Walking in the cloud: parallel {SimRank} at scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "1",
pages = "24--35",
month = sep,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Dec 19 17:42:24 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/pagerank.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Despite its popularity, SimRank is computationally
costly, in both time and space. In particular, its
recursive nature poses a great challenge in using
modern distributed computing power, and also prevents
querying similarities individually. Existing solutions
suffer greatly from these practical issues. In this
paper, we break such dependency for maximum efficiency
possible. Our method consists of offline and online
phases. In offline phase, a length- n indexing vector
is derived by solving a linear system in parallel. At
online query time, the similarities are computed
instantly from the index vector. Throughout, the Monte
Carlo method is used to maximally reduce time and
space. Our algorithm, called CloudWalker, is highly
parallelizable, with only linear time and space.
Remarkably, it responses to both single-pair and
single-source queries in constant time. CloudWalker is
orders of magnitude more efficient and scalable than
existing solutions for large-scale problems.
Implemented on Spark with 10 machines and tested on the
web-scale clue-web graph with 1 billion nodes and 43
billion edges, it takes 110 hours for offline indexing,
64 seconds for a single-pair query, and 188 seconds for
a single-source query. To the best of our knowledge,
our work is the first to report results on clue-web,
which is 10x larger than the largest graph ever
reported for SimRank computation.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Arocena:2015:MBE,
author = "Patricia C. Arocena and Boris Glavic and Giansalvatore
Mecca and Ren{\'e}e J. Miller and Paolo Papotti and
Donatello Santoro",
title = "Messing up with {BART}: error generation for
evaluating data-cleaning algorithms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "2",
pages = "36--47",
month = oct,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 2 14:26:50 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We study the problem of introducing errors into clean
databases for the purpose of benchmarking data-cleaning
algorithms. Our goal is to provide users with the
highest possible level of control over the
error-generation process, and at the same time develop
solutions that scale to large databases. We show in the
paper that the error-generation problem is surprisingly
challenging, and in fact, NP-complete. To provide a
scalable solution, we develop a correct and efficient
greedy algorithm that sacrifices completeness, but
succeeds under very reasonable assumptions. To scale to
millions of tuples, the algorithm relies on several
non-trivial optimizations, including a new symmetry
property of data quality constraints. The trade-off
between control and scalability is the main technical
contribution of the paper.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Hayashi:2015:FDB,
author = "Takanori Hayashi and Takuya Akiba and Yuichi Yoshida",
title = "Fully dynamic betweenness centrality maintenance on
massive networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "2",
pages = "48--59",
month = oct,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 2 14:26:50 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Measuring the relative importance of each vertex in a
network is one of the most fundamental building blocks
in network analysis. Among several importance measures,
betweenness centrality, in particular, plays key roles
in many real applications. Considerable effort has been
made for developing algorithms for static settings.
However, real networks today are highly dynamic and are
evolving rapidly, and scalable dynamic methods that can
instantly reflect graph changes into centrality values
are required. In this paper, we present the first fully
dynamic method for managing betweenness centrality of
all vertices in a large dynamic network. Its main data
structure is the weighted hyperedge representation of
shortest paths called hypergraph sketch. We carefully
design dynamic update procedure with theoretical
accuracy guarantee. To accelerate updates, we further
propose two auxiliary data structures called two-ball
index and special-purpose reachability index.
Experimental results using real networks demonstrate
its high scalability and efficiency. In particular, it
can reflect a graph change in less than a millisecond
on average for a large-scale web graph with 106M
vertices and 3.7B edges, which is several orders of
magnitude larger than the limits of previous dynamic
methods.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lu:2015:CCC,
author = "Wei Lu and Wei Chen and Laks V. S. Lakshmanan",
title = "From competition to complementarity: comparative
influence diffusion and maximization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "2",
pages = "60--71",
month = oct,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 2 14:26:50 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Influence maximization is a well-studied problem that
asks for a small set of influential users from a social
network, such that by targeting them as early adopters,
the expected total adoption through influence cascades
over the network is maximized. However, almost all
prior work focuses on cascades of a single propagating
entity or purely-competitive entities. In this work, we
propose the Comparative Independent Cascade (Com-IC)
model that covers the full spectrum of entity
interactions from competition to complementarity. In
Com-IC, users' adoption decisions depend not only on
edge-level information propagation, but also on a
node-level automaton whose behavior is governed by a
set of model parameters, enabling our model to capture
not only competition, but also complementarity, to any
possible degree. We study two natural optimization
problems, Self Influence Maximization and Complementary
Influence Maximization, in a novel setting with
complementary entities. Both problems are NP-hard, and
we devise efficient and effective approximation
algorithms via non-trivial techniques based on
reverse-reachable sets and a novel ``sandwich
approximation'' strategy. The applicability of both
techniques extends beyond our model and problems. Our
experiments show that the proposed algorithms
consistently outperform intuitive baselines on four
real-world social networks, often by a significant
margin. In addition, we learn model parameters from
real user action logs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kloudas:2015:POD,
author = "Konstantinos Kloudas and Margarida Mamede and Nuno
Pregui{\c{c}}a and Rodrigo Rodrigues",
title = "{Pixida}: optimizing data parallel jobs in wide-area
data analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "2",
pages = "72--83",
month = oct,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 2 14:26:50 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In the era of global-scale services, big data
analytical queries are often required to process
datasets that span multiple data centers (DCs). In this
setting, cross-DC bandwidth is often the scarcest, most
volatile, and/or most expensive resource. However,
current widely deployed big data analytics frameworks
make no attempt to minimize the traffic traversing
these links. In this paper, we present P ixida, a
scheduler that aims to minimize data movement across
resource constrained links. To achieve this, we
introduce a new abstraction called Silo, which is key
to modeling Pixida's scheduling goals as a graph
partitioning problem. Furthermore, we show that
existing graph partitioning problem formulations do not
map to how big data jobs work, causing their solutions
to miss opportunities for avoiding data movement. To
address this, we formulate a new graph partitioning
problem and propose a novel algorithm to solve it. We
integrated Pixida in Spark and our experiments show
that, when compared to existing schedulers, Pixida
achieves a significant traffic reduction of up to $
\approx 9 \times $ on the aforementioned links.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2015:SOS,
author = "Lu Wang and Robert Christensen and Feifei Li and Ke
Yi",
title = "Spatial online sampling and aggregation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "3",
pages = "84--95",
month = nov,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 2 14:26:50 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The massive adoption of smart phones and other mobile
devices has generated humongous amount of spatial and
spatio-temporal data. The importance of spatial
analytics and aggregation is ever-increasing. An
important challenge is to support interactive
exploration over such data. However, spatial analytics
and aggregation using all data points that satisfy a
query condition is expensive, especially over large
data sets, and could not meet the needs of interactive
exploration. To that end, we present novel indexing
structures that support spatial online sampling and
aggregation on large spatial and spatio-temporal data
sets. In spatial online sampling, random samples from
the set of spatial (or spatio-temporal) points that
satisfy a query condition are generated incrementally
in an online fashion. With more and more samples,
various spatial analytics and aggregations can be
performed in an online, interactive fashion, with
estimators that have better accuracy over time. Our
design works well for both memory-based and
disk-resident data sets, and scales well towards
different query and sample sizes. More importantly, our
structures are dynamic, hence, they are able to deal
with insertions and deletions efficiently. Extensive
experiments on large real data sets demonstrate the
improvements achieved by our indexing structures
compared to other baseline methods.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Richter:2015:SDA,
author = "Stefan Richter and Victor Alvarez and Jens Dittrich",
title = "A seven-dimensional analysis of hashing methods and
its implications on query processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "3",
pages = "96--107",
month = nov,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 2 14:26:50 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Hashing is a solved problem. It allows us to get
constant time access for lookups. Hashing is also
simple. It is safe to use an arbitrary method as a
black box and expect good performance, and
optimizations to hashing can only improve it by a
negligible delta. Why are all of the previous
statements plain wrong? That is what this paper is
about. In this paper we thoroughly study hashing for
integer keys and carefully analyze the most common
hashing methods in a five-dimensional requirements
space: (1) data-distribution, (2) load factor, (3)
dataset size, (4) read/write-ratio, and (5)
un/successful-ratio. Each point in that design space
may potentially suggest a different hashing scheme, and
additionally also a different hash function. We show
that a right or wrong decision in picking the right
hashing scheme and hash function combination may lead
to significant difference in performance. To
substantiate this claim, we carefully analyze two
additional dimensions: (6) five representative hashing
schemes (which includes an improved variant of Robin
Hood hashing), (7) four important classes of hash
functions widely used today. That is, we consider 20
different combinations in total. Finally, we also
provide a glimpse about the effect of table memory
layout and the use of SIMD instructions. Our study
clearly indicates that picking the right combination
may have considerable impact on insert and lookup
performance, as well as memory footprint. A major
conclusion of our work is that hashing should be
considered a white box before blindly using it in
applications, such as query processing. Finally, we
also provide a strong guideline about when to use which
hashing method.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Arocena:2015:IIM,
author = "Patricia C. Arocena and Boris Glavic and Radu Ciucanu
and Ren{\'e}e J. Miller",
title = "The {iBench} integration metadata generator",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "3",
pages = "108--119",
month = nov,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 2 14:26:50 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Given the maturity of the data integration field it is
surprising that rigorous empirical evaluations of
research ideas are so scarce. We identify a major
roadblock for empirical work --- the lack of
comprehensive metadata generators that can be used to
create benchmarks for different integration tasks. This
makes it difficult to compare integration solutions,
understand their generality, and understand their
performance. We present iBench, the first metadata
generator that can be used to evaluate a wide-range of
integration tasks (data exchange, mapping creation,
mapping composition, schema evolution, among many
others). iBench permits control over the size and
characteristics of the metadata it generates (schemas,
constraints, and mappings). Our evaluation demonstrates
that iBench can efficiently generate very large,
complex, yet realistic scenarios with different
characteristics. We also present an evaluation of three
mapping creation systems using iBench and show that the
intricate control that iBench provides over metadata
scenarios can reveal new and important empirical
insights. iBench is an open-source, extensible tool
that we are providing to the community. We believe it
will raise the bar for empirical evaluation and
comparison of data integration systems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Altwaijry:2015:QFI,
author = "Hotham Altwaijry and Sharad Mehrotra and Dmitri V.
Kalashnikov",
title = "{QuERy}: a framework for integrating entity resolution
with query processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "3",
pages = "120--131",
month = nov,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 2 14:26:50 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper explores an analysis-aware data cleaning
architecture for a large class of SPJ SQL queries. In
particular, we propose QuERy, a novel framework for
integrating entity resolution (ER) with query
processing. The aim of QuERy is to correctly and
efficiently answer complex queries issued on top of
dirty data. The comprehensive empirical evaluation of
the proposed solution demonstrates its significant
advantage in terms of efficiency over the traditional
techniques for the given problem settings.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lee:2015:POM,
author = "Taesung Lee and Jin-woo Park and Sanghoon Lee and
Seung-Won Hwang and Sameh Elnikety and Yuxiong He",
title = "Processing and optimizing main memory spatial-keyword
queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "3",
pages = "132--143",
month = nov,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 2 14:26:50 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Important cloud services rely on spatial-keyword
queries, containing a spatial predicate and arbitrary
boolean keyword queries. In particular, we study the
processing of such queries in main memory to support
short response times. In contrast, current
state-of-the-art spatial-keyword indexes and relational
engines are designed for different assumptions. Rather
than building a new spatial-keyword index, we employ a
cost-based optimizer to process these queries using a
spatial index and a keyword index. We address several
technical challenges to achieve this goal. We introduce
three operators as the building blocks to construct
plans for main memory query processing. We then develop
a cost model for the operators and query plans. We
introduce five optimization techniques that efficiently
reduce the search space and produce a query plan with
low cost. The optimization techniques are
computationally efficient, and they identify a query
plan with a formal approximation guarantee under the
common independence assumption. Furthermore, we extend
the framework to exploit interesting orders. We
implement the query optimizer to empirically validate
our proposed approach using real-life datasets. The
evaluation shows that the optimizations provide
significant reduction in the average and tail latency
of query processing: 7- to 11-fold reduction over using
a single index in terms of 99th percentile response
time. In addition, this approach outperforms existing
spatial-keyword indexes, and DBMS query optimizers for
both average and high-percentile response times.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Park:2015:NSH,
author = "Yongjoo Park and Michael Cafarella and Barzan
Mozafari",
title = "Neighbor-sensitive hashing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "3",
pages = "144--155",
month = nov,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 2 14:26:50 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Approximate $k$ NN ($k$-nearest neighbor) techniques
using binary hash functions are among the most commonly
used approaches for overcoming the prohibitive cost of
performing exact $k$ NN queries. However, the success
of these techniques largely depends on their hash
functions' ability to distinguish $k$ NN items; that
is, the $k$ NN items retrieved based on data items'
hashcodes, should include as many true $k$ NN items as
possible. A widely-adopted principle for this process
is to ensure that similar items are assigned to the
same hashcode so that the items with the hashcodes
similar to a query's hashcode are likely to be true
neighbors. In this work, we abandon this
heavily-utilized principle and pursue the opposite
direction for generating more effective hash functions
for $k$ NN tasks. That is, we aim to increase the
distance between similar items in the hashcode space,
instead of reducing it. Our contribution begins by
providing theoretical analysis on why this
revolutionary and seemingly counter-intuitive approach
leads to a more accurate identification of $k$ NN
items. Our analysis is followed by a proposal for a
hashing algorithm that embeds this novel principle. Our
empirical studies confirm that a hashing algorithm
based on this counter-intuitive idea significantly
improves the efficiency and accuracy of
state-of-the-art techniques.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Huang:2015:CMB,
author = "Botong Huang and Nicholas W. D. Jarrett and Shivnath
Babu and Sayan Mukherjee and Jun Yang",
title = "{C{\"u}m{\"u}l{\"o}n}: matrix-based data analytics in
the cloud with spot instances",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "3",
pages = "156--167",
month = nov,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 2 14:26:50 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We describe C{\"u}m{\"u}l{\"o}n, a system aimed at
helping users develop and deploy matrix-based data
analysis programs in a public cloud. A key feature of
C{\"u}m{\"u}l{\"o}n is its end-to-end support for the
so-called spot instances ---machines whose market price
fluctuates over time but is usually much lower than the
regular fixed price. A user sets a bid price when
acquiring spot instances, and loses them as soon as the
market price exceeds the bid price. While spot
instances can potentially save cost, they are difficult
to use effectively, and run the risk of not finishing
work while costing more. C{\"u}m{\"u}l{\"o}n provides a
highly elastic computation and storage engine on top of
spot instances, and offers automatic cost-based
optimization of execution, deployment, and bidding
strategies. C{\"u}m{\"u}l{\"o}n further quantifies how
the uncertainty in the market price translates into the
cost uncertainty of its recommendations, and allows
users to specify their risk tolerance as an
optimization constraint.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kaul:2015:NLU,
author = "Manohar Kaul and Raymond Chi-Wing Wong and Christian
S. Jensen",
title = "New lower and upper bounds for shortest distance
queries on terrains",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "3",
pages = "168--179",
month = nov,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 2 14:26:50 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The increasing availability of massive and accurate
laser data enables the processing of spatial queries on
terrains. As shortest-path computation, an integral
element of query processing, is inherently expensive on
terrains, a key approach to enabling efficient query
processing is to reduce the need for exact
shortest-path computation in query processing. We
develop new lower and upper bounds on terrain shortest
distances that are provably tighter than any existing
bounds. Unlike existing bounds, the new bounds do not
rely on the quality of the triangulation. We show how
use of the new bounds speeds up query processing by
reducing the need for exact distance computations.
Speedups of of nearly an order of magnitude are
demonstrated empirically for well-known spatial
queries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Freire:2015:CRR,
author = "Cibele Freire and Wolfgang Gatterbauer and Neil
Immerman and Alexandra Meliou",
title = "The complexity of resilience and responsibility for
self-join-free conjunctive queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "3",
pages = "180--191",
month = nov,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 2 14:26:50 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Several research thrusts in the area of data
management have focused on understanding how changes in
the data affect the output of a view or standing query.
Example applications are explaining query results,
propagating updates through views, and anonymizing
datasets. An important aspect of this analysis is the
problem of deleting a minimum number of tuples from the
input tables to make a given Boolean query false, which
we refer to as `` the resilience of a query. '' In this
paper, we study the complexity of resilience for
self-join-free conjunctive queries with arbitrary
functional dependencies. The cornerstone of our work is
the novel concept of triads, a simple structural
property of a query that leads to the several dichotomy
results we show in this paper. The concepts of triads
and resilience bridge the connections between the
problems of deletion propagation and causal
responsibility, and allow us to substantially advance
the known complexity results in these topics.
Specifically, we show a dichotomy for the complexity of
resilience, which identifies previously unknown
tractable families for deletion propagation with source
side-effects, and we extend this result to account for
functional dependencies. Further, we identify a mistake
in a previous dichotomy for causal responsibility, and
offer a revised characterization based purely on the
structural form of the query (presence or absence of
triads). Finally, we extend the dichotomy for causal
responsibility in two ways: (a) we account for
functional dependencies in the input tables, and (b) we
compute responsibility for sets of tuples specified via
wildcards.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Huang:2015:SAD,
author = "Hao Huang and Shiva Prasad Kasiviswanathan",
title = "Streaming anomaly detection using randomized matrix
sketching",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "3",
pages = "192--203",
month = nov,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 2 14:26:50 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data is continuously being generated from sources such
as machines, network traffic, application logs, etc.
Timely and accurate detection of anomalies in massive
data streams has important applications such as in
preventing machine failures, intrusion detection, and
dynamic load balancing. In this paper, we introduce a
novel (unsupervised) anomaly detection framework which
can be used to detect anomalies in a streaming fashion
by making only one pass over the data while utilizing
limited storage. We adapt ideas from matrix sketching
to maintain, in a streaming model, a set of few
orthogonal vectors that form a good approximate basis
for all the observed data. Using this constructed
orthogonal basis, anomalies in new incoming data are
detected based on a simple reconstruction error test.
We theoretically prove that our algorithm compares
favorably with an offline approach based on expensive
global singular value decomposition (SVD) updates.
Additionally, we apply ideas from randomized low-rank
matrix approximations to further speedup the algorithm.
The experimental results show the effectiveness and
efficiency of our approach over other popular scalable
anomaly detection approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Leis:2015:HGQ,
author = "Viktor Leis and Andrey Gubichev and Atanas Mirchev and
Peter Boncz and Alfons Kemper and Thomas Neumann",
title = "How good are query optimizers, really?",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "3",
pages = "204--215",
month = nov,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 2 14:26:50 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Finding a good join order is crucial for query
performance. In this paper, we introduce the Join Order
Benchmark (JOB) and experimentally revisit the main
components in the classic query optimizer architecture
using a complex, real-world data set and realistic
multi-join queries. We investigate the quality of
industrial-strength cardinality estimators and find
that all estimators routinely produce large errors. We
further show that while estimates are essential for
finding a good join order, query performance is
unsatisfactory if the query engine relies too heavily
on these estimates. Using another set of experiments
that measure the impact of the cost model, we find that
it has much less influence on query performance than
the cardinality estimates. Finally, we investigate plan
enumeration techniques comparing exhaustive dynamic
programming with heuristic algorithms and find that
exhaustive enumeration improves performance despite the
sub-optimal cardinality estimates.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Interlandi:2015:TDP,
author = "Matteo Interlandi and Kshitij Shah and Sai Deep Tetali
and Muhammad Ali Gulzar and Seunghyun Yoo and Miryung
Kim and Todd Millstein and Tyson Condie",
title = "{Titian}: data provenance support in {Spark}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "3",
pages = "216--227",
month = nov,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 2 14:26:50 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Debugging data processing logic in Data-Intensive
Scalable Computing (DISC) systems is a difficult and
time consuming effort. Today's DISC systems offer very
little tooling for debugging programs, and as a result
programmers spend countless hours collecting evidence (
e.g., from log files) and performing trial and error
debugging. To aid this effort, we built Titian, a
library that enables data provenance ---tracking data
through transformations---in Apache Spark. Data
scientists using the Titian Spark extension will be
able to quickly identify the input data at the root
cause of a potential bug or outlier result. Titian is
built directly into the Spark platform and offers data
provenance support at interactive
speeds---orders-of-magnitude faster than alternative
solutions---while minimally impacting Spark job
performance; observed overheads for capturing data
lineage rarely exceed 30\% above the baseline job
execution time.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Rodiger:2015:HSQ,
author = "Wolf R{\"o}diger and Tobias M{\"u}hlbauer and Alfons
Kemper and Thomas Neumann",
title = "High-speed query processing over high-speed networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "4",
pages = "228--239",
month = dec,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Dec 19 17:42:25 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Modern database clusters entail two levels of
networks: connecting CPUs and NUMA regions inside a
single server in the small and multiple servers in the
large. The huge performance gap between these two types
of networks used to slow down distributed query
processing to such an extent that a cluster of machines
actually performed worse than a single many-core
server. The increased main-memory capacity of the
cluster remained the sole benefit of such a scale-out.
The economic viability of high-speed interconnects such
as InfiniBand has narrowed this performance gap
considerably. However, InfiniBand's higher network
bandwidth alone does not improve query performance as
expected when the distributed query engine is left
unchanged. The scalability of distributed query
processing is impaired by TCP overheads, switch
contention due to uncoordinated communication, and load
imbalances resulting from the inflexibility of the
classic exchange operator model. This paper presents
the blueprint for a distributed query engine that
addresses these problems by considering both levels of
networks holistically. It consists of two parts: First,
hybrid parallelism that distinguishes local and
distributed parallelism for better scalability in both
the number of cores as well as servers. Second, a novel
communication multiplexer tailored for analytical
database workloads using remote direct memory access
(RDMA) and low-latency network scheduling for
high-speed communication with almost no CPU overhead.
An extensive evaluation within the HyPer database
system using the TPC-H benchmark shows that our
holistic approach indeed enables high-speed query
processing over high-speed networks.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zong:2015:BQD,
author = "Bo Zong and Xusheng Xiao and Zhichun Li and Zhenyu Wu
and Zhiyun Qian and Xifeng Yan and Ambuj K. Singh and
Guofei Jiang",
title = "Behavior query discovery in system-generated temporal
graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "4",
pages = "240--251",
month = dec,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Dec 19 17:42:25 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Computer system monitoring generates huge amounts of
logs that record the interaction of system entities.
How to query such data to better understand system
behaviors and identify potential system risks and
malicious behaviors becomes a challenging task for
system administrators due to the dynamics and
heterogeneity of the data. System monitoring data are
essentially heterogeneous temporal graphs with nodes
being system entities and edges being their
interactions over time. Given the complexity of such
graphs, it becomes time-consuming for system
administrators to manually formulate useful queries in
order to examine abnormal activities, attacks, and
vulnerabilities in computer systems. In this work, we
investigate how to query temporal graphs and treat
query formulation as a discriminative temporal graph
pattern mining problem. We introduce TGMiner to mine
discriminative patterns from system logs, and these
patterns can be taken as templates for building more
complex queries. TGMiner leverages temporal information
in graphs to prune graph patterns that share similar
growth trend without compromising pattern quality.
Experimental results on real system data show that
TGMiner is 6-32 times faster than baseline methods. The
discovered patterns were verified by system experts;
they achieved high precision (97\%) and recall
(91\%).",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kocberber:2015:AMA,
author = "Onur Kocberber and Babak Falsafi and Boris Grot",
title = "Asynchronous memory access chaining",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "4",
pages = "252--263",
month = dec,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Dec 19 17:42:25 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In-memory databases rely on pointer-intensive data
structures to quickly locate data in memory. A single
lookup operation in such data structures often exhibits
long-latency memory stalls due to dependent pointer
dereferences. Hiding the memory latency by launching
additional memory accesses for other lookups is an
effective way of improving performance of
pointer-chasing codes (e.g., hash table probes, tree
traversals). The ability to exploit such inter-lookup
parallelism is beyond the reach of modern out-of-order
cores due to the limited size of their instruction
window. Instead, recent work has proposed software
prefetching techniques that exploit inter-lookup
parallelism by arranging a set of independent lookups
into a group or a pipeline, and navigate their
respective pointer chains in a synchronized fashion.
While these techniques work well for highly regular
access patterns, they break down in the face of
irregularity across lookups. Such irregularity includes
variable-length pointer chains, early exit, and
read/write dependencies. This work introduces
Asynchronous Memory Access Chaining (AMAC), a new
approach for exploiting inter-lookup parallelism to
hide the memory access latency. AMAC achieves high
dynamism in dealing with irregularity across lookups by
maintaining the state of each lookup separately from
that of other lookups. This feature enables AMAC to
initiate a new lookup as soon as any of the in-flight
lookups complete. In contrast, the static arrangement
of lookups into a group or pipeline in existing
techniques precludes such adaptivity. Our results show
that AMAC matches or outperforms state-of-the-art
prefetching techniques on regular access patterns,
while delivering up to 2.3x higher performance under
irregular data structure lookups. AMAC fully utilizes
the available microarchitectural resources, generating
the maximum number of memory accesses allowed by
hardware in both single- and multi-threaded execution
modes.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Haney:2015:DPA,
author = "Samuel Haney and Ashwin Machanavajjhala and Bolin
Ding",
title = "Design of policy-aware differentially private
algorithms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "4",
pages = "264--275",
month = dec,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Dec 19 17:42:25 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The problem of designing error optimal differentially
private algorithms is well studied. Recent work
applying differential privacy to real world settings
have used variants of differential privacy that
appropriately modify the notion of neighboring
databases. The problem of designing error optimal
algorithms for such variants of differential privacy is
open. In this paper, we show a novel transformational
equivalence result that can turn the problem of query
answering under differential privacy with a modified
notion of neighbors to one of query answering under
standard differential privacy, for a large class of
neighbor definitions. We utilize the Blowfish privacy
framework that generalizes differential privacy.
Blowfish uses a policy graph to instantiate different
notions of neighboring databases. We show that the
error incurred when answering a workload W on a
database x under a Blowfish policy graph G is identical
to the error required to answer a transformed workload
f$_G$ (W) on database g$_G$ (x) under standard
differential privacy, where f$_G$ and g$_G$ are linear
transformations based on G. Using this result, we
develop error efficient algorithms for releasing
histograms and multidimensional range queries under
different Blowfish policies. We believe the tools we
develop will be useful for finding mechanisms to answer
many other classes of queries with low error under
other policy graphs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Huang:2015:ACC,
author = "Xin Huang and Laks V. S. Lakshmanan and Jeffrey Xu Yu
and Hong Cheng",
title = "Approximate closest community search in networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "4",
pages = "276--287",
month = dec,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Dec 19 17:42:25 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Recently, there has been significant interest in the
study of the community search problem in social and
information networks: given one or more query nodes,
find densely connected communities containing the query
nodes. However, most existing studies do not address
the ``free rider'' issue, that is, nodes far away from
query nodes and irrelevant to them are included in the
detected community. Some state-of-the-art models have
attempted to address this issue, but not only are their
formulated problems NP-hard, they do not admit any
approximations without restrictive assumptions, which
may not always hold in practice. In this paper, given
an undirected graph G and a set of query nodes Q, we
study community search using the k -truss based
community model. We formulate our problem of finding a
closest truss community (CTC), as finding a connected k
truss subgraph with the largest k that contains Q, and
has the minimum diameter among such subgraphs. We prove
this problem is NP-hard. Furthermore, it is NP-hard to
approximate the problem within a factor $ (2 -
\epsilon) $, for any $ \epsilon > 0 $. However, we
develop a greedy algorithmic framework, which first
finds a CTC containing Q, and then iteratively removes
the furthest nodes from Q, from the graph. The method
achieves 2-approximation to the optimal solution. To
further improve the efficiency, we make use of a
compact truss index and develop efficient algorithms
for k -truss identification and maintenance as nodes
get eliminated. In addition, using bulk deletion
optimization and local exploration strategies, we
propose two more efficient algorithms. One of them
trades some approximation quality for efficiency while
the other is a very efficient heuristic. Extensive
experiments on 6 real-world networks show the
effectiveness and efficiency of our community model and
search algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Andre:2015:CLE,
author = "Fabien Andr{\'e} and Anne-Marie Kermarrec and Nicolas
{Le Scouarnec}",
title = "Cache locality is not enough: high-performance nearest
neighbor search with product quantization fast scan",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "4",
pages = "288--299",
month = dec,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Dec 19 17:42:25 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Nearest Neighbor (NN) search in high dimension is an
important feature in many applications (e.g., image
retrieval, multimedia databases). Product Quantization
(PQ) is a widely used solution which offers high
performance, i.e., low response time while preserving a
high accuracy. PQ represents high-dimensional vectors
(e.g., image descriptors) by compact codes. Hence, very
large databases can be stored in memory, allowing NN
queries without resorting to slow I/O operations. PQ
computes distances to neighbors using cache-resident
lookup tables, thus its performance remains limited by
(i) the many cache accesses that the algorithm
requires, and (ii) its inability to leverage SIMD
instructions available on modern CPUs. In this paper,
we advocate that cache locality is not sufficient for
efficiency. To address these limitations, we design a
novel algorithm, PQ Fast Scan, that transforms the
cache-resident lookup tables into small tables, sized
to fit SIMD registers. This transformation allows (i)
in-register lookups in place of cache accesses and (ii)
an efficient SIMD implementation. PQ Fast Scan has the
exact same accuracy as PQ, while having 4 to 6 times
lower response time (e.g., for 25 million vectors, scan
time is reduced from 74ms to 13ms).",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Prokoshyna:2015:CQL,
author = "Nataliya Prokoshyna and Jaros{\l}aw Szlichta and Fei
Chiang and Ren{\'e}e J. Miller and Divesh Srivastava",
title = "Combining quantitative and logical data cleaning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "4",
pages = "300--311",
month = dec,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Dec 19 17:42:25 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Quantitative data cleaning relies on the use of
statistical methods to identify and repair data quality
problems while logical data cleaning tackles the same
problems using various forms of logical reasoning over
declarative dependencies. Each of these approaches has
its strengths: the logical approach is able to capture
subtle data quality problems using sophisticated
dependencies, while the quantitative approach excels at
ensuring that the repaired data has desired statistical
properties. We propose a novel framework within which
these two approaches can be used synergistically to
combine their respective strengths. We instantiate our
framework using (i) metric functional dependencies, a
type of dependency that generalizes functional
dependencies (FDs) to identify inconsistencies in
domains where only large differences in metric data are
considered to be a data quality problem, and (ii)
repairs that modify the inconsistent data so as to
minimize statistical distortion, measured using the
Earth Mover's Distance. We show that the problem of
computing a statistical distortion minimal repair is
NP-hard. Given this complexity, we present an efficient
algorithm for finding a minimal repair that has a small
statistical distortion using EMD computation over
semantically related attributes. To identify
semantically related attributes, we present a sound and
complete axiomatization and an efficient algorithm for
testing implication of metric FDs. While the complexity
of inference for some other FD extensions is co-NP
complete, we show that the inference problem for metric
FDs remains linear, as in traditional FDs. We prove
that every instance that can be generated by our repair
algorithm is set-minimal (with no unnecessary changes).
Our experimental evaluation demonstrates that our
techniques obtain a considerably lower statistical
distortion than existing repair techniques, while
achieving similar levels of efficiency.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Papadakis:2015:SAV,
author = "George Papadakis and George Alexiou and George
Papastefanatos and Georgia Koutrika",
title = "Schema-agnostic vs schema-based configurations for
blocking methods on homogeneous data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "4",
pages = "312--323",
month = dec,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Dec 19 17:42:25 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Entity Resolution constitutes a core task for data
integration that, due to its quadratic complexity,
typically scales to large datasets through blocking
methods. These can be configured in two ways. The
schema-based configuration relies on schema information
in order to select signatures of high distinctiveness
and low noise, while the schema-agnostic one treats
every token from all attribute values as a signature.
The latter approach has significant potential, as it
requires no fine-tuning by human experts and it applies
to heterogeneous data. Yet, there is no systematic
study on its relative performance with respect to the
schema-based configuration. This work covers this gap
by comparing analytically the two configurations in
terms of effectiveness, time efficiency and
scalability. We apply them to 9 established blocking
methods and to 11 benchmarks of structured data. We
provide valuable insights into the internal
functionality of the blocking methods with the help of
a novel taxonomy. Our studies reveal that the
schema-agnostic configuration offers unsupervised and
robust definition of blocking keys under versatile
settings, trading a higher computational cost for a
consistently higher recall than the schema-based one.
It also enables the use of state-of-the-art blocking
methods without schema knowledge.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Epasto:2015:ENC,
author = "Alessandro Epasto and Silvio Lattanzi and Vahab
Mirrokni and Ismail Oner Sebe and Ahmed Taei and Sunita
Verma",
title = "Ego-net community mining applied to friend
suggestion",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "4",
pages = "324--335",
month = dec,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Dec 19 17:42:25 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we present a study of the community
structure of ego-networks---the graphs representing the
connections among the neighbors of a node---for several
online social networks. Toward this goal, we design a
new technique to efficiently build and cluster all the
ego-nets of a graph in parallel (note that even just
building the ego-nets efficiently is challenging on
large networks). Our experimental findings are quite
compelling: at a microscopic level it is easy to detect
high quality communities. Leveraging on this fact we,
then, develop new features for friend suggestion based
on co-occurrences of two nodes in different ego-nets'
communities. Our new features can be computed
efficiently on very large scale graphs by just
analyzing the neighborhood of each node. Furthermore,
we prove formally on a stylized model, and by
experimental analysis that this new similarity measure
outperforms the classic local features employed for
friend suggestions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Abedjan:2015:TRD,
author = "Ziawasch Abedjan and Cuneyt G. Akcora and Mourad
Ouzzani and Paolo Papotti and Michael Stonebraker",
title = "Temporal rules discovery for web data cleaning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "4",
pages = "336--347",
month = dec,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Dec 19 17:42:25 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Declarative rules, such as functional dependencies,
are widely used for cleaning data. Several systems take
them as input for detecting errors and computing a
``clean'' version of the data. To support domain
experts, in specifying these rules, several tools have
been proposed to profile the data and mine rules.
However, existing discovery techniques have
traditionally ignored the time dimension. Recurrent
events, such as persons reported in locations, have a
duration in which they are valid, and this duration
should be part of the rules or the cleaning process
would simply fail. In this work, we study the rule
discovery problem for temporal web data. Such a
discovery process is challenging because of the nature
of web data; extracted facts are (i) sparse over time,
(ii) reported with delays, and (iii) often reported
with errors over the values because of inaccurate
sources or non robust extractors. We handle these
challenges with a new discovery approach that is more
robust to noise. Our solution uses machine learning
methods, such as association measures and outlier
detection, for the discovery of the rules, together
with an aggressive repair of the data in the mining
step itself. Our experimental evaluation over
real-world data from Recorded Future, an intelligence
company that monitors over 700K Web sources, shows that
temporal rules improve the quality of the data with an
increase of the average precision in the cleaning
process from 0.37 to 0.84, and a 40\% relative increase
in the average F-measure.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Roy:2015:EQA,
author = "Sudeepa Roy and Laurel Orr and Dan Suciu",
title = "Explaining query answers with explanation-ready
databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "4",
pages = "348--359",
month = dec,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Dec 19 17:42:25 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the increased generation and availability of big
data in different domains, there is an imminent
requirement for data analysis tools that are able to
'explain' the trends and anomalies obtained from this
data to a range of users with different backgrounds.
Wu-Madden (PVLDB 2013) and Roy-Suciu (SIGMOD 2014)
recently proposed solutions that can explain
interesting or unexpected answers to simple aggregate
queries in terms of predicates on attributes. In this
paper, we propose a generic framework that can support
much richer, insightful explanations by preparing the
database offline, so that top explanations can be found
interactively at query time. The main idea in such
explanation-ready databases is to pre-compute the
effects of potential explanations (called interventions
), and efficiently re-evaluate the original query
taking into account these effects. We formalize this
notion and define an explanation-query that can
evaluate all possible explanations simultaneously
without having to run an iterative process, develop
algorithms and optimizations, and evaluate our approach
with experiments on real data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Deng:2015:EPB,
author = "Dong Deng and Guoliang Li and He Wen and Jianhua
Feng",
title = "An efficient partition based method for exact set
similarity joins",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "4",
pages = "360--371",
month = dec,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Dec 19 17:42:25 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We study the exact set similarity join problem, which,
given two collections of sets, finds out all the
similar set pairs from the collections. Existing
methods generally utilize the prefix filter based
framework. They generate a prefix for each set and
prune all the pairs whose prefixes are disjoint.
However the pruning power is limited, because if two
dissimilar sets share a common element in their
prefixes, they cannot be pruned. To address this
problem, we propose a partition-based framework. We
design a partition scheme to partition the sets into
several subsets and guarantee that two sets are similar
only if they share a common subset. To improve the
pruning power, we propose a mixture of the subsets and
their 1-deletion neighborhoods (the subset of a set by
eliminating one element). As there are multiple
allocation strategies to generate the mixture, we
evaluate different allocations and design a
dynamic-programming algorithm to select the optimal
one. However the time complexity of generating the
optimal one is $ O(s^3) $ for a set with size $s$. To
speed up the allocation selection, we develop a greedy
algorithm with an approximation ratio of 2. To further
reduce the complexity, we design an adaptive grouping
mechanism, and the two techniques can reduce the
complexity to $ O(s \log s)$. Experimental results on
three real-world datasets show our method achieves high
performance and outperforms state-of-the-art methods by
2-5 times.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Haas:2015:CSC,
author = "Daniel Haas and Jiannan Wang and Eugene Wu and Michael
J. Franklin",
title = "{CLAMShell}: speeding up crowds for low-latency data
labeling",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "4",
pages = "372--383",
month = dec,
year = "2015",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Dec 19 17:42:25 MST 2015",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data labeling is a necessary but often slow process
that impedes the development of interactive systems for
modern data analysis. Despite rising demand for manual
data labeling, there is a surprising lack of work
addressing its high and unpredictable latency. In this
paper, we introduce CLAMShell, a system that speeds up
crowds in order to achieve consistently low-latency
data labeling. We offer a taxonomy of the sources of
labeling latency and study several large crowd-sourced
labeling deployments to understand their empirical
latency profiles. Driven by these insights, we
comprehensively tackle each source of latency, both by
developing novel techniques such as straggler
mitigation and pool maintenance and by optimizing
existing methods such as crowd retainer pools and
active learning. We evaluate CLAMShell in simulation
and on live workers on Amazon's Mechanical Turk,
demonstrating that our techniques can provide an order
of magnitude speedup and variance reduction over
existing crowdsourced labeling strategies.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Firmani:2016:OER,
author = "Donatella Firmani and Barna Saha and Divesh
Srivastava",
title = "Online entity resolution using an oracle",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "5",
pages = "384--395",
month = jan,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Jan 11 17:54:24 MST 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Entity resolution (ER) is the task of identifying all
records in a database that refer to the same underlying
entity. This is an expensive task, and can take a
significant amount of money and time; the end-user may
want to take decisions during the process, rather than
waiting for the task to be completed. We formalize an
online version of the entity resolution task, and use
an oracle which correctly labels matching and
non-matching pairs through queries. In this setting, we
design algorithms that seek to maximize progressive
recall, and develop a novel analysis framework for
prior proposals on entity resolution with an oracle,
beyond their worst case guarantees. Finally, we provide
both theoretical and experimental analysis of the
proposed algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Calautti:2016:EEG,
author = "Marco Calautti and Sergio Greco and Cristian Molinaro
and Irina Trubitsyna",
title = "Exploiting equality generating dependencies in
checking chase termination",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "5",
pages = "396--407",
month = jan,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Jan 11 17:54:24 MST 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The chase is a well-known algorithm with a wide range
of applications in data exchange, data cleaning, data
integration, query optimization, and ontological
reasoning. Since the chase evaluation might not
terminate and it is undecidable whether it terminates,
the problem of defining (decidable) sufficient
conditions ensuring termination has received a great
deal of interest in recent years. In this regard,
several termination criteria have been proposed. One of
the main weaknesses of current approaches is the
limited analysis they perform on equality generating
dependencies (EGDs). In this paper, we propose
sufficient conditions ensuring that a set of
dependencies has at least one terminating chase
sequence. We propose novel criteria which are able to
perform a more accurate analysis of EGDs. Specifically,
we propose a new stratification criterion and an
adornment algorithm. The latter can both be used as a
termination criterion and be combined with current
techniques to make them more effective, in that
strictly more sets of dependencies are identified. Our
techniques identify sets of dependencies that are not
recognized by any of the current criteria.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yang:2016:SBF,
author = "Tong Yang and Alex X. Liu and Muhammad Shahzad and
Yuankun Zhong and Qiaobin Fu and Zi Li and Gaogang Xie
and Xiaoming Li",
title = "A shifting {Bloom} filter framework for set queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "5",
pages = "408--419",
month = jan,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Jan 11 17:54:24 MST 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Set queries are fundamental operations in computer
systems and applications. This paper addresses the
fundamental problem of designing a probabilistic data
structure that can quickly process set queries using a
small amount of memory. We propose a Shifting Bloom
Filter (ShBF) framework for representing and querying
sets. We demonstrate the effectiveness of ShBF using
three types of popular set queries: membership,
association, and multiplicity queries. The key novelty
of ShBF is on encoding the auxiliary information of a
set element in a location offset. In contrast, prior BF
based set data structures allocate additional memory to
store auxiliary information. We conducted experiments
using real-world network traces, and results show that
ShBF significantly advances the state-of-the-art on all
three types of set queries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yang:2016:HTM,
author = "Fan Yang and Jinfeng Li and James Cheng",
title = "{Husky}: towards a more efficient and expressive
distributed computing framework",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "5",
pages = "420--431",
month = jan,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Jan 11 17:54:24 MST 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Finding efficient, expressive and yet intuitive
programming models for data-parallel computing system
is an important and open problem. Systems like Hadoop
and Spark have been widely adopted for massive data
processing, as coarse-grained primitives like map and
reduce are succinct and easy to master. However,
sometimes over-simplified API hinders programmers from
more fine-grained control and designing more efficient
algorithms. Developers may have to resort to
sophisticated domain-specific languages (DSLs), or even
low-level layers like MPI, but this raises development
cost---learning many mutually exclusive systems
prolongs the development schedule, and the use of
low-level tools may result in bug-prone programming.
This motivated us to start the Husky open-source
project, which is an attempt to strike a better balance
between high performance and low development cost.
Husky is developed mainly for in-memory large scale
data mining, and also serves as a general research
platform for designing efficient distributed
algorithms. We show that many existing frameworks can
be easily implemented and bridged together inside
Husky, and Husky is able to achieve similar or even
better performance compared with domain-specific
systems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2016:RDT,
author = "Zeyu Li and Hongzhi Wang and Wei Shao and Jianzhong Li
and Hong Gao",
title = "Repairing data through regular expressions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "5",
pages = "432--443",
month = jan,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Jan 11 17:54:24 MST 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/string-matching.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Since regular expressions are often used to detect
errors in sequences such as strings or date, it is
natural to use them for data repair. Motivated by this,
we propose a data repair method based on regular
expression to make the input sequence data obey the
given regular expression with minimal revision cost.
The proposed method contains two steps, sequence repair
and token value repair. For sequence repair, we propose
the Regular-expression-based Structural Repair (RSR in
short) algorithm. RSR algorithm is a dynamic
programming algorithm that utilizes Nondeterministic
Finite Automata (NFA) to calculate the edit distance
between a prefix of the input string and a partial
pattern regular expression with time complexity of $ O
(n m^2) $ and space complexity of $ O(m n) $ where $m$
is the edge number of NFA and $n$ is the input string
length. We also develop an optimization strategy to
achieve higher performance for long strings. For token
value repair, we combine the edit-distance-based method
and associate rules by a unified argument for the
selection of the proper method. Experimental results on
both real and synthetic data show that the proposed
method could repair the data effectively and
efficiently.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yan:2016:LLC,
author = "Cong Yan and Alvin Cheung",
title = "Leveraging lock contention to improve {OLTP}
application performance",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "5",
pages = "444--455",
month = jan,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Jan 11 17:54:24 MST 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Locking is one of the predominant costs in transaction
processing. While much work has focused on designing
efficient concurrency control mechanisms, not much has
been done on understanding how transaction applications
issue queries and leveraging application semantics to
improve application performance. This paper presents Q
uro, a query-aware compiler that automatically reorders
queries in transaction code to improve performance.
Observing that certain queries within a transaction are
more contentious than others as they require locking
the same tuples as other concurrently executing
transactions, Quro automatically changes the
application such that contentious queries are issued as
late as possible. We have evaluated Quro on various
transaction benchmarks, and our results show that
Quro-generated implementations can increase transaction
throughput by up to 6.53x, while reduce transaction
latency by up to 85\%.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Choudhury:2016:MBR,
author = "Farhana M. Choudhury and J. Shane Culpepper and Timos
Sellis and Xin Cao",
title = "Maximizing bichromatic reverse spatial and textual $k$
nearest neighbor queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "6",
pages = "456--467",
month = jan,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 19 10:09:59 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The problem of maximizing bichromatic reverse $k$
nearest neighbor queries (BR $k$ NN) has been
extensively studied in spatial databases. In this work,
we present a related query for spatial-textual
databases that finds an optimal location, and a set of
keywords that maximizes the size of bichromatic reverse
spatial textual $k$ nearest neighbors (MaxBRST $k$ NN).
Such a query has many practical applications including
social media advertisements where a limited number of
relevant advertisements are displayed to each user. The
problem is to find the location and the text contents
to include in an advertisement so that it will be
displayed to the maximum number of users. The
increasing availability of spatial-textual collections
allows us to answer these queries for both spatial
proximity and textual similarity. This paper is the
first to consider the MaxBRST $k$ NN query. We show
that the problem is NP-hard and present both
approximate and exact solutions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Subercaze:2016:IFM,
author = "Julien Subercaze and Christophe Gravier and Jules
Chevalier and Frederique Laforest",
title = "{Inferray}: fast in-memory {RDF} inference",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "6",
pages = "468--479",
month = jan,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 19 10:09:59 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The advent of semantic data on the Web requires
efficient reasoning systems to infer RDF and OWL data.
The linked nature and the huge volume of data entail
efficiency and scalability challenges when designing
productive inference systems. This paper presents
Inferray, an implementation of RDFS, $ \rho $ df, and
RDFS-Plus inference with improved performance over
existing solutions. The main features of Inferray are
(1) a storage layout based on vertical partitioning
that guarantees sequential access and efficient
sort-merge join inference; (2) efficient sorting of
pairs of 64-bit integers using ad-hoc optimizations on
MSD radix and a custom counting sort; (3) a dedicated
temporary storage to perform efficient graph closure
computation. Our measurements on synthetic and
real-world datasets show improvements over competitors
on RDFS-Plus, and up to several orders of magnitude for
transitivity closure.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Makreshanski:2016:MES,
author = "Darko Makreshanski and Georgios Giannikis and Gustavo
Alonso and Donald Kossmann",
title = "{MQJoin}: efficient shared execution of main-memory
joins",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "6",
pages = "480--491",
month = jan,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 19 10:09:59 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Database architectures typically process queries
one-at-a-time, executing concurrent queries in
independent execution contexts. Often, such a design
leads to unpredictable performance and poor
scalability. One approach to circumvent the problem is
to take advantage of sharing opportunities across
concurrently running queries. In this paper we propose
Many-Query Join (MQJoin), a novel method for sharing
the execution of a join that can efficiently deal with
hundreds of concurrent queries. This is achieved by
minimizing redundant work and making efficient use of
main-memory bandwidth and multi-core architectures.
Compared to existing proposals, MQJoin is able to
efficiently handle larger workloads regardless of the
schema by exploiting more sharing opportunities. We
also compared MQJoin to two commercial main-memory
column-store databases. For a TPC-H based workload, we
show that MQJoin provides 2--5x higher throughput with
significantly more stable response times.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Abeywickrama:2016:NNR,
author = "Tenindra Abeywickrama and Muhammad Aamir Cheema and
David Taniar",
title = "$k$-nearest neighbors on road networks: a journey in
experimentation and in-memory implementation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "6",
pages = "492--503",
month = jan,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 19 10:09:59 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A $k$ nearest neighbor ($k$ NN) query on road networks
retrieves the $k$ closest points of interest (POIs) by
their network distances from a given location. Today,
in the era of ubiquitous mobile computing, this is a
highly pertinent query. While Euclidean distance has
been used as a heuristic to search for the closest POIs
by their road network distance, its efficacy has not
been thoroughly investigated. The most recent methods
have shown significant improvement in query
performance. Earlier studies, which proposed disk-based
indexes, were compared to the current state-of-the-art
in main memory. However, recent studies have shown that
main memory comparisons can be challenging and require
careful adaptation. This paper presents an extensive
experimental investigation in main memory to settle
these and several other issues. We use efficient and
fair memory-resident implementations of each method to
reproduce past experiments and conduct additional
comparisons for several overlooked evaluations. Notably
we revisit a previously discarded technique (IER)
showing that, through a simple improvement, it is often
the best performing technique.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yuan:2016:BRF,
author = "Yuan Yuan and Kaibo Wang and Rubao Lee and Xiaoning
Ding and Jing Xing and Spyros Blanas and Xiaodong
Zhang",
title = "{BCC}: reducing false aborts in optimistic concurrency
control with low cost for in-memory databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "6",
pages = "504--515",
month = jan,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 19 10:09:59 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The Optimistic Concurrency Control (OCC) method has
been commonly used for in-memory databases to ensure
transaction serializability --- a transaction will be
aborted if its read set has been changed during
execution. This simple criterion to abort transactions
causes a large proportion of false positives, leading
to excessive transaction aborts. Transactions aborted
false-positively (i.e. false aborts) waste system
resources and can significantly degrade system
throughput (as much as 3.68x based on our experiments)
when data contention is intensive. Modern in-memory
databases run on systems with increasingly parallel
hardware and handle workloads with growing concurrency.
They must efficiently deal with data contention in the
presence of greater concurrency by minimizing false
aborts. This paper presents a new concurrency control
method named Balanced Concurrency Control (BCC) which
aborts transactions more carefully than OCC does. BCC
detects data dependency patterns which can more
reliably indicate unserializable transactions than the
criterion used in OCC. The paper studies the design
options and implementation techniques that can
effectively detect data contention by identifying
dependency patterns with low overhead. To test the
performance of BCC, we have implemented it in Silo and
compared its performance against that of the vanilla
Silo system with OCC and two-phase locking (2PL). Our
extensive experiments with TPC-W-like, TPC-C-like and
YCSB workloads demonstrate that when data contention is
intensive, BCC can increase transaction throughput by
more than 3x versus OCC and more than 2x versus 2PL;
meanwhile, BCC has comparable performance with OCC for
workloads with low data contention.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yuan:2016:EEG,
author = "Long Yuan and Lu Qin and Xuemin Lin and Lijun Chang
and Wenjie Zhang",
title = "{I/O} efficient {ECC} graph decomposition via graph
reduction",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "7",
pages = "516--527",
month = mar,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 19 10:10:00 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The problem of computing $k$-edge connected components
($k$-ECCs) of a graph G for a specific $k$ is a
fundamental graph problem and has been investigated
recently. In this paper, we study the problem of ECC
decomposition, which computes the $k$-ECCs of a graph G
for all $k$ values. ECC decomposition can be widely
applied in a variety of applications such as
graph-topology analysis, community detection, Steiner
component search, and graph visualization. A
straightforward solution for ECC decomposition is to
apply the existing $k$-ECC computation algorithm to
compute the $k$-ECCs for all $k$ values. However, this
solution is not applicable to large graphs for two
challenging reasons. First, all existing $k$-ECC
computation algorithms are highly memory intensive due
to the complex data structures used in the algorithms.
Second, the number of possible $k$ values can be very
large, resulting in a high computational cost when each
$k$ value is independently considered. In this paper,
we address the above challenges, and study I/O
efficient ECC decomposition via graph reduction. We
introduce two elegant graph reduction operators which
aim to reduce the size of the graph loaded in memory
while preserving the connectivity information of a
certain set of edges to be computed for a specific k.
We also propose three novel I/O efficient algorithms,
Bottom-Up, Top-Down, and Hybrid, that explore the $k$
values in different orders to reduce the redundant
computations between different $k$ values. We analyze
the I/O and memory costs for all proposed algorithms.
In our experiments, we evaluate our algorithms using
seven real large datasets with various graph
properties, one of which contains 1.95 billion edges.
The experimental results show that our proposed
algorithms are scalable and efficient.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Binnig:2016:ESN,
author = "Carsten Binnig and Andrew Crotty and Alex Galakatos
and Tim Kraska and Erfan Zamanian",
title = "The end of slow networks: it's time for a redesign",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "7",
pages = "528--539",
month = mar,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 19 10:10:00 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The next generation of high-performance networks with
remote direct memory access (RDMA) capabilities
requires a fundamental rethinking of the design of
distributed in-memory DBMSs. These systems are commonly
built under the assumption that the network is the
primary bottleneck and should be avoided at all costs,
but this assumption no longer holds. For instance, with
InfiniBand FDR $ 4 \times $, the bandwidth available to
transfer data across the network is in the same
ballpark as the bandwidth of one memory channel.
Moreover, RDMA transfer latencies continue to rapidly
improve as well. In this paper, we first argue that
traditional distributed DBMS architectures cannot take
full advantage of high-performance networks and suggest
a new architecture to address this problem. Then, we
discuss initial results from a prototype implementation
of our proposed architecture for OLTP and OLAP, showing
remarkable performance improvements over existing
designs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Huang:2016:LLE,
author = "Jiewen Huang and Daniel J. Abadi",
title = "{Leopard}: lightweight edge-oriented partitioning and
replication for dynamic graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "7",
pages = "540--551",
month = mar,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 19 10:10:00 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper introduces a dynamic graph partitioning
algorithm, designed for large, constantly changing
graphs. We propose a partitioning framework that
adjusts on the fly as the graph structure changes. We
also introduce a replication algorithm that is tightly
integrated with the partitioning algorithm, which
further reduces the number of edges cut by the
partitioning algorithm. Even though the proposed
approach is handicapped by only taking into
consideration local parts of the graph when reassigning
vertices, extensive evaluation shows that the proposed
approach maintains a quality partitioning over time,
which is comparable at any point in time to performing
a full partitioning from scratch using a state-the-art
static graph partitioning algorithm such as METIS.
Furthermore, when vertex replication is turned on,
edge-cut can improve by an order of magnitude.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gribkoff:2016:SDP,
author = "Eric Gribkoff and Dan Suciu",
title = "{SlimShot}: in-database probabilistic inference for
knowledge bases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "7",
pages = "552--563",
month = mar,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 19 10:10:00 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Increasingly large Knowledge Bases are being created,
by crawling the Web or other corpora of documents, and
by extracting facts and relations using machine
learning techniques. To manage the uncertainty in the
data, these KBs rely on probabilistic engines based on
Markov Logic Networks (MLN), for which probabilistic
inference remains a major challenge. Today's state of
the art systems use variants of MCMC, which have no
theoretical error guarantees, and, as we show, suffer
from poor performance in practice. In this paper we
describe SlimShot (Scalable Lifted Inference and Monte
Carlo Sampling Hybrid Optimization Technique), a
probabilistic inference engine for knowledge bases.
SlimShot converts the MLN to a tuple-independent
probabilistic database, then uses a simple Monte
Carlo-based inference, with three key enhancements: (1)
it combines sampling with safe query evaluation, (2) it
estimates a conditional probability by jointly
computing the numerator and denominator, and (3) it
adjusts the proposal distribution based on the sample
cardinality. In combination, these three techniques
allow us to give formal error guarantees, and we
demonstrate empirically that SlimShot outperforms
to-day's state of the art probabilistic inference
engines used in knowledge bases.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yan:2016:GPQ,
author = "Da Yan and James Cheng and M. Tamer {\"O}zsu and Fan
Yang and Yi Lu and John C. S. Lui and Qizhen Zhang and
Wilfred Ng",
title = "A general-purpose query-centric framework for querying
big graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "7",
pages = "564--575",
month = mar,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 19 10:10:00 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Pioneered by Google's Pregel, many distributed systems
have been developed for large-scale graph analytics.
These systems employ a user-friendly ``think like a
vertex'' programming model, and exhibit good
scalability for tasks where the majority of graph
vertices participate in computation. However, the
design of these systems can seriously under-utilize the
resources in a cluster for processing light-workload
graph queries, where only a small fraction of vertices
need to be accessed. In this work, we develop a new
open-source system, called Quegel, for querying big
graphs. Quegel treats queries as first-class citizens
in its design: users only need to specify the
Pregel-like algorithm for a generic query, and Quegel
processes light-workload graph queries on demand, using
a novel superstep-sharing execution model to
effectively utilize the cluster resources. Quegel
further provides a convenient interface for
constructing graph indexes, which significantly improve
query performance but are not supported by existing
graph-parallel systems. Our experiments verified that
Quegel is highly efficient in answering various types
of graph queries and is up to orders of magnitude
faster than existing systems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Brucato:2016:SPQ,
author = "Matteo Brucato and Juan Felipe Beltran and Azza
Abouzied and Alexandra Meliou",
title = "Scalable package queries in relational database
systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "7",
pages = "576--587",
month = mar,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 19 10:10:00 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Traditional database queries follow a simple model:
they define constraints that each tuple in the result
must satisfy. This model is computationally efficient,
as the database system can evaluate the query
conditions on each tuple individually. However, many
practical, real-world problems require a collection of
result tuples to satisfy constraints collectively,
rather than individually. In this paper, we present
package queries, a new query model that extends
traditional database queries to handle complex
constraints and preferences over answer sets. We
develop a full-fledged package query system,
implemented on top of a traditional database engine.
Our work makes several contributions. First, we design
PaQL, a SQL-based query language that supports the
declarative specification of package queries. We prove
that PaQL is at least as expressive as integer linear
programming, and therefore, evaluation of package
queries is in general NP-hard. Second, we present a
fundamental evaluation strategy that combines the
capabilities of databases and constraint optimization
solvers to derive solutions to package queries. The
core of our approach is a set of translation rules that
transform a package query to an integer linear program.
Third, we introduce an offline data partitioning
strategy allowing query evaluation to scale to large
data sizes. Fourth, we introduce SketchRefine, a
scalable algorithm for package evaluation, with strong
approximation guarantees ($ (1 \pm \epsilon)^6$-factor
approximation). Finally, we present extensive
experiments over real-world and benchmark data. The
results demonstrate that SketchRefine is effective at
deriving high-quality package results, and achieves
runtime performance that is an order of magnitude
faster than directly using ILP solvers over large
datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2016:STK,
author = "Xiang Wang and Ying Zhang and Wenjie Zhang and Xuemin
Lin and Zengfeng Huang",
title = "{Skype}: top-$k$ spatial-keyword publish\slash
subscribe over sliding window",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "7",
pages = "588--599",
month = mar,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 19 10:10:00 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "As the prevalence of social media and GPS-enabled
devices, a massive amount of geo-textual data has been
generated in a stream fashion, leading to a variety of
applications such as location-based recommendation and
information dissemination. In this paper, we
investigate a novel real-time top-$k$ monitoring
problem over sliding window of streaming data; that is,
we continuously maintain the top-$k$ most relevant
geo-textual messages (e.g., geo-tagged tweets) for a
large number of spatial-keyword subscriptions (e.g.,
registered users interested in local events)
simultaneously. To provide the most recent information
under controllable memory cost, sliding window model is
employed on the streaming geo-textual data. To the best
of our knowledge, this is the first work to study
top-$k$ spatial-keyword publish/subscribe over sliding
window. A novel system, called Skype (Top-k
Spatial-keyword Publish/Subscribe), is proposed in this
paper. In Skype, to continuously maintain top-$k$
results for massive subscriptions, we devise a novel
indexing structure upon subscriptions such that each
incoming message can be immediately delivered on its
arrival. Moreover, to reduce the expensive top-$k$
re-evaluation cost triggered by message expiration, we
develop a novel cost-based $k$-skyband technique to
reduce the number of re-evaluations in a cost-effective
way. Extensive experiments verify the great efficiency
and effectiveness of our proposed techniques.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Asudeh:2016:DSW,
author = "Abolfazl Asudeh and Saravanan Thirumuruganathan and
Nan Zhang and Gautam Das",
title = "Discovering the skyline of web databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "7",
pages = "600--611",
month = mar,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 19 10:10:00 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Many web databases are ``hidden'' behind proprietary
search interfaces that enforce the top-$k$ output
constraint, i.e., each query returns at most $k$ of all
matching tuples, preferentially selected and returned
according to a proprietary ranking function. In this
paper, we initiate research into the novel problem of
skyline discovery over top-$k$ hidden web databases.
Since skyline tuples provide critical insights into the
database and include the top-ranked tuple for every
possible ranking function following the monotonic order
of attribute values, skyline discovery from a hidden
web database can enable a wide variety of innovative
third-party applications over one or multiple web
databases. Our research in the paper shows that the
critical factor affecting the cost of skyline discovery
is the type of search interface controls provided by
the website. As such, we develop efficient algorithms
for three most popular types, i.e., one-ended range,
free range and point predicates, and then combine them
to support web databases that feature a mixture of
these types. Rigorous theoretical analysis and
extensive real-world online and offline experiments
demonstrate the effectiveness of our proposed
techniques and their superiority over baseline
solutions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2016:CTK,
author = "Xiaohang Zhang and Guoliang Li and Jianhua Feng",
title = "Crowdsourced top-$k$ algorithms: an experimental
evaluation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "8",
pages = "612--623",
month = apr,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2921558.2921559",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu May 26 16:07:35 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Crowdsourced top-$k$ computation has attracted
significant attention recently, thanks to emerging
crowdsourcing platforms, e.g., Amazon Mechanical Turk
and CrowdFlower. Crowdsourced top-$k$ algorithms ask
the crowd to compare the objects and infer the top-$k$
objects based on the crowdsourced comparison results.
The crowd may return incorrect answers, but traditional
top-$k$ algorithms cannot tolerate the errors from the
crowd. To address this problem, the database and
machine-learning communities have independently studied
the crowdsourced top-$k$ problem. The database
community proposes the heuristic-based solutions while
the machine-learning community proposes the
learning-based methods (e.g., maximum likelihood
estimation). However, these two types of techniques
have not been compared systematically under the same
experimental framework. Thus it is rather difficult for
a practitioner to decide which algorithm should be
adopted. Furthermore, the experimental evaluation of
existing studies has several weaknesses. Some methods
assume the crowd returns high-quality results and some
algorithms are only tested on simulated experiments. To
alleviate these limitations, in this paper we present a
comprehensive comparison of crowdsourced top-$k$
algorithms. Using various synthetic and real datasets,
we evaluate each algorithm in terms of result quality
and efficiency on real crowdsourcing platforms. We
reveal the characteristics of different techniques and
provide guidelines on selecting appropriate algorithms
for various scenarios.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Maddox:2016:DRD,
author = "Michael Maddox and David Goehring and Aaron J. Elmore
and Samuel Madden and Aditya Parameswaran and Amol
Deshpande",
title = "{Decibel}: the relational dataset branching system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "9",
pages = "624--635",
month = may,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu May 26 16:06:05 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "As scientific endeavors and data analysis become
increasingly collaborative, there is a need for data
management systems that natively support the versioning
or branching of datasets to enable concurrent analysis,
cleaning, integration, manipulation, or curation of
data across teams of individuals. Common practice for
sharing and collaborating on datasets involves creating
or storing multiple copies of the dataset, one for each
stage of analysis, with no provenance information
tracking the relationships between these datasets. This
results not only in wasted storage, but also makes it
challenging to track and integrate modifications made
by different users to the same dataset. In this paper,
we introduce the Relational Dataset Branching System,
Decibel, a new relational storage system with built-in
version control designed to address these
short-comings. We present our initial design for
Decibel and provide a thorough evaluation of three
versioned storage engine designs that focus on
efficient query processing with minimal storage
overhead. We also develop an exhaustive benchmark to
enable the rigorous testing of these and future
versioned storage engine designs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mann:2016:EES,
author = "Willi Mann and Nikolaus Augsten and Panagiotis
Bouros",
title = "An empirical evaluation of set similarity join
techniques",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "9",
pages = "636--647",
month = may,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu May 26 16:06:05 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Set similarity joins compute all pairs of similar sets
from two collections of sets. We conduct extensive
experiments on seven state-of-the-art algorithms for
set similarity joins. These algorithms adopt a
filter-verification approach. Our analysis shows that
verification has not received enough attention in
previous works. In practice, efficient verification
inspects only a small, constant number of set elements
and is faster than some of the more sophisticated
filter techniques. Although we can identify three
winners, we find that most algorithms show very similar
performance. The key technique is the prefix filter,
and AllPairs, the first algorithm adopting this
techniques is still a relevant competitor. We repeat
experiments from previous work and discuss diverging
results. All our claims are supported by a detailed
analysis of the factors that determine the overall
runtime.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Trummer:2016:MQO,
author = "Immanuel Trummer and Christoph Koch",
title = "Multiple query optimization on the {D-Wave 2X}
adiabatic quantum computer",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "9",
pages = "648--659",
month = may,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu May 26 16:06:05 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The D-Wave adiabatic quantum annealer solves hard
combinatorial optimization problems leveraging quantum
physics. The newest version features over 1000 qubits
and was released in August 2015. We were given access
to such a machine, currently hosted at NASA Ames
Research Center in California, to explore the potential
for hard optimization problems that arise in the
context of databases. In this paper, we tackle the
problem of multiple query optimization (MQO). We show
how an MQO problem instance can be transformed into a
mathematical formula that complies with the restrictive
input format accepted by the quantum annealer. This
formula is translated into weights on and between
qubits such that the configuration minimizing the input
formula can be found via a process called adiabatic
quantum annealing. We analyze the asymptotic growth
rate of the number of required qubits in the MQO
problem dimensions as the number of qubits is currently
the main factor restricting applicability. We
experimentally compare the performance of the quantum
annealer against other MQO algorithms executed on a
traditional computer. While the problem sizes that can
be treated are currently limited, we already find a
class of problem instances where the quantum annealer
is three orders of magnitude faster than other
approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Trummer:2016:PQO,
author = "Immanuel Trummer and Christoph Koch",
title = "Parallelizing query optimization on shared-nothing
architectures",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "9",
pages = "660--671",
month = may,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu May 26 16:06:05 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data processing systems offer an ever increasing
degree of parallelism on the levels of cores, CPUs, and
processing nodes. Query optimization must exploit high
degrees of parallelism in order not to gradually become
the bottleneck of query evaluation. We show how to
parallelize query optimization at a massive scale. We
present algorithms for parallel query optimization in
left-deep and bushy plan spaces. At optimization start,
we divide the plan space for a given query into
partitions of equal size that are explored in parallel
by worker nodes. At the end of optimization, each
worker returns the optimal plan in its partition to the
master which determines the globally optimal plan from
the partition-optimal plans. No synchronization or data
exchange is required during the actual optimization
phase. The amount of data sent over the network, at the
start and at the end of optimization, as well as the
complexity of serial steps within our algorithms
increase only linearly in the number of workers and in
the query size. The time and space complexity of
optimization within one partition decreases uniformly
in the number of workers. We parallelize single- and
multi-objective query optimization over a cluster with
100 nodes in our experiments, using more than 250
concurrent worker threads (Spark executors). Despite
high network latency and task assignment overheads,
parallelization yields speedups of up to one order of
magnitude for large queries whose optimization takes
minutes on a single node.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kalavri:2016:SPA,
author = "Vasiliki Kalavri and Tiago Simas and Dionysios
Logothetis",
title = "The shortest path is not always a straight line:
leveraging semi-metricity in graph analysis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "9",
pages = "672--683",
month = may,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu May 26 16:06:05 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we leverage the concept of the metric
backbone to improve the efficiency of large-scale graph
analytics. The metric backbone is the minimum subgraph
that preserves the shortest paths of a weighted graph.
We use the metric backbone in place of the original
graph to compute various graph metrics exactly or with
good approximation. By computing on a smaller graph, we
improve the performance of graph analytics applications
on two different systems, a batch graph processing
system and a graph database. Further, we provide an
algorithm for the computation of the metric backbone on
large graphs. While one can compute the metric backbone
by solving the all-pairs-shortest-paths problem, this
approach incurs prohibitive time and space complexity
for big graphs. Instead, we propose a heuristic that
makes computing the metric backbone practical even for
large graphs. Additionally, we analyze several real
datasets of different sizes and domains and we show
that we can approximate the metric backbone by removing
only first-order semi-metric edges; edges for which a
shorter two-hop path exists. We provide a distributed
implementation of our algorithm and apply it in large
scale scenarios. We evaluate our algorithm using a
variety of real graphs, including a Facebook social
network subgraph of $ \approx $50 billion edges. We
measure the impact of using the metric backbone on
runtime performance in two graph management systems. We
achieve query speedups of up to 6.7x in the Neo4j
commercial graph database and job speedups of up to 6x
in the Giraph graph processing system.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Papadakis:2016:CAA,
author = "George Papadakis and Jonathan Svirsky and Avigdor Gal
and Themis Palpanas",
title = "Comparative analysis of approximate blocking
techniques for entity resolution",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "9",
pages = "684--695",
month = may,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu May 26 16:06:05 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Entity Resolution is a core task for merging data
collections. Due to its quadratic complexity, it
typically scales to large volumes of data through
blocking: similar entities are clustered into blocks
and pair-wise comparisons are executed only between
co-occurring entities, at the cost of some missed
matches. There are numerous blocking methods, and the
aim of this work is to offer a comprehensive empirical
survey, extending the dimensions of comparison beyond
what is commonly available in the literature. We
consider 17 state-of-the-art blocking methods and use 6
popular real datasets to examine the robustness of
their internal configurations and their relative
balance between effectiveness and time efficiency. We
also investigate their scalability over a corpus of 7
established synthetic datasets that range from 10,000
to 2 million entities.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhao:2016:EED,
author = "Yiran Zhao and Shen Li and Shaohan Hu and Hongwei Wang
and Shuochao Yao and Huajie Shao and Tarek Abdelzaher",
title = "An experimental evaluation of datacenter workloads on
low-power embedded micro servers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "9",
pages = "696--707",
month = may,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu May 26 16:06:05 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper presents a comprehensive evaluation of an
ultra-low power cluster, built upon the Intel Edison
based micro servers. The improved performance and high
energy efficiency of micro servers have driven both
academia and industry to explore the possibility of
replacing conventional brawny servers with a larger
swarm of embedded micro servers. Existing attempts
mostly focus on mobile-class micro servers, whose
capacities are similar to mobile phones. We, on the
other hand, target on sensor-class micro servers, which
are originally intended for uses in wearable
technologies, sensor networks, and Internet-of-Things.
Although sensor-class micro servers have much less
capacity, they are touted for minimal power consumption
(< 1 Watt), which opens new possibilities of achieving
higher energy efficiency in datacenter workloads. Our
systematic evaluation of the Edison cluster and
comparisons to conventional brawny clusters involve
careful workload choosing and laborious parameter
tuning, which ensures maximum server utilization and
thus fair comparisons. Results show that the Edison
cluster achieves up to 3.5x improvement on
work-done-per-joule for web service applications and
data-intensive MapReduce jobs. In terms of scalability,
the Edison cluster scales linearly on the throughput of
web service workloads, and also shows satisfactory
scalability for MapReduce workloads despite
coordination overhead.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Song:2016:CTT,
author = "Shaoxu Song and Yue Cao and Jianmin Wang",
title = "Cleaning timestamps with temporal constraints",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "10",
pages = "708--719",
month = jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2977797.2977798",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Timestamps are often found to be dirty in various
scenarios, e.g., in distributed systems with clock
synchronization problems or unreliable RFID readers.
Without cleaning the imprecise timestamps,
temporal-related applications such as provenance
analysis or pattern queries are not reliable. To
evaluate the correctness of timestamps, temporal
constraints could be employed, which declare the
distance restrictions between timestamps. Guided by
such constraints on timestamps, in this paper, we study
a novel problem of repairing inconsistent timestamps
that do not conform to the required temporal
constraints. Following the same line of data repairing,
the timestamp repairing problem is to minimally modify
the timestamps towards satisfaction of temporal
constraints. This problem is practically challenging,
given the huge space of possible timestamps. We tackle
the problem by identifying a concise set of promising
candidates, where an optimal repair solution can always
be found. Repair algorithms with efficient pruning are
then devised over the identified candidates.
Experiments on real datasets demonstrate the
superiority of our proposal compared to the
state-of-the-art approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tan:2016:TRS,
author = "Zilong Tan and Shivnath Babu",
title = "{Tempo}: robust and self-tuning resource management in
multi-tenant parallel databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "10",
pages = "720--731",
month = jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2977797.2977799",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Multi-tenant database systems have a component called
the Resource Manager, or RM that is responsible for
allocating resources to tenants. RMs today do not
provide direct support for performance objectives such
as: ``Average job response time of tenant A must be
less than two minutes'', or ``No more than 5\% of
tenant B's jobs can miss the deadline of 1 hour.''
Thus, DBAs have to tinker with the RM's low-level
configuration settings to meet such objectives. We
propose a framework called Tempo that brings
simplicity, self-tuning, and robustness to existing
RMs. Tempo provides a simple interface for DBAs to
specify performance objectives declaratively, and
optimizes the RM configuration settings to meet these
objectives. Tempo has a solid theoretical foundation
which gives key robustness guarantees. We report
experiments done on Tempo using production traces of
data-processing workloads from companies such as
Facebook and Cloudera. These experiments demonstrate
significant improvements in meeting desired performance
objectives over RM configuration settings specified by
human experts.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Daenen:2016:PEM,
author = "Jonny Daenen and Frank Neven and Tony Tan and Stijn
Vansummeren",
title = "Parallel evaluation of multi-semi-joins",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "10",
pages = "732--743",
month = jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2977797.2977800",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "While services such as Amazon AWS make computing power
abundantly available, adding more computing nodes can
incur high costs in, for instance, pay-as-you-go plans
while not always significantly improving the net
running time (aka wall-clock time) of queries. In this
work, we provide algorithms for parallel evaluation of
SGF queries in MapReduce that optimize total time,
while retaining low net time. Not only can SGF queries
specify all semi-join reducers, but also more
expressive queries involving disjunction and negation.
Since SGF queries can be seen as Boolean combinations
of (potentially nested) semi-joins, we introduce a
novel multi-semi-join (MSJ) MapReduce operator that
enables the evaluation of a set of semi-joins in one
job. We use this operator to obtain parallel query
plans for SGF queries that outvalue sequential plans
w.r.t. net time and provide additional optimizations
aimed at minimizing total time without severely
affecting net time. Even though the latter
optimizations are NP-hard, we present effective greedy
algorithms. Our experiments, conducted using our own
implementation Gumbo on top of Hadoop, confirm the
usefulness of parallel query plans, and the
effectiveness and scalability of our optimizations, all
with a significant improvement over Pig and Hive.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chen:2016:WCE,
author = "Jianfei Chen and Kaiwei Li and Jun Zhu and Wenguang
Chen",
title = "{WarpLDA}: a cache efficient {O(1)} algorithm for
latent {Dirichlet} allocation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "10",
pages = "744--755",
month = jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2977797.2977801",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Developing efficient and scalable algorithms for
Latent Dirichlet Allocation (LDA) is of wide interest
for many applications. Previous work has developed an $
O(1) $ Metropolis--Hastings (MH) sampling method for
each token. However, its performance is far from being
optimal due to frequent cache misses caused by random
accesses to the parameter matrices. In this paper, we
first carefully analyze the memory access behavior of
existing algorithms for LDA by cache locality at
document level. We then develop WarpLDA, which achieves
$ O(1) $ time complexity per-token and fits the
randomly accessed memory per document in the L3 cache.
Our empirical results in a wide range of testing
conditions demonstrate that WarpLDA is consistently
5-15x faster than the state-of-the-art MH-based
LightLDA, and is faster than the state-of-the-art
sparsity aware F+LDA in most settings. Our WarpLDA
learns a million topics from 639 millions of documents
in only five hours at an unprecedented throughput of 11
billion tokens per second.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Eich:2016:FPG,
author = "Marius Eich and Pit Fender and Guido Moerkotte",
title = "Faster plan generation through consideration of
functional dependencies and keys",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "10",
pages = "756--767",
month = jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2977797.2977802",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "It has been a recognized fact for many years that
query execution can benefit from pushing group-by
operators down in the operator tree and applying them
before a join. This so-called eager aggregation reduces
the size(s) of the join argument(s), making join
evaluation faster. Lately, the idea enjoyed a revival
when it was applied to outer joins for the first time
and incorporated in a state-of-the-art plan generator.
However, this recent approach is highly dependent on
the use of heuristics because of the exponential growth
of the search space that goes along with eager
aggregation. Finding an optimal solution for larger
queries calls for effective optimality preserving
pruning mechanisms to reduce the search space size as
far as possible. By a more thorough investigation of
functional dependencies and keys, we provide a set of
new pruning criteria and evaluate their effectiveness
with respect to the runtime and memory consumption of
the resulting plan generator.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Schuhknecht:2016:RIR,
author = "Felix Martin Schuhknecht and Jens Dittrich and Ankur
Sharma",
title = "{RUMA} has it: rewired user-space memory access is
possible!",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "10",
pages = "768--779",
month = jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2977797.2977803",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Memory management is one of the most boring topics in
database research. It plays a minor role in tasks like
free-space management or efficient space usage. Here
and there we also realize its impact on database
performance when worrying about NUMA-aware memory
allocation, data compacting, snapshotting, and
defragmentation. But, overall, let's face it: the
entire topic sounds as exciting as 'garbage collection'
or 'debugging a program for memory leaks'. What if
there were a technique that would promote memory
management from a third class helper thingie to a first
class citizen in algorithm and systems design? What if
that technique turned the role of memory management in
a database system (and any other data processing
system) upside-down? What if that technique could be
identified as a key for re-designing various core
algorithms with the effect of outperforming existing
state-of-the-art methods considerably? Then we would
write this paper. We introduce RUMA: Rewired User-space
Memory Access. It allows for physiological data
management, i.e. we allow developers to freely rewire
the mappings from virtual to physical memory (in user
space) while at the same time exploiting the virtual
memory support offered by hardware and operating
system. We show that fundamental database building
blocks such as array operations, partitioning, sorting,
and snapshotting benefit strongly from RUMA.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Marcus:2016:WLB,
author = "Ryan Marcus and Olga Papaemmanouil",
title = "{WiSeDB}: a learning-based workload management advisor
for cloud databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "10",
pages = "780--791",
month = jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2977797.2977804",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Workload management for cloud databases deals with the
tasks of resource provisioning, query placement, and
query scheduling in a manner that meets the
application's performance goals while minimizing the
cost of using cloud resources. Existing solutions have
approached these three challenges in isolation while
aiming to optimize a single performance metric. In this
paper, we introduce WiSeDB, a learning-based framework
for generating holistic workload management solutions
customized to application-defined performance goals and
workload characteristics. Our approach relies on
supervised learning to train cost-effective decision
tree models for guiding query placement, scheduling,
and resource provisioning decisions. Applications can
use these models for both batch and online scheduling
of incoming workloads. A unique feature of our system
is that it can adapt its offline model to
stricter/looser performance goals with minimal
re-training. This allows us to present to the
application alternative workload management strategies
that address the typical performance vs. cost trade-off
of cloud services. Experimental results show that our
approach has very low training overhead while offering
low cost strategies for a variety of performance
metrics and workload characteristics.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{DeFrancisciMorales:2016:SSS,
author = "Gianmarco {De Francisci Morales} and Aristides
Gionis",
title = "Streaming similarity self-join",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "10",
pages = "792--803",
month = jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2977797.2977805",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We introduce and study the problem of computing the
similarity self-join in a streaming context (SSSJ),
where the input is an unbounded stream of items
arriving continuously. The goal is to find all pairs of
items in the stream whose similarity is greater than a
given threshold. The simplest formulation of the
problem requires unbounded memory, and thus, it is
intractable. To make the problem feasible, we introduce
the notion of time-dependent similarity: the similarity
of two items decreases with the difference in their
arrival time. By leveraging the properties of this
time-dependent similarity function, we design two
algorithmic frameworks to solve the SSSJ problem. The
first one, MiniBatch (MB), uses existing index-based
filtering techniques for the static version of the
problem, and combines them in a pipeline. The second
framework, Streaming (STR), adds time filtering to the
existing indexes, and integrates new time-based bounds
deeply in the working of the algorithms. We also
introduce a new indexing technique (L2), which is based
on an existing state-of-the-art indexing technique
(L2AP), but is optimized for the streaming case.
Extensive experiments show that the STR algorithm, when
instantiated with the L2 index, is the most scalable
option across a wide array of datasets and
parameters.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Schatzle:2016:SRQ,
author = "Alexander Sch{\"a}tzle and Martin Przyjaciel-Zablocki
and Simon Skilevic and Georg Lausen",
title = "{S2RDF}: {RDF} querying with {SPARQL} on spark",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "10",
pages = "804--815",
month = jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2977797.2977806",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "RDF has become very popular for semantic data
publishing due to its flexible and universal graph-like
data model. Thus, the ever-increasing size of RDF data
collections raises the need for scalable distributed
approaches. We endorse the usage of existing
infrastructures for Big Data processing like Hadoop for
this purpose. Yet, SPARQL query performance is a major
challenge as Hadoop is not intentionally designed for
RDF processing. Existing approaches often favor certain
query pattern shapes while performance drops
significantly for other shapes. In this paper, we
introduce a novel relational partitioning schema for
RDF data called ExtVP that uses a semi-join based
preprocessing, akin to the concept of Join Indices in
relational databases, to efficiently minimize query
input size regardless of its pattern shape and
diameter. Our prototype system S2RDF is built on top of
Spark and uses SQL to execute SPARQL queries over
ExtVP. We demonstrate its superior performance in
comparison to state of the art SPARQL-on-Hadoop
approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Singh:2016:BSS,
author = "Rishabh Singh",
title = "{BlinkFill}: semi-supervised programming by example
for syntactic string transformations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "10",
pages = "816--827",
month = jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2977797.2977807",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The recent Programming By Example (PBE) techniques
such as FlashFill have shown great promise for enabling
end-users to perform data transformation tasks using
input-output examples. Since examples are inherently an
under-specification, there are typically a large number
of hypotheses conforming to the examples, and the PBE
techniques suffer from scalability issues for finding
the intended program amongst the large space. We
present a semi-supervised learning technique to
significantly reduce this ambiguity by using the
logical information present in the input data to guide
the synthesis algorithm. We develop a data structure
InputDataGraph to succinctly represent a large set of
logical patterns that are shared across the input data,
and use this graph to efficiently learn substring
expressions in a new PBE system B linkFill. We evaluate
BlinkFill on 207 real-world benchmarks and show that
BlinkFill is significantly faster (on average 41x) and
requires fewer input-output examples (1.27 vs 1.53) to
learn the desired transformations in comparison to
FlashFill.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Deng:2016:MEM,
author = "Dong Deng and Guoliang Li and He Wen and H. V.
Jagadish and Jianhua Feng",
title = "{META}: an efficient matching-based method for
error-tolerant autocompletion",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "10",
pages = "828--839",
month = jun,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2977797.2977808",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Autocompletion has been widely adopted in many
computing systems because it can instantly provide
users with results as users type in queries. Since the
typing task is tedious and prone to error, especially
on mobile devices, a recent trend is to tolerate errors
in autocompletion. Existing error-tolerant
autocompletion methods build a trie to index the data,
utilize the trie index to compute the trie nodes that
are similar to the query, called active nodes, and
identify the leaf descendants of active nodes as the
results. However these methods have two limitations.
First, they involve many redundant computations to
identify the active nodes. Second, they do not support
top- k queries. To address these problems, we propose a
matching-based framework, which computes the answers
based on matching characters between queries and data.
We design a compact tree index to maintain active nodes
in order to avoid the redundant computations. We devise
an incremental method to efficiently answer top- k
queries. Experimental results on real datasets show
that our method outperforms state-of-the-art approaches
by 1--2 orders of magnitude.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zheng:2016:SSS,
author = "Weiguo Zheng and Lei Zou and Wei Peng and Xifeng Yan
and Shaoxu Song and Dongyan Zhao",
title = "Semantic {SPARQL} similarity search over {RDF}
knowledge graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "11",
pages = "840--851",
month = jul,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2983200.2983201",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "RDF knowledge graphs have attracted increasing
attentions these years. However, due to the schema-free
nature of RDF data, it is very difficult for users to
have full knowledge of the underlying schema.
Furthermore, the same kind of information can be
represented in diverse graph fragments. Hence, it is a
huge challenge to formulate complex SPARQL expressions
by taking the union of all possible structures. In this
paper, we propose an effective framework to access the
RDF repository even if users have no full knowledge of
the underlying schema. Specifically, given a SPARQL
query, the system could return as more answers that
match the query based on the semantic similarity as
possible. Interestingly, we propose a systematic method
to mine diverse semantically equivalent structure
patterns. More importantly, incorporating both
structural and semantic similarities we are the first
to propose a novel similarity measure, semantic graph
edit distance. In order to improve the efficiency
performance, we apply the semantic summary graph to
summarize the knowledge graph, which supports both
high-level pruning and drill-down pruning. We also
devise an effective lower bound based on the TA-style
access to each of the candidate sets. Extensive
experiments over real datasets confirm the
effectiveness and efficiency of our approach.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dubey:2016:WHP,
author = "Ayush Dubey and Greg D. Hill and Robert Escriva and
Emin G{\"u}n Sirer",
title = "{Weaver}: a high-performance, transactional graph
database based on refinable timestamps",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "11",
pages = "852--863",
month = jul,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2983200.2983202",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Graph databases have become a common infrastructure
component. Yet existing systems either operate on
offline snapshots, provide weak consistency guarantees,
or use expensive concurrency control techniques that
limit performance. In this paper, we introduce a new
distributed graph database, called Weaver, which
enables efficient, transactional graph analyses as well
as strictly serializable ACID transactions on dynamic
graphs. The key insight that allows Weaver to combine
strict serializability with horizontal scalability and
high performance is a novel request ordering mechanism
called refinable timestamps. This technique couples
coarse-grained vector timestamps with a fine-grained
timeline oracle to pay the overhead of strong
consistency only when needed. Experiments show that
Weaver enables a Bitcoin blockchain explorer that is 8x
faster than Blockchain.info, and achieves 10.9x higher
throughput than the Titan graph database on social
network workloads and 4x lower latency than GraphLab on
offline graph traversal workloads.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chu:2016:DDD,
author = "Xu Chu and Ihab F. Ilyas and Paraschos Koutris",
title = "Distributed data deduplication",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "11",
pages = "864--875",
month = jul,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2983200.2983203",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data deduplication refers to the process of
identifying tuples in a relation that refer to the same
real world entity. The complexity of the problem is
inherently quadratic with respect to the number of
tuples, since a similarity value must be computed for
every pair of tuples. To avoid comparing tuple pairs
that are obviously non-duplicates, blocking techniques
are used to divide the tuples into blocks and only
tuples within the same block are compared. However,
even with the use of blocking, data deduplication
remains a costly problem for large datasets. In this
paper, we show how to further speed up data
deduplication by leveraging parallelism in a
shared-nothing computing environment. Our main
contribution is a distribution strategy, called
Dis-Dedup, that minimizes the maximum workload across
all worker nodes and provides strong theoretical
guarantees. We demonstrate the effectiveness of our
proposed strategy by performing extensive experiments
on both synthetic datasets with varying block size
distributions, as well as real world datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Arenas:2016:FAC,
author = "Marcelo Arenas and Francisco Maturana and Cristian
Riveros and Domagoj Vrgoc",
title = "A framework for annotating {CSV}-like data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "11",
pages = "876--887",
month = jul,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2983200.2983204",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we propose a simple and expressive
framework for adding metadata to CSV documents and
their noisy variants. The framework is based on
annotating parts of the document that can be later used
to read, query, or exchange the data. The core of our
framework is a language based on extended regular
expressions that are used for selecting data. These
expressions are then combined using a set of rules in
order to annotate the data. We study the computational
complexity of implementing our framework and present an
efficient evaluation algorithm that runs in time
proportional to its output and linear in its input. As
a proof of concept, we test an implementation of our
framework against a large number of real world datasets
and show that it can be efficiently used in practice.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Asudeh:2016:QRS,
author = "Abolfazl Asudeh and Nan Zhang and Gautam Das",
title = "Query reranking as a service",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "11",
pages = "888--899",
month = jul,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2983200.2983205",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The ranked retrieval model has rapidly become the de
facto way for search query processing in client-server
databases, especially those on the web. Despite of the
extensive efforts in the database community on
designing better ranking functions/mechanisms, many
such databases in practice still fail to address the
diverse and sometimes contradicting preferences of
users on tuple ranking, perhaps (at least partially)
due to the lack of expertise and/or motivation for the
database owner to design truly effective ranking
functions. This paper takes a different route on
addressing the issue by defining a novel query
reranking problem, i.e., we aim to design a third-party
service that uses nothing but the public search
interface of a client-server database to enable the
on-the-fly processing of queries with any
user-specified ranking functions (with or without
selection conditions), no matter if the ranking
function is supported by the database or not. We
analyze the worst-case complexity of the problem and
introduce a number of ideas, e.g., on-the-fly indexing,
domination detection and virtual tuple pruning, to
reduce the average-case cost of the query reranking
algorithm. We also present extensive experimental
results on real-world datasets, in both offline and
live online systems, that demonstrate the effectiveness
of our proposed techniques.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ma:2016:GSF,
author = "Hongbin Ma and Bin Shao and Yanghua Xiao and Liang
Jeff Chen and Haixun Wang",
title = "{G-SQL}: fast query processing via graph exploration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "12",
pages = "900--911",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2994509.2994510",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A lot of real-life data are of graph nature. However,
it is not until recently that business begins to
exploit data's connectedness for business insights. On
the other hand, RDBMSs are a mature technology for data
management, but they are not for graph processing. Take
graph traversal, a common graph operation for example,
it heavily relies on a graph primitive that accesses a
given node's neighborhood. We need to join tables
following foreign keys to access the nodes in the
neighborhood if an RDBMS is used to manage graph data.
Graph exploration is a fundamental building block of
many graph algorithms. But this simple operation is
costly due to a large volume of I/O caused by the
massive amount of table joins. In this paper, we
present G-SQL, our effort toward the integration of a
RDBMS and a native in-memory graph processing engine.
G-SQL leverages the fast graph exploration capability
provided by the graph engine to answer multi-way join
queries. Meanwhile, it uses RDBMSs to provide mature
data management functionalities, such as reliable data
storage and additional data access methods.
Specifically, G-SQL is a SQL dialect augmented with
graph exploration functionalities and it dispatches
query tasks to the in-memory graph engine and its
underlying RDMBS. The G-SQL runtime coordinates the two
query processors via a unified cost model to ensure the
entire query is processed efficiently. Experimental
results show that our approach greatly expands
capabilities of RDBMs and delivers exceptional
performance for SQL-graph hybrid queries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2016:MOD,
author = "Mingxing Zhang and Yongwei Wu and Kang Chen and Teng
Ma and Weimin Zheng",
title = "Measuring and optimizing distributed array programs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "12",
pages = "912--923",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2994509.2994511",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Nowadays, there is a rising trend of building
array-based distributed computing frameworks, which are
suitable for implementing many machine learning and
data mining algorithms. However, most of these
frameworks only execute each primitive in an isolated
manner and in the exact order defined by programmers,
which implies a huge space for optimization. In this
paper, we propose a novel array-based programming
model, named K asen, which distinguishes itself from
models in the existing literature by defining a strict
computation and communication model. This model makes
it easy to analyze programs' behavior and measure their
performance, with which we design a corresponding
optimizer that can automatically apply high-level
optimizations to the original programs written by
programmers. According to our evaluation, the optimizer
of Kasen can achieve a significant reduction on memory
read/write, buffer allocation and network traffic,
which leads to a speedup up to 5.82x.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jo:2016:YHP,
author = "Insoon Jo and Duck-Ho Bae and Andre S. Yoon and
Jeong-Uk Kang and Sangyeun Cho and Daniel D. G. Lee and
Jaeheon Jeong",
title = "{YourSQL}: a high-performance database system
leveraging in-storage computing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "12",
pages = "924--935",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2994509.2994512",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper presents YourSQL, a database system that
accelerates data-intensive queries with the help of
additional in-storage computing capabilities. YourSQL
realizes very early filtering of data by offloading
data scanning of a query to user-programmable
solid-state drives. We implement our system on a recent
branch of MariaDB (a variant of MySQL). In order to
quantify the performance gains of YourSQL, we evaluate
SQL queries with varying complexities. Our result shows
that YourSQL reduces the execution time of the whole
TPC-H queries by $ 3.6 \times $, compared to a vanilla
system. Moreover, the average speed-up of the five
TPC-H queries with the largest performance gains
reaches over $ 15 \times $. Thanks to this significant
reduction of execution time, we observe sizable energy
savings. Our study demonstrates that the YourSQL
approach, combining the power of early filtering with
end-to-end datapath optimization, can accelerate
large-scale analytic queries with lower energy
consumption.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lu:2016:LBM,
author = "Lu Lu and Xuanhua Shi and Yongluan Zhou and Xiong
Zhang and Hai Jin and Cheng Pei and Ligang He and
Yuanzhen Geng",
title = "Lifetime-based memory management for distributed data
processing systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "12",
pages = "936--947",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2994509.2994513",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In-memory caching of intermediate data and eager
combining of data in shuffle buffers have been shown to
be very effective in minimizing the re-computation and
I/O cost in distributed data processing systems like
Spark and Flink. However, it has also been widely
reported that these techniques would create a large
amount of long-living data objects in the heap, which
may quickly saturate the garbage collector, especially
when handling a large dataset, and hence would limit
the scalability of the system. To eliminate this
problem, we propose a lifetime-based memory management
framework, which, by automatically analyzing the
user-defined functions and data types, obtains the
expected lifetime of the data objects, and then
allocates and releases memory space accordingly to
minimize the garbage collection overhead. In
particular, we present Deca, a concrete implementation
of our proposal on top of Spark, which transparently
decomposes and groups objects with similar lifetimes
into byte arrays and releases their space altogether
when their lifetimes come to an end. An extensive
experimental study using both synthetic and real
datasets shows that, in comparing to Spark, Deca is
able to (1) reduce the garbage collection time by up to
99.9\%, (2) to achieve up to 22.7x speed up in terms of
execution time in cases without data spilling and 41.6x
speedup in cases with data spilling, and (3) to consume
up to 46.6\% less memory.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Krishnan:2016:AID,
author = "Sanjay Krishnan and Jiannan Wang and Eugene Wu and
Michael J. Franklin and Ken Goldberg",
title = "{ActiveClean}: interactive data cleaning for
statistical modeling",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "12",
pages = "948--959",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2994509.2994514",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Analysts often clean dirty data iteratively--cleaning
some data, executing the analysis, and then cleaning
more data based on the results. We explore the
iterative cleaning process in the context of
statistical model training, which is an increasingly
popular form of data analytics. We propose ActiveClean,
which allows for progressive and iterative cleaning in
statistical modeling problems while preserving
convergence guarantees. ActiveClean supports an
important class of models called convex loss models
(e.g., linear regression and SVMs), and prioritizes
cleaning those records likely to affect the results. We
evaluate ActiveClean on five real-world datasets UCI
Adult, UCI EEG, MNIST, IMDB, and Dollars For Docs with
both real and synthetic errors. The results show that
our proposed optimizations can improve model accuracy
by up-to 2.5x for the same amount of data cleaned.
Furthermore for a fixed cleaning budget and on all real
dirty datasets, ActiveClean returns more accurate
models than uniform sampling and Active Learning.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Elgohary:2016:CLA,
author = "Ahmed Elgohary and Matthias Boehm and Peter J. Haas
and Frederick R. Reiss and Berthold Reinwald",
title = "Compressed linear algebra for large-scale machine
learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "12",
pages = "960--971",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2994509.2994515",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Large-scale machine learning (ML) algorithms are often
iterative, using repeated read-only data access and
I/O-bound matrix-vector multiplications to converge to
an optimal model. It is crucial for performance to fit
the data into single-node or distributed main memory.
General-purpose, heavy- and lightweight compression
techniques struggle to achieve both good compression
ratios and fast decompression speed to enable
block-wise uncompressed operations. Hence, we initiate
work on compressed linear algebra (CLA), in which
lightweight database compression techniques are applied
to matrices and then linear algebra operations such as
matrix-vector multiplication are executed directly on
the compressed representations. We contribute effective
column compression schemes, cache-conscious operations,
and an efficient sampling-based compression algorithm.
Our experiments show that CLA achieves in-memory
operations performance close to the uncompressed case
and good compression ratios that allow us to fit larger
datasets into available memory. We thereby obtain
significant end-to-end performance improvements up to
26x or reduced memory requirements.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Karpathiotakis:2016:FQH,
author = "Manos Karpathiotakis and Ioannis Alagiannis and
Anastasia Ailamaki",
title = "Fast queries over heterogeneous data through engine
customization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "12",
pages = "972--983",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2994509.2994516",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Industry and academia are continuously becoming more
data-driven and data-intensive, relying on the analysis
of a wide variety of heterogeneous datasets to gain
insights. The different data models and formats pose a
significant challenge on performing analysis over a
combination of diverse datasets. Serving all queries
using a single, general-purpose query engine is slow.
On the other hand, using a specialized engine for each
heterogeneous dataset increases complexity: queries
touching a combination of datasets require an
integration layer over the different engines. This
paper presents a system design that natively supports
heterogeneous data formats and also minimizes query
execution times. For multi-format support, the design
uses an expressive query algebra which enables
operations over various data models. For minimal
execution times, it uses a code generation mechanism to
mimic the system and storage most appropriate to answer
a query fast. We validate our design by building
Proteus, a query engine which natively supports queries
over CSV, JSON, and relational binary data, and which
specializes itself to each query, dataset, and workload
via code generation. Proteus outperforms
state-of-the-art open-source and commercial systems on
both synthetic and real-world workloads without being
tied to a single data model or format, all while
exposing users to a single query interface.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bhowmick:2016:DDV,
author = "Sourav S. Bhowmick and Byron Choi and Curtis Dyreson",
title = "Data-driven visual graph query interface construction
and maintenance: challenges and opportunities",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "12",
pages = "984--992",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2994509.2994517",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Visual query interfaces make it easy for scientists
and other nonexpert users to query a data collection.
Heretofore, visual query interfaces have been
statically-constructed, independent of the data. In
this paper we outline a vision of a different kind of
interface, one that is built (in part) from the data.
In our data-driven approach, the visual interface is
dynamically constructed and maintained. A data-driven
approach has many benefits such as reducing the cost in
constructing and maintaining an interface, superior
support for query formulation, and increased
portability of the interface. We focus on graph
databases, but our approach is applicable to several
other kinds of databases such as JSON and XML.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Abedjan:2016:DDE,
author = "Ziawasch Abedjan and Xu Chu and Dong Deng and Raul
Castro Fernandez and Ihab F. Ilyas and Mourad Ouzzani
and Paolo Papotti and Michael Stonebraker and Nan
Tang",
title = "Detecting data errors: where are we and what needs to
be done?",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "12",
pages = "993--1004",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2994509.2994518",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data cleaning has played a critical role in ensuring
data quality for enterprise applications. Naturally,
there has been extensive research in this area, and
many data cleaning algorithms have been translated into
tools to detect and to possibly repair certain classes
of errors such as outliers, duplicates, missing values,
and violations of integrity constraints. Since
different types of errors may coexist in the same data
set, we often need to run more than one kind of tool.
In this paper, we investigate two pragmatic questions:
(1) are these tools robust enough to capture most
errors in real-world data sets? and (2) what is the
best strategy to holistically run multiple tools to
optimize the detection effort? To answer these two
questions, we obtained multiple data cleaning tools
that utilize a variety of error detection techniques.
We also collected five real-world data sets, for which
we could obtain both the raw data and the ground truth
on existing errors. In this paper, we report our
experimental findings on the errors detected by the
tools we tested. First, we show that the coverage of
each tool is well below 100\%. Second, we show that the
order in which multiple tools are run makes a big
difference. Hence, we propose a holistic multi-tool
strategy that orders the invocations of the available
tools to maximize their benefit, while minimizing human
effort in verifying results. Third, since this holistic
approach still does not lead to acceptable error
coverage, we discuss two simple strategies that have
the potential to improve the situation, namely domain
specific tools and data enrichment. We close this paper
by reasoning about the errors that are not detectable
by any of the tools we tested.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2016:ESH,
author = "Hai Liu and Dongqing Xiao and Pankaj Didwania and
Mohamed Y. Eltabakh",
title = "Exploiting soft and hard correlations in big data
query optimization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "12",
pages = "1005--1016",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2994509.2994519",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Big data infrastructures are increasingly supporting
datasets that are relatively structured. These datasets
are full of correlations among their attributes, which
if managed in systematic ways would enable optimization
opportunities that otherwise will be missed. Unlike
relational databases in which discovering and
exploiting the correlations in query optimization have
been extensively studied, in big data infrastructures,
such important data properties and their utilization
have been mostly abandoned. The key reason is that
domain experts may know many correlations but with a
degree of uncertainty (fuzziness or softness). Since
the data is big, it is very challenging to validate
such correlations, judge their worthiness, and put
strategies for utilizing them in query optimization.
Existing techniques for exploiting soft correlations in
RDBMSs, e.g., BHUNT, CORDS, and CM, are heavily
tailored towards optimizing factors inherent in
relational databases, e.g., predicate selectivity and
random I/O accesses of secondary indexes, which are
issues not applicable to big data infrastructures,
e.g., Hadoop. In this paper, we propose the EXORD
system to fill in this gap by exploiting the data's
correlations in big data query optimization. EXORD
supports two types of correlations; hard
correlations---which are guaranteed to hold for all
data records, and soft correlations---which are
expected to hold for most, but not all, data records.
We introduce a new three-phase approach for (1)
Validating and judging the worthiness of soft
correlations, (2) Selecting and preparing the soft
correlations for deployment by specially handling the
violating data records, and (3) Deploying and
exploiting the correlations in query optimization. We
propose a novel cost-benefit model for adaptively
selecting the most beneficial soft correlations w.r.t a
given query workload while minimizing the introduced
overhead. We show the complexity of this problem
(NP-Hard), and propose a heuristic to efficiently solve
it in a polynomial time. EXORD can be integrated with
various state-of-art big data query optimization
techniques, e.g., indexing and partitioning. EXORD
prototype is implemented as an extension to the Hive
engine on top of Hadoop. The experimental evaluation
shows the potential of EXORD in achieving more than 10x
speedup while introducing minimal storage overheads.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kahng:2016:IBN,
author = "Minsuk Kahng and Shamkant B. Navathe and John T.
Stasko and Duen Horng Polo Chau",
title = "Interactive browsing and navigation in relational
databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "12",
pages = "1017--1028",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2994509.2994520",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Although researchers have devoted considerable
attention to helping database users formulate queries,
many users still find it challenging to specify queries
that involve joining tables. To help users construct
join queries for exploring relational databases, we
propose ETable, a novel presentation data model that
provides users with a presentation-level interactive
view. This view compactly presents one-to-many and
many-to-many relationships within a single enriched
table by allowing a cell to contain a set of entity
references. Users can directly interact with this
enriched table to incrementally construct complex
queries and navigate databases on a conceptual
entity-relationship level. In a user study,
participants performed a range of database querying
tasks faster with ETable than with a commercial
graphical query builder. Subjective feedback about
ETable was also positive. All participants found that
ETable was easier to learn and helpful for exploring
databases.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Borovica-Gajic:2016:CDA,
author = "Renata Borovica-Gaji{\'c} and Raja Appuswamy and
Anastasia Ailamaki",
title = "Cheap data analytics using cold storage devices",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "12",
pages = "1029--1040",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2994509.2994521",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Enterprise databases use storage tiering to lower
capital and operational expenses. In such a setting,
data waterfalls from an SSD-based high-performance tier
when it is ``hot'' (frequently accessed) to a
disk-based capacity tier and finally to a tape-based
archival tier when ``cold'' (rarely accessed). To
address the unprecedented growth in the amount of cold
data, hardware vendors introduced new devices named
Cold Storage Devices (CSD) explicitly targeted at cold
data workloads. With access latencies in tens of
seconds and cost/GB as low as \$0.01/GB/month, CSD
provide a middle ground between the low-latency (ms),
high-cost, HDD-based capacity tier, and high-latency
(min to h), low-cost, tape-based, archival tier. Driven
by the price/performance aspect of CSD, this paper
makes a case for using CSD as a replacement for both
capacity and archival tiers of enterprise databases.
Although CSD offer major cost savings, we show that
current database systems can suffer from severe
performance drop when CSD are used as a replacement for
HDD due to the mismatch between design assumptions made
by the query execution engine and actual storage
characteristics of the CSD. We then build a CSD-driven
query execution framework, called Skipper, that
modifies both the database execution engine and CSD
scheduling algorithms to be aware of each other. Using
results from our implementation of the architecture
based on PostgreSQL and OpenStack Swift, we show that
Skipper is capable of completely masking the high
latency overhead of CSD, thereby opening up CSD for
wider adoption as a storage tier for cheap data
analytics over cold data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shun:2016:PLG,
author = "Julian Shun and Farbod Roosta-Khorasani and Kimon
Fountoulakis and Michael W. Mahoney",
title = "Parallel local graph clustering",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "12",
pages = "1041--1052",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2994509.2994522",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Graph clustering has many important applications in
computing, but due to growing sizes of graph, even
traditionally fast clustering methods such as spectral
partitioning can be computationally expensive for
real-world graphs of interest. Motivated partly by
this, so-called local algorithms for graph clustering
have received significant interest due to the fact that
they can find good clusters in a graph with work
proportional to the size of the cluster rather than
that of the entire graph. This feature has proven to be
crucial in making such graph clustering and many of its
downstream applications efficient in practice. While
local clustering algorithms are already faster than
traditional algorithms that touch the entire graph,
they are sequential and there is an opportunity to make
them even more efficient via parallelization. In this
paper, we show how to parallelize many of these
algorithms in the shared-memory multicore setting, and
we analyze the parallel complexity of these algorithms.
We present comprehensive experiments on large-scale
graphs showing that our parallel algorithms achieve
good parallel speedups on a modern multicore machine,
thus significantly speeding up the analysis of local
graph clusters in the very large-scale setting.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tong:2016:OMM,
author = "Yongxin Tong and Jieying She and Bolin Ding and Lei
Chen and Tianyu Wo and Ke Xu",
title = "Online minimum matching in real-time spatial data:
experiments and analysis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "12",
pages = "1053--1064",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2994509.2994523",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Recently, with the development of mobile Internet and
smartphones, the online minimum bipartite matching in
real time spatial data (OMBM) problem becomes popular.
Specifically, given a set of service providers with
specific locations and a set of users who dynamically
appear one by one, the OMBM problem is to find a
maximum-cardinality matching with minimum total
distance following that once a user appears, s/he must
be immediately matched to an unmatched service
provider, which cannot be revoked, before subsequent
users arrive. To address this problem, existing studies
mainly focus on analyzing the worst-case competitive
ratios of the proposed online algorithms, but study on
the performance of the algorithms in practice is
absent. In this paper, we present a comprehensive
experimental comparison of the representative
algorithms of the OMBM problem. Particularly, we
observe a surprising result that the simple and
efficient greedy algorithm, which has been considered
as the worst due to its exponential worst-case
competitive ratio, is significantly more effective than
other algorithms. We investigate the results and
further show that the competitive ratio of the worst
case of the greedy algorithm is actually just a
constant, 3.195, in the average-case analysis. We try
to clarify a 25-year misunderstanding towards the
greedy algorithm and justify that the greedy algorithm
is not bad at all. Finally, we provide a uniform
implementation for all the algorithms of the OMBM
problem and clarify their strengths and weaknesses,
which can guide practitioners to select appropriate
algorithms for various scenarios.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Brunel:2016:IAH,
author = "Robert Brunel and Norman May and Alfons Kemper",
title = "Index-assisted hierarchical computations in
main-memory {RDBMS}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "12",
pages = "1065--1076",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2994509.2994524",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We address the problem of expressing and evaluating
computations on hierarchies represented as database
tables. Engine support for such computations is very
limited today, and so they are usually outsourced into
stored procedures or client code. Recently, data model
and SQL language extensions were proposed to
conveniently represent and work with hierarchies. On
that basis we introduce a concept of structural
grouping to relational algebra, provide concise syntax
to express a class of useful computations, and discuss
algorithms to evaluate them efficiently by exploiting
available indexing schemes. This extends the
versatility of RDBMS towards a great many use cases
dealing with hierarchical data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ohsaka:2016:DIA,
author = "Naoto Ohsaka and Takuya Akiba and Yuichi Yoshida and
Ken-ichi Kawarabayashi",
title = "Dynamic influence analysis in evolving networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "12",
pages = "1077--1088",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2994509.2994525",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We propose the first real-time fully-dynamic index
data structure designed for influence analysis on
evolving networks. With this aim, we carefully redesign
the data structure of the state-of-the-art sketching
method introduced by Borgs et al., and construct
corresponding update algorithms. Using this index, we
present algorithms for two kinds of queries, influence
estimation and influence maximization, which are
strongly motivated by practical applications, such as
viral marketing. We provide a thorough theoretical
analysis, which guarantees the non-degeneracy of the
solution accuracy after an arbitrary number of updates.
Furthermore, we introduce a reachability-tree-based
technique and a skipping method, which greatly reduce
the time consumption required for edge/vertex deletions
and vertex additions, respectively, and counter-based
random number generators, which improve the space
efficiency. Experimental evaluations using real dynamic
networks with tens of millions of edges demonstrate the
efficiency, scalability, and accuracy of our proposed
indexing scheme. Specifically, it can reflect a graph
modification within a time of several orders of
magnitude smaller than that required to reconstruct an
index from scratch, estimate the influence spread of a
vertex set accurately within a millisecond, and select
highly influential vertices at least ten times faster
than state-of-the-art static algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tran:2016:DBO,
author = "Luan Tran and Liyue Fan and Cyrus Shahabi",
title = "Distance-based outlier detection in data streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "12",
pages = "1089--1100",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2994509.2994526",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Continuous outlier detection in data streams has
important applications in fraud detection, network
security, and public health. The arrival and departure
of data objects in a streaming manner impose new
challenges for outlier detection algorithms, especially
in time and space efficiency. In the past decade,
several studies have been performed to address the
problem of distance-based outlier detection in data
streams (DODDS), which adopts an unsupervised
definition and does not have any distributional
assumptions on data values. Our work is motivated by
the lack of comparative evaluation among the
state-of-the-art algorithms using the same datasets on
the same platform. We systematically evaluate the most
recent algorithms for DODDS under various stream
settings and outlier rates. Our extensive results show
that in most settings, the MCOD algorithm offers the
superior performance among all the algorithms,
including the most recent algorithm Thresh\_LEAP.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mountantonakis:2016:MLC,
author = "Michalis Mountantonakis and Yannis Tzitzikas",
title = "On measuring the lattice of commonalities among
several linked datasets",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "12",
pages = "1101--1112",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2994509.2994527",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A big number of datasets has been published according
to the principles of Linked Data and this number keeps
increasing. Although the ultimate objective is linking
and integration, it is not currently evident how
connected the current LOD cloud is. Measurements (and
indexes) that involve more than two datasets are not
available although they are important: (a) for
obtaining complete information about one particular URI
(or set of URIs) with provenance (b) for aiding dataset
discovery and selection, (c) for assessing the
connectivity between any set of datasets for quality
checking and for monitoring their evolution over time,
(d) for constructing visualizations that provide more
informative overviews. Since it would be prohibitively
expensive to perform all these measurements in a
na{\"\i}ve way, in this paper we introduce indexes (and
their construction algorithms) that can speedup such
tasks. In brief, we introduce (i) a namespace-based
prefix index, (ii) a sameAs catalog for computing the
symmetric and transitive closure of the owl:sameAs
relationships encountered in the datasets, (iii) a
semantics-aware element index (that exploits the
aforementioned indexes), and finally (iv) two
lattice-based incremental algorithms for speeding up
the computation of the intersection of URIs of any set
of datasets. We discuss the speedup obtained by the
introduced indexes and algorithms through comparative
results and finally we report measurements about
connectivity of the LOD cloud that have never been
carried out so far.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chang:2016:ORD,
author = "Zhao Chang and Dong Xie and Feifei Li",
title = "Oblivious {RAM}: a dissection and experimental
evaluation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "12",
pages = "1113--1124",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2994509.2994528",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Many companies choose the cloud as their data and IT
infrastructure platform. The remote access of the data
brings the issue of trust. Despite the use of strong
encryption schemes, adversaries can still learn
valuable information regarding encrypted data by
observing the data access patterns. To that end, one
can hide the access patterns, which may leak sensitive
information, using Oblivious RAMs (ORAMs). Numerous
works have proposed different ORAM constructions, but
they have never been thoroughly compared against and
tested on large databases. There are also no open
source implementation of these schemes. These
limitations make it difficult for researchers and
practitioners to choose and adopt a suitable ORAM for
their applications. To address this issue, we provide a
thorough study over several practical ORAM
constructions, and implement them under the same
library. We perform extensive experiments to provide
insights into their performance characteristics with
respect to efficiency, scalability, and communication
cost.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kastrati:2016:OCP,
author = "Fisnik Kastrati and Guido Moerkotte",
title = "Optimization of conjunctive predicates for main memory
column stores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "12",
pages = "1125--1136",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2994509.2994529",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Optimization of queries with conjunctive predicates
for main memory databases remains a challenging task.
The traditional way of optimizing this class of queries
relies on predicate ordering based on selectivities or
ranks. However, the optimization of queries with
conjunctive predicates is a much more challenging task,
requiring a holistic approach in view of (1) an
accurate cost model that is aware of CPU architectural
characteristics such as branch (mis)prediction, (2) a
storage layer, allowing for a streamlined query
execution, (3) a common subexpression elimination
technique, minimizing column access costs, and (4) an
optimization algorithm able to pick the optimal plan
even in presence of a small (bounded) estimation error.
In this work, we embrace the holistic approach, and
show its superiority experimentally. Current approaches
typically base their optimization algorithms on at
least one of two assumptions: (1) the predicate
selectivities are assumed to be independent, (2) the
predicate costs are assumed to be constant. Our
approach is not based on these assumptions, as they in
general do not hold.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chothia:2016:EOM,
author = "Zaheer Chothia and John Liagouris and Frank McSherry
and Timothy Roscoe",
title = "Explaining outputs in modern data analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "12",
pages = "1137--1148",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2994509.2994530",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We report on the design and implementation of a
general framework for interactively explaining the
outputs of modern data-parallel computations, including
iterative data analytics. To produce explanations,
existing works adopt a naive backward tracing approach
which runs into known issues; naive backward tracing
may identify: (i) too much information that is
difficult to process, and (ii) not enough information
to reproduce the output, which hinders the logical
debugging of the program. The contribution of this work
is twofold. First, we provide methods to effectively
reduce the size of explanations based on the first
occurrence of a record in an iterative computation.
Second, we provide a general method for identifying
explanations that are sufficient to reproduce the
target output in arbitrary computations --- a problem
for which no viable solution existed until now. We
implement our approach on differential dataflow, a
modern high-throughput, low-latency dataflow platform.
We add a small (but extensible) set of rules to explain
each of its data-parallel operators, and we implement
these rules as differential dataflow operators
themselves. This choice allows our implementation to
inherit the performance characteristics of differential
dataflow, and results in a system that efficiently
computes and updates explanatory inputs even as the
inputs of the reference computation change. We evaluate
our system with various analytic tasks on real
datasets, and we show that it produces concise
explanations in tens of milliseconds, while remaining
faster --- up to two orders of magnitude --- than even
the best implementations that do not support
explanations.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Buneman:2016:RGA,
author = "Peter Buneman and Slawek Staworko",
title = "{RDF} graph alignment with bisimulation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "12",
pages = "1149--1160",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2994509.2994531",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We investigate the problem of aligning two RDF
databases, an essential problem in understanding the
evolution of ontologies. Our approaches address three
fundamental challenges: (1) the use of ``blank'' (null)
names, (2) ontology changes in which different names
are used to identify the same entity, and (3) small
changes in the data values as well as small changes in
the graph structure of the RDF database. We propose
approaches inspired by the classical notion of graph
bisimulation and extend them to capture the natural
metrics of edit distance on the data values and the
graph structure. We evaluate our methods on three
evolving curated data sets. Overall, our results show
that the proposed methods perform well and are
scalable.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bursztyn:2016:TRA,
author = "Damian Bursztyn and Fran{\c{c}}ois Goasdou{\'e} and
Ioana Manolescu",
title = "Teaching an {RDBMS} about ontological constraints",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "12",
pages = "1161--1172",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2994509.2994532",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In the presence of an ontology, query answers must
reflect not only data explicitly present in the
database, but also implicit data, which holds due to
the ontology, even though it is not present in the
database. A large and useful set of ontology languages
enjoys FOL reducibility of query answering: answering a
query can be reduced to evaluating a certain
first-order logic (FOL) formula (obtained from the
query and ontology) against only the explicit facts. We
present a novel query optimization framework for
ontology-based data access settings enjoying FOL
reducibility. Our framework is based on searching
within a set of alternative equivalent FOL queries,
i.e., FOL reformulations, one with minimal evaluation
cost when evaluated through a relational database
system. We apply this framework to the DL-Lite$_R$
Description Logic underpinning the W3C's OWL2 QL
ontology language, and demonstrate through experiments
its performance benefits when two leading SQL systems,
one open-source and one commercial, are used for
evaluating the FOL query reformulations.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Simonini:2016:BLS,
author = "Giovanni Simonini and Sonia Bergamaschi and H. V.
Jagadish",
title = "{BLAST}: a loosely schema-aware meta-blocking approach
for entity resolution",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "12",
pages = "1173--1184",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2994509.2994533",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Identifying records that refer to the same entity is a
fundamental step for data integration. Since it is
prohibitively expensive to compare every pair of
records, blocking techniques are typically employed to
reduce the complexity of this task. These techniques
partition records into blocks and limit the comparison
to records co-occurring in a block. Generally, to deal
with highly heterogeneous and noisy data (e.g.
semi-structured data of the Web), these techniques rely
on redundancy to reduce the chance of missing matches.
Meta-blocking is the task of restructuring blocks
generated by redundancy-based blocking techniques,
removing superfluous comparisons. Existing
meta-blocking approaches rely exclusively on
schema-agnostic features. In this paper, we demonstrate
how ``loose'' schema information (i.e., statistics
collected directly from the data) can be exploited to
enhance the quality of the blocks in a holistic loosely
schema-aware (meta-)blocking approach that can be used
to speed up your favorite Entity Resolution algorithm.
We call it B last (Blocking with Loosely-Aware Schema
Techniques). We show how Blast can automatically
extract this loose information by adopting a LSH-based
step for efficiently scaling to large datasets. We
experimentally demonstrate, on real-world datasets, how
Blast outperforms the state-of-the-art unsupervised
meta-blocking approaches, and, in many cases, also the
supervised one.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhu:2016:LEI,
author = "Erkang Zhu and Fatemeh Nargesian and Ken Q. Pu and
Ren{\'e}e J. Miller",
title = "{LSH} ensemble: {Internet}-scale domain search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "12",
pages = "1185--1196",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2994509.2994534",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We study the problem of domain search where a domain
is a set of distinct values from an unspecified
universe. We use Jaccard set containment score, defined
as $ | Q \cap X | / | Q | $, as the measure of
relevance of a domain $X$ to a query domain $Q$. Our
choice of Jaccard set containment over Jaccard
similarity as a measure of relevance makes our work
particularly suitable for searching Open Data and data
on the web, as Jaccard similarity is known to have poor
performance over sets with large differences in their
domain sizes. We demonstrate that the domains found in
several real-life Open Data and web data repositories
show a power-law distribution over their domain sizes.
We present a new index structure, Locality Sensitive
Hashing (LSH) Ensemble, that solves the domain search
problem using set containment at Internet scale. Our
index structure and search algorithm cope with the data
volume and skew by means of data sketches using Minwise
Hashing and domain partitioning. Our index structure
does not assume a prescribed set of data values. We
construct a cost model that describes the accuracy of
LSH Ensemble with any given partitioning. This allows
us to formulate the data partitioning for LSH Ensemble
as an optimization problem. We prove that there exists
an optimal partitioning for any data distribution.
Furthermore, for datasets following a power-law
distribution, as observed in Open Data and Web data
corpora, we show that the optimal partitioning can be
approximated using equi-depth, making it particularly
efficient to use in practice. We evaluate our algorithm
using real data (Canadian Open Data and WDC Web Tables)
containing up over 262 million domains. The experiments
demonstrate that our index consistently outperforms
other leading alternatives in accuracy and performance.
The improvements are most dramatic for data with large
skew in the domain sizes. Even at 262 million domains,
our index sustains query performance with under 3
seconds response time.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Konda:2016:MTBa,
author = "Pradap Konda and Sanjib Das and Paul Suganthan G. C.
and AnHai Doan and Adel Ardalan and Jeffrey R. Ballard
and Han Li and Fatemah Panahi and Haojun Zhang and Jeff
Naughton and Shishir Prasad and Ganesh Krishnan and
Rohit Deep and Vijay Raghavendra",
title = "{Magellan}: toward building entity matching management
systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "12",
pages = "1197--1208",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2994509.2994535",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Entity matching (EM) has been a long-standing
challenge in data management. Most current EM works
focus only on developing matching algorithms. We argue
that far more efforts should be devoted to building EM
systems. We discuss the limitations of current EM
systems, then present as a solution Magellan, a new
kind of EM systems. Magellan is novel in four important
aspects. (1) It provides how-to guides that tell users
what to do in each EM scenario, step by step. (2) It
provides tools to help users do these steps; the tools
seek to cover the entire EM pipeline, not just matching
and blocking as current EM systems do. (3) Tools are
built on top of the data analysis and Big Data stacks
in Python, allowing Magellan to borrow a rich set of
capabilities in data cleaning, IE, visualization,
learning, etc. (4) Magellan provides a powerful
scripting environment to facilitate interactive
experimentation and quick ``patching'' of the system.
We describe research challenges raised by Magellan,
then present extensive experiments with 44 students and
users at several organizations that show the promise of
the Magellan approach.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Saha:2016:AOD,
author = "Diptikalyan Saha and Avrilia Floratou and Karthik
Sankaranarayanan and Umar Farooq Minhas and Ashish R.
Mittal and Fatma {\"O}zcan",
title = "{ATHENA}: an ontology-driven system for natural
language querying over relational data stores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "12",
pages = "1209--1220",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2994509.2994536",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we present ATHENA, an ontology-driven
system for natural language querying of complex
relational databases. Natural language interfaces to
databases enable users easy access to data, without the
need to learn a complex query language, such as SQL.
ATHENA uses domain specific ontologies, which describe
the semantic entities, and their relationships in a
domain. We propose a unique two-stage approach, where
the input natural language query (NLQ) is first
translated into an intermediate query language over the
ontology, called OQL, and subsequently translated into
SQL. Our two-stage approach allows us to decouple the
physical layout of the data in the relational store
from the semantics of the query, providing physical
independence. Moreover, ontologies provide richer
semantic information, such as inheritance and
membership relations, that are lost in a relational
schema. By reasoning over the ontologies, our NLQ
engine is able to accurately capture the user intent.
We study the effectiveness of our approach using three
different workloads on top of geographical (GEO),
academic (MAS) and financial (FIN) data. ATHENA
achieves 100\% precision on the GEO and MAS workloads,
and 99\% precision on the FIN workload which operates
on a complex financial ontology. Moreover, ATHENA
attains 87.2\%, 88.3\%, and 88.9\% recall on the GEO,
MAS, and FIN workloads, respectively.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wesley:2016:ICC,
author = "Richard Wesley and Fei Xu",
title = "Incremental computation of common windowed holistic
aggregates",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "12",
pages = "1221--1232",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2994509.2994537",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Windowed aggregates are a SQL 2003 feature for
computing aggregates in moving windows. Common examples
include cumulative sums, local maxima and moving
quantiles. With the advent over the last few years of
easy-to-use data analytics tools, these functions are
becoming widely used by more and more analysts, but
some aggregates (such as local maxima) are much easier
to compute than others (such as moving quantiles).
Nevertheless, aggregates that are more difficult to
compute, like quantile and mode (or ``most frequent'')
provide more appropriate statistical summaries in the
common situation when a distribution is not Gaussian
and are an essential part of a data analysis toolkit.
Recent work has described highly efficient windowed
implementations of the most common aggregate function
categories, including distributive$^1$ aggregates such
as cumulative sums and algebraic aggregates such as
moving averages. But little has been published on
either the implementation or the performance of the
more complex holistic windowed aggregates such as
moving quantiles. This paper provides the first
in-depth study of how to efficiently implement the
three most common holistic windowed aggregates (count
distinct, mode and quantile) by reusing the aggregate
state between consecutive frames. Our measurements show
that these incremental algorithms generally achieve
improvements of about 10x over na{\"\i}ve
implementations, and that they can effectively detect
when to reset the internal state during extreme frame
variation.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fang:2016:ECS,
author = "Yixiang Fang and Reynold Cheng and Siqiang Luo and
Jiafeng Hu",
title = "Effective community search for large attributed
graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "12",
pages = "1233--1244",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/2994509.2994538",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 6 16:21:12 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Given a graph $G$ and a vertex $ q \in G$, the
community search query returns a subgraph of $G$ that
contains vertices related to $q$. Communities, which
are prevalent in attributed graphs such as social
networks and knowledge bases, can be used in emerging
applications such as product advertisement and setting
up of social events. In this paper, we investigate the
attributed community query (or ACQ), which returns an
attributed community (AC) for an attributed graph. The
AC is a subgraph of $G$, which satisfies both structure
cohesiveness (i.e., its vertices are tightly connected)
and keyword cohesiveness (i.e., its vertices share
common keywords). The AC enables a better understanding
of how and why a community is formed (e.g., members of
an AC have a common interest in music, because they all
have the same keyword ``music''). An AC can be
``personalized''; for example, an ACQ user may specify
that an AC returned should be related to some specific
keywords like ``research'' and ``sports''. To enable
efficient AC search, we develop the CL-tree index
structure and three algorithms based on it. We evaluate
our solutions on four large graphs, namely Flickr,
DBLP, Tencent, and DBpedia. Our results show that ACs
are more effective and efficient than existing
community retrieval approaches. Moreover, an AC
contains more precise and personalized information than
that of existing community search and detection
methods.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lang:2016:TIA,
author = "Willis Lang and Karthik Ramachandra and David J.
DeWitt and Shize Xu and Qun Guo and Ajay Kalhan and
Peter Carlin",
title = "Not for the timid: on the impact of aggressive
over-booking in the cloud",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1245--1256",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "To lower hosting costs and service prices,
database-as-a-service (DBaaS) providers strive to
maximize cluster utilization without negatively
affecting their users' service experience. Some of the
most effective approaches for increasing service
efficiency result in the over-booking of the cluster
with user databases. For instance, one approach is to
reclaim cluster capacity from a database when it is
idle, temporarily re-using the capacity for some other
purpose, and over-booking the cluster's resources. Such
approaches are largely driven by policies that
determine when it is prudent to temporarily reclaim
capacity from an idle database. In this paper, we
examine policies that inherently tune the system's idle
sensitivity. Increased sensitivity to idleness leads to
aggressive over-booking while the converse leads to
conservative reclamation and lower utilization levels.
Aggressive over-booking also incurs a ``reserve''
capacity cost (for when we suddenly ``owe'' capacity to
previously idle databases.) We answer these key
questions in this paper: (1) how to find a ``good''
resource reclamation policy for a given DBaaS cluster
of users; and (2) how to forecast the needed near-term
reserve capacity. To help us answer these questions, we
used production user activity traces from Azure SQL DB
and built models of an over-booking mechanism. We show
that choosing the right policy can substantially boost
the efficiency of the service, facilitating lower
service prices via lower amortized infrastructure
costs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sevenich:2016:UDS,
author = "Martin Sevenich and Sungpack Hong and Oskar van Rest
and Zhe Wu and Jayanta Banerjee and Hassan Chafi",
title = "Using domain-specific languages for analytic graph
databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1257--1268",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Recently graph has been drawing lots of attention both
as a natural data model that captures fine-grained
relationships between data entities and as a tool for
powerful data analysis that considers such
relationships. In this paper, we present a new graph
database system that integrates a robust graph storage
with an efficient graph analytics engine. Primarily,
our system adopts two domain-specific languages (DSLs),
one for describing graph analysis algorithms and the
other for graph pattern matching queries. Compared to
the API-based approaches in conventional graph
processing systems, the DSL-based approach provides
users with more flexible and intuitive ways of
expressing algorithms and queries. Moreover, the
DSL-based approach has significant performance benefits
as well, (1) by skipping (remote) API invocation
overhead and (2) by applying high-level optimization
from the compiler.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2016:KLM,
author = "Shaosu Liu and Bin Song and Sriharsha Gangam and
Lawrence Lo and Khaled Elmeleegy",
title = "{Kodiak}: leveraging materialized views for very
low-latency analytics over high-dimensional web-scale
data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1269--1280",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Turn's online advertising campaigns produce petabytes
of data. This data is composed of trillions of events,
e.g. impressions, clicks, etc., spanning multiple
years. In addition to a timestamp, each event includes
hundreds of fields describing the user's attributes,
campaign's attributes, attributes of where the ad was
served, etc. Advertisers need advanced analytics to
monitor their running campaigns' performance, as well
as to optimize future campaigns. This involves slicing
and dicing the data over tens of dimensions over
arbitrary time ranges. Many of these queries need to
power the web portal to provide reports and dashboards.
For an interactive response time, they have to have
tens of milliseconds latency. At Turn's scale of
operations, no existing system was able to deliver this
performance in a cost effective manner. Kodiak, a
distributed analytical data platform for web-scale
high-dimensional data, was built to serve this need. It
relies on pre-computations to materialize thousands of
views to serve these advanced queries. These views are
partitioned and replicated across Kodiak's storage
nodes for scalability and reliability. They are system
maintained as new events arrive. At query time, the
system auto-selects the most suitable view to serve
each query. Kodiak has been used in production for over
a year. It hosts 2490 views for over three petabytes of
raw data serving over 200K queries daily. It has median
and 99\% query latencies of 8 ms and 252 ms
respectively. Our experiments show that its query
latency is 3 orders of magnitude faster than leading
big data platforms on head-to-head comparisons using
Turn's query workload. Moreover, Kodiak uses 4 orders
of magnitude less resources to run the same workload.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sharma:2016:GRT,
author = "Aneesh Sharma and Jerry Jiang and Praveen Bommannavar
and Brian Larson and Jimmy Lin",
title = "{GraphJet}: real-time content recommendations at
{Twitter}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1281--1292",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper presents GraphJet, a new graph-based system
for generating content recommendations at Twitter. As
motivation, we trace the evolution of our formulation
and approach to the graph recommendation problem,
embodied in successive generations of systems. Two
trends can be identified: supplementing batch with
real-time processing and a broadening of the scope of
recommendations from users to content. Both of these
trends come together in Graph-Jet, an in-memory graph
processing engine that maintains a real-time bipartite
interaction graph between users and tweets. The storage
engine implements a simple API, but one that is
sufficiently expressive to support a range of
recommendation algorithms based on random walks that we
have refined over the years. Similar to Cassovary, a
previous graph recommendation engine developed at
Twitter, GraphJet assumes that the entire graph can be
held in memory on a single server. The system organizes
the interaction graph into temporally-partitioned index
segments that hold adjacency lists. GraphJet is able to
support rapid ingestion of edges while concurrently
serving lookup queries through a combination of compact
edge encoding and a dynamic memory allocation scheme
that exploits power-law characteristics of the graph.
Each GraphJet server ingests up to one million graph
edges per second, and in steady state, computes up to
500 recommendations per second, which translates into
several million edge read operations per second.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ma:2016:DFP,
author = "Edward Ma and Vishrut Gupta and Meichun Hsu and
Indrajit Roy",
title = "\pkg{dmapply}: a functional primitive to express
distributed machine learning algorithms in {R}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1293--1304",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/s-plus.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Due to R's popularity as a data-mining tool, many
distributed systems expose an R-based API to users who
need to build a distributed application in R. As a
result, data scientists have to learn to use different
interfaces such as RHadoop, SparkR, Revolution R's
ScaleR, and HPE's Distributed R. Unfortunately, these
interfaces are custom, non-standard, and difficult to
learn. Not surprisingly, R applications written in one
framework do not work in another, and each backend
infrastructure has spent redundant effort in
implementing distributed machine learning algorithms.
Working with the members of R-core, we have created ddR
(Distributed Data structures in R), a unified system
that works across different distributed frameworks. In
ddR, we introduce a novel programming primitive called
dmapply that executes functions on distributed data
structures. The dmapply primitive encapsulates
different computation patterns: from function and data
broadcast to pair-wise communication. We show that
dmapply is powerful enough to express algorithms that
fit the statistical query model, which includes many
popular machine learning algorithms, as well as
applications written in MapReduce. We have integrated
ddR with many backends, such as R's single-node
parallel framework, multi-node SNOW framework, Spark,
and HPE Distributed R, with few or no modifications to
any of these systems. We have also implemented multiple
machine learning algorithms which are not only portable
across different distributed systems, but also have
performance comparable to the ``native''
implementations on the backends. We believe that ddR
will standardize distributed computing in R, just like
the SQL interface has standardized how relational data
is manipulated.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Pedreira:2016:CIM,
author = "Pedro Pedreira and Chris Croswhite and Luis Bona",
title = "{Cubrick}: indexing millions of records per second for
interactive analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1305--1316",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper describes the architecture and design of
Cubrick, a distributed multidimensional in-memory DBMS
suited for interactive analytics over highly dynamic
datasets. Cubrick has a strictly multidimensional data
model composed of cubes, dimensions and metrics,
supporting sub-second OLAP operations such as slice and
dice, roll-up and drill-down over terabytes of data.
All data stored in Cubrick is range partitioned by
every dimension and stored within containers called
bricks in an unordered and sparse fashion, providing
high data ingestion rates and indexed access through
any combination of dimensions. In this paper, we
describe details about Cubrick's internal data
structures, distributed model, query execution engine
and a few details about the current implementation.
Finally, we present results from a thorough
experimental evaluation that leveraged datasets and
queries collected from a few internal Cubrick
deployments at Facebook.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Iosup:2016:LGB,
author = "Alexandru Iosup and Tim Hegeman and Wing Lung Ngai and
Stijn Heldens and Arnau Prat-P{\'e}rez and Thomas
Manhardto and Hassan Chafio and Mihai Capota and
Narayanan Sundaram and Michael Anderson and Ilie
Gabriel Tanase and Yinglong Xia and Lifeng Nai and
Peter Boncz",
title = "{LDBC} graphalytics: a benchmark for large-scale graph
analysis on parallel and distributed platforms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1317--1328",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper we introduce LDBC Graphalytics, a new
industrial-grade benchmark for graph analysis
platforms. It consists of six deterministic algorithms,
standard datasets, synthetic dataset generators, and
reference output, that enable the objective comparison
of graph analysis platforms. Its test harness produces
deep metrics that quantify multiple kinds of system
scalability, such as horizontal/vertical and
weak/strong, and of robustness, such as failures and
performance variability. The benchmark comes with
open-source software for generating data and monitoring
performance. We describe and analyze six
implementations of the benchmark (three from the
community, three from the industry), providing insights
into the strengths and weaknesses of the platforms. Key
to our contribution, vendors perform the tuning and
benchmarking of their platforms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lustosa:2016:DSS,
author = "Hermano Lustosa and Fabio Porto and Patrick Valduriez
and Pablo Blanco",
title = "Database system support of simulation data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1329--1340",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Supported by increasingly efficient HPC
infra-structure, numerical simulations are rapidly
expanding to fields such as oil and gas, medicine and
meteorology. As simulations become more precise and
cover longer periods of time, they may produce files
with terabytes of data that need to be efficiently
analyzed. In this paper, we investigate techniques for
managing such data using an array DBMS. We take
advantage of multidimensional arrays that nicely models
the dimensions and variables used in numerical
simulations. However, a naive approach to map
simulation data files may lead to sparse arrays,
impacting query response time, in particular, when the
simulation uses irregular meshes to model its physical
domain. We propose efficient techniques to map
coordinate values in numerical simulations to evenly
distributed cells in array chunks with the use of
equi-depth histograms and space-filling curves. We
implemented our techniques in SciDB and, through
experiments over real-world data, compared them with
two other approaches: row-store and column-store DBMS.
The results indicate that multidimensional arrays and
column-stores are much faster than a traditional
row-store system for queries over a larger amount of
simulation data. They also help identifying the
scenarios where array DBMSs are most efficient, and
those where they are outperformed by column-stores.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jacques-Silva:2016:CRG,
author = "Gabriela Jacques-Silva and Fang Zheng and Daniel
Debrunner and Kun-Lung Wu and Victor Dogaru and Eric
Johnson and Michael Spicer and Ahmet Erdem
Sariy{\"u}ce",
title = "Consistent regions: guaranteed tuple processing in
{IBM} streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1341--1352",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Guaranteed tuple processing has become critically
important for many streaming applications. This paper
describes how we enabled IBM Streams, an
enterprise-grade stream processing system, to provide
data processing guarantees. Our solution goes from
language-level abstractions to a runtime protocol. As a
result, with a couple of simple annotations at the
source code level, IBM Streams developers can define
consistent regions, allowing any subgraph of their
streaming application to achieve guaranteed tuple
processing. At runtime, a consistent region
periodically executes a variation of the Chandy-Lamport
snapshot algorithm to establish a consistent global
state for that region. The coupling of consistent
states with data replay enables guaranteed tuple
processing.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Al-Kateb:2016:HRC,
author = "Mohammed Al-Kateb and Paul Sinclair and Grace Au and
Carrie Ballinger",
title = "Hybrid row-column partitioning in {Teradata\reg}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1353--1364",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data partitioning is an indispensable ingredient of
database systems due to the performance improvement it
can bring to any given mixed workload. Data can be
partitioned horizontally or vertically. While some
commercial proprietary and open source database systems
have one flavor or mixed flavors of these partitioning
forms, Teradata Database offers a unique hybrid
row-column store solution that seamlessly combines both
of these partitioning schemes. The key feature of this
hybrid solution is that either row, column, or combined
partitions are all stored and handled in the same way
internally by the underlying file system storage layer.
In this paper, we present the main characteristics and
explain the implementation approach of Teradata's
row-column store. We also discuss query optimization
techniques applicable specifically to partitioned
tables. Furthermore, we present a performance study
that demonstrates how different partitioning options
impact the performance of various queries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fernandes:2016:THH,
author = "Ricardo Fernandes and Piotr Zaczkowski and Bernd
G{\"o}ttler and Conor Ettinoffe and Anis Moussa",
title = "{TrafficDB}: {HERE}'s high performance shared-memory
data store",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1365--1376",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "HERE's traffic-aware services enable route planning
and traffic visualisation on web, mobile and connected
car applications. These services process thousands of
requests per second and require efficient ways to
access the information needed to provide a timely
response to end-users. The characteristics of road
traffic information and these traffic-aware services
require storage solutions with specific performance
features. A route planning application utilising
traffic congestion information to calculate the optimal
route from an origin to a destination might hit a
database with millions of queries per second. However,
existing storage solutions are not prepared to handle
such volumes of concurrent read operations, as well as
to provide the desired vertical scalability. This paper
presents TrafficDB, a shared-memory data store,
designed to provide high rates of read operations,
enabling applications to directly access the data from
memory. Our evaluation demonstrates that TrafficDB
handles millions of read operations and provides
near-linear scalability on multi-core machines, where
additional processes can be spawned to increase the
systems' throughput without a noticeable impact on the
latency of querying the data store. The paper concludes
with a description of how TrafficDB improved the
performance of our traffic-aware services running in
production.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Scotti:2016:CBH,
author = "Alex Scotti and Mark Hannum and Michael Ponomarenko
and Dorin Hogea and Akshat Sikarwar and Mohit Khullar
and Adi Zaimi and James Leddy and Rivers Zhang and
Fabio Angius and Lingzhi Deng",
title = "{Comdb2}: {Bloomberg}'s highly available relational
database system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1377--1388",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Comdb2 is a distributed database system designed for
geographical replication and high availability. In
contrast with the latest trends in this field, Comdb2
offers full transactional support, a standard
relational model, and the expressivity of SQL.
Moreover, the system allows for rich stored procedures
using a dialect of Lua. Comdb2 implements a
serializable system in which reads from any node always
return current values. Comdb2 provides transparent High
Availability through built-in service discovery and
sophisticated retry logic embedded in the standard API.
In addition to the relational data model, Comdb2
implements queues for publisher-to-subscriber message
delivery. Queues can be combined with table triggers
for time-consistent log distribution, providing
functionality commonly needed in modern OLTP. In this
paper we give an overview of our last twelve years of
work. We focus on the design choices that have made
Comdb2 the primary database solution within our
company, Bloomberg LP (BLP).",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Srinivasan:2016:AAR,
author = "V. Srinivasan and Brian Bulkowski and Wei-Ling Chu and
Sunil Sayyaparaju and Andrew Gooding and Rajkumar Iyer
and Ashish Shinde and Thomas Lopatic",
title = "{Aerospike}: architecture of a real-time operational
{DBMS}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1389--1400",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we describe the solutions developed to
address key technical challenges encountered while
building a distributed database system that can
smoothly handle demanding real-time workloads and
provide a high level of fault tolerance. Specifically,
we describe schemes for the efficient clustering and
data partitioning for the automatic scale out of
processing across multiple nodes and for optimizing the
usage of CPUs, DRAM, SSDs and networks to efficiently
scale up performance on one node. The techniques
described here were used to develop Aerospike (formerly
Citrusleaf), a high performance distributed database
system built to handle the needs of today's interactive
online services. Most real-time decision systems that
use Aerospike require very high scale and need to make
decisions within a strict SLA by reading from, and
writing to, a database containing billions of data
items at a rate of millions of operations per second
with sub-millisecond latency. For over five years,
Aerospike has been continuously used in over a hundred
successful production deployments, as many enterprises
have discovered that it can substantially enhance their
user experience.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chen:2016:MQO,
author = "Jack Chen and Samir Jindel and Robert Walzer and
Rajkumar Sen and Nika Jimsheleishvilli and Michael
Andrews",
title = "The {MemSQL} query optimizer: a modern optimizer for
real-time analytics in a distributed database",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1401--1412",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Real-time analytics on massive datasets has become a
very common need in many enterprises. These
applications require not only rapid data ingest, but
also quick answers to analytical queries operating on
the latest data. MemSQL is a distributed SQL database
designed to exploit memory-optimized, scale-out
architecture to enable real-time transactional and
analytical workloads which are fast, highly concurrent,
and extremely scalable. Many analytical queries in
MemSQL's customer workloads are complex queries
involving joins, aggregations, sub-queries, etc. over
star and snowflake schemas, often ad-hoc or produced
interactively by business intelligence tools. These
queries often require latencies of seconds or less, and
therefore require the optimizer to not only produce a
high quality distributed execution plan, but also
produce it fast enough so that optimization time does
not become a bottleneck. In this paper, we describe the
architecture of the MemSQL Query Optimizer and the
design choices and innovations which enable it quickly
produce highly efficient execution plans for complex
distributed queries. We discuss how query rewrite
decisions oblivious of distribution cost can lead to
poor distributed execution plans, and argue that to
choose high-quality plans in a distributed database,
the optimizer needs to be distribution-aware in
choosing join plans, applying query rewrites, and
costing plans. We discuss methods to make join
enumeration faster and more effective, such as a
rewrite-based approach to exploit bushy joins in
queries involving multiple star schemas without
sacrificing optimization time. We demonstrate the
effectiveness of the MemSQL optimizer over queries from
the TPC-H benchmark and a real customer workload.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lakshman:2016:NFS,
author = "Sarath Lakshman and Sriram Melkote and John Liang and
Ravi Mayuram",
title = "{Nitro}: a fast, scalable in-memory storage engine for
{NoSQL} global secondary index",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1413--1424",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present Nitro, a high-performance in-memory
key--value storage engine used in Couchbase 4.5 Global
Secondary Indexes. The Nitro storage engine is well
suited for the recent hardware trends like large
amounts of memory and many CPU cores. The storage
engine leverages latch-free data structures and tries
to achieve linear scalability for the index read-write
operations. The Nitro storage engine offers concurrent
readers and writers, lightweight database snapshots,
stable scan, backup and recovery operations. We
integrated Nitro into the Couchbase Global Secondary
Indexes (GSI) and observed significant improvement in
performance compared to our disk oriented storage
engine configured with the same amount of memory for
buffer cache. On a 32 core machine, we observed an
end-to-end GSI server insertion throughput of 1,650,000
entries/sec and index update throughput of 822,000
entries/sec. A single instance of Nitro data structure
running on a 40 core machine achieved a peak insertion
throughput of 4 million index entries/sec and entry
lookup throughput of 10 million lookups/sec.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Boehm:2016:SDM,
author = "Matthias Boehm and Michael W. Dusenberry and Deron
Eriksson and Alexandre V. Evfimievski and Faraz Makari
Manshadi and Niketan Pansare and Berthold Reinwald and
Frederick R. Reiss and Prithviraj Sen and Arvind C.
Surve and Shirish Tatikonda",
title = "{SystemML}: declarative machine learning on spark",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1425--1436",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The rising need for custom machine learning (ML)
algorithms and the growing data sizes that require the
exploitation of distributed, data-parallel frameworks
such as MapReduce or Spark, pose significant
productivity challenges to data scientists. Apache
SystemML addresses these challenges through declarative
ML by (1) increasing the productivity of data
scientists as they are able to express custom
algorithms in a familiar domain-specific language
covering linear algebra primitives and statistical
functions, and (2) transparently running these ML
algorithms on distributed, data-parallel frameworks by
applying cost-based compilation techniques to generate
efficient, low-level execution plans with in-memory
single-node and large-scale distributed operations.
This paper describes SystemML on Apache Spark, end to
end, including insights into various optimizer and
runtime techniques as well as performance
characteristics. We also share lessons learned from
porting SystemML to Spark and declarative ML in
general. Finally, SystemML is open-source, which allows
the database community to leverage it as a testbed for
further research.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mishra:2016:AAD,
author = "Aurosish Mishra and Shasank Chavan and Allison
Holloway and Tirthankar Lahiri and Zhen Hua Liu and
Sunil Chakkappen and Dennis Lui and Vinita Subramanian
and Ramesh Kumar and Maria Colgan and Jesse Kamp and
Niloy Mukherjee and Vineet Marwah",
title = "Accelerating analytics with dynamic in-memory
expressions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1437--1448",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Oracle Database In-Memory (DBIM) accelerates analytic
workload performance by orders of magnitude through an
in-memory columnar format utilizing techniques such as
SIMD vector processing, in-memory storage indexes, and
optimized predicate evaluation and aggregation. With
Oracle Database 12.2, Database In-Memory is further
enhanced to accelerate analytic processing through a
novel lightweight mechanism known as Dynamic In-Memory
Expressions (DIMEs). The DIME mechanism automatically
detects frequently occurring expressions in a query
workload, and then creates highly optimized,
transactionally consistent, in-memory columnar
representations of these expression results. At
runtime, queries can directly access these DIMEs, thus
avoiding costly expression evaluations. Furthermore,
all the optimizations introduced in DBIM can apply
directly to DIMEs. Since DIMEs are purely in-memory
structures, no changes are required to the underlying
tables. We show that DIMEs can reduce query elapsed
times by several orders of magnitude without the need
for costly pre-computed structures such as computed
columns or materialized views or cubes.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bhadange:2016:GSL,
author = "Satyajit Bhadange and Akhil Arora and Arnab
Bhattacharya",
title = "{GARUDA}: a system for large-scale mining of
statistically significant connected subgraphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1449--1452",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Unraveling ``interesting'' subgraphs corresponding to
disease/crime hotspots or characterizing habitation
shift patterns is an important graph mining task. With
the availability and growth of large-scale real-world
graphs, mining for such subgraphs has become the need
of the hour for graph miners as well as non-technical
end-users. In this demo, we present GARUDA, a system
capable of mining large-scale graphs for statistically
significant subgraphs in a scalable manner, and
provide: (1) a detailed description of the various
features and user-friendly GUI of GARUDA; (2) a brief
description of the system architecture; and (3) a
demonstration scenario for the audience. The
demonstration showcases one real graph mining task as
well as its ability to scale to large real graphs,
portraying speed-ups of upto 8--10 times over the
state-of-the-art MSCS algorithm.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2016:VVT,
author = "Huan Li and Hua Lu and Xin Chen and Gang Chen and Ke
Chen and Lidan Shou",
title = "{Vita}: a versatile toolkit for generating indoor
mobility data for real-world buildings",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1453--1456",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We demonstrate a generic, user-configurable toolkit
for generating different types of indoor mobility data
for real-world buildings. Our prototype generates the
desired data in a three-layer pipeline. The
Infrastructure Layer accepts industry-standard digital
building information (DBI) files to generate the host
indoor environment, allowing users to configure the
generation of a variety of positioning devices, such as
Wi-Fi, Bluetooth, RFID, etc. The Moving Object Layer
offers the functionality of defining objects or
trajectories, with configurable indoor moving patterns,
distribution models, and sampling frequencies. The
Positioning Layer generates synthetic signal strength
measurements known as raw RSSI$^1$ measurements
according to the positioning device data and trajectory
data generated at relevant layers. It also generates
different types of indoor positioning data through the
customization of all typical indoor positioning methods
on the raw RSSI data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bagan:2016:GFW,
author = "Guillaume Bagan and Angela Bonifati and Radu Ciucanu
and George H. L. Fletcher and Aur{\'e}lien Lemay and
Nicky Advokaat",
title = "Generating flexible workloads for graph databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1457--1460",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Graph data management tools are nowadays evolving at a
great pace. Key drivers of progress in the design and
study of data intensive systems are solutions for
synthetic generation of data and workloads, for use in
empirical studies. Current graph generators, however,
provide limited or no support for workload generation
or are limited to fixed use-cases. Towards addressing
these limitations, we demonstrate gMark, the first
domain- and query language-independent framework for
synthetic graph and query workload generation. Its
novel features are: (i) fine-grained control of graph
instance and query workload generation via expressive
user-defined schemas; (ii) the support of expressive
graph query languages, including recursion among other
features; and, (iii) selectivity estimation of the
generated queries. During the demonstration, we will
showcase the highly tunable generation of graphs and
queries through various user-defined schemas and
targeted selectivities, and the variety of supported
practical graph query languages. We will also show a
performance comparison of four state-of-the-art graph
database engines, which helps us understand their
current strengths and desirable future extensions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhou:2016:AQP,
author = "Xiaofeng Zhou and Yang Chen and Daisy Zhe Wang",
title = "{ArchimedesOne}: query processing over probabilistic
knowledge bases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1461--1464",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Knowledge bases are becoming increasingly important in
structuring and representing information from the web.
Meanwhile, web-scale information poses significant
scalability and quality challenges to knowledge base
systems. To address these challenges, we develop a
probabilistic knowledge base system, ArchimedesOne, by
scaling up the knowledge expansion and statistical
inference algorithms. We design a web interface for
users to query and update large knowledge bases. In
this paper, we demonstrate the ArchimedesOne system to
showcase its efficient query and inference engines. The
demonstration serves two purposes: (1) to provide an
interface for users to interact with ArchimedesOne
through load, search, and update queries; and (2) to
validate our approaches of knowledge expansion by
applying inference rules in batches using relational
operations and query-driven inference by focusing
computation on the query facts. We compare
ArchimedesOne with state-of-the-art approaches using
two knowledge bases: NELL-sports with 4.5 million facts
and Reverb-Sherlock with 15 million facts.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Milo:2016:RIR,
author = "Tova Milo and Slava Novgorodov and Wang-Chiew Tan",
title = "{Rudolf}: interactive rule refinement system for fraud
detection",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1465--1468",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Credit card frauds are unauthorized transactions that
are made or attempted by a person or an organization
that is not authorized by the card holders. In addition
to machine learning-based techniques, credit card
companies often employ domain experts to manually
specify rules that exploit domain knowledge for
improving the detection process. Over time, however, as
new (fraudulent and legitimate) transaction arrive,
these rules need to be updated and refined to capture
the evolving (fraud and legitimate) activity patterns.
The goal of the RUDOLF system that is demonstrated here
is to guide and assist domain experts in this
challenging task. RUDOLF automatically determines a
best set of candidate adaptations to existing rules to
capture all fraudulent transactions and, respectively,
omit all legitimate transactions. The proposed
modifications can then be further refined by domain
experts based on their domain knowledge, and the
process can be repeated until the experts are satisfied
with the resulting rules. Our experimental results on
real-life datasets demonstrate the effectiveness and
efficiency of our approach. We showcase RUDOLF with two
demonstration scenarios: detecting credit card frauds
and network attacks. Our demonstration will engage the
VLDB audience by allowing them to play the role of a
security expert, a credit card fraudster, or a network
attacker.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Maccioni:2016:GDB,
author = "Antonio Maccioni and Matteo Collina",
title = "Graph databases in the browser: using {LevelGraph} to
explore {New Delhi}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1469--1472",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The pervasiveness of graphs on the Web is growing;
however, the difficulty of managing complex graph
structures curbs the development of web-oriented
applications that embed network data. The open source
project, LevelGraph, aims to overcome the obstacles
that web developers face with graph data management.
LevelGraph is an easy-to-use graph database layer for
web applications. To demonstrate various capabilities
of the system, we developed a web-based application
that utilizes a graph database of a tourist network in
New Delhi. The application allows users to move around
the city while LevelGraph executes graph queries on the
underlying database. In this demonstration, we show how
LevelGraph's features facilitate development and
maintenance of web applications that embed graph
data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sellam:2016:ZCQ,
author = "Thibault Sellam and Martin Kersten",
title = "{Ziggy}: characterizing query results for data
explorers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1473--1476",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data exploration has received much attention during
the last few years. The aim is to learn interesting new
facts from a possibly unfamiliar data set. Typically,
explorers operate by trial and error: they write a
query, inspect the results and refine their
specifications accordingly. In this demo proposal, we
present Ziggy, a system to help them understand their
query results. Ziggy's aim is to complement an existing
exploration system. It assumes that users already have
a query in mind, but they do not know what is
interesting about it. To assist them, it detects
characteristic views, that is, small sets of columns on
which the tuples in the results are different from
those in the rest of the database. Thanks to these
views, our explorers can understand why their selection
is unique and make more informed exploration
decisions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sellam:2016:BMN,
author = "Thibault Sellam and Robin Cijvat and Richard
Koopmanschap and Martin Kersten",
title = "{Blaeu}: mapping and navigating large tables with
cluster analysis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1477--1480",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Blaeu is an interactive database exploration tool. Its
aim is to guide casual users through large data tables,
ultimately triggering insights and serendipity. To do
so, it relies on a double cluster analysis mechanism.
It clusters the data vertically: it detects themes,
groups of mutually dependent columns that highlight one
aspect of the data. Then it clusters the data
horizontally. For each theme, it produces a data map,
an interactive visualization of the clusters in the
table. The data maps summarize the data. They provide a
visual synopsis of the clusters, as well as facilities
to inspect their content and annotate them. But they
also let the users navigate further. Our explorers can
change the active set of columns or drill down into the
clusters to refine their selection. Our prototype is
fully operational, ready to deliver insights from
complex databases.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{El-Roby:2016:SQR,
author = "Ahmed El-Roby and Khaled Ammar and Ashraf Aboulnaga
and Jimmy Lin",
title = "{Sapphire}: querying {RDF} data made simple",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1481--1484",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "There is currently a large amount of publicly
accessible structured data available as RDF data sets.
For example, the Linked Open Data (LOD) cloud now
consists of thousands of RDF data sets with over 30
billion triples, and the number and size of the data
sets is continuously growing. Many of the data sets in
the LOD cloud provide public SPARQL endpoints to allow
issuing queries over them. These end-points enable
users to retrieve data using precise and highly
expressive SPARQL queries. However, in order to do so,
the user must have sufficient knowledge about the data
sets that she wishes to query, that is, the structure
of data, the vocabulary used within the data set, the
exact values of literals, their data types, etc. Thus,
while SPARQL is powerful, it is not easy to use. An
alternative to SPARQL that does not require as much
prior knowledge of the data is some form of keyword
search over the structured data. Keyword search queries
are easy to use, but inherently ambiguous in describing
structured queries. This demonstration introduces
Sapphire, a system for querying RDF data that strikes a
middle ground between ambiguous keyword search and
difficult-to-use SPARQL. Our system does not replace
either, but utilizes both where they are most
effective. Sapphire helps the user construct expressive
SPARQL queries that represent her information needs
without requiring detailed knowledge about the queried
data sets. These queries are then executed over public
SPARQL endpoints from the LOD cloud. Sapphire guides
the user in the query writing process by showing
suggestions of query terms based on the queried data,
and by recommending changes to the query based on a
predictive user model.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Amsterdamer:2016:DDT,
author = "Yael Amsterdamer and Tova Milo and Amit Somech and
Brit Youngmann",
title = "{December}: a declarative tool for crowd member
selection",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1485--1488",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Adequate crowd selection is an important factor in the
success of crowdsourcing platforms, increasing the
quality and relevance of crowd answers and their
performance in different tasks. The optimal crowd
selection can greatly vary depending on properties of
the crowd and of the task. To this end, we present
December, a declarative platform with novel
capabilities for flexible crowd selection. December
supports the personalized selection of crowd members
via a dedicated query language Member-QL. This language
enables specifying and combining common crowd selection
criteria such as properties of a crowd member's profile
and history, similarity between profiles in specific
aspects and relevance of the member to a given task.
This holistic, customizable approach differs from
previous work that has mostly focused on dedicated
algorithms for crowd selection in specific settings. To
allow efficient query execution, we implement novel
algorithms in December based on our generic,
semantically-aware definitions of crowd member
similarity and expertise. We demonstrate the
effectiveness of December and Member-QL by using the
VLDB community as crowd members and allowing conference
participants to choose from among these members for
different purposes and in different contexts.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{He:2016:DVV,
author = "Xi He and Nisarg Raval and Ashwin Machanavajjhala",
title = "A demonstration of {VisDPT}: visual exploration of
differentially private trajectories",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1489--1492",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The release of detailed taxi trips has motivated
numerous useful studies, but has also triggered
multiple privacy attacks on individuals' trips. Despite
these attacks, no tools are available for
systematically analyzing the privacy risk of released
trajectory data. While, recent studies have proposed
mechanisms to publish synthetic mobility data with
provable privacy guarantees, the questions on --- (1)
how to explain the theoretical privacy guarantee to
non-privacy experts; and (2) how well private data
preserves the properties of ground truth, remain
unclear. To address these issues, we propose a system
--- VisDPT that provides rich visualization of
sensitive information in trajectory databases and helps
data curators understand the impact on utility due to
privacy preserving mechanisms. We believe VisDPT will
enable data curators to take informed decisions while
publishing sanitized data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Scheuer:2016:JSA,
author = "Tobias Scheuer and Norman May and Alexander B{\"o}hm
and Daniel Scheibli",
title = "{JexLog}: a sonar for the abyss",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1493--1496",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Today's hardware architectures provide an
ever-increasing number of CPU cores that can be used
for running concurrent operations. A big challenge is
to ensure that these operations are properly
synchronized and make efficient use of the available
resources. Fellow database researchers have
appropriately described this problem as ``staring into
the abyss'' of complexity [12], where reasoning about
the interplay of jobs on a thousand cores becomes
extremely challenging. In this demonstration, we show
how a new tool, JexLog, can help to visually analyze
concurrent jobs in system software and how it is used
to optimize for modern hardware.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ikeda:2016:CCC,
author = "Kosetsu Ikeda and Atsuyuki Morishima and Habibur
Rahman and Senjuti Basu Roy and Saravanan
Thirumuruganathan and Sihem Amer-Yahia and Gautam Das",
title = "Collaborative crowdsourcing with {Crowd4u}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1497--1500",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Collaborative crowdsourcing is an emerging paradigm
where a set of workers, often with diverse and
complementary skills, form groups and work together to
complete complex tasks. While crowdsourcing has been
used successfully in many applications, collaboration
is essential for achieving a high quality outcome for a
number of emerging applications such as text
translation, citizen journalism and surveillance tasks.
However, no crowdsourcing platform today enables the
end-to-end deployment of collaborative tasks. We
demonstrate Crowd4U, a volunteer-based system that
enables the deployment of diverse crowdsourcing tasks
with complex data-flows, in a declarative manner. In
addition to treating workers and tasks as rich
entities, Crowd4U also provides an easy-to-use
form-based task UI. Crowd4U implements worker-to-task
assignment algorithms that are appropriate for each
kind of task. Once workers are assigned to tasks,
appropriate worker collaboration schemes are enforced
in order to enable effective result coordination.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chen:2016:YWQ,
author = "Lei Chen and Jianliang Xu and Christian S. Jensen and
Yafei Li",
title = "{YASK}: a why-not question answering engine for
spatial keyword query services",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1501--1504",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the proliferation of the mobile use of the web,
spatial keyword query (SKQ) services are gaining in
importance. However, state-of-the-art SKQ systems do
not provide systematic functionality that allows users
to ask why some known object is unexpectedly missing
from a query result and do not provide an explanation
for such missing objects. In this demonstration, we
present a system called YASK, a whY-not question
Answering engine for Spatial Keyword query services,
that is capable of answering why-not questions posed in
response to answers to spatial keyword top-$k$ queries.
Two explanation and query refinement models, namely
preference adjustment and keyword adaption, are
implemented in YASK. The system provides users not only
with the reasons why desired objects are missing from
query results, but provides also relevant refined
queries that revive the expected but missing objects.
This demonstration gives attendees hands-on experience
with YASK through a map-based GUI interface in which
attendees can issue spatial keyword queries, pose
why-not questions, and visualize the results.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yi:2016:AVQ,
author = "Peipei Yi and Byron Choi and Sourav S. Bhowmick and
Jianliang Xu",
title = "{AutoG}: a visual query autocompletion framework for
graph databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1505--1508",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Composing queries is evidently a tedious task. This is
particularly true of graph queries as they are
typically complex and prone to errors, compounded by
the fact that graph schemas can be missing or too loose
to be helpful for query formulation. Despite the great
success of query formulation aids, in particular,
automatic query completion, graph query autocompletion
has received much less research attention. In this
demonstration, we present a novel interactive visual
subgraph query autocompletion framework called AutoG
which alleviates the potentially painstaking task of
graph query formulation. Specifically, given a large
collection of small or medium-sized graphs and a visual
query fragment q formulated by a user, AutoG returns
top-$k$ query suggestions $ Q'$ as output at
interactive time. Users may choose a query from $ Q'$
and iteratively apply AutoG to compose their queries.
We demonstrate various features of AutoG and its
superior ability to generate high quality suggestions
to aid visual subgraph query formulation.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Miao:2016:SPR,
author = "Xiaoye Miao and Yunjun Gao and Gang Chen and Huiyong
Cui and Chong Guo and Weida Pan",
title = "{Si$^2$ p}: a restaurant recommendation system using
preference queries over incomplete information",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1509--1512",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The incomplete data is universal in many real-life
applications due to data integration, the limitation of
devices, etc. In this demonstration, we present Si$^2$
p, a restaurant recommendation System with Preference
queries on Incomplete Information. Si$^2$ p is capable
of friendly recommending desirable restaurants based on
preference queries that take the incomplete ratings
information into consideration. It adopts the
browser-server model, and incorporates three
functionality modules including friendly and convenient
query submission, flexible and useful result
explanation, timely and incremental dataset
interaction. Si$^2$ p provides the server side based on
an extended PostgreSQL database that integrates two
types of preference queries, namely, skyline and
top-$k$ dominating queries over incomplete data. It
also offers the browser-based interface for the users
to interact with the system. Using a real restaurant
dataset from TripAdvisor, we demonstrate Si$^2$ p can
recommend and explore the restaurants in a friendly
way.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bonaque:2016:MIQ,
author = "R. Bonaque and T. D. Cao and B. Cautis and F.
Goasdou{\'e} and J. Letelier and I. Manolescu and O.
Mendoza and S. Ribeiro and X. Tannier",
title = "Mixed-instance querying: a lightweight integration
architecture for data journalism",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1513--1516",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "As the world's affairs get increasingly more digital,
timely production and consumption of news require to
efficiently and quickly exploit heterogeneous data
sources. Discussions with journalists revealed that
content management tools currently at their disposal
fall very short of expectations. We demonstrate
Tatooine, a lightweight data integration prototype,
which allows to quickly set up integration queries
across (very) heterogeneous data sources, capitalizing
on the many data links (joins) available in this
application domain. Our demonstration is based on
scenarios we study in collaboration with Le Monde,
France's major newspaper.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Butterstein:2016:PPS,
author = "Dennis Butterstein and Torsten Grust",
title = "Precision performance surgery for {CostgreSQL}:
{LLVM}-based Expression Compilation, Just in Time",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1517--1520",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We demonstrate how the compilation of SQL expressions
into machine code leads to significant query runtime
improvements in PostgreSQL 9. Our primary goal is to
connect recent research in query code generation with
one of the most widely deployed database engines. The
approach calls on LLVM to translate arithmetic and
filter expressions into native x86 instructions just
before SQL query execution begins. We deliberately
follow a non-invasive design that does not turn
PostgreSQL on its head: interpreted and compiled
expression evaluation coexist and both are used to
execute the same query. We will bring an enhanced
version of PostgreSQL that exhibits notable runtime
savings and provides visual insight into exactly where
and how execution plans can benefit from SQL expression
compilation.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yahya:2016:EQE,
author = "Mohamed Yahya and Klaus Berberich and Maya Ramanath
and Gerhard Weikum",
title = "Exploratory querying of extended knowledge graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1521--1524",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Knowledge graphs (KGs) are important assets for
search, analytics, and recommendations. However,
querying a KG to explore entities and discover facts is
difficult and tedious, even for users with skills in
SPARQL. First, users are not familiar with the
structure and labels of entities, classes and
relations. Second, KGs are bound to be incomplete, as
they capture only major facts about entities and their
relationships and miss out on many of the more subtle
aspects. We demonstrate TriniT, a system that
facilitates exploratory querying of large KGs, by
addressing these issues of ``vocabulary'' mismatch and
KG incompleteness. TriniT supports query relaxation
rules that are invoked to allow for relevant answers
which are not found otherwise. The incompleteness issue
is addressed by extending a KG with additional
text-style token triples obtained by running Open IE on
Web and text sources. The query language, relaxation
methods, and answer ranking are extended appropriately.
The demo shows automatic query relaxation and has
support for interactively adding user-customized
relaxations. In both situations, the demo provides
answer explanations and offers additional query
suggestions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Panev:2016:EDR,
author = "Kiril Panev and Sebastian Michel and Evica Milchevski
and Koninika Pal",
title = "Exploring databases via reverse engineering ranking
queries with {PALEO}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1525--1528",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A novel approach to explore databases using ranked
lists is demonstrated. Working with ranked lists,
capturing the relative performance of entities, is a
very intuitive and widely applicable concept. Users can
post lists of entities for which explanatory SQL
queries and full result lists are returned. By refining
the input, the results, or the queries, user can
interactively explore the database content. The
demonstrated system is centered around our PALEO
framework for reverse engineering OLAP-style database
queries and novel work on mining interesting
categorical attributes.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bespinyowong:2016:EER,
author = "Ramon Bespinyowong and Wei Chen and H. V. Jagadish and
Yuxin Ma",
title = "{ExRank}: an exploratory ranking interface",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1529--1532",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Even with simple everyday tasks like online shopping
or choosing a restaurant, users are easily overwhelmed
with the large number of choices available today, each
with a large number of inter-related attributes. We
present ExRank, an interactive interface for exploring
data that helps users understand the relationship
between attribute values and find interesting items in
the dataset. Based on a kNN graph and a PageRank
algorithm, ExRank suggests which attributes the user
should look at, and how expressed choices in particular
attributes affect the distribution of values in other
attributes for candidate objects. It solves the problem
of empty result by showing similar items and when there
are too many results, it ranks the data for the user.
This demo consists of (1) the description of the
software architecture and the user interface (2) the
logic and reason behind our solution and (3) a list of
demonstration scenarios for showing to the audience.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Diaz:2016:SQR,
author = "Gonzalo Diaz and Marcelo Arenas and Michael Benedikt",
title = "{SPARQLByE}: querying {RDF} data by example",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1533--1536",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Semantic Web technologies such as RDF and its query
language, SPARQL, offer the possibility of opening up
the use of public datasets to a great variety of
ordinary users. But a key obstacle to the use of open
data is the unfamiliarity of users with the structure
of data or with SPARQL. To deal with these issues, we
introduce a system for querying RDF data by example. At
its core is a technique for reverse-engineering SPARQL
queries by example. We demonstrate how reverse
engineering along with other techniques, such as query
relaxation, enables our system, SPARQLByE, to guide
users who are unfamiliar with both the dataset and with
SPARQL to the desired query and result set.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Deutch:2016:NNL,
author = "Daniel Deutch and Nave Frost and Amir Gilad",
title = "{NLProv}: natural language provenance",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1537--1540",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We propose to present NLProv: an end-to-end Natural
Language (NL) interface for database queries. Previous
work has focused on interfaces for specifying NL
questions, which are then compiled into queries in a
formal language (e.g. SQL). We build upon this work,
but focus on presenting a detailed form of the answers
in Natural Language. The answers that we present are
importantly based on the provenance of tuples in the
query result, detailing not only which are the results
but also their explanations. We develop a novel method
for transforming provenance information to NL, by
leveraging the original NL question structure.
Furthermore, since provenance information is typically
large, we present two solutions for its effective
presentation as NL text: one that is based on
provenance factorization with novel desiderata relevant
to the NL case, and one that is based on
summarization.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chandra:2016:PMA,
author = "Bikash Chandra and Mathew Joseph and Bharath
Radhakrishnan and Shreevidhya Acharya and S.
Sudarshan",
title = "Partial marking for automated grading of {SQL}
queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1541--1544",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The XData system, currently being developed at IIT
Bombay, provides an automated and interactive platform
for grading student SQL queries, as well as for
learning SQL. Prior work on the XData system focused on
generating query specific test cases to catch common
errors in queries. These test cases are used to check
whether the student queries are correct or not. For
grading student assignments, it is usually not
sufficient to just check if a query is correct: if the
query is incorrect, partial marks may need to be given,
depending on how close the query is to being correct.
In this paper, we extend the XData system by adding
features that enable awarding of partial marks to
incorrect student queries. Our system is able to go
beyond numerous syntactic features when comparing a
student query with a correct query. These features of
our grading system allow the grading of SQL queries to
be fully automated, and scalable to even large class
sizes such as those of MOOCs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhao:2016:TPM,
author = "Kaiqi Zhao and Yiding Liu and Quan Yuan and Lisi Chen
and Zhida Chen and Gao Cong",
title = "Towards personalized maps: mining user preferences
from geo-textual data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1545--1548",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Rich geo-textual data is available online and the data
keeps increasing at a high speed. We propose two user
behavior models to learn several types of user
preferences from geo-textual data, and a prototype
system on top of the user preference models for mining
and search geo-textual data (called PreMiner) to
support personalized maps. Different from existing
recommender systems and data analysis systems, PreMiner
highly personalizes user experience on maps and
supports several applications, including user mobility
\& interests mining, opinion mining in regions, user
recommendation, point-of-interest recommendation, and
querying and subscribing on geo-textual data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Feng:2016:SRS,
author = "Kaiyu Feng and Kaiqi Zhao and Yiding Liu and Gao
Cong",
title = "A system for region search and exploration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1549--1552",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the increasing popularity of mobile devices and
location based services, massive amount of geo-textual
data (e.g., geo-tagged tweets) is being generated
everyday. Compared with traditional spatial data, the
textual dimension of geo-textual data greatly enriches
the data. Meanwhile, the spatial dimension of
geo-textual data also adds a semantically rich new
aspect to textual data. The large volume, together with
its rich semantics, calls for the need for data
exploration. First, it has many applications to
retrieve a region for exploration that satisfies
user-specified conditions (e.g., the size and shape of
the region) while maximizing some other conditions
(e.g., the relevance to the query keywords of the
objects in the region). Second, it is useful to mine
and explore the topics of the geo-textual data within a
(specified or retrieved) region and perhaps a timespan.
This demonstration proposal presents the main ideas of
our system, the Region Search and Exploration System
(RISE), for efficiently supporting region search and
exploration, and our demonstration plan.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Vitorovic:2016:SSR,
author = "Aleksandar Vitorovic and Mohammed Elseidy and Khayyam
Guliyev and Khue Vu Minh and Daniel Espino and Mohammad
Dashti and Yannis Klonatos and Christoph Koch",
title = "{Squall}: scalable real-time analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1553--1556",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Squall is a scalable online query engine that runs
complex analytics in a cluster using skew-resilient,
adaptive operators. Squall builds on state-of-the-art
partitioning schemes and local algorithms, including
some of our own. This paper presents the overview of
Squall, including some novel join operators. The paper
also presents lessons learned over the five years of
working on this system, and outlines the plan for the
proposed system demonstration.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Khurana:2016:GBE,
author = "Udayan Khurana and Srinivasan Parthasarathy and Deepak
Turaga",
title = "Graph-based exploration of non-graph datasets",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1557--1560",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Graphs or networks provide a powerful abstraction to
view and analyze relationships among different entities
present in a dataset. However, much of the data of
interest to analysts and data scientists resides in
non-graph forms such as relational databases, JSON,
XML, CSV and text. The effort and skill required in
identifying and extracting the relevant graph
representation from data is often the prohibitive and
limits a wider adoption of graph-based analysis of
non-graph data. In this paper, we demonstrate our
system called GraphViewer, for accelerated graph-based
exploration and analysis. It automatically discovers
relevant graphs implicit within a given non-graph
dataset using a set of novel rule-based and data-driven
techniques, and optimizes their extraction and storage.
It computes several node and graph level metrics and
detects anomalous entities in data. Finally, it
summarizes the results to support interpretation by a
human analyst. While the system automates the
computationally intensive aspects of the process, it is
engineered to leverage human domain expertise and
instincts to fine tune the data exploration process.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2016:RDF,
author = "Minjian Liu and Qing Wang",
title = "{Rogas}: a declarative framework for network
analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1561--1564",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Network analytics has become increasingly popular in
recent years. Various graph systems have been developed
for analysing networks, while network data is still
largely stored and managed in relational database
systems in the first place. As two separate systems are
often used to manage and analyse network data, it not
only increases the difficulty for users to learn and
maintain these different systems simultaneously, but
also impedes performing more sophisticated analysis on
relational and topological properties of network data.
Aiming to tackle these issues, we present Rogas in this
paper, which is a declarative framework that allows the
user to formulate analysis queries naturally without
thinking about the tedious implementation details of
graph algorithms and query processing.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tang:2016:LDM,
author = "Mingjie Tang and Yongyang Yu and Qutaibah M. Malluhi
and Mourad Ouzzani and Walid G. Aref",
title = "{LocationSpark}: a distributed in-memory data
management system for big spatial data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1565--1568",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present LocationSpark, a spatial data processing
system built on top of Apache Spark, a widely used
distributed data processing system. LocationSpark
offers a rich set of spatial query operators, e.g.,
range search, k NN, spatio-textual operation,
spatial-join, and k NN-join. To achieve high
performance, LocationSpark employs various spatial
indexes for in-memory data, and guarantees that
immutable spatial indexes have low overhead with fault
tolerance. In addition, we build two new layers over
Spark, namely a query scheduler and a query executor.
The query scheduler is responsible for mitigating skew
in spatial queries, while the query executor selects
the best plan based on the indexes and the nature of
the spatial queries. Furthermore, to avoid unnecessary
network communication overhead when processing
overlapped spatial data, We embed an efficient spatial
Bloom filter into LocationSpark's indexes. Finally,
LocationSpark tracks frequently accessed spatial data,
and dynamically flushes less frequently accessed data
into disk. We evaluate our system on real workloads and
demonstrate that it achieves an order of magnitude
performance gain over a baseline framework.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shanbhag:2016:ASC,
author = "Anil Shanbhag and Alekh Jindal and Yi Lu and Samuel
Madden",
title = "{Amoeba}: a shape changing storage system for big
data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1569--1572",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data partitioning significantly improves the query
performance in distributed database systems. A large
number of techniques have been proposed to efficiently
partition a dataset for a given query workload.
However, many modern analytic applications involve
ad-hoc or exploratory analysis where users do not have
a representative query workload upfront. Furthermore,
workloads change over time as businesses evolve or as
analysts gain better understanding of their data.
Static workload-based data partitioning techniques are
therefore not suitable for such settings. In this
paper, we describe the demonstration of Amoeba, a
distributed storage system which uses adaptive
multi-attribute data partitioning to efficiently
support ad-hoc as well as recurring queries. Amoeba
applies a robust partitioning algorithm such that
ad-hoc queries on all attributes have similar
performance gains. Thereafter, Amoeba adaptively
repartitions the data based on the observed query
sequence, i.e., the system improves over time. All
along Amoeba offers both adaptivity (i.e., adjustments
according to workload changes) as well as robustness
(i.e., avoiding performance spikes due to workload
changes). We propose to demonstrate Amoeba on scenarios
from an internet-of-things startup that tracks user
driving patterns. We invite the audience to
interactively fire fast ad-hoc queries, observe
multi-dimensional adaptivity, and play with a
robust/reactive knob in Amoeba. The web front end
displays the layout changes, runtime costs, and
compares it to Spark with both default and
workload-aware partitioning.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Olteanu:2016:FRM,
author = "Dan Olteanu and Maximilian Schleich",
title = "{F}: regression models over factorized views",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1573--1576",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We demonstrate F, a system for building regression
models over database views. At its core lies the
observation that the computation and representation of
materialized views, and in particular of joins, entail
non-trivial redundancy that is not necessary for the
efficient computation of aggregates used for building
regression models. F avoids this redundancy by
factorizing data and computation and can outperform the
state-of-the-art systems MADlib, R, and Python
StatsModels by orders of magnitude on real-world
datasets. We illustrate how to incrementally build
regression models over factorized views using both an
in-memory implementation of F and its SQL encoding. We
also showcase the effective use of F for model
selection: F decouples the data-dependent computation
step from the data-independent convergence of model
parameters and only performs once the former to explore
the entire model space.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Rodriguez:2016:SMP,
author = "Miguel Rodr{\'\i}guez and Sean Goldberg and Daisy Zhe
Wang",
title = "{SigmaKB}: multiple probabilistic knowledge base
fusion",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1577--1580",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The interest in integrating web-scale knowledge bases
(KBs) has intensified in the last several years.
Research has focused on knowledge base completion
between two KBs with complementary information, lacking
any notion of uncertainty or method of handling
conflicting information. We present SigmaKB, a
knowledge base system that utilizes Consensus
Maximization Fusion and user feedback to integrate and
improve the query results of a total of 71 KBs. This
paper presents the architecture and demonstration
details.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Konda:2016:MTBb,
author = "Pradap Konda and Sanjib Das and Paul Suganthan G. C.
and AnHai Doan and Adel Ardalan and Jeffrey R. Ballard
and Han Li and Fatemah Panahi and Haojun Zhang and Jeff
Naughton and Shishir Prasad and Ganesh Krishnan and
Rohit Deep and Vijay Raghavendra",
title = "{Magellan}: toward building entity matching management
systems over data science stacks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1581--1584",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Entity matching (EM) has been a long-standing
challenge in data management. Most current EM works,
however, focus only on developing matching algorithms.
We argue that far more efforts should be devoted to
building EM systems. We discuss the limitations of
current EM systems, then present Magellan, a new kind
of EM systems that addresses these limitations.
Magellan is novel in four important aspects. (1) It
provides a how-to guide that tells users what to do in
each EM scenario, step by step. (2) It provides tools
to help users do these steps; the tools seek to cover
the entire EM pipeline, not just matching and blocking
as current EM systems do. (3) Tools are built on top of
the data science stacks in Python, allowing Magellan to
borrow a rich set of capabilities in data cleaning, IE,
visualization, learning, etc. (4) Magellan provide a
powerful scripting environment to facilitate
interactive experimentation and allow users to quickly
write code to ``patch'' the system. We have extensively
evaluated Magellan with 44 students and users at
various organizations. In this paper we propose
demonstration scenarios that show the promise of the
Magellan approach.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Alkowaileet:2016:LSC,
author = "Wail Y. Alkowaileet and Sattam Alsubaiee and Michael
J. Carey and Till Westmann and Yingyi Bu",
title = "Large-scale complex analytics on semi-structured
datasets using {AsterixDB} and {Spark}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1585--1588",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Large quantities of raw data are being generated by
many different sources in different formats. Private
and public sectors alike acclaim the valuable
information and insights that can be mined from such
data to better understand the dynamics of everyday
life, such as traffic, worldwide logistics, and social
behavior. For this reason, storing, managing, and
analyzing ``Big Data'' at scale is getting a tremendous
amount of attention, both in academia and industry. In
this paper, we demonstrate the power of a parallel
connection that we have built between Apache Spark and
Apache AsterixDB (Incubating) to enable complex
analytics such as machine learning and graph analysis
on data drawn from large semi-structured data
collections. The integration of these two systems
allows researchers and data scientists to leverage
AsterixDB capabilities, including fast ingestion and
indexing of semi-structured data and efficient
answering of geo-spatial and fuzzy text queries.
Complex data analytics can then be performed on the
resulting AsterixDB query output in order to obtain
additional insights by leveraging the power of Spark's
machine learning and graph libraries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Picado:2016:SIS,
author = "Jose Picado and Parisa Ataei and Arash Termehchy and
Alan Fern",
title = "Schema independent and scalable relational learning by
{Castor}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1589--1592",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Learning novel relations from relational databases is
an important problem with many applications in database
systems and machine learning. Relational learning
algorithms leverage the properties of the database
schema to find the definition of the target relation in
terms of the existing relations in the database.
However, the same data set may be represented under
different schemas for various reasons, such as
efficiency and data quality. Unfortunately, current
relational learning algorithms tend to vary quite
substantially over the choice of schema, which
complicates their off-the-shelf application. We
demonstrate Castor, a relational learning system that
efficiently learns the same definitions over common
schema variations. The results of Castor are more
accurate than well-known learning systems over large
data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kannapalli:2016:AWA,
author = "Rajeshkumar Kannapalli and Azade Nazi and Mahashweta
Das and Gautam Das",
title = "{AD-WIRE}: add-on for {Web} item reviewing system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1593--1596",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Over the past few decades as purchasing options moved
online, the widespread use and popularity of online
review sites has simultaneously increased. In spite of
the fact that a huge extent of buying choices today are
driven by numeric scores (e.g., rating a product),
detailed reviews play an important role for activities
like purchasing an expensive DSLR camera. Since writing
a detailed review for an item is usually
time-consuming, the number of reviews available in the
Web is far from many. In this paper, we build a system
AD-WIRE that given a user and an item, our system
identifies the top- k meaningful tags to help her
review the item easily. AD-WIRE allows a user to
compose her review by quickly selecting from among the
set of returned tags or writes her own review. AD-WIRE
also visualizes the dependency of the tags to different
aspects of an item so a user can make an informed
decision quickly. The system can be used for different
type of the products. The current demonstration is
built to explore review writing process for the mobile
phones.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chaoji:2016:MLR,
author = "Vineet Chaoji and Rajeev Rastogi and Gourav Roy",
title = "Machine learning in the real world",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1597--1600",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Machine Learning (ML) has become a mature technology
that is being applied to a wide range of business
problems such as web search, online advertising,
product recommendations, object recognition, and so on.
As a result, it has become imperative for researchers
and practitioners to have a fundamental understanding
of ML concepts and practical knowledge of end-to-end
modeling. This tutorial takes a hands-on approach to
introducing the audience to machine learning. The first
part of the tutorial gives a broad overview and
discusses some of the key concepts within machine
learning. The second part of the tutorial takes the
audience through the end-to-end modeling pipeline for a
real-world income prediction problem.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bohm:2016:OAD,
author = "Alexander B{\"o}hm and Jens Dittrich and Niloy
Mukherjee and Ippokratis Pandis and Rajkumar Sen",
title = "Operational analytics data management systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1601--1604",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Prior to mid-2000s, the space of data analytics was
mainly confined within the area of decision support
systems. It was a long era of isolated enterprise data
ware houses curating information from live data sources
and of business intelligence software used to query
such information. Most data sets were small enough in
volume and static enough in velocity to be segregated
in warehouses for analysis. Data analysis was not
ad-hoc; it required pre-requisite knowledge of
underlying data access patterns for the creation of
specialized access methods (e.g. covering indexes,
materialized views) in order to efficiently execute a
set of few focused queries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chu:2016:QDC,
author = "Xu Chu and Ihab F. Ilyas",
title = "Qualitative data cleaning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1605--1608",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data quality is one of the most important problems in
data management, since dirty data often leads to
inaccurate data analytics results and wrong business
decisions. Data cleaning exercise often consist of two
phases: error detection and error repairing. Error
detection techniques can either be quantitative or
qualitative; and error repairing is performed by
applying data transformation scripts or by involving
human experts, and sometimes both. In this tutorial, we
discuss the main facets and directions in designing
qualitative data cleaning techniques. We present a
taxonomy of current qualitative error detection
techniques, as well as a taxonomy of current data
repairing techniques. We will also discuss proposals
for tackling the challenges for cleaning ``big data''
in terms of scale and distribution.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Larson:2016:MMM,
author = "Per-{\AA}ke Larson and Justin Levandoski",
title = "Modern main-memory database systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1609--1610",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This tutorial provides an overview of recent
developments in main-memory database systems. With
growing memory sizes and memory prices dropping by a
factor of 10 every 5 years, data having a ``primary
home'' in memory is now a reality. Main-memory
databases eschew many of the traditional architectural
tenets of relational database systems that optimized
for disk-resident data. Innovative approaches to
fundamental issues such as concurrency control and
query processing are required to unleash the full
performance potential of main-memory databases. The
tutorial is focused around design issues and
architectural choices that must be made when building a
high performance database system optimized for
main-memory: data storage and indexing, concurrency
control, durability and recovery techniques, query
processing and compilation, support for high
availability, and ability to support hybrid
transactional and analytics workloads. This will be
illustrated by example solutions drawn from four
state-of-the-art systems: H-Store/VoltDB, Hekaton,
HyPeR, and SAP HANA. The tutorial will also cover
current and future research trends.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Machanavajjhala:2016:DPW,
author = "Ashwin Machanavajjhala and Xi He and Michael Hay",
title = "Differential privacy in the wild: a tutorial on
current practices \& open challenges",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1611--1614",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Differential privacy has emerged as an important
standard for privacy preserving computation over
databases containing sensitive information about
individuals. Research on differential privacy spanning
a number of research areas, including theory, security,
database, networks, machine learning, and statistics,
over the last decade has resulted in a variety of
privacy preserving algorithms for a number of analysis
tasks. Despite maturing research efforts, the adoption
of differential privacy by practitioners in industry,
academia, or government agencies has so far been rare.
Hence, in this tutorial, we will first describe the
foundations of differentially private algorithm design
that cover the state of the art in private computation
on tabular data. In the second half of the tutorial we
will highlight real world applications on complex data
types, and identify research challenges in applying
differential privacy to real world applications.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Amer-Yahia:2016:HFC,
author = "Sihem Amer-Yahia and Senjuti Basu Roy",
title = "Human factors in crowdsourcing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1615--1618",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Today, crowdsourcing is used to ``taskify'' any job
ranging from simple receipt transcription to
collaborative editing, fan-subbing, citizen science,
and citizen journalism. The crowd is typically
volatile, its arrival and departure asynchronous, and
its levels of attention and accuracy diverse. Tasks
vary in complexity and may necessitate the
participation of workers with varying degrees of
expertise. Sometimes, workers need to collaborate
explicitly and build on each other's contributions to
complete a single task. For example, in disaster
reporting, CrowdMap allows geographically closed people
with diverse and complementary skills, to work together
to report details about the course of a typhoon or the
aftermath of an earthquake. This uber-ization of human
labor requires the understanding of workers motivation
in completing a task, their ability to work together in
collaborative tasks, as well as, helping workers find
relevant tasks. For over 40 years, organization studies
have thoroughly examined human factors that affect
workers in physical workplaces. More recently, computer
scientists have developed algorithms that verify and
leverage those findings in a virtual marketplace, in
this case, a crowdsourcing platform. The goal of this
tutorial is to review those two areas and discuss how
their combination may improve workers' experience, task
throughput and outcome quality for both micro-tasks and
collaborative tasks. We will start with a coverage of
motivation theory, team formation, and learning worker
profiles. We will then address open research questions
that result from this review.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Stoica:2016:TCB,
author = "Ion Stoica",
title = "Trends and challenges in big data processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1619--1619",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Almost six years ago we started the Spark project at
UC Berkeley. Spark is a cluster computing engine that
is optimized for in-memory processing, and unifies
support for a variety of workloads, including batch,
interactive querying, streaming, and iterative
computations. Spark is now the most active big data
project in the open source community, and is already
being used by over one thousand organizations. One of
the reasons behind Spark's success has been our early
bet on the continuous increase in the memory capacity
and the feasibility to fit many realistic workloads in
the aggregate memory of typical production clusters.
Today, we are witnessing new trends, such as Moore's
law slowing down, and the emergence of a variety of
computation and storage technologies, such as GPUs,
FPGAs, and 3D Xpoint. In this talk, I'll discuss some
of the lessons we learned in developing Spark as a
unified computation platform, and the implications of
today's hardware and software trends on the development
of the next generation of big data processing
systems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Rajaraman:2016:DDD,
author = "Anand Rajaraman",
title = "Data-driven disruption: the view from {Silicon
Valley}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1620--1620",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We live in an era where software is transforming
industries, the sciences, and society as a whole. This
exciting phenomenon has been described by the phrase
``software is eating the world.'' It is becoming
increasingly apparent that data is the fuel powering
software's conquests. Data is the new disruptor. It's
hard to believe that the first decade of the Big Data
era is already behind us. Silicon Valley has been at
the forefront of developing and applying data-driven
approaches to create disruption at many levels:
infrastructure (e.g., Hadoop and Spark), capabilities
(e.g., image recognition and machine translation), and
killer apps (e.g., self-driving cars and messaging
bots). In this talk, we first look back on the past
decade and share learnings from the frontlines of
data-driven disruption. Looking ahead, we then describe
challenges and opportunities for the next decade. Since
this has also been a personal journey, we will use
examples drawn from personal experience to illustrate
each point.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dong:2016:LNV,
author = "Xin Luna Dong",
title = "Leave no valuable data behind: the crazy ideas and the
business",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1621--1621",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the mission ``leave no valuable data behind'', we
developed techniques for knowledge fusion to guarantee
the correctness of the knowledge. This talk starts with
describing a few crazy ideas we have tested. The first,
known as ``Knowledge Vault'', used 15 extractors to
automatically extract knowledge from 1B+ Webpages,
obtaining 3B+ distinct (subject, predicate, object)
knowledge triples and predicting well-calibrated
probabilities for extracted triples. The second, known
as ``Knowledge-Based Trust'', estimated the
trustworthiness of 119M webpages and 5.6M websites
based on the correctness of their factual information.
We then present how we bring the ideas to business in
filling the gap between the knowledge at Google
Knowledge Graph and the knowledge in the world.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mokbel:2016:LDM,
author = "Mohamed Mokbel and Chi-Yin Chow and Walid Aref",
title = "Location data management: a tale of two systems and
the ``next destination''!",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "13",
pages = "1622--1622",
month = sep,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:19:51 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In early 2000, we had the vision of ubiquitous
location services, where each object is aware of its
location, and continuously sends its location to a
designated database server. This flood of location data
opened the door for a myriad of location-based services
that were considered visionary at that time, yet today
they are a reality and have become ubiquitous. To
realize our early vision, we identified two main
challenges that needed to be addressed, namely,
scalability and privacy. We have addressed these
challenges through two main systems, PLACE and Casper.
PLACE, developed at Purdue University from 2000 to
2005, set up the environment for built-in database
support of scalable and continuous location-based
services. The Casper system, developed at University of
Minnesota from 2005 to 2010, was built inside the PLACE
server allowing it to provide its high quality scalable
service, while maintaining the privacy of its users'
locations. This talk will take you through a time
journey of location services from 2000 until today, and
beyond, highlighting the development efforts of the
PLACE and Casper systems, along with their impact on
current and future research initiatives in both
academia and industry.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chandramouli:2016:QET,
author = "Badrish Chandramouli and Raul Castro Fernandez and
Jonathan Goldstein and Ahmed Eldawy and Abdul Quamar",
title = "{Quill}: efficient, transferable, and rich analytics
at scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "14",
pages = "1623--1634",
month = oct,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:14:56 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper introduces Quill (stands for a quadrillion
tuples per day), a library and distributed platform for
relational and temporal analytics over large datasets
in the cloud. Quill exposes a new abstraction for
parallel datasets and computation, called
ShardedStreamable. This abstraction provides the
ability to express efficient distributed physical query
plans that are transferable, i.e., movable from offline
to real-time and vice versa. ShardedStreamable
decouples incremental query logic specification, a
small but rich set of data movement operations, and
keying; this allows Quill to express a broad space of
plans with complex querying functionality, while
leveraging existing temporal libraries such as Trill.
Quill's layered architecture provides a careful
separation of responsibilities with independently
useful components, while retaining high performance. We
built Quill for the cloud, with a master-less design
where a language-integrated client library directly
communicates and coordinates with cloud workers using
off-the-shelf distributed cloud components such as
queues. Experiments on up to 400 cloud machines, and on
datasets up to 1TB, find Quill to incur low overheads
and outperform SparkSQL by up to orders-of-magnitude
for temporal and 6$ \times $ for relational queries,
while supporting a rich space of transferable,
programmable, and expressive distributed physical query
plans.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Walenz:2016:PAD,
author = "Brett Walenz and Jun Yang",
title = "Perturbation analysis of database queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "14",
pages = "1635--1646",
month = oct,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:14:56 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present a system, Perada, for parallel perturbation
analysis of database queries. Perturbation analysis
considers the results of a query evaluated with (a
typically large number of) different parameter
settings, to help discover leads and evaluate claims
from data. Perada simplifies the development of
general, ad hoc perturbation analysis by providing a
flexible API to support a variety of optimizations such
as grouping, memoization, and pruning; by automatically
optimizing performance through run-time observation,
learning, and adaptation; and by hiding the complexity
of concurrency and failures from its developers. We
demonstrate Perada's efficacy and efficiency with real
workloads applying perturbation analysis to
computational journalism.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2016:HBG,
author = "Jing Li and Hung-Wei Tseng and Chunbin Lin and Yannis
Papakonstantinou and Steven Swanson",
title = "{HippogriffDB}: balancing {I/O} and {GPU} bandwidth in
big data analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "14",
pages = "1647--1658",
month = oct,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:14:56 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "As data sets grow and conventional processor
performance scaling slows, data analytics move towards
heterogeneous architectures that incorporate hardware
accelerators (notably GPUs) to continue scaling
performance. However, existing GPU-based databases fail
to deal with big data applications efficiently: their
execution model suffers from scalability limitations on
GPUs whose memory capacity is limited; existing systems
fail to consider the discrepancy between fast GPUs and
slow storage, which can counteract the benefit of GPU
accelerators. In this paper, we propose HippogriffDB,
an efficient, scalable GPU-accelerated OLAP system. It
tackles the bandwidth discrepancy using compression and
an optimized data transfer path. HippogriffDB stores
tables in a compressed format and uses the GPU for
decompression, trading GPU cycles for the improved I/O
bandwidth. To improve the data transfer efficiency,
HippogriffDB introduces a peer-to-peer, multi-threaded
data transfer mechanism, directly transferring data
from the SSD to the GPU. HippogriffDB adopts a
query-over-block execution model that provides
scalability using a stream-based approach. The model
improves kernel efficiency with the operator fusion and
double buffering mechanism. We have implemented
HippogriffDB using an NVMe SSD, which talks directly to
a commercial GPU. Results on two popular benchmarks
demonstrate its scalability and efficiency.
HippogriffDB outperforms existing GPU-based databases
(YDB) and in-memory data analytics (MonetDB) by 1-2
orders of magnitude.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zeuch:2016:NIP,
author = "Steffen Zeuch and Holger Pirk and Johann-Christoph
Freytag",
title = "Non-invasive progressive optimization for in-memory
databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "14",
pages = "1659--1670",
month = oct,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:14:56 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Progressive optimization introduces robustness for
database workloads against wrong estimates, skewed
data, correlated attributes, or outdated statistics.
Previous work focuses on cardinality estimates and rely
on expensive counting methods as well as complex
learning algorithms. In this paper, we utilize
performance counters to drive progressive optimization
during query execution. The main advantages are that
performance counters introduce virtually no costs on
modern CPUs and their usage enables a non-invasive
monitoring. We present fine-grained cost models to
detect differences between estimates and actual costs
which enables us to kick-start reoptimization. Based on
our cost models, we implement an optimization approach
that estimates the individual selectivities of a
multi-selection query efficiently. Furthermore, we are
able to learn properties like sortedness, skew, or
correlation during run-time. In our evaluation we show,
that the overhead of our approach is negligible, while
performance improvements are convincing. Using
progressive optimization, we improve runtime up to a
factor of three compared to average run-times and up to
a factor of 4,5 compared to worst case run-times. As a
result, we avoid costly operator execution orders and;
thus, making query execution highly robust.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2016:DSS,
author = "J. W. Zhang and Y. C. Tay",
title = "{Dscaler}: synthetically scaling a given relational
database",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "14",
pages = "1671--1682",
month = oct,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:14:56 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The Dataset Scaling Problem (DSP) defined in previous
work states: Given an empirical set of relational
tables $D$ and a scale factor $s$, generate a database
state $D$ that is similar to $D$ but $s$ times its
size. A DSP solution is useful for application
development $ (s < 1) $, scalability testing $ (s > 1)
$ and anonymization $ (s = 1) $. Current solutions
assume all table sizes scale by the same ratio $s$.
However, a real database tends to have tables that grow
at different rates. This paper therefore considers
non-uniform scaling (nuDSP), a DSP generalization
where, instead of a single scale factor $s$, tables can
scale by different factors. $D$ scaler is the first
solution for nuDSP. It follows previous work in
achieving similarity by reproducing correlation among
the primary and foreign keys. However, it introduces
the concept of a correlation database that captures
fine-grained, per-tuple correlation. Experiments with
well-known real and synthetic datasets $D$ show that
$D$ scaler produces $D$ with greater similarity to $D$
than state-of-the-art techniques. Here, similarity is
measured by number of tuples, frequency distribution of
foreign key references, and multi-join aggregate
queries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2016:FAI,
author = "Sheng Wang and David Maier and Beng Chin Ooi",
title = "Fast and adaptive indexing of multi-dimensional
observational data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "14",
pages = "1683--1694",
month = oct,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:14:56 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Sensing devices generate tremendous amounts of data
each day, which include large quantities of
multi-dimensional measurements. These data are expected
to be immediately available for real-time analytics as
they are streamed into storage. Such scenarios pose
challenges to state-of-the-art indexing methods, as
they must not only support efficient queries but also
frequent updates. We propose here a novel indexing
method that ingests multi-dimensional observational
data in real time. This method primarily guarantees
extremely high throughput for data ingestion, while it
can be continuously refined in the background to
improve query efficiency. Instead of representing
collections of points using Minimal Bounding Boxes as
in conventional indexes, we model sets of successive
points as line segments in hyperspaces, by exploiting
the intrinsic value continuity in observational data.
This representation reduces the number of index entries
and drastically reduces ``over-coverage'' by entries.
Experimental results show that our approach handles
real-world workloads gracefully, providing both
low-overhead indexing and excellent query efficiency.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Upadhyaya:2016:POQ,
author = "Prasang Upadhyaya and Magdalena Balazinska and Dan
Suciu",
title = "Price-optimal querying with data {APIs}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "14",
pages = "1695--1706",
month = oct,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:14:56 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data is increasingly being purchased online in data
markets and REST APIs have emerged as a favored method
to acquire such data. Typically, sellers charge buyers
based on how much data they purchase. In many
scenarios, buyers need to make repeated calls to the
seller's API. The challenge is then for buyers to keep
track of the data they purchase and avoid purchasing
the same data twice. In this paper, we propose
lightweight modifications to data APIs to achieve
optimal history-aware pricing so that buyers are only
charged once for data that they have purchased and that
has not been updated. The key idea behind our approach
is the notion of refunds: buyers buy data as needed but
have the ability to ask for refunds of data that they
had already purchased before. We show that our
techniques can provide significant data cost savings
while reducing overheads by two orders of magnitude as
compared to the state-of-the-art competing
approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Pirk:2016:VVA,
author = "Holger Pirk and Oscar Moll and Matei Zaharia and Sam
Madden",
title = "{Voodoo} --- a vector algebra for portable database
performance on modern hardware",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "9",
number = "14",
pages = "1707--1718",
month = oct,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 12 10:14:56 MDT 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In-memory databases require careful tuning and many
engineering tricks to achieve good performance. Such
database performance engineering is hard: a plethora of
data and hardware-dependent optimization techniques
form a design space that is difficult to navigate for a
skilled engineer --- even more so for a query compiler.
To facilitate performance-oriented design exploration
and query plan compilation, we present Voodoo, a
declarative intermediate algebra that abstracts the
detailed architectural properties of the hardware, such
as multi- or many-core architectures, caches and SIMD
registers, without losing the ability to generate
highly tuned code. Because it consists of a collection
of declarative, vector-oriented operations, Voodoo is
easier to reason about and tune than low-level C and
related hardware-focused extensions (Intrinsics,
OpenCL, CUDA, etc.). This enables our Voodoo compiler
to produce (OpenCL) code that rivals and even
outperforms the fastest state-of-the-art in memory
databases for both GPUs and CPUs. In addition, Voodoo
makes it possible to express techniques as diverse as
cache-conscious processing, predication and
vectorization (again on both GPUs and CPUs) with just a
few lines of code. Central to our approach is a novel
idea we termed control vectors, which allows a code
generating frontend to expose parallelism to the Voodoo
compiler in a abstract manner, enabling portable
performance across hardware platforms. We used Voodoo
to build an alternative backend for MonetDB, a popular
open-source in-memory database. Our backend allows
MonetDB to perform at the same level as highly tuned
in-memory databases, including HyPeR and Ocelot. We
also demonstrate Voodoo's usefulness when investigating
hardware conscious tuning techniques, assessing their
performance on different queries, devices and data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jiang:2016:CQP,
author = "Dawei Jiang and Qingchao Cai and Gang Chen and H. V.
Jagadish and Beng Chin Ooi and Kian-Lee Tan and Anthony
K. H. Tung",
title = "Cohort query processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "1",
pages = "1--12",
month = sep,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/3015270.3015271",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:50 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Modern Internet applications often produce a large
volume of user activity records. Data analysts are
interested in cohort analysis, or finding unusual user
behavioral trends, in these large tables of activity
records. In a traditional database system, cohort
analysis queries are both painful to specify and
expensive to evaluate. We propose to extend database
systems to support cohort analysis. We do so by
extending SQL with three new operators. We devise three
different evaluation schemes for cohort query
processing. Two of them adopt a non-intrusive approach.
The third approach employs a columnar based evaluation
scheme with optimizations specifically designed for
cohort query processing. Our experimental results
confirm the performance benefits of our proposed
columnar database system, compared against the two
non-intrusive approaches that implement cohort queries
on top of regular relational databases.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wu:2016:RWY,
author = "Yubao Wu and Yuchen Bian and Xiang Zhang",
title = "Remember where you came from: on the second-order
random walk based proximity measures",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "1",
pages = "13--24",
month = sep,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/3015270.3015272",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:50 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Measuring the proximity between different nodes is a
fundamental problem in graph analysis. Random walk
based proximity measures have been shown to be
effective and widely used. Most existing random walk
measures are based on the first-order Markov model,
i.e., they assume that the next step of the random
surfer only depends on the current node. However, this
assumption neither holds in many real-life applications
nor captures the clustering structure in the graph. To
address the limitation of the existing first-order
measures, in this paper, we study the second-order
random walk measures, which take the previously visited
node into consideration. While the existing first-order
measures are built on node-to-node transition
probabilities, in the second-order random walk, we need
to consider the edge-to-edge transition probabilities.
Using incidence matrices, we develop simple and elegant
matrix representations for the second-order proximity
measures. A desirable property of the developed
measures is that they degenerate to their original
first-order forms when the effect of the previous step
is zero. We further develop Monte Carlo methods to
efficiently compute the second-order measures and
provide theoretical performance guarantees.
Experimental results show that in a variety of
applications, the second-order measures can
dramatically improve the performance compared to their
first-order counterparts.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{George:2016:MIL,
author = "Lars George and Bruno Cadonna and Matthias Weidlich",
title = "{IL-Miner}: instance-level discovery of complex event
patterns",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "1",
pages = "25--36",
month = sep,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/3015270.3015273",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:50 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Complex event processing (CEP) matches patterns over a
continuous stream of events to detect situations of
interest. Yet, the definition of an event pattern that
precisely characterises a particular situation is
challenging: there are manifold dimensions to correlate
events, including time windows and value predicates. In
the presence of historic event data that is labelled
with the situation to detect, event patterns can be
learned automatically. To cope with the combinatorial
explosion of pattern candidates, existing approaches
work on a type-level and discover patterns based on
predefined event abstractions, aka event types. Hence,
discovery is limited to patterns of a fixed granularity
and users face the burden to manually select
appropriate event abstractions. We present IL-M iner, a
system that discovers event patterns by genuinely
working on the instance-level, not assuming a priori
knowledge on event abstractions. In a multi-phase
process, IL-Miner first identifies relevant
abstractions for the construction of event patterns.
The set of events explored for pattern discovery is
thereby reduced, while still providing formal
guarantees on correctness, minimality, and completeness
of the discovery result. Experiments using real-world
datasets from diverse domains show that IL-Miner
discovers a much broader range of event patterns
compared to the state-of-the-art in the field.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Psaroudakis:2016:ANA,
author = "Iraklis Psaroudakis and Tobias Scheuer and Norman May
and Abdelkader Sellami and Anastasia Ailamaki",
title = "Adaptive {NUMA}-aware data placement and task
scheduling for analytical workloads in main-memory
column-stores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "2",
pages = "37--48",
month = oct,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Dec 1 09:02:03 MST 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Non-uniform memory access (NUMA) architectures pose
numerous performance challenges for main-memory
column-stores in scaling up analytics on modern
multi-socket multi-core servers. A NUMA-aware execution
engine needs a strategy for data placement and task
scheduling that prefers fast local memory accesses over
remote memory accesses, and avoids an imbalance of
resource utilization, both CPU and memory bandwidth,
across sockets. State-of-the-art systems typically use
a static strategy that always partitions data across
sockets, and always allows inter-socket task stealing.
In this paper, we show that adapting data placement and
task stealing to the workload can improve throughput by
up to a factor of 4 compared to a static approach. We
focus on highly concurrent workloads dominated by
operators working on a single table or table group
(copartitioned tables). Our adaptive data placement
algorithm tracks the resource utilization of tasks,
partitions of tables and table groups, and sockets.
When a utilization imbalance across sockets is
detected, the algorithm corrects it by moving or
repartitioning tables. Also, inter-socket task stealing
is dynamically disabled for memory-intensive tasks that
could otherwise hurt performance.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2016:MOC,
author = "Tianzheng Wang and Hideaki Kimura",
title = "Mostly-optimistic concurrency control for highly
contended dynamic workloads on a thousand cores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "2",
pages = "49--60",
month = oct,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Dec 1 09:02:03 MST 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Future servers will be equipped with thousands of CPU
cores and deep memory hierarchies. Traditional
concurrency control (CC) schemes---both optimistic and
pessimistic---slow down orders of magnitude in such
environments for highly contended workloads. Optimistic
CC (OCC) scales the best for workloads with few
conflicts, but suffers from clobbered reads for high
conflict workloads. Although pessimistic locking can
protect reads, it floods cache-coherence backbones in
deep memory hierarchies and can also cause numerous
deadlock aborts. This paper proposes a new CC scheme,
mostly-optimistic concurrency control (MOCC), to
address these problems. MOCC achieves orders of
magnitude higher performance for dynamic workloads on
modern servers. The key objective of MOCC is to avoid
clobbered reads for high conflict workloads, without
any centralized mechanisms or heavyweight interthread
communication. To satisfy such needs, we devise a
native, cancellable reader-writer spinlock and a
serializable protocol that can acquire, release and
re-acquire locks in any order without expensive
interthread communication. For low conflict workloads,
MOCC maintains OCC's high performance without taking
read locks. Our experiments with high conflict YCSB
workloads on a 288-core server reveal that MOCC
performs $ 8 \times $ and $ 23 \times $ faster than OCC
and pessimistic locking, respectively. It achieves 17
million TPS for TPC-C and more than 110 million TPS for
YCSB without conflicts, $ 170 \times $ faster than
pessimistic methods.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2016:EIA,
author = "Sibo Wang and Xiaokui Xiao and Yin Yang and Wenqing
Lin",
title = "Effective indexing for approximate constrained
shortest path queries on large road networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "2",
pages = "61--72",
month = oct,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Dec 1 09:02:03 MST 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In a constrained shortest path (CSP) query, each edge
in the road network is associated with both a length
and a cost. Given an origin $s$, a destination $t$, and
a cost constraint $ \theta $, the goal is to find the
shortest path from $s$ to $t$ whose total cost does not
exceed $ \theta $. Because exact CSP is NP-hard,
previous work mostly focuses on approximate solutions.
Even so, existing methods are still prohibitively
expensive for large road networks. Two main reasons are
(i) that they fail to utilize the special properties of
road networks and (ii) that most of them process
queries without indices; the few existing indices
consume large amounts of memory and yet have limited
effectiveness in reducing query costs. Motivated by
this, we propose COLA, the first practical solution for
approximate CSP processing on large road networks. COLA
exploits the facts that a road network can be
effectively partitioned, and that there exists a
relatively small set of landmark vertices that commonly
appear in CSP results. Accordingly, COLA indexes the
vertices lying on partition boundaries, and applies an
on-the-fly algorithm called $ \alpha $-Dijk for path
computation within a partition, which effectively
prunes paths based on landmarks. Extensive experiments
demonstrate that on continent-sized road networks, COLA
answers an approximate CSP query in sub-second time,
whereas existing methods take hours. Interestingly,
even without an index, the $ \alpha $-Dijk algorithm in
COLA still outperforms previous solutions by more than
an order of magnitude.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Huang:2016:THP,
author = "Qun Huang and Patrick P. C. Lee",
title = "Toward high-performance distributed stream processing
via approximate fault tolerance",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "3",
pages = "73--84",
month = nov,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Dec 1 09:02:03 MST 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Fault tolerance is critical for distributed stream
processing systems, yet achieving error-free fault
tolerance often incurs substantial performance
overhead. We present AF-Stream, a distributed stream
processing system that addresses the trade-off between
performance and accuracy in fault tolerance. AF-Stream
builds on a notion called approximate fault tolerance,
whose idea is to mitigate backup overhead by adaptively
issuing backups, while ensuring that the errors upon
failures are bounded with theoretical guarantees. Our
AF-Stream design provides an extensible programming
model for incorporating general streaming algorithms,
and also exports only few threshold parameters for
configuring approximation fault tolerance. Experiments
on Amazon EC2 show that AF-Stream maintains high
performance (compared to no fault tolerance) and high
accuracy after multiple failures (compared to no
failures) under various streaming algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dai:2016:PCD,
author = "Jian Dai and Bin Yang and Chenjuan Guo and Christian
S. Jensen and Jilin Hu",
title = "Path cost distribution estimation using trajectory
data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "3",
pages = "85--96",
month = nov,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Dec 1 09:02:03 MST 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the growing volumes of vehicle trajectory data,
it becomes increasingly possible to capture
time-varying and uncertain travel costs in a road
network, including travel time and fuel consumption.
The current paradigm represents a road network as a
weighted graph; it blasts trajectories into small
fragments that fit the under-lying edges to assign
weights to edges; and it then applies a routing
algorithm to the resulting graph. We propose a new
paradigm, the hybrid graph, that targets more accurate
and more efficient path cost distribution estimation.
The new paradigm avoids blasting trajectories into
small fragments and instead assigns weights to paths
rather than simply to the edges. We show how to compute
path weights using trajectory data while taking into
account the travel cost dependencies among the edges in
the paths. Given a departure time and a query path, we
show how to select an optimal set of weights with
associated paths that cover the query path and such
that the weights enable the most accurate joint cost
distribution estimation for the query path. The cost
distribution of the query path is then computed
accurately using the joint distribution. Finally, we
show how the resulting method for computing cost
distributions of paths can be integrated into existing
routing algorithms. Empirical studies with substantial
trajectory data from two different cities offer insight
into the design properties of the proposed method and
confirm that the method is effective in real-world
settings.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sariyuce:2016:FHC,
author = "Ahmet Erdem Sariy{\"u}ce and Ali Pinar",
title = "Fast hierarchy construction for dense subgraphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "3",
pages = "97--108",
month = nov,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Dec 1 09:02:03 MST 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Discovering dense subgraphs and understanding the
relations among them is a fundamental problem in graph
mining. We want to not only identify dense subgraphs,
but also build a hierarchy among them (e.g., larger but
sparser subgraphs formed by two smaller dense
subgraphs). Peeling algorithms (k -core, k -truss, and
nucleus decomposition) have been effective to locate
many dense subgraphs. However, constructing a
hierarchical representation of density structure, even
correctly computing the connected k -cores and k
-trusses, have been mostly overlooked. Keeping track of
connected components during peeling requires an
additional traversal operation, which is as expensive
as the peeling process. In this paper, we start with a
thorough survey and point to nuances in problem
formulations that lead to significant differences in
runtimes. We then propose efficient and generic
algorithms to construct the hierarchy of dense
subgraphs for k -core, k -truss, or any nucleus
decomposition. Our algorithms leverage the disjoint-set
forest data structure to efficiently construct the
hierarchy during traversal. Furthermore, we introduce a
new idea to avoid traversal. We construct the subgraphs
while visiting neighborhoods in the peeling process,
and build the relations to previously constructed
subgraphs. We also consider an existing idea to find
the k -core hierarchy and adapt for our objectives
efficiently. Experiments on different types of large
scale real-world networks show significant speedups
over naive algorithms and existing alternatives. Our
algorithms also outperform the hypothetical limits of
any possible traversal-based solution.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2016:SEE,
author = "Xuhong Zhang and Jun Wang and Jiangling Yin",
title = "{Sapprox}: enabling efficient and accurate
approximations on sub-datasets with distribution-aware
online sampling",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "3",
pages = "109--120",
month = nov,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Dec 1 09:02:03 MST 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we aim to enable both efficient and
accurate approximations on arbitrary sub-datasets of a
large dataset. Due to the prohibitive storage overhead
of caching offline samples for each sub-dataset,
existing offline sample based systems provide high
accuracy results for only a limited number of
sub-datasets, such as the popular ones. On the other
hand, current online sample based approximation
systems, which generate samples at runtime, do not take
into account the uneven storage distribution of a
sub-dataset. They work well for uniform distribution of
a sub-dataset while suffer low sampling efficiency and
poor estimation accuracy on unevenly distributed
sub-datasets. To address the problem, we develop a
distribution aware method called Sapprox. Our idea is
to collect the occurrences of a sub-dataset at each
logical partition of a dataset (storage distribution)
in the distributed system, and make good use of such
information to facilitate online sampling. There are
three thrusts in Sapprox. First, we develop a
probabilistic map to reduce the exponential number of
recorded sub-datasets to a linear one. Second, we apply
the cluster sampling with unequal probability theory to
implement a distribution-aware sampling method for
efficient online sub-dataset sampling. Third, we
quantitatively derive the optimal sampling unit size in
a distributed file system by associating it with
approximation costs and accuracy. We have implemented
Sapprox into Hadoop ecosystem as an example system and
open sourced it on GitHub. Our comprehensive
experimental results show that Sapprox can achieve a
speedup by up to $ 20 \times $ over the precise
execution.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ren:2016:MQO,
author = "Xuguang Ren and Junhu Wang",
title = "Multi-query optimization for subgraph isomorphism
search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "3",
pages = "121--132",
month = nov,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Dec 1 09:02:03 MST 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Existing work on subgraph isomorphism search mainly
focuses on a-query-at-a-time approaches: optimizing and
answering each query separately. When multiple queries
arrive at the same time, sequential processing is not
always the most efficient. In this paper, we study
multi-query optimization for subgraph isomorphism
search. We first propose a novel method for efficiently
detecting useful common sub-graphs and a data structure
to organize them. Then we propose a heuristic algorithm
based on the data structure to compute a query
execution order so that cached intermediate results can
be effectively utilized. To balance memory usage and
the time for cached results retrieval, we present a
novel structure for caching the intermediate results.
We provide strategies to revise existing single-query
subgraph isomorphism algorithms to seamlessly utilize
the cached results, which leads to significant
performance improvement. Extensive experiments verified
the effectiveness of our solution.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Simpson:2016:ECF,
author = "Michael Simpson and Venkatesh Srinivasan and Alex
Thomo",
title = "Efficient computation of feedback arc set at
web-scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "3",
pages = "133--144",
month = nov,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Dec 1 09:02:03 MST 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The minimum feedback arc set problem is an NP-hard
problem on graphs that seeks a minimum set of arcs
which, when removed from the graph, leave it acyclic.
In this work, we investigate several approximations for
computing a minimum feedback arc set with the goal of
comparing the quality of the solutions and the running
times. Our investigation is motivated by applications
in Social Network Analysis such as misinformation
removal and label propagation. We present careful
algorithmic engineering for multiple algorithms to
improve the scalability of each approach. In
particular, two approaches we optimize (one greedy and
one randomized) provide a nice balance between feedback
arc set size and running time complexity. We
experimentally compare the performance of a wide range
of algorithms on a broad selection of large online
networks including Twitter, LiveJournal, and the
Clueweb12 dataset. The experiments reveal that our
greedy and randomized implementations outperform the
other approaches by simultaneously computing a feedback
arc set of competitive size and scaling to web-scale
graphs with billions of vertices and tens of billions
of arcs. Finally, we extend the algorithms considered
to the probabilistic case in which arcs are realized
with some fixed probability and provide detailed
experimental comparisons.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Antenucci:2016:DQP,
author = "Dolan Antenucci and Michael R. Anderson and Michael
Cafarella",
title = "A declarative query processing system for nowcasting",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "3",
pages = "145--156",
month = nov,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Dec 1 09:02:03 MST 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Nowcasting is the practice of using social media data
to quantify ongoing real-world phenomena. It has been
used by researchers to measure flu activity,
unemployment behavior, and more. However, the typical
nowcasting workflow requires either slow and tedious
manual searching of relevant social media messages or
automated statistical approaches that are prone to
spurious and low-quality results. In this paper, we
propose a method for declaratively specifying a
nowcasting model; this method involves processing a
user query over a very large social media database,
which can take hours. Due to the human-in-the-loop
nature of constructing nowcasting models, slow runtimes
place an extreme burden on the user. Thus we also
propose a novel set of query optimization techniques,
which allow users to quickly construct nowcasting
models over very large datasets. Further, we propose a
novel query quality alarm that helps users estimate
phenomena even when historical ground truth data is not
available. These contributions allow us to build a
declarative nowcasting data management system,
RaccoonDB, which yields high-quality results in
interactive time. We evaluate RaccoonDB using 40
billion tweets collected over five years. We show that
our automated system saves work over traditional manual
approaches while improving result quality---57\% more
accurate in our user study---and that its query
optimizations yield a 424x speedup, allowing it to
process queries 123x faster than a 300-core Spark
cluster, using only 10\% of the computational
resources.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lulli:2016:NDS,
author = "Alessandro Lulli and Matteo Dell'Amico and Pietro
Michiardi and Laura Ricci",
title = "{NG-DBSCAN}: scalable density-based clustering for
arbitrary data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "3",
pages = "157--168",
month = nov,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Dec 1 09:02:03 MST 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present NG-DBSCAN, an approximate density-based
clustering algorithm that operates on arbitrary data
and any symmetric distance measure. The distributed
design of our algorithm makes it scalable to very large
datasets; its approximate nature makes it fast, yet
capable of producing high quality clustering results.
We provide a detailed overview of the steps of
NG-DBSCAN, together with their analysis. Our results,
obtained through an extensive experimental campaign
with real and synthetic data, substantiate our claims
about NG-DBSCAN's performance and scalability.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Neamtu:2016:ITS,
author = "Rodica Neamtu and Ramoza Ahsan and Elke Rundensteiner
and Gabor Sarkozy",
title = "Interactive time series exploration powered by the
marriage of similarity distances",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "3",
pages = "169--180",
month = nov,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Dec 1 09:02:03 MST 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Finding similar trends among time series data is
critical for applications ranging from financial
planning to policy making. The detection of these
multifaceted relationships, especially time warped
matching of time series of different lengths and
alignments is prohibitively expensive to compute. To
achieve real time responsiveness on large time series
datasets, we propose a novel paradigm called Online
Exploration of Time Series (ONEX) employing a powerful
one-time preprocessing step that encodes critical
similarity relationships to support subsequent rapid
data exploration. Since the encoding of a huge number
of pairwise similarity relationships for all variable
lengths time series segments is not feasible, our work
rests on the important insight that clustering with
inexpensive point-to-point distances such as the
Euclidean Distance can support subsequent time warped
matching. Our ONEX framework overcomes the prohibitive
computational costs associated with a more robust
elastic distance namely the DTW by applying it over the
surprisingly compact knowledge base instead of the raw
data. Our comparative study reveals that ONEX is up to
19\% more accurate and several times faster than the
state-of-the-art. Beyond being a highly accurate and
fast domain independent solution, ONEX offers a truly
interactive exploration experience supporting novel
time series operations.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2016:CLI,
author = "Youhuan Li and Lei Zou and Huaming Zhang and Dongyan
Zhao",
title = "Computing longest increasing subsequences over
sequential data streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "3",
pages = "181--192",
month = nov,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Dec 1 09:02:03 MST 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we propose a data structure, a
quadruple neighbor list (QN-list, for short), to
support real time queries of all longest increasing
subsequence (LIS) and LIS with constraints over
sequential data streams. The QN-List built by our
algorithm requires $ O(w) $ space, where w is the time
window size. The running time for building the initial
QN-List takes $ O(w \log w) $ time. Applying the
QN-List, insertion of the new item takes $ O(\log w) $
time and deletion of the first item takes $ O(w) $
time. To the best of our knowledge, this is the first
work to support both LIS enumeration and LIS with
constraints computation by using a single uniform data
structure for real time sequential data streams. Our
method outperforms the state-of-the-art methods in both
time and space cost, not only theoretically, but also
empirically.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chirigati:2016:KEU,
author = "Fernando Chirigati and Jialu Liu and Flip Korn and You
(Will) Wu and Cong Yu and Hao Zhang",
title = "Knowledge exploration using tables on the web",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "3",
pages = "193--204",
month = nov,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Dec 1 09:02:03 MST 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The increasing popularity of mobile device usage has
ushered in many features in modern search engines that
help users with various information needs. One of those
needs is Knowledge Exploration, where related documents
are returned in response to a user query, either
directly through right-hand side knowledge panels or
indirectly through navigable sections underneath
individual search results. Existing knowledge
exploration features have relied on a combination of
Knowledge Bases and query logs. In this paper, we
propose Knowledge Carousels of two modalities, namely
sideways and downwards, that facilitate exploration of
IS-A and HAS-A relationships, respectively, with regard
to an entity-seeking query, based on leveraging the
large corpus of tables on the Web. This brings many
technical challenges, including associating correct
carousels with the search entity, selecting the best
carousel from the candidates, and finding titles that
best describe the carousel. We describe how we address
these challenges and also experimentally demonstrate
through user studies that our approach produces better
result sets than baseline approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2016:HEI,
author = "Sibo Wang and Youze Tang and Xiaokui Xiao and Yin Yang
and Zengxiang Li",
title = "{HubPPR}: effective indexing for approximate
{Personalized Pagerank}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "3",
pages = "205--216",
month = nov,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Dec 1 09:02:03 MST 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/pagerank.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Personalized PageRank (PPR) computation is a
fundamental operation in web search, social networks,
and graph analysis. Given a graph $G$, a source $s$,
and a target $t$, the PPR query $ \Pi (s, t)$ returns
the probability that a random walk on $G$ starting from
$s$ terminates at $t$. Unlike global PageRank which can
be effectively pre-computed and materialized, the PPR
result depends on both the source and the target,
rendering results materialization infeasible for large
graphs. Existing indexing techniques have rather
limited effectiveness; in fact, the current
state-of-the-art solution, BiPPR, answers individual
PPR queries without pre-computation or indexing, and
yet it outperforms all previous index-based solutions.
Motivated by this, we propose HubPPR, an effective
indexing scheme for PPR computation with controllable
tradeoffs for accuracy, query time, and memory
consumption. The main idea is to pre-compute and index
auxiliary information for selected hub nodes that are
often involved in PPR processing. Going one step
further, we extend HubPPR to answer top-$k$ PPR
queries, which returns the $k$ nodes with the highest
PPR values with respect to a source $s$, among a given
set $T$ of target nodes. Extensive experiments
demonstrate that compared to the current best solution
BiPPR, HubPPR achieves up to 10x and 220x speedup for
PPR and top-$k$ PPR processing, respectively, with
moderate memory consumption. Notably, with a single
commodity server, HubPPR answers a top-$k$ PPR query in
seconds on graphs with billions of edges, with high
accuracy and strong result quality guarantees.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lai:2016:SDS,
author = "Longbin Lai and Lu Qin and Xuemin Lin and Ying Zhang
and Lijun Chang and Shiyu Yang",
title = "Scalable distributed subgraph enumeration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "3",
pages = "217--228",
month = nov,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Dec 1 09:02:03 MST 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Subgraph enumeration aims to find all the subgraphs of
a large data graph that are isomorphic to a given
pattern graph. As the subgraph isomorphism operation is
computationally intensive, researchers have recently
focused on solving this problem in distributed
environments, such as MapReduce and Pregel. Among them,
the state-of-the-art algorithm, Twin TwigJoin, is
proven to be instance optimal based on a left-deep join
framework. However, it is still not scalable to large
graphs because of the constraints in the left-deep join
framework and that each decomposed component (join
unit) must be a star. In this paper, we propose SEED
--- a scalable sub-graph enumeration approach in the
distributed environment. Compared to Twin TwigJoin,
SEED returns optimal solution in a generalized join
framework without the constraints in Twin TwigJoin. We
use both star and clique as the join units, and design
an effective distributed graph storage mechanism to
support such an extension. We develop a comprehensive
cost model, that estimates the number of matches of any
given pattern graph by considering power-law degree
distribution in the data graph. We then generalize the
left-deep join framework and develop a
dynamic-programming algorithm to compute an optimal
bushy join plan. We also consider overlaps among the
join units. Finally, we propose clique compression to
further improve the algorithm by reducing the number of
the intermediate results. Extensive performance studies
are conducted on several real graphs, one containing
billions of edges. The results demonstrate that our
algorithm outperforms all other state-of-the-art
algorithms by more than one order of magnitude.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fujiwara:2016:FAL,
author = "Yasuhiro Fujiwara and Yasutoshi Ida and Junya Arai and
Mai Nishimura and Sotetsu Iwamura",
title = "Fast algorithm for the lasso based {$ L_1 $}-graph
construction",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "3",
pages = "229--240",
month = nov,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Dec 1 09:02:03 MST 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The lasso-based $ L_1$-graph is used in many
applications since it can effectively model a set of
data points as a graph. The lasso is a popular
regression approach and the $ L_1$ -graph represents
data points as nodes by using the regression result.
More specifically, by solving the $ L_1$-optimization
problem of the lasso, the sparse regression
coefficients are used to obtain the weights of the
edges in the graph. Conventional graph structures such
as k -NN graph use two steps, adjacency searching and
weight selection, for constructing the graph whereas
the lasso-based $ L_1$ -graph derives the adjacency
structure as well as the edge weights simultaneously by
using a coordinate descent. However, the construction
cost of the lasso-based $ L_1$ -graph is impractical
for large data sets since the coordinate descent
iteratively updates the weights of all edges until
convergence. Our proposal, Castnet, can efficiently
construct the lasso-based $ L_1$ -graph. In order to
avoid updating the weights of all edges, we prune edges
that cannot have nonzero weights before entering the
iterations. In addition, we update edge weights only if
they are nonzero in the iterations. Experiments show
that Castnet is significantly faster than existing
approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhai:2016:RTS,
author = "Ennan Zhai and Zhenhua Li and Zhenyu Li and Fan Wu and
Guihai Chen",
title = "Resisting tag spam by leveraging implicit user
behaviors",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "3",
pages = "241--252",
month = nov,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Dec 1 09:02:03 MST 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Tagging systems are vulnerable to tag spam attacks.
However, defending against tag spam has been
challenging in practice, since adversaries can easily
launch spam attacks in various ways and scales. To
deeply understand users' tagging behaviors and explore
more effective defense, this paper first conducts
measurement experiments on public datasets of two
representative tagging systems: Del.icio.us and
CiteULike. Our key finding is that a significant
fraction of correct tag-resource annotations are
contributed by a small number of implicit similarity
cliques, where users annotate common resources with
similar tags. Guided by the above finding, we propose a
new service, called Spam-Resistance-as-a-Service (or
SRaaS), to effectively defend against heterogeneous tag
spam attacks even at very large scales. At the heart of
SRaaS is a novel reputation assessment protocol, whose
design leverages the implicit similarity cliques
coupled with the social networks inherent to typical
tagging systems. With such a design, SRaaS manages to
offer provable guarantees on diminishing the influence
of tag spam attacks. We build an SRaaS prototype and
evaluate it using a large-scale spam-oriented research
dataset (which is much more polluted by tag spam than
Del.icio.us and CiteULike datasets). Our evaluational
results demonstrate that SRaaS outperforms existing tag
spam defenses deployed in real-world systems, while
introducing low overhead.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chen:2016:GFE,
author = "Xiaowei Chen and Yongkun Li and Pinghui Wang and John
C. S. Lui",
title = "A general framework for estimating graphlet statistics
via random walk",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "3",
pages = "253--264",
month = nov,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Dec 1 09:02:03 MST 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Graphlets are induced subgraph patterns and have been
frequently applied to characterize the local topology
structures of graphs across various domains, e.g.,
online social networks (OSNs) and biological networks.
Discovering and computing graphlet statistics are
highly challenging. First, the massive size of
real-world graphs makes the exact computation of
graphlets extremely expensive. Secondly, the graph
topology may not be readily available so one has to
resort to web crawling using the available application
programming interfaces (APIs). In this work, we propose
a general and novel framework to estimate graphlet
statistics of `` any size. '' Our framework is based on
collecting samples through consecutive steps of random
walks. We derive an analytical bound on the sample size
(via the Chernoff--Hoeffding technique) to guarantee
the convergence of our unbiased estimator. To further
improve the accuracy, we introduce two novel
optimization techniques to reduce the lower bound on
the sample size. Experimental evaluations demonstrate
that our methods outperform the state-of-the-art method
up to an order of magnitude both in terms of accuracy
and time cost.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lin:2016:FMS,
author = "Chunbin Lin and Benjamin Mandel and Yannis
Papakonstantinou and Matthias Springer",
title = "Fast in-memory {SQL} analytics on typed graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "3",
pages = "265--276",
month = nov,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Dec 1 09:02:03 MST 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We study a class of graph analytics SQL queries, which
we call relationship queries. These queries involving
aggregation, join, semijoin, intersection and selection
are a wide superset of fixed-length graph reachability
queries and of tree pattern queries. We present
real-world OLAP scenarios, where efficient relationship
queries are needed. However, row stores, column stores
and graph databases are unacceptably slow in such OLAP
scenarios. We propose a GQ-Fast database, which is an
indexed database that roughly corresponds to efficient
encoding of annotated adjacency lists that combines
salient features of column-based organization, indexing
and compression. GQ-Fast uses a bottom-up fully
pipelined query execution model, which enables (a)
aggressive compression (e.g., compressed bitmaps and
Huffman) and (b) avoids intermediate results that
consist of row IDs (which are typical in column
databases). GQ-Fast compiles query plans into
executable C++ source code. Besides achieving runtime
efficiency, GQ-Fast also reduces main memory
requirements because, unlike column databases, GQ-Fast
selectively allows dense forms of compression including
heavy-weight compressions, which do not support random
access. We used GQ-Fast to accelerate queries for two
OLAP dashboards in the biomedical field. GQ-Fast
outperforms PostgreSQL by 2--4 orders of magnitude and
MonetDB, Vertica and Neo4j by 1--3 orders of magnitude
when all of them are running on RAM. Our experiments
dissect GQ-Fast's advantage between (i) the use of
compiled code, (ii) the bottom-up pipelining execution
strategy, and (iii) the use of dense structures. Other
analysis and experiments show the space savings of
GQ-Fast due to the appropriate use of compression
methods. We also show that the runtime penalty incurred
by the dense compression methods decreases as the
number of CPU cores increases.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2016:SDA,
author = "Zheng Li and Tingjian Ge",
title = "Stochastic data acquisition for answering queries as
time goes by",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "3",
pages = "277--288",
month = nov,
year = "2016",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Dec 1 09:02:03 MST 2016",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data and actions are tightly coupled. On one hand,
data analysis results trigger decision making and
actions. On the other hand, the action of acquiring
data is the very first step in the whole data
processing pipeline. Data acquisition almost always has
some costs, which could be either monetary costs or
computing resource costs such as sensor battery power,
network transfers, or I/O costs. Using out-dated data
to answer queries can avoid the data acquisition costs,
but there is a penalty of potentially inaccurate
results. Given a sequence of incoming queries over
time, we study the problem of sequential decision
making on when to acquire data and when to use existing
versions to answer each query. We propose two
approaches to solve this problem using reinforcement
learning and tailored locality-sensitive hashing. A
systematic empirical study using two real-world
datasets shows that our approaches are effective and
efficient.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dai:2016:FPI,
author = "Haipeng Dai and Muhammad Shahzad and Alex X. Liu and
Yuankun Zhong",
title = "Finding persistent items in data streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "4",
pages = "289--300",
month = nov,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/3025111.3025112",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Frequent item mining, which deals with finding items
that occur frequently in a given data stream over a
period of time, is one of the heavily studied problems
in data stream mining. A generalized version of
frequent item mining is the persistent item mining,
where a persistent item, unlike a frequent item, does
not necessarily occur more frequently compared to other
items over a short period of time, rather persists and
occurs more frequently over a long period of time. To
the best of our knowledge, there is no prior work on
mining persistent items in a data stream. In this
paper, we address the fundamental problem of finding
persistent items in a given data stream during a given
period of time at any given observation point. We
propose a novel scheme, PIE, that can accurately
identify each persistent item with a probability
greater than any desired false negative rate (FNR)
while using a very small amount of memory. The key idea
of PIE is that it uses Raptor codes to encode the ID of
each item that appears at the observation point during
a measurement period and stores only a few bits of the
encoded ID in the memory of that observation point
during that measurement period. The item that is
persistent occurs in enough measurement periods that
enough encoded bits for the ID can be retrieved from
the observation point to decode them correctly and get
the ID of the persistent item. We implemented and
extensively evaluated PIE using three real network
traffic traces and compared its performance with two
prior adapted schemes. Our results show that not only
PIE achieves the desired FNR in every scenario, its
FNR, on average, is 19.5 times smaller than the FNR of
the best adapted prior art.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Xu:2016:BSD,
author = "Shuotao Xu and Sungjin Lee and Sang-Woo Jun and Ming
Liu and Jamey Hicks and Arvind",
title = "{Bluecache}: a scalable distributed flash-based
key--value store",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "4",
pages = "301--312",
month = nov,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/3025111.3025113",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A key--value store (KVS), such as memcached and Redis,
is widely used as a caching layer to augment the slower
persistent backend storage in data centers. DRAM-based
KVS provides fast key--value access, but its
scalability is limited by the cost, power and space
needed by the machine cluster to support a large amount
of DRAM. This paper offers a 10X to 100X cheaper
solution based on flash storage and hardware
accelerators. In BlueCache key--value pairs are stored
in flash storage and all KVS operations, including the
flash controller are directly implemented in hardware.
Furthermore, BlueCache includes a fast interconnect
between flash controllers to provide a scalable
solution. We show that BlueCache has 4.18X higher
throughput and consumes 25X less power than a
flash-backed KVS software implementation on x86
servers. We further show that BlueCache can outperform
DRAM-based KVS when the latter has more than 7.4\%
misses for a read-intensive aplication. BlueCache is an
attractive solution for both rack-level appliances and
data-center-scale key--value cache.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fan:2016:GPP,
author = "Qi Fan and Dongxiang Zhang and Huayu Wu and Kian-Lee
Tan",
title = "A general and parallel platform for mining co-movement
patterns over large-scale trajectories",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "4",
pages = "313--324",
month = nov,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/3025111.3025114",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Discovering co-movement patterns from large-scale
trajectory databases is an important mining task and
has a wide spectrum of applications. Previous studies
have identified several types of interesting
co-movement patterns and show-cased their usefulness.
In this paper, we make two key contributions to this
research field. First, we propose a more general
co-movement pattern to unify those defined in the past
literature. Second, we propose two types of parallel
and scalable frameworks and deploy them on Apache
Spark. To the best of our knowledge, this is the first
work to mine co-movement patterns in real life
trajectory databases with hundreds of millions of
points. Experiments on three real life large-scale
trajectory datasets have verified the efficiency and
scalability of our proposed solutions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shao:2016:VTE,
author = "Zhou Shao and Muhammad Aamir Cheema and David Taniar
and Hua Lu",
title = "{VIP-Tree}: an effective index for indoor spatial
queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "4",
pages = "325--336",
month = nov,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/3025111.3025115",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Due to the growing popularity of indoor location-based
services, indoor data management has received
significant research attention in the past few years.
However, we observe that the existing indexing and
query processing techniques for the indoor space do not
fully exploit the properties of the indoor space.
Consequently, they provide below par performance which
makes them unsuitable for large indoor venues with high
query workloads. In this paper, we propose two novel
indexes called Indoor Partitioning Tree (IP-Tree) and
Vivid IP-Tree (VIP-Tree) that are carefully designed by
utilizing the properties of indoor venues. The proposed
indexes are lightweight, have small pre-processing cost
and provide near-optimal performance for shortest
distance and shortest path queries. We also present
efficient algorithms for other spatial queries such as
k nearest neighbors queries and range queries. Our
extensive experimental study on real and synthetic data
sets demonstrates that our proposed indexes outperform
the existing algorithms by several orders of
magnitude.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Arulraj:2016:WBL,
author = "Joy Arulraj and Matthew Perron and Andrew Pavlo",
title = "Write-behind logging",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "4",
pages = "337--348",
month = nov,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/3025111.3025116",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The design of the logging and recovery components of
database management systems (DBMSs) has always been
influenced by the difference in the performance
characteristics of volatile (DRAM) and non-volatile
storage devices (HDD/SSDs). The key assumption has been
that non-volatile storage is much slower than DRAM and
only supports block-oriented read/writes. But the
arrival of new non-volatile memory (NVM) storage that
is almost as fast as DRAM with fine-grained read/writes
invalidates these previous design choices. This paper
explores the changes that are required in a DBMS to
leverage the unique properties of NVM in systems that
still include volatile DRAM. We make the case for a new
logging and recovery protocol, called write-behind
logging, that enables a DBMS to recover nearly
instantaneously from system failures. The key idea is
that the DBMS logs what parts of the database have
changed rather than how it was changed. Using this
method, the DBMS flushes the changes to the database {
before} recording them in the log. Our evaluation shows
that this protocol improves a DBMS's transactional
throughput by 1.3$ \times $, reduces the recovery time
by more than two orders of magnitude, and shrinks the
storage footprint of the DBMS on NVM by 1.5$ \times $.
We also demonstrate that our logging protocol is
compatible with standard replication schemes.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Papadopoulos:2016:TAD,
author = "Stavros Papadopoulos and Kushal Datta and Samuel
Madden and Timothy Mattson",
title = "The {TileDB} array data storage manager",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "4",
pages = "349--360",
month = nov,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/3025111.3025117",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present a novel storage manager for
multi-dimensional arrays that arise in scientific
applications, which is part of a larger scientific data
management system called TileDB. In contrast to
existing solutions, TileDB is optimized for both dense
and sparse arrays. Its key idea is to organize array
elements into ordered collections called fragments.
Each fragment is dense or sparse, and groups contiguous
array elements into data tiles of fixed capacity. The
organization into fragments turns random writes into
sequential writes, and, coupled with a novel read
algorithm, leads to very efficient reads. TileDB
enables parallelization via multi-threading and
multi-processing, offering thread-/process-safety and
atomicity via lightweight locking. We show that TileDB
delivers comparable performance to the HDF5 dense array
storage manager, while providing much faster random
writes. We also show that TileDB offers substantially
faster reads and writes than the SciDB array database
system with both dense and sparse arrays. Finally, we
demonstrate that TileDB is considerably faster than
adaptations of the Vertica relational column-store for
dense array storage management, and at least as fast
for the case of sparse arrays.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zheng:2016:DDA,
author = "Yudian Zheng and Guoliang Li and Reynold Cheng",
title = "{DOCS}: a domain-aware crowdsourcing system using
knowledge bases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "4",
pages = "361--372",
month = nov,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/3025111.3025118",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Crowdsourcing is a new computing paradigm that
harnesses human effort to solve computer-hard problems,
such as entity resolution and photo tagging. The crowd
(or workers) have diverse qualities and it is important
to effectively model a worker's quality. Most of
existing worker models assume that workers have the
same quality on different tasks. In practice, however,
tasks belong to a variety of diverse domains, and
workers have different qualities on different domains.
For example, a worker who is a basketball fan should
have better quality for the task of labeling a photo
related to ' Stephen Curry ' than the one related to '
Leonardo DiCaprio '. In this paper, we study how to
leverage domain knowledge to accurately model a
worker's quality. We examine using knowledge base (KB),
e.g., Wikipedia and Freebase, to detect the domains of
tasks and workers. We develop Domain Vector Estimation,
which analyzes the domains of a task with respect to
the KB. We also study Truth Inference, which utilizes
the domain-sensitive worker model to accurately infer
the true answer of a task. We design an Online Task
Assignment algorithm, which judiciously and efficiently
assigns tasks to appropriate workers. To implement
these solutions, we have built DOCS, a system deployed
on the Amazon Mechanical Turk. Experiments show that
DOCS performs much better than the state-of-the-art
approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2016:LHC,
author = "Yue Wang and Alexandra Meliou and Gerome Miklau",
title = "Lifting the haze off the cloud: a consumer-centric
market for database computation in the cloud",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "4",
pages = "373--384",
month = nov,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/3025111.3025119",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The availability of public computing resources in the
cloud has revolutionized data analysis, but requesting
cloud resources often involves complex decisions for
consumers. Estimating the completion time and cost of a
computation and requesting the appropriate cloud
resources are challenging tasks even for an expert
user. We propose a new market-based framework for
pricing computational tasks in the cloud. Our framework
introduces an agent between consumers and cloud
providers. The agent takes data and computational tasks
from users, estimates time and cost for evaluating the
tasks, and returns to consumers contracts that specify
the price and completion time. Our framework can be
applied directly to existing cloud markets without
altering the way cloud providers offer and price
services. In addition, it simplifies cloud use for
consumers by allowing them to compare contracts, rather
than choose resources directly. We present design,
analytical, and algorithmic contributions focusing on
pricing computation contracts, analyzing their
properties, and optimizing them in complex workflows.
We conduct an experimental evaluation of our market
framework over a real-world cloud service and
demonstrate empirically that our market ensures three
key properties: (a) that consumers benefit from using
the market due to competitiveness among agents, (b)
that agents have an incentive to price contracts
fairly, and (c) that inaccuracies in estimates do not
pose a significant risk to agents' profits. Finally, we
present a fine-grained pricing mechanism for complex
workflows and show that it can increase agent profits
by more than an order of magnitude in some cases.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yu:2016:TBO,
author = "Jia Yu and Mohamed Sarwat",
title = "Two birds, one stone: a fast, yet lightweight,
indexing scheme for modern database systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "4",
pages = "385--396",
month = nov,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/3025111.3025120",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Classic database indexes (e.g., B$^+$ -Tree), though
speed up queries, suffer from two main drawbacks: (1)
An index usually yields 5\% to 15\% additional storage
overhead which results in non-ignorable dollar cost in
big data scenarios especially when deployed on modern
storage devices. (2) Maintaining an index incurs high
latency because the DBMS has to locate and update those
index pages affected by the underlying table changes.
This paper proposes Hippo a fast, yet scalable,
database indexing approach. It significantly shrinks
the index storage and mitigates maintenance overhead
without compromising much on the query execution
performance. Hippo stores disk page ranges instead of
tuple pointers in the indexed table to reduce the
storage space occupied by the index. It maintains
simplified histograms that represent the data
distribution and adopts a page grouping technique that
groups contiguous pages into page ranges based on the
similarity of their index key attribute distributions.
When a query is issued, Hippo leverages the page ranges
and histogram-based page summaries to recognize those
pages such that their tuples are guaranteed not to
satisfy the query predicates and inspects the remaining
pages. Experiments based on real and synthetic datasets
show that Hippo occupies up to two orders of magnitude
less storage space than that of the B$^+$ -Tree while
still achieving comparable query execution performance
to that of the B$^+$ -Tree for 0.1\% --- 1\%
selectivity factors. Also, the experiments show that
Hippo outperforms BRIN (Block Range Index) in executing
queries with various selectivity factors. Furthermore,
Hippo achieves up to three orders of magnitude less
maintenance overhead and up to an order of magnitude
higher throughput (for hybrid query/update workloads)
than its counterparts.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2016:HMF,
author = "Zheng Li and Tingjian Ge",
title = "History is a mirror to the future: best-effort
approximate complex event matching with insufficient
resources",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "4",
pages = "397--408",
month = nov,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/3025111.3025121",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Complex event processing (CEP) has proven to be a
highly relevant topic in practice. As it is sensitive
to both errors in the stream and uncertainty in the
pattern, approximate complex event processing (ACEP) is
an important direction but has not been adequately
studied before. ACEP is costly, and is often performed
under insufficient computing resources. We propose an
algorithm that learns from the past behavior of ACEP
runs, and makes decisions on what to process first in
an online manner, so as to maximize the number of full
matches found. In addition, we devise effective
optimization techniques. Finally, we propose a
mechanism that uses reinforcement learning to
dynamically update the history structure without
incurring much overhead. Put together, these techniques
drastically improve the fraction of full matches found
in resource constrained environments.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Avni:2016:PHT,
author = "Hillel Avni and Trevor Brown",
title = "Persistent hybrid transactional memory for databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "4",
pages = "409--420",
month = nov,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/3025111.3025122",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Processors with hardware support for transactional
memory (HTM) are rapidly becoming commonplace, and
processor manufacturers are currently working on
implementing support for upcoming non-volatile memory
(NVM) technologies. The combination of HTM and NVM
promises to be a natural choice for in-memory database
synchronization. However, limitations on the size of
hardware transactions and the lack of progress
guarantees by modern HTM implementations prevent some
applications from obtaining the full benefit of
hardware transactional memory. In this paper, we
propose a persistent hybrid TM algorithm called PHyTM
for systems that support NVM and HTM. PHyTM allows
hardware assisted ACID transactions to execute
concurrently with pure software transactions, which
allows applications to gain the benefit of persistent
HTM while simultaneously accommodating unbounded
transactions (with a high degree of concurrency).
Experimental simulations demonstrate that PHyTM is fast
and scalable for realistic workloads.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sun:2016:SOP,
author = "Liwen Sun and Michael J. Franklin and Jiannan Wang and
Eugene Wu",
title = "Skipping-oriented partitioning for columnar layouts",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "4",
pages = "421--432",
month = nov,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/3025111.3025123",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "As data volumes continue to grow, modern database
systems increasingly rely on data skipping mechanisms
to improve performance by avoiding access to irrelevant
data. Recent work [39] proposed a fine-grained
partitioning scheme that was shown to improve the
opportunities for data skipping in row-oriented
systems. Modern analytics and big data systems
increasingly adopt columnar storage schemes, and in
such systems, a row-based approach misses important
opportunities for further improving data skipping. The
flexibility of column-oriented organizations, however,
comes with the additional cost of tuple reconstruction.
In this paper, we develop Generalized Skipping-Oriented
Partitioning (GSOP), a novel hybrid data skipping
framework that takes into account these row-based and
column-based tradeoffs. In contrast to previous
column-oriented physical design work, GSOP considers
the tradeoffs between horizontal data skipping and
vertical partitioning jointly. Our experiments using
two public benchmarks and a real-world workload show
that GSOP can significantly reduce the amount of data
scanned and improve end-to-end query response times
over the state-of-the- art techniques.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Singh:2016:EQU,
author = "Sneha Aman Singh and Divesh Srivastava and Srikanta
Tirthapura",
title = "Estimating quantiles from the union of historical and
streaming data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "4",
pages = "433--444",
month = nov,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/3025111.3025124",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Modern enterprises generate huge amounts of streaming
data, for example, micro-blog feeds, financial data,
network monitoring and industrial application
monitoring. While Data Stream Management Systems have
proven successful in providing support for real-time
alerting, many applications, such as network monitoring
for intrusion detection and real-time bidding, require
complex analytics over historical and real-time data
over the data streams. We present a new method to
process one of the most fundamental analytical
primitives, quantile queries, on the union of
historical and streaming data. Our method combines an
index on historical data with a memory-efficient sketch
on streaming data to answer quantile queries with
accuracy-resource tradeoffs that are significantly
better than current solutions that are based solely on
disk-resident indexes or solely on streaming
algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Serafini:2016:CFG,
author = "Marco Serafini and Rebecca Taft and Aaron J. Elmore
and Andrew Pavlo and Ashraf Aboulnaga and Michael
Stonebraker",
title = "{Clay}: fine-grained adaptive partitioning for general
database schemas",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "4",
pages = "445--456",
month = nov,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/3025111.3025125",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Transaction processing database management systems
(DBMSs) are critical for today's data-intensive
applications because they enable an organization to
quickly ingest and query new information. Many of these
applications exceed the capabilities of a single
server, and thus their database has to be deployed in a
distributed DBMS. The key factor affecting such a
system's performance is how the database is
partitioned. If the database is partitioned
incorrectly, the number of distributed transactions can
be high. These transactions have to synchronize their
operations over the network, which is considerably
slower and leads to poor performance. Previous work on
elastic database repartitioning has focused on a
certain class of applications whose database schema can
be represented in a hierarchical tree structure. But
many applications cannot be partitioned in this manner,
and thus are subject to distributed transactions that
impede their performance and scalability. In this
paper, we present a new on-line partitioning approach,
called Clay, that supports both tree-based schemas and
more complex ``general'' schemas with arbitrary foreign
key relationships. Clay dynamically creates blocks of
tuples to migrate among servers during repartitioning,
placing no constraints on the schema but taking care to
balance load and reduce the amount of data migrated.
Clay achieves this goal by including in each block a
set of hot tuples and other tuples co-accessed with
these hot tuples. To evaluate our approach, we
integrate Clay in a distributed, main-memory DBMS and
show that it can generate partitioning schemes that
enable the system to achieve up to 15$ \times $ better
throughput and 99\% lower latency than existing
approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Siddiqui:2016:EDE,
author = "Tarique Siddiqui and Albert Kim and John Lee and
Karrie Karahalios and Aditya Parameswaran",
title = "Effortless data exploration with zenvisage: an
expressive and interactive visual analytics system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "4",
pages = "457--468",
month = nov,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.14778/3025111.3025126",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data visualization is by far the most commonly used
mechanism to explore and extract insights from
datasets, especially by novice data scientists. And
yet, current visual analytics tools are rather limited
in their ability to operate on collections of
visualizations---by composing, filtering, comparing,
and sorting them---to find those that depict desired
trends or patterns. The process of visual data
exploration remains a tedious process of
trial-and-error. We propose zenvisage, a visual
analytics platform for effortlessly finding desired
visual patterns from large datasets. We introduce
zenvisage's general purpose visual exploration
language, ZQL (``zee-quel'') for specifying the desired
visual patterns, drawing from use-cases in a variety of
domains, including biology, mechanical engineering,
climate science, and commerce. We formalize the
expressiveness of ZQL via a visual exploration
algebra---an algebra on collections of
visualizations---and demonstrate that ZQL is as
expressive as that algebra. zenvisage exposes an
interactive front-end that supports the issuing of ZQL
queries, and also supports interactions that are
``short-cuts'' to certain commonly used ZQL queries. To
execute these queries, zenvisage uses a novel ZQL
graph-based query optimizer that leverages a suite of
optimizations tailored to the goal of processing
collections of visualizations in certain pre-defined
ways. Lastly, a user survey and study demonstrates that
data scientists are able to effectively use zenvisage
to eliminate error-prone and tedious exploration and
directly identify desired visualizations.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ceccarello:2017:MSA,
author = "Matteo Ceccarello and Andrea Pietracaprina and Geppino
Pucci and Eli Upfal",
title = "{MapReduce} and streaming algorithms for diversity
maximization in metric spaces of bounded doubling
dimension",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "5",
pages = "469--480",
month = jan,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Given a dataset of points in a metric space and an
integer $k$, a diversity maximization problem requires
determining a subset of $k$ points maximizing some
diversity objective measure, e.g., the minimum or the
average distance between two points in the subset.
Diversity maximization is computationally hard, hence
only approximate solutions can be hoped for. Although
its applications are mainly in massive data analysis,
most of the past research on diversity maximization
focused on the sequential setting. In this work we
present space and pass/round-efficient diversity
maximization algorithms for the Streaming and MapReduce
models and analyze their approximation guarantees for
the relevant class of metric spaces of bounded doubling
dimension. Like other approaches in the literature, our
algorithms rely on the determination of high-quality
core-sets, i.e., (much) smaller subsets of the input
which contain good approximations to the optimal
solution for the whole input. For a variety of
diversity objective functions, our algorithms attain an
$ (\alpha + \epsilon)$-approximation ratio, for any
constant $ \epsilon > 0$, where $ \alpha $ is the best
approximation ratio achieved by a polynomial-time,
linear-space sequential algorithm for the same
diversity objective. This improves substantially over
the approximation ratios attainable in Streaming and
MapReduce by state-of-the-art algorithms for general
metric spaces. We provide extensive experimental
evidence of the effectiveness of our algorithms on both
real world and synthetic datasets, scaling up to over a
billion points.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bindschaedler:2017:PDP,
author = "Vincent Bindschaedler and Reza Shokri and Carl A.
Gunter",
title = "Plausible deniability for privacy-preserving data
synthesis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "5",
pages = "481--492",
month = jan,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Releasing full data records is one of the most
challenging problems in data privacy. On the one hand,
many of the popular techniques such as data
de-identification are problematic because of their
dependence on the background knowledge of adversaries.
On the other hand, rigorous methods such as the
exponential mechanism for differential privacy are
often computationally impractical to use for releasing
high dimensional data or cannot preserve high utility
of original data due to their extensive data
perturbation. This paper presents a criterion called
plausible deniability that provides a formal privacy
guarantee, notably for releasing sensitive datasets: an
output record can be released only if a certain amount
of input records are indistinguishable, up to a privacy
parameter. This notion does not depend on the
background knowledge of an adversary. Also, it can
efficiently be checked by privacy tests. We present
mechanisms to generate synthetic datasets with similar
statistical properties to the input data and the same
format. We study this technique both theoretically and
experimentally. A key theoretical result shows that,
with proper randomization, the plausible deniability
mechanism generates differentially private synthetic
data. We demonstrate the efficiency of this generative
technique on a large dataset; it is shown to preserve
the utility of original data with respect to various
statistical analysis and machine learning measures.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Verma:2017:ECP,
author = "Shiv Verma and Luke M. Leslie and Yosub Shin and
Indranil Gupta",
title = "An experimental comparison of partitioning strategies
in distributed graph processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "5",
pages = "493--504",
month = jan,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we study the problem of choosing among
partitioning strategies in distributed graph processing
systems. To this end, we evaluate and characterize both
the performance and resource usage of different
partitioning strategies under various popular
distributed graph processing systems, applications,
input graphs, and execution environments. Through our
experiments, we found that no single partitioning
strategy is the best fit for all situations, and that
the choice of partitioning strategy has a significant
effect on resource usage and application run-time. Our
experiments demonstrate that the choice of partitioning
strategy depends on (1) the degree distribution of
input graph, (2) the type and duration of the
application, and (3) the cluster size. Based on our
results, we present rules of thumb to help users pick
the best partitioning strategy for their particular use
cases. We present results from each system, as well as
from all partitioning strategies implemented in one
common system (PowerLyra).",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chandramouli:2017:SPR,
author = "Badrish Chandramouli and Jonathan Goldstein",
title = "Shrink: prescribing resiliency solutions for
streaming",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "5",
pages = "505--516",
month = jan,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Streaming query deployments make up a vital part of
cloud oriented applications. They vary widely in their
data, logic, and statefulness, and are typically
executed in multi-tenant distributed environments with
varying uptime SLAs. In order to achieve these SLAs,
one of a number of proposed resiliency strategies is
employed to protect against failure. This paper has
introduced the first, comprehensive, cloud friendly
comparison between different resiliency techniques for
streaming queries. In this paper, we introduce models
which capture the costs associated with different
resiliency strategies, and through a series of
experiments which implement and validate these models,
show that (1) there is no single resiliency strategy
which efficiently handles most streaming scenarios; (2)
the optimization space is too complex for a person to
employ a ``rules of thumb'' approach; and (3) there
exists a clear generalization of periodic checkpointing
that is worth considering in many cases. Finally, the
models presented in this paper can be adapted to fit a
wide variety of resiliency strategies, and likely have
important consequences for cloud services beyond those
that are obviously streaming.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Barthels:2017:DJA,
author = "Claude Barthels and Ingo M{\"u}ller and Timo Schneider
and Gustavo Alonso and Torsten Hoefler",
title = "Distributed join algorithms on thousands of cores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "5",
pages = "517--528",
month = jan,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/pvm.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Traditional database operators such as joins are
relevant not only in the context of database engines
but also as a building block in many computational and
machine learning algorithms. With the advent of big
data, there is an increasing demand for efficient join
algorithms that can scale with the input data size and
the available hardware resources. In this paper, we
explore the implementation of distributed join
algorithms in systems with several thousand cores
connected by a low-latency network as used in high
performance computing systems or data centers. We
compare radix hash join to sort-merge join algorithms
and discuss their implementation at this scale. In the
paper, we explain how to use MPI to implement joins,
show the impact and advantages of RDMA, discuss the
importance of network scheduling, and study the
relative performance of sorting vs. hashing. The
experimental results show that the algorithms we
present scale well with the number of cores, reaching a
throughput of 48.7 billion input tuples per second on
4,096 cores.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2017:CBS,
author = "Junling Liu and Ke Deng and Huanliang Sun and Yu Ge
and Xiaofang Zhou and Christian S. Jensen",
title = "Clue-based spatio-textual query",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "5",
pages = "529--540",
month = jan,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Along with the proliferation of online digital map and
location-based service, very large POI (point of
interest) databases have been constructed where a
record corresponds to a POI with information including
name, category, address, geographical location and
other features. A basic spatial query in POI database
is POI retrieval. In many scenarios, a user cannot
provide enough information to pinpoint the POI except
some clue. For example, a user wants to identify a caf
{\'e} in a city visited many years ago. SHe cannot
remember the name and address but she still recalls
that ``the caf {\'e} is about 200 meters away from a
restaurant; and turning left at the restaurant there is
a bakery 500 meters away, etc.''. Intuitively, the
clue, even partial and approximate, describes the
spatio-textual context around the targeted POI.
Motivated by this observation, this work investigates
clue-based spatio-textual query which allows user
providing clue, i.e., some nearby POIs and the spatial
relationships between them, in POI retrieval. The
objective is to retrieve k POIs from a POI database
with the highest spatio-textual context similarities
against the clue. This work has deliberately designed
data-quality-tolerant spatio-textual context similarity
metric to cope with various data quality problems in
both the clue and the POI database. Through crossing
valuation, the query accuracy is further enhanced by
ensemble method. Also, this work has developed an index
called roll-out-star R-tree (RSR-tree) to dramatically
improve the query processing efficiency. The extensive
tests on data sets from the real world have verified
the superiority of our methods in all aspects.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zheng:2017:TIC,
author = "Yudian Zheng and Guoliang Li and Yuanbing Li and
Caihua Shan and Reynold Cheng",
title = "Truth inference in crowdsourcing: is the problem
solved?",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "5",
pages = "541--552",
month = jan,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Crowdsourcing has emerged as a novel problem-solving
paradigm, which facilitates addressing problems that
are hard for computers, e.g., entity resolution and
sentiment analysis. However, due to the openness of
crowdsourcing, workers may yield low-quality answers,
and a redundancy-based method is widely employed, which
first assigns each task to multiple workers and then
infers the correct answer (called truth) for the task
based on the answers of the assigned workers. A
fundamental problem in this method is Truth Inference,
which decides how to effectively infer the truth.
Recently, the database community and data mining
community independently study this problem and propose
various algorithms. However, these algorithms are not
compared extensively under the same framework and it is
hard for practitioners to select appropriate
algorithms. To alleviate this problem, we provide a
detailed survey on 17 existing algorithms and perform a
comprehensive evaluation using 5 real datasets. We make
all codes and datasets public for future research.
Through experiments we find that existing algorithms
are not stable across different datasets and there is
no algorithm that outperforms others consistently. We
believe that the truth inference problem is not fully
solved, and identify the limitations of existing
algorithms and point out promising research
directions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Harding:2017:EDC,
author = "Rachael Harding and Dana {Van Aken} and Andrew Pavlo
and Michael Stonebraker",
title = "An evaluation of distributed concurrency control",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "5",
pages = "553--564",
month = jan,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Increasing transaction volumes have led to a
resurgence of interest in distributed transaction
processing. In particular, partitioning data across
several servers can improve throughput by allowing
servers to process transactions in parallel. But
executing transactions across servers limits the
scalability and performance of these systems. In this
paper, we quantify the effects of distribution on
concurrency control protocols in a distributed
environment. We evaluate six classic and modern
protocols in an in-memory distributed database
evaluation framework called Deneva, providing an
apples-to-apples comparison between each. Our results
expose severe limitations of distributed transaction
processing engines. Moreover, in our analysis, we
identify several protocol-specific scalability
bottlenecks. We conclude that to achieve truly scalable
operation, distributed concurrency control solutions
must seek a tighter coupling with either novel network
hardware (in the local area) or applications (via data
modeling and semantically-aware execution), or both.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cui:2017:KLQ,
author = "Wanyun Cui and Yanghua Xiao and Haixun Wang and
Yangqiu Song and Seung-won Hwang and Wei Wang",
title = "{KBQA}: learning question answering over {QA} corpora
and knowledge bases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "5",
pages = "565--576",
month = jan,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Question answering (QA) has become a popular way for
humans to access billion-scale knowledge bases. Unlike
web search, QA over a knowledge base gives out accurate
and concise results, provided that natural language
questions can be understood and mapped precisely to
structured queries over the knowledge base. The
challenge, however, is that a human can ask one
question in many different ways. Previous approaches
have natural limits due to their representations: rule
based approaches only understand a small set of
``canned'' questions, while keyword based or synonym
based approaches cannot fully understand the questions.
In this paper, we design a new kind of question
representation: templates, over a billion scale
knowledge base and a million scale QA corpora. For
example, for questions about a city's population, we
learn templates such as What's the population of city?,
How many people are there in city?. We learned 27
million templates for 2782 intents. Based on these
templates, our QA system KBQA effectively supports
binary factoid questions, as well as complex questions
which are composed of a series of binary factoid
questions. Furthermore, we expand predicates in RDF
knowledge base, which boosts the coverage of knowledge
base by 57 times. Our QA system beats all other
state-of-art works on both effectiveness and efficiency
over QALD benchmarks.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Deutch:2017:PNL,
author = "Daniel Deutch and Nave Frost and Amir Gilad",
title = "Provenance for natural language queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "5",
pages = "577--588",
month = jan,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Multiple lines of research have developed Natural
Language (NL) interfaces for formulating database
queries. We build upon this work, but focus on
presenting a highly detailed form of the answers in NL.
The answers that we present are importantly based on
the provenance of tuples in the query result, detailing
not only the results but also their explanations. We
develop a novel method for transforming provenance
information to NL, by leveraging the original NL query
structure. Furthermore, since provenance information is
typically large and complex, we present two solutions
for its effective presentation as NL text: one that is
based on provenance factorization, with novel
desiderata relevant to the NL case, and one that is
based on summarization. We have implemented our
solution in an end-to-end system supporting questions,
answers and provenance, all expressed in NL. Our
experiments, including a user study, indicate the
quality of our solution and its scalability.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lu:2017:AAP,
author = "Yi Lu and Anil Shanbhag and Alekh Jindal and Samuel
Madden",
title = "{AdaptDB}: adaptive partitioning for distributed
joins",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "5",
pages = "589--600",
month = jan,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Big data analytics often involves complex join queries
over two or more tables. Such join processing is
expensive in a distributed setting both because large
amounts of data must be read from disk, and because of
data shuffling across the network. Many techniques
based on data partitioning have been proposed to reduce
the amount of data that must be accessed, often
focusing on finding the best partitioning scheme for a
particular workload, rather than adapting to changes in
the workload over time. In this paper, we present
AdaptDB, an adaptive storage manager for analytical
database workloads in a distributed setting. It works
by partitioning datasets across a cluster and
incrementally refining data partitioning as queries are
run. AdaptDB introduces a novel hyper-join that avoids
expensive data shuffling by identifying storage blocks
of the joining tables that overlap on the join
attribute, and only joining those blocks. Hyper-join
performs well when each block in one table overlaps
with few blocks in the other table, since that will
minimize the number of blocks that have to be accessed.
To minimize the number of overlapping blocks for common
join queries, AdaptDB users smooth repartitioning to
repartition small portions of the tables on join
attributes as queries run. A prototype of AdaptDB
running on top of Spark improves query performance by
2--3x on TPC-H as well as real-world dataset, versus a
system that employs scans and shuffle-joins.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2017:EES,
author = "Zhipeng Zhang and Yingxia Shao and Bin Cui and Ce
Zhang",
title = "An experimental evaluation of {SimRank}-based
similarity search algorithms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "5",
pages = "601--612",
month = jan,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Given a graph, SimRank is one of the most popular
measures of the similarity between two vertices. We
focus on efficiently calculating SimRank, which has
been studied intensively over the last decade. This has
led to many algorithms that efficiently calculate or
approximate SimRank being proposed by researchers.
Despite these abundant research efforts, there is no
systematic comparison of these algorithms. In this
paper, we conduct a study to compare these algorithms
to understand their pros and cons. We first introduce a
taxonomy for different algorithms that calculate
SimRank and classify each algorithm into one of the
following three classes, namely, iterative-,
non-iterative-, and random walk-based method. We
implement ten algorithms published from 2002 to 2015,
and compare them using synthetic and real-world graphs.
To ensure the fairness of our study, our
implementations use the same data structure and
execution framework, and we try our best to optimize
each of these algorithms. Our study reveals that none
of these algorithms dominates the others: algorithms
based on iterative method often have higher accuracy
while algorithms based on random walk can be more
scalable. One noniterative algorithm has good
effectiveness and efficiency on graphs with medium
size. Thus, depending on the requirements of different
applications, the optimal choice of algorithms differs.
This paper provides an empirical guideline for making
such choices.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Faleiro:2017:HPT,
author = "Jose M. Faleiro and Daniel J. Abadi and Joseph M.
Hellerstein",
title = "High performance transactions via early write
visibility",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "5",
pages = "613--624",
month = jan,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In order to guarantee recoverable transaction
execution, database systems permit a transaction's
writes to be observable only at the end of its
execution. As a consequence, there is generally a delay
between the time a transaction performs a write and the
time later transactions are permitted to read it. This
delayed write visibility can significantly impact the
performance of serializable database systems by
reducing concurrency among conflicting transactions.
This paper makes the observation that delayed write
visibility stems from the fact that database systems
can arbitrarily abort transactions at any point during
their execution. Accordingly, we make the case for
database systems which only abort transactions under a
restricted set of conditions, thereby enabling a new
recoverability mechanism, early write visibility, which
safely makes transactions' writes visible prior to the
end of their execution. We design a new serializable
concurrency control protocol, piece-wise visibility
(PWV), with the explicit goal of enabling early write
visibility. We evaluate PWV against state-of-the-art
serializable protocols and a highly optimized
implementation of read committed, and find that PWV can
outperform serializable protocols by an order of
magnitude and read committed by 3X on high contention
workloads.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Eswaran:2017:ZBP,
author = "Dhivya Eswaran and Stephan G{\"u}nnemann and Christos
Faloutsos and Disha Makhija and Mohit Kumar",
title = "{ZooBP}: belief propagation for heterogeneous
networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "5",
pages = "625--636",
month = jan,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Given a heterogeneous network, with nodes of different
types --- e.g., products, users and sellers from an
online recommendation site like Amazon --- and labels
for a few nodes ('honest', 'suspicious', etc), can we
find a closed formula for Belief Propagation (BP),
exact or approximate? Can we say whether it will
converge? BP, traditionally an inference algorithm for
graphical models, exploits so-called ``network
effects'' to perform graph classification tasks when
labels for a subset of nodes are provided; and it has
been successful in numerous settings like fraudulent
entity detection in online retailers and classification
in social networks. However, it does not have a
closed-form nor does it provide convergence guarantees
in general. We propose ZooBP, a method to perform fast
BP on undirected heterogeneous graphs with provable
convergence guarantees. ZooBP has the following
advantages: (1) Generality: It works on heterogeneous
graphs with multiple types of nodes and edges; (2)
Closed-form solution: ZooBP gives a closed-form
solution as well as convergence guarantees; (3)
Scalability: ZooBP is linear on the graph size and is
up to 600$ \times $ faster than BP, running on graphs
with 3.3 million edges in a few seconds. (4)
Effectiveness: Applied on real data (a Flipkart
e-commerce network with users, products and sellers),
ZooBP identifies fraudulent users with a near-perfect
precision of 92.3 \% over the top 300 results.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lyu:2017:USV,
author = "Min Lyu and Dong Su and Ninghui Li",
title = "Understanding the sparse vector technique for
differential privacy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "6",
pages = "637--648",
month = feb,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The Sparse Vector Technique (SVT) is a fundamental
technique for satisfying differential privacy and has
the unique quality that one can output some query
answers without apparently paying any privacy cost. SVT
has been used in both the interactive setting, where
one tries to answer a sequence of queries that are not
known ahead of the time, and in the non-interactive
setting, where all queries are known. Because of the
potential savings on privacy budget, many variants for
SVT have been proposed and employed in
privacy-preserving data mining and publishing. However,
most variants of SVT are actually not private. In this
paper, we analyze these errors and identify the
misunderstandings that likely contribute to them. We
also propose a new version of SVT that provides better
utility, and introduce an effective technique to
improve the performance of SVT. These enhancements can
be applied to improve utility in the interactive
setting. Through both analytical and experimental
comparisons, we show that, in the non-interactive
setting (but not the interactive setting), the SVT
technique is unnecessary, as it can be replaced by the
Exponential Mechanism (EM) with better accuracy.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2017:OEA,
author = "Fan Zhang and Wenjie Zhang and Ying Zhang and Lu Qin
and Xuemin Lin",
title = "{OLAK}: an efficient algorithm to prevent unraveling
in social networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "6",
pages = "649--660",
month = feb,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we study the problem of the anchored
$k$-core. Given a graph $G$, an integer $k$ and a
budget $b$, we aim to identify $b$ vertices in $G$ so
that we can determine the largest induced subgraph $J$
in which every vertex, except the $b$ vertices, has at
least $k$ neighbors in $J$. This problem was introduced
by Bhawalkar and Kleinberg et al. in the context of
user engagement in social networks, where a user may
leave a community if he/she has less than $k$ friends
engaged. The problem has been shown to be NP-hard and
inapproximable. A polynomial-time algorithm for graphs
with bounded tree-width has been proposed. However,
this assumption usually does not hold in real-life
graphs, and their techniques cannot be extended to
handle general graphs. Motivated by this, we propose an
efficient algorithm, namely onion-layer based anchored
$k$-core (OLAK), for the anchored $k$ core problem on
large scale graphs. To facilitate computation of the
anchored $k$-core, we design an onion layer structure,
which is generated by a simple onion-peeling-like
algorithm against a small set of vertices in the graph.
We show that computation of the best anchor can simply
be conducted upon the vertices on the onion layers,
which significantly reduces the search space. Based on
the well-organized layer structure, we develop
efficient candidates exploration, early termination and
pruning techniques to further speed up computation.
Comprehensive experiments on 10 real-life graphs
demonstrate the effectiveness and efficiency of our
proposed methods.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Khan:2017:DTI,
author = "Meraj Khan and Larry Xu and Arnab Nandi and Joseph M.
Hellerstein",
title = "Data tweening: incremental visualization of data
transforms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "6",
pages = "661--672",
month = feb,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In the context of interactive query sessions, it is
common to issue a succession of queries, transforming a
dataset to the desired result. It is often difficult to
comprehend a succession of transformations, especially
for complex queries. Thus, to facilitate understanding
of each data transformation and to provide continuous
feedback, we introduce the concept of ``data
tweening'', i.e., interpolating between resultsets,
presenting to the user a series of incremental visual
representations of a resultset transformation. We
present tweening methods that consider not just the
changes in the result, but also the changes in the
query. Through user studies, we show that data tweening
allows users to efficiently comprehend data transforms,
and also enables them to gain a better understanding of
the underlying query operations.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bater:2017:SSQ,
author = "Johes Bater and Gregory Elliott and Craig Eggen and
Satyender Goel and Abel Kho and Jennie Rogers",
title = "{SMCQL}: secure querying for federated databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "6",
pages = "673--684",
month = feb,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "People and machines are collecting data at an
unprecedented rate. Despite this newfound abundance of
data, progress has been slow in sharing it for open
science, business, and other data-intensive endeavors.
Many such efforts are stymied by privacy concerns and
regulatory compliance issues. For example, many
hospitals are interested in pooling their medical
records for research, but none may disclose arbitrary
patient records to researchers or other healthcare
providers. In this context we propose the Private Data
Network (PDN), a federated database for querying over
the collective data of mutually distrustful parties. In
a PDN, each member database does not reveal its tuples
to its peers nor to the query writer. Instead, the user
submits a query to an honest broker that plans and
coordinates its execution over multiple private
databases using secure multiparty computation (SMC).
Here, each database's query execution is oblivious, and
its program counters and memory traces are agnostic to
the inputs of others. We introduce a framework for
executing PDN queries named smcql. This system
translates SQL statements into SMC primitives to
compute query results over the union of its source
databases without revealing sensitive information about
individual tuples to peer data providers or the honest
broker. Only the honest broker and the querier receive
the results of a PDN query. For fast, secure query
evaluation, we explore a heuristics-driven optimizer
that minimizes the PDN's use of secure computation and
partitions its query evaluation into scalable slices.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zamanian:2017:EMD,
author = "Erfan Zamanian and Carsten Binnig and Tim Harris and
Tim Kraska",
title = "The end of a myth: distributed transactions can
scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "6",
pages = "685--696",
month = feb,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The common wisdom is that distributed transactions do
not scale. But what if distributed transactions could
be made scalable using the next generation of networks
and a redesign of distributed databases? There would no
longer be a need for developers to worry about
co-partitioning schemes to achieve decent performance.
Application development would become easier as data
placement would no longer determine how scalable an
application is. Hardware provisioning would be
simplified as the system administrator can expect a
linear scale-out when adding more machines rather than
some complex sub-linear function, which is highly
application specific. In this paper, we present the
design of our novel scalable database system NAM-DB and
show that distributed transactions with the very common
Snapshot Isolation guarantee can indeed scale using the
next generation of RDMA-enabled network technology
without any inherent bottlenecks. Our experiments with
the TPC-C benchmark show that our system scales
linearly to over 6.5 million new-order (14.5 million
total) distributed transactions per second on 56
machines.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhu:2017:NIG,
author = "Haohan Zhu and Xianrui Meng and George Kollios",
title = "{NED}: an inter-graph node metric based on edit
distance",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "6",
pages = "697--708",
month = feb,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Node similarity is fundamental in graph analytics.
However, node similarity between nodes in different
graphs (inter-graph nodes) has not received enough
attention yet. The inter-graph node similarity is
important in learning a new graph based on the
knowledge extracted from an existing graph (transfer
learning on graphs) and has applications in biological,
communication, and social networks. In this paper, we
propose a novel distance function for measuring
inter-graph { node} similarity with { edit} {
distance}, called NED. In NED, two nodes are compared
according to their local neighborhood topologies which
are represented as unordered k -adjacent trees, without
relying on any extra information. Due to the hardness
of computing tree edit distance on unordered trees
which is NP-Complete, we propose a modified tree edit
distance, called TED*, for comparing unordered and
unlabeled k adjacent trees. TED* is a metric distance,
as the original tree edit distance, but more
importantly, TED* is polynomially computable. As a
metric distance, NED admits efficient indexing,
provides interpretable results, and shows to perform
better than existing approaches on a number of data
analysis tasks, including graph deanonymization.
Finally, the efficiency and effectiveness of NED are
empirically demonstrated using real-world graphs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fang:2017:ECS,
author = "Yixiang Fang and Reynold Cheng and Xiaodong Li and
Siqiang Luo and Jiafeng Hu",
title = "Effective community search over large spatial graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "6",
pages = "709--720",
month = feb,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 25 09:01:51 MST 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Communities are prevalent in social networks,
knowledge graphs, and biological networks. Recently,
the topic of community search (CS) has received plenty
of attention. Given a query vertex, CS looks for a
dense subgraph that contains it. Existing CS solutions
do not consider the spatial extent of a community. They
can yield communities whose locations of vertices span
large areas. In applications that facilitate the
creation of social events (e.g., finding conference
attendees to join a dinner), it is important to find
groups of people who are physically close to each
other. In this situation, it is desirable to have a
spatial-aware community (or SAC), whose vertices are
close structurally and spatially. Given a graph G and a
query vertex q, we develop exact solutions for finding
an SAC that contains q. Since these solutions cannot
scale to large datasets, we have further designed three
approximation algorithms to compute an SAC. We have
performed an experimental evaluation for these
solutions on both large real and synthetic datasets.
Experimental results show that SAC is better than the
communities returned by existing solutions. Moreover,
our approximation solutions can find SACs accurately
and efficiently.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Szlichta:2017:ECD,
author = "Jaros{\l}aw Szlichta and Parke Godfrey and Lukasz
Golab and Mehdi Kargar and Divesh Srivastava",
title = "Effective and complete discovery of order dependencies
via set-based axiomatization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "7",
pages = "721--732",
month = mar,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Mar 27 20:45:15 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Integrity constraints (ICs) are useful for query
optimization and for expressing and enforcing
application semantics. However, formulating constraints
manually requires domain expertise, is prone to human
errors, and may be excessively time consuming,
especially on large datasets. Hence, proposals for
automatic discovery have been made for some classes of
ICs, such as functional dependencies (FDs), and
recently, order dependencies (ODs). ODs properly
subsume FDs, as they can additionally express business
rules involving order; e.g., an employee never has a
higher salary while paying lower taxes than another
employee. We present a new OD discovery algorithm
enabled by a novel polynomial mapping to a canonical
form of ODs, and a sound and complete set of axioms
(inference rules) for canonical ODs. Our algorithm has
exponential worst-case time complexity, O (2$^{| R |}$
), in the number of attributes | R | and linear
complexity in the number of tuples. We prove that it
produces a complete and minimal set of ODs. Using real
and synthetic datasets, we experimentally show
orders-of-magnitude performance improvements over the
prior state-of-the-art.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Karnagel:2017:AWP,
author = "Tomas Karnagel and Dirk Habich and Wolfgang Lehner",
title = "Adaptive work placement for query processing on
heterogeneous computing resources",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "7",
pages = "733--744",
month = mar,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Mar 27 20:45:15 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The hardware landscape is currently changing from
homogeneous multi-core systems towards heterogeneous
systems with many different computing units, each with
their own characteristics. This trend is a great
opportunity for data-base systems to increase the
overall performance if the heterogeneous resources can
be utilized efficiently. To achieve this, the main
challenge is to place the right work on the right
computing unit. Current approaches tackling this
placement for query processing assume that data
cardinalities of intermediate results can be correctly
estimated. However, this assumption does not hold for
complex queries. To overcome this problem, we propose
an adaptive placement approach being independent of
cardinality estimation of intermediate results. Our
approach is incorporated in a novel adaptive placement
sequence. Additionally, we implement our approach as an
extensible virtualization layer, to demonstrate the
broad applicability with multiple database systems. In
our evaluation, we clearly show that our approach
significantly improves OLAP query processing on
heterogeneous hardware, while being adaptive enough to
react to changing cardinalities of intermediate query
results.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yang:2017:LFE,
author = "Fan Yang and Fanhua Shang and Yuzhen Huang and James
Cheng and Jinfeng Li and Yunjian Zhao and Ruihao Zhao",
title = "{LFTF}: a framework for efficient tensor analytics at
scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "7",
pages = "745--756",
month = mar,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Mar 27 20:45:15 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Tensors are higher order generalizations of matrices
to model multi-aspect data, e.g., a set of purchase
records with the schema (user\_id, product\_id,
timestamp, feedback). Tensor factorization is a
powerful technique for generating a model from a
tensor, just like matrix factorization generates a
model from a matrix, but with higher accuracy and
richer information as more attributes are available in
a higher- order tensor than a matrix. The data model
obtained by tensor factorization can be used for
classification, recommendation, anomaly detection, and
so on. Though having a broad range of applications,
tensor factorization has not been popularly applied
compared with matrix factorization that has been widely
used in recommender systems, mainly due to the high
computational cost and poor scalability of existing
tensor factorization methods. Efficient and scalable
tensor factorization is particularly challenging
because real world tensor data are mostly sparse and
massive. In this paper, we propose a novel distributed
algorithm, called Lock-Free Tensor Factorization
(LFTF), which significantly improves the efficiency and
scalability of distributed tensor factorization by
exploiting asynchronous execution in a re-formulated
problem. Our experiments show that LFTF achieves much
higher CPU and network throughput than existing
methods, converges at least 17 times faster and scales
to much larger datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gupta:2017:LSM,
author = "Shalmoli Gupta and Ravi Kumar and Kefu Lu and Benjamin
Moseley and Sergei Vassilvitskii",
title = "Local search methods for $k$-means with outliers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "7",
pages = "757--768",
month = mar,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Mar 27 20:45:15 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We study the problem of k -means clustering in the
presence of outliers. The goal is to cluster a set of
data points to minimize the variance of the points
assigned to the same cluster, with the freedom of
ignoring a small set of data points that can be labeled
as outliers. Clustering with outliers has received a
lot of attention in the data processing community, but
practical, efficient, and provably good algorithms
remain unknown for the most popular k -means objective.
Our work proposes a simple local search-based algorithm
for k -means clustering with outliers. We prove that
this algorithm achieves constant-factor approximate
solutions and can be combined with known sketching
techniques to scale to large data sets. Using empirical
evaluation on both synthetic and large-scale real-world
data, we demonstrate that the algorithm dominates
recently proposed heuristic approaches for the
problem.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Casanova:2017:DTR,
author = "Guillaume Casanova and Elias Englmeier and Michael E.
Houle and Peer Kr{\"o}ger and Michael Nett and Erich
Schubert and Arthur Zimek",
title = "Dimensional testing for reverse $k$-nearest neighbor
search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "7",
pages = "769--780",
month = mar,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Mar 27 20:45:15 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Given a query object q, reverse k -nearest neighbor (R
k NN) search aims to locate those objects of the
database that have q among their k -nearest neighbors.
In this paper, we propose an approximation method for
solving R k NN queries, where the pruning operations
and termination tests are guided by a characterization
of the intrinsic dimensionality of the data. The method
can accommodate any index structure supporting
incremental (forward) nearest-neighbor search for the
generation and verification of candidates, while
avoiding impractically-high preprocessing costs. We
also provide experimental evidence that our method
significantly outperforms its competitors in terms of
the tradeoff between execution time and the quality of
the approximation. Our approach thus addresses many of
the scalability issues surrounding the use of previous
methods in data mining.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wu:2017:EEM,
author = "Yingjun Wu and Joy Arulraj and Jiexi Lin and Ran Xian
and Andrew Pavlo",
title = "An empirical evaluation of in-memory multi-version
concurrency control",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "7",
pages = "781--792",
month = mar,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Mar 27 20:45:15 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Multi-version concurrency control (MVCC) is currently
the most popular transaction management scheme in
modern database management systems (DBMSs). Although
MVCC was discovered in the late 1970s, it is used in
almost every major relational DBMS released in the last
decade. Maintaining multiple versions of data
potentially increases parallelism without sacrificing
serializability when processing transactions. But
scaling MVCC in a multi-core and in-memory setting is
non-trivial: when there are a large number of threads
running in parallel, the synchronization overhead can
outweigh the benefits of multi-versioning. To
understand how MVCC perform when processing
transactions in modern hardware settings, we conduct an
extensive study of the scheme's four key design
decisions: concurrency control protocol, version
storage, garbage collection, and index management. We
implemented state-of-the-art variants of all of these
in an in-memory DBMS and evaluated them using OLTP
workloads. Our analysis identifies the fundamental
bottlenecks of each design choice.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wu:2017:FDH,
author = "You Wu and Junyang Gao and Pankaj K. Agarwal and Jun
Yang",
title = "Finding diverse, high-value representatives on a
surface of answers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "7",
pages = "793--804",
month = mar,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Mar 27 20:45:15 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In many applications, the system needs to selectively
present a small subset of answers to users. The set of
all possible answers can be seen as an elevation
surface over a domain, where the elevation measures the
quality of each answer, and the dimensions of the
domain correspond to attributes of the answers with
which similarity between answers can be measured. This
paper considers the problem of finding a diverse set of
k high-quality representatives for such a surface. We
show that existing methods for diversified top- k and
weighted clustering problems are inadequate for this
problem. We propose k -DHR as a better formulation for
the problem. We show that k -DHR has a submodular and
monotone objective function, and we develop efficient
algorithms for solving k -DHR with provable guarantees.
We conduct extensive experiments to demonstrate the
usefulness of the results produced by k -DHR for
applications in computational lead-finding and
fact-checking, as well as the efficiency and
effectiveness of our algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2017:RTI,
author = "Yanhao Wang and Qi Fan and Yuchen Li and Kian-Lee
Tan",
title = "Real-time influence maximization on dynamic social
streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "7",
pages = "805--816",
month = mar,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Mar 27 20:45:15 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Influence maximization (IM), which selects a set of
$k$ users (called seeds) to maximize the influence
spread over a social network, is a fundamental problem
in a wide range of applications such as viral marketing
and network monitoring. Existing IM solutions fail to
consider the highly dynamic nature of social influence,
which results in either poor seed qualities or long
processing time when the network evolves. To address
this problem, we define a novel IM query named Stream
Influence Maximization (SIM) on social streams.
Technically, SIM adopts the sliding window model and
maintains a set of $k$ seeds with the largest influence
value over the most recent social actions. Next, we
propose the Influential Checkpoints (IC) framework to
facilitate continuous SIM query processing. The IC
framework creates a checkpoint for each window shift
and ensures an $ \epsilon $-approximate solution. To
improve its efficiency, we further devise a Sparse
Influential Checkpoints (SIC) framework which
selectively keeps $ O(l o g N / \beta)$ checkpoints for
a sliding window of size $N$ and maintains an $
\epsilon (1 \beta) / 2$-approximate solution.
Experimental results on both real-world and synthetic
datasets confirm the effectiveness and efficiency of
our proposed frameworks against the state-of-the-art IM
approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cai:2017:CDC,
author = "Hongyun Cai and Vincent W. Zheng and Fanwei Zhu and
Kevin Chen-Chuan Chang and Zi Huang",
title = "From community detection to community profiling",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "7",
pages = "817--828",
month = mar,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Mar 27 20:45:15 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Most existing community-related studies focus on
detection, which aim to find the community membership
for each user from user friendship links. However,
membership alone, without a complete profile of what a
community is and how it interacts with other
communities, has limited applications. This motivates
us to consider systematically profiling the communities
and thereby developing useful community-level
applications. In this paper, we for the first time
formalize the concept of community profiling. With rich
user information on the network, such as user published
content and user diffusion links, we characterize a
community in terms of both its internal content profile
and external diffusion profile. The difficulty of
community profiling is often underestimated. We novelly
identify three unique challenges and propose a joint
Community Profiling and Detection (CPD) model to
address them accordingly. We also contribute a scalable
inference algorithm, which scales linearly with the
data size and it is easily parallelizable. We evaluate
CPD on large-scale real-world data sets, and show that
it is significantly better than the state-of-the-art
baselines in various tasks.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jain:2017:UWD,
author = "Ayush Jain and Akash Das Sarma and Aditya Parameswaran
and Jennifer Widom",
title = "Understanding workers, developing effective tasks, and
enhancing marketplace dynamics: a study of a large
crowdsourcing marketplace",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "7",
pages = "829--840",
month = mar,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Mar 27 20:45:15 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We conduct an experimental analysis of a dataset
comprising over 27 million microtasks performed by over
70,000 workers issued to a large crowdsourcing
marketplace between 2012--2016. Using this data---never
before analyzed in an academic context---we shed light
on three crucial aspects of crowdsourcing: (1) Task
design---helping requesters understand what constitutes
an effective task, and how to go about designing one;
(2) Marketplace dynamics --- helping marketplace
administrators and designers understand the interaction
between tasks and workers, and the corresponding
marketplace load; and (3) Worker behavior ---
understanding worker attention spans, lifetimes, and
general behavior, for the improvement of the
crowdsourcing ecosystem as a whole.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lin:2017:OPE,
author = "Xuelian Lin and Shuai Ma and Han Zhang and Tianyu Wo
and Jinpeng Huai",
title = "One-pass error bounded trajectory simplification",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "7",
pages = "841--852",
month = mar,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Mar 27 20:45:15 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Nowadays, various sensors are collecting, storing and
transmitting tremendous trajectory data, and it is
known that raw trajectory data seriously wastes the
storage, network band and computing resource. Line
simplification (LS) algorithms are an effective
approach to attacking this issue by compressing data
points in a trajectory to a set of continuous line
segments, and are commonly used in practice. However,
existing LS algorithms are not sufficient for the needs
of sensors in mobile devices. In this study, we first
develop a one-pass error bounded trajectory
simplification algorithm (OPERB), which scans each data
point in a trajectory once and only once. We then
propose an aggressive one-pass error bounded trajectory
simplification algorithm (OPERB-A), which allows
interpolating new data points into a trajectory under
certain conditions. Finally, we experimentally verify
that our approaches (OPERB and OPERB-A) are both
efficient and effective, using four real-life
trajectory datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2017:MIL,
author = "Jianguo Wang and Chunbin Lin and Ruining He and Moojin
Chae and Yannis Papakonstantinou and Steven Swanson",
title = "{MILC}: inverted list compression in memory",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "8",
pages = "853--864",
month = apr,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3090163.3090164",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 17:12:46 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Inverted list compression is a topic that has been
studied for 50 years due to its fundamental importance
in numerous applications including information
retrieval, databases, and graph analytics. Typically,
an inverted list compression algorithm is evaluated on
its space overhead and query processing time. Earlier
list compression designs mainly focused on minimizing
the space overhead to reduce expensive disk I/O time in
disk-oriented systems. But the recent trend is shifted
towards reducing query processing time because the
underlying systems tend to be memory-resident. Although
there are many highly optimized compression approaches
in main memory, there is still a considerable
performance gap between query processing over
compressed lists and uncompressed lists, which
motivates this work. In this work, we set out to bridge
this performance gap for the first time by proposing a
new compression scheme, namely, MILC (memory inverted
list compression). MILC relies on a series of
techniques including offset-oriented fixed-bit
encoding, dynamic partitioning, in-block compression,
cache-aware optimization, and SIMD acceleration. We
conduct experiments on three real-world datasets in
information retrieval, databases, and graph analytics
to demonstrate the high performance and low space
overhead of MILC. We compare MILC with 12 recent
compression algorithms and experimentally show that
MILC improves the query performance by up to 13.2$
\times $ and reduces the space overhead by up to 4.7$
\times $.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Huang:2017:CDD,
author = "Botong Huang and Jun Yang",
title = "{C{\"u}m{\"u}l{\"o}n--D}: data analytics in a dynamic
spot market",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "8",
pages = "865--876",
month = apr,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3090163.3090165",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 17:12:46 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present a system called C{\"u}m{\"u}l{\"o}n-D for
matrix-based data analysis in a spot market of a public
cloud. Prices in such markets fluctuate over time:
while users can acquire machines usually at a very low
bid price, the cloud can terminate these machines as
soon as the market price exceeds their bid price. The
distinguishing features of C{\"u}m{\"u}l{\"o}n-D
include its continuous, proactive adaptation to the
changing market, and its ability to quantify and
control the monetary risk involved in paying for a
workflow execution. We solve the dynamic optimization
problem in a principled manner with a Markov decision
process, and account for practical details that are
often ignored previously but nonetheless important to
performance. We evaluate C{\"u}m{\"u}l{\"o}n-D's
effectiveness and advantages over previous approaches
with experiments on Amazon EC2.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Then:2017:AAT,
author = "Manuel Then and Timo Kersten and Stephan G{\"u}nnemann
and Alfons Kemper and Thomas Neumann",
title = "Automatic algorithm transformation for efficient
multi-snapshot analytics on temporal graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "8",
pages = "877--888",
month = apr,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3090163.3090166",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 17:12:46 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Analytical graph algorithms commonly compute metrics
for a graph at one point in time. In practice it is
often also of interest how metrics change over time,
e.g., to find trends. For this purpose, algorithms must
be executed for multiple graph snapshots. We present
Single Algorithm Multiple Snapshots (SAMS), a novel
approach to execute algorithms concurrently for
multiple graph snapshots. SAMS automatically transforms
graph algorithms to leverage similarities between the
analyzed graph snapshots. The automatic transformation
interleaves algorithm executions on multiple snapshots,
synergistically shares their graph accesses and
traversals, and optimizes the algorithm's data layout.
Thus, SAMS can amortize the cost of random data
accesses and improve memory bandwidth utilization---two
main cost factors in graph analytics. We extensively
evaluate SAMS using six well-known algorithms and
multiple synthetic as well as real-world graph
datasets. Our measurements show that in multi-snapshot
analyses, SAMS offers runtime improvements of up to two
orders of magnitude over traditional snapshot-at-a-time
execution.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhu:2017:LAM,
author = "Jianqiao Zhu and Navneet Potti and Saket Saurabh and
Jignesh M. Patel",
title = "Looking ahead makes query plans robust: making the
initial case with in-memory star schema data warehouse
workloads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "8",
pages = "889--900",
month = apr,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3090163.3090167",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 17:12:46 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Query optimizers and query execution engines cooperate
to deliver high performance on complex analytic
queries. Typically, the optimizer searches through the
plan space and sends a selected plan to the execution
engine. However, optimizers may at times miss the
optimal plan, with sometimes disastrous impact on
performance. In this paper, we develop the notion of
robustness of a query evaluation strategy with respect
to a space of query plans. We also propose a novel
query execution strategy called Lookahead Information
Passing (LIP) that is robust with respect to the space
of (fully pipeline-able) left-deep query plan trees for
in-memory star schema data warehouses. LIP ensures that
execution times for the best and the worst case plans
are far closer than without LIP. In fact, under certain
assumptions of independent and uniform distributions,
any plan in that space is theoretically guaranteed to
execute in near-optimal time. LIP ensures that the
execution time for every plan in the space is
nearly-optimal. In this paper, we also evaluate these
claims using workloads that include skew and
correlation. With LIP we make an initial foray into a
novel way of thinking about robustness from the
perspective of query evaluation, where we develop
strategies (like LIP) that collapse plan sub-spaces in
the overall global plan space.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Anderson:2017:BGB,
author = "Michael Anderson and Shaden Smith and Narayanan
Sundaram and Mihai Capota and Zheguang Zhao and
Subramanya Dulloor and Nadathur Satish and Theodore L.
Willke",
title = "Bridging the gap between {HPC} and big data
frameworks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "8",
pages = "901--912",
month = apr,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3090163.3090168",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 17:12:46 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/pvm.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Apache Spark is a popular framework for data analytics
with attractive features such as fault tolerance and
interoperability with the Hadoop ecosystem.
Unfortunately, many analytics operations in Spark are
an order of magnitude or more slower compared to native
implementations written with high performance computing
tools such as MPI. There is a need to bridge the
performance gap while retaining the benefits of the
Spark ecosystem such as availability, productivity, and
fault tolerance. In this paper, we propose a system for
integrating MPI with Spark and analyze the costs and
benefits of doing so for four distributed graph and
machine learning applications. We show that offloading
computation to an MPI environment from within Spark
provides 3.1--17.7$ \times $ speedups on the four
sparse applications, including all of the overheads.
This opens up an avenue to reuse existing MPI libraries
in Spark with little effort.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Huang:2017:RSS,
author = "Keke Huang and Sibo Wang and Glenn Bevilacqua and
Xiaokui Xiao and Laks V. S. Lakshmanan",
title = "Revisiting the stop-and-stare algorithms for influence
maximization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "9",
pages = "913--924",
month = may,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 17:12:46 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Influence maximization is a combinatorial optimization
problem that finds important applications in viral
marketing, feed recommendation, etc. Recent research
has led to a number of scalable approximation
algorithms for influence maximization, such as TIM$^+$
and IMM, and more recently, SSA and D-SSA. The goal of
this paper is to conduct a rigorous theoretical and
experimental analysis of SSA and D-SSA and compare them
against the preceding algorithms. In doing so, we
uncover inaccuracies in previously reported technical
results on the accuracy and efficiency of SSA and
D-SSA, which we set right. We also attempt to reproduce
the original experiments on SSA and D-SSA, based on
which we provide interesting empirical insights. Our
evaluation confirms some results reported from the
original experiments, but it also reveals anomalies in
some other results and sheds light on the behavior of
SSA and D-SSA in some important settings not considered
previously. We also report on the performance of
SSA-Fix, our modification to SSA in order to restore
the approximation guarantee that was claimed for but
not enjoyed by SSA. Overall, our study suggests that
there exist opportunities for further scaling up
influence maximization with approximation guarantees.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2017:LSR,
author = "Xubo Wang and Lu Qin and Xuemin Lin and Ying Zhang and
Lijun Chang",
title = "Leveraging set relations in exact set similarity
join",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "9",
pages = "925--936",
month = may,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 17:12:46 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Exact set similarity join, which finds all the similar
set pairs from two collections of sets, is a
fundamental problem with a wide range of applications.
The existing solutions for set similarity join follow a
filtering-verification framework, which generates a
list of candidate pairs through scanning indexes in the
filtering phase, and reports those similar pairs in the
verification phase. Though much research has been
conducted on this problem, set relations, which we find
out is quite effective on improving the algorithm
efficiency through computational cost sharing, have
never been studied. Therefore, in this paper, instead
of considering each set individually, we explore the
set relations in different levels to reduce the overall
computational costs. First, it has been shown that most
of the computational time is spent on the filtering
phase, which can be quadratic to the number of sets in
the worst case for the existing solutions. Thus we
explore index-level set relations to reduce the
filtering cost to be linear to the size of the input
while keeping the same filtering power. We achieve this
by grouping related sets into blocks in the index and
skipping useless index probes in joins. Second, we
explore answer-level set relations to further improve
the algorithm based on the intuition that if two sets
are similar, their answers may have a large overlap. We
derive an algorithm which incrementally generates the
answer of one set from an already computed answer of
another similar set rather than compute the answer from
scratch to reduce the computational cost. Finally, we
conduct extensive performance studies using 21 real
datasets with various data properties from a wide range
of domains. The experimental results demonstrate that
our algorithm outperforms all the existing algorithms
across all datasets and can achieve more than an order
of magnitude speedup against the state- of-the-art
algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jiang:2017:RRW,
author = "Minhao Jiang and Ada Wai-Chee Fu and Raymond Chi-Wing
Wong",
title = "{READS}: a random walk approach for efficient and
accurate dynamic {SimRank}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "9",
pages = "937--948",
month = may,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 17:12:46 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Similarity among entities in graphs plays a key role
in data analysis and mining. SimRank is a widely used
and popular measurement to evaluate the similarity
among the vertices. In real-life applications, graphs
do not only grow in size, requiring fast and precise
SimRank computation for large graphs, but also change
and evolve continuously over time, demanding an
efficient maintenance process to handle dynamic
updates. In this paper, we propose a random walk based
indexing scheme to compute SimRank efficiently and
accurately over large dynamic graphs. We show that our
algorithm outperforms the state-of-the-art static and
dynamic SimRank algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Huang:2017:ADC,
author = "Xin Huang and Laks V. S. Lakshmanan",
title = "Attribute-driven community search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "9",
pages = "949--960",
month = may,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 17:12:46 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Recently, community search over graphs has gained
significant interest. In applications such as analysis
of protein-protein interaction (PPI) networks, citation
graphs, and collaboration networks, nodes tend to have
attributes. Unfortunately, most previous community
search algorithms ignore attributes and result in
communities with poor cohesion w.r.t. their node
attributes. In this paper, we study the problem of
attribute-driven community search, that is, given an
undirected graph G where nodes are associated with
attributes, and an input query Q consisting of nodes
V$_q$ and attributes W$_q$, find the communities
containing V$_q$, in which most community members are
densely inter-connected and have similar attributes. We
formulate this problem as finding attributed truss
communities (ATC), i.e., finding connected and close
k-truss subgraphs containing V$_q$, with the largest
attribute relevance score. We design a framework of
desirable properties that good score function should
satisfy. We show that the problem is NP-hard. However,
we develop an efficient greedy algorithmic framework to
iteratively remove nodes with the least popular
attributes, and shrink the graph into an ATC. In
addition, we also build an elegant index to maintain k
-truss structure and attribute information, and propose
efficient query processing algorithms. Extensive
experiments on large real-world networks with
ground-truth communities show that our algorithms
significantly outperform the state of the art and
demonstrates their efficiency and effectiveness.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chen:2017:BAS,
author = "Jiecao Chen and Qin Zhang",
title = "Bias-aware sketches",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "9",
pages = "961--972",
month = may,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 17:12:46 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Linear sketching algorithms have been widely used for
processing large-scale distributed and streaming
datasets. Their popularity is largely due to the fact
that linear sketches can be naturally composed in the
distributed model and be efficiently updated in the
streaming model. The errors of linear sketches are
typically expressed in terms of the sum of coordinates
of the input vector excluding those largest ones, or,
the mass on the tail of the vector. Thus, the
precondition for these algorithms to perform well is
that the mass on the tail is small, which is, however,
not always the case --- in many real-world datasets the
coordinates of the input vector have a bias, which will
generate a large mass on the tail. In this paper we
propose linear sketches that are bias- aware. We
rigorously prove that they achieve strictly better
error guarantees than the corresponding existing
sketches, and demonstrate their practicality and
superiority via an extensive experimental evaluation on
both real and synthetic datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cao:2017:DDA,
author = "Yang Cao and Wenfei Fan",
title = "Data driven approximation with bounded resources",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "9",
pages = "973--984",
month = may,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 17:12:46 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper proposes BEAS, a resource-bounded scheme
for querying relations. It is parameterized with a
resource ratio $ \alpha \in (0, 1] $, indicating that
given a big dataset D, we can only afford to access an
$ \alpha $-fraction of D with limited resources. For a
query Q posed on D, BEAS computes exact answers Q(D) if
doable and otherwise approximate answers, by accessing
at most $ \alpha | D |$ amount of data in the entire
process. Underlying BEAS are (1) an access schema,
which helps us identify and fetch the part of data
needed to answer Q, (2) an accuracy measure to assess
approximate answers in terms of their relevance and
coverage w.r.t. exact answers, (3) an Approximability
Theorem for the feasibility of resource-bounded
approximation, and (4) algorithms for query evaluation
with bounded resources. A unique feature of BEAS is its
ability to answer unpredictable queries, aggregate or
not, using bounded resources and assuring a
deterministic accuracy lower bound. Using real-life and
synthetic data, we empirically verify the effectiveness
and efficiency of BEAS.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Khayyat:2017:ELF,
author = "Zuhair Khayyat and William Lucia and Meghna Singh and
Mourad Ouzzani and Paolo Papotti and Jorge-Arnulfo
Quian{\'e}-Ruiz and Nan Tang and Panos Kalnis",
title = "Errata for {``Lightning Fast and Space Efficient
Inequality Joins'' (PVLDB 8(13): 2074--2085)}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "9",
pages = "985--985",
month = may,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 17:12:46 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
note = "See \cite{Khayyat:2015:LFS}.",
abstract = "This is in response to recent feedback from some
readers, which requires some clarifications regarding
our IEJoin algorithm published in [1]. The feedback
revolves around four points: (1) a typo in our
illustrating example of the join process; (2) a naming
error for the index used by our algorithm to improve
the bit array scan; (3) the sort order used in our
algorithms; and (4) a missing explanation on how
duplicates are handled by our self join algorithm.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Qin:2017:SAG,
author = "Chengjie Qin and Martin Torres and Florin Rusu",
title = "Scalable asynchronous gradient descent optimization
for out-of-core models",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "10",
pages = "986--997",
month = jun,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 17:12:46 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Existing data analytics systems have approached
predictive model training exclusively from a
data-parallel perspective. Data examples are
partitioned to multiple workers and training is
executed concurrently over different partitions, under
various synchronization policies that emphasize speedup
or convergence. Since models with millions and even
billions of features become increasingly common
nowadays, model management becomes an equally important
task for effective training. In this paper, we present
a general framework for parallelizing stochastic
optimization algorithms over massive models that cannot
fit in memory. We extend the lock-free HOGWILD!-family
of algorithms to disk-resident models by vertically
partitioning the model offline and asynchronously
updating the resulting partitions online. Unlike
HOGWILD!, concurrent requests to the common model are
minimized by a preemptive push-based sharing mechanism
that reduces the number of disk accesses. Experimental
results on real and synthetic datasets show that the
proposed framework achieves improved convergence over
HOGWILD! and is the only solution scalable to massive
models.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2017:WEM,
author = "Fan Zhang and Ying Zhang and Lu Qin and Wenjie Zhang
and Xuemin Lin",
title = "When engagement meets similarity: efficient $ (k,
r)$-core computation on social networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "10",
pages = "998--1009",
month = jun,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 17:12:46 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we investigate the problem of $ (k,
r)$-core which intends to find cohesive subgraphs on
social networks considering both user engagement and
similarity perspectives. In particular, we adopt the
popular concept of $k$-core to guarantee the engagement
of the users (vertices) in a group (subgraph) where
each vertex in a $ (k, r)$-core connects to at least k
other vertices. Meanwhile, we consider the pairwise
similarity among users based on their attributes.
Efficient algorithms are proposed to enumerate all
maximal $ (k, r)$-cores and find the maximum $ (k,
r)$-core, where both problems are shown to be NP-hard.
Effective pruning techniques substantially reduce the
search space of two algorithms. A novel $ (k, k')$-core
based $ (k, r)$-core size upper bound enhances
performance of the maximum $ (k, r)$-core computation.
We also devise effective search orders for two mining
algorithms where search priorities for vertices are
different. Comprehensive experiments on real-life data
demonstrate that the maximal/maximum $ (k, r)$-cores
enable us to find interesting cohesive subgraphs, and
performance of two mining algorithms is effectively
improved by proposed techniques.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2017:EEP,
author = "Yiding Liu and Tuan-Anh Nguyen Pham and Gao Cong and
Quan Yuan",
title = "An experimental evaluation of point-of-interest
recommendation in location-based social networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "10",
pages = "1010--1021",
month = jun,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 17:12:46 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Point-of-interest (POI) recommendation is an important
service to Location-Based Social Networks (LBSNs) that
can benefit both users and businesses. In recent years,
a number of POI recommender systems have been proposed,
but there is still a lack of systematical comparison
thereof. In this paper, we provide an all-around
evaluation of 12 state-of-the-art POI recommendation
models. From the evaluation, we obtain several
important findings, based on which we can better
understand and utilize POI recommendation models in
various scenarios. We anticipate this work to provide
readers with an overall picture of the cutting-edge
research on POI recommendation.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Raasveldt:2017:DHM,
author = "Mark Raasveldt and Hannes M{\"u}hleisen",
title = "Don't hold my data hostage: a case for client protocol
redesign",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "10",
pages = "1022--1033",
month = jun,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 17:12:46 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Transferring a large amount of data from a database to
a client program is a surprisingly expensive operation.
The time this requires can easily dominate the query
execution time for large result sets. This represents a
significant hurdle for external data analysis, for
example when using statistical software. In this paper,
we explore and analyse the result set serialization
design space. We present experimental results from a
large chunk of the database market and show the
inefficiencies of current approaches. We then propose a
columnar serialization method that improves
transmission performance by an order of magnitude.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhu:2017:AJJ,
author = "Erkang Zhu and Yeye He and Surajit Chaudhuri",
title = "Auto-join: joining tables by leveraging
transformations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "10",
pages = "1034--1045",
month = jun,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 17:12:46 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Traditional equi-join relies solely on string equality
comparisons to perform joins. However, in scenarios
such as ad-hoc data analysis in spreadsheets, users
increasingly need to join tables whose join-columns are
from the same semantic domain but use different textual
representations, for which transformations are needed
before equi-join can be performed. We developed
Auto-Join, a system that can automatically search over
a rich space of operators to compose a transformation
program, whose execution makes input tables
equi-join-able. We developed an optimal sampling
strategy that allows Auto-Join to scale to large
datasets efficiently, while ensuring joins succeed with
high probability. Our evaluation using real test cases
collected from both public web tables and proprietary
enterprise tables shows that the proposed system
performs the desired transformation joins efficiently
and with high quality.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2017:TSD,
author = "Aoqian Zhang and Shaoxu Song and Jianmin Wang and
Philip S. Yu",
title = "Time series data cleaning: from anomaly detection to
anomaly repairing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "10",
pages = "1046--1057",
month = jun,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 17:12:46 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Errors are prevalent in time series data, such as GPS
trajectories or sensor readings. Existing methods focus
more on anomaly detection but not on repairing the
detected anomalies. By simply filtering out the dirty
data via anomaly detection, applications could still be
unreliable over the incomplete time series. Instead of
simply discarding anomalies, we propose to
(iteratively) repair them in time series data, by
creatively bonding the beauty of temporal nature in
anomaly detection with the widely considered minimum
change principle in data repairing. Our major
contributions include: (1) a novel framework of
iterative minimum repairing (IMR) over time series
data, (2) explicit analysis on convergence of the
proposed iterative minimum repairing, and (3) efficient
estimation of parameters in each iteration. Remarkably,
with incremental computation, we reduce the complexity
of parameter estimation from O (n) to O (1).
Experiments on real datasets demonstrate the
superiority of our proposal compared to the
state-of-the-art approaches. In particular, we show
that (the proposed) repairing indeed improves the time
series classification application.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chen:2017:PBM,
author = "Lu Chen and Yunjun Gao and Baihua Zheng and Christian
S. Jensen and Hanyu Yang and Keyu Yang",
title = "Pivot-based metric indexing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "10",
pages = "1058--1069",
month = jun,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 17:12:46 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The general notion of a metric space encompasses a
diverse range of data types and accompanying similarity
measures. Hence, metric search plays an important role
in a wide range of settings, including multimedia
retrieval, data mining, and data integration. With the
aim of accelerating metric search, a collection of
pivot-based indexing techniques for metric data has
been proposed, which reduces the number of potentially
expensive similarity comparisons by exploiting the
triangle inequality for pruning and validation.
However, no comprehensive empirical study of those
techniques exists. Existing studies each offers only a
narrower coverage, and they use different pivot
selection strategies that affect performance
substantially and thus render cross-study comparisons
difficult or impossible. We offer a survey of existing
pivot-based indexing techniques, and report a
comprehensive empirical comparison of their
construction costs, update efficiency, storage sizes,
and similarity search performance. As part of the
study, we provide modifications for two existing
indexing techniques to make them more competitive. The
findings and insights obtained from the study reveal
different strengths and weaknesses of different
indexing techniques, and offer guidance on selecting an
appropriate indexing technique for a given setting.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Guerraoui:2017:HRW,
author = "Rachid Guerraoui and Anne-Marie Kermarrec and Tao Lin
and Rhicheek Patra",
title = "Heterogeneous recommendations: what you might like to
read after watching interstellar",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "10",
pages = "1070--1081",
month = jun,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 17:12:46 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Recommenders, as widely implemented nowadays by major
e-commerce players like Netflix or Amazon, use
collaborative filtering to suggest the most relevant
items to their users. Clearly, the effectiveness of
recommenders depends on the data they can exploit,
i.e., the feedback of users conveying their
preferences, typically based on their past ratings. As
of today, most recommenders are homogeneous in the
sense that they utilize one specific application at a
time. In short, Alice will only get recommended a movie
if she has been rating movies. But what if she has been
only rating books and would like to get recommendations
for a movie? Clearly, the multiplicity of web
applications is calling for heterogeneous recommenders
that could utilize ratings in one application to
provide recommendations in another one. This paper
presents X-M ap, a heterogeneous recommender. X-Map
leverages meta-paths between heterogeneous items over
several application domains, based on users who rated
across these domains. These meta-paths are then used in
X-Map to generate, for every user, a profile (AlterEgo
) in a domain where the user might not have rated any
item yet. Not surprisingly, leveraging meta-paths poses
non-trivial issues of (a) meta-path-based inter-item
similarity, in order to enable accurate predictions,
(b) scalability, given the amount of computation
required, as well as (c) privacy, given the need to
aggregate information across multiple applications. We
show in this paper how X-M ap addresses the
above-mentioned issues to achieve accuracy, scalability
and differential privacy. In short, X-Map weights the
meta-paths based on several factors to compute
inter-item similarities, and ensures scalability
through a layer-based pruning technique. X-Map
guarantees differential privacy using an exponential
scheme that leverages the meta-path-based similarities
while determining the probability of item selection to
construct the AlterEgos. We present an exhaustive
experimental evaluation of X-Map using real traces from
Amazon. We show that, in terms of accuracy, X-Map
outperforms alternative heterogeneous recommenders and,
in terms of throughput, X-Map achieves a linear speedup
with an increasing number of machines.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Deng:2017:SEM,
author = "Dong Deng and Albert Kim and Samuel Madden and Michael
Stonebraker",
title = "{SilkMoth}: an efficient method for finding related
sets with maximum matching constraints",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "10",
pages = "1082--1093",
month = jun,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 17:12:46 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Determining if two sets are related --- that is, if
they have similar values or if one set contains the
other --- is an important problem with many
applications in data cleaning, data integration, and
information retrieval. For example, set relatedness can
be a useful tool to discover whether columns from two
different databases are joinable; if enough of the
values in the columns match, it may make sense to join
them. A common metric is to measure the relatedness of
two sets by treating the elements as vertices of a
bipartite graph and calculating the score of the
maximum matching pairing between elements. Compared to
other metrics which require exact matchings between
elements, this metric uses a similarity function to
compare elements between the two sets, making it robust
to small dissimilarities in elements and more useful
for real-world, dirty data. Unfortunately, the metric
suffers from expensive computational cost, taking O (
n$^3$) time, where n is the number of elements in the
sets, for each set-to-set comparison. Thus for
applications that try to search for all pairings of
related sets in a brute-force manner, the runtime
becomes unacceptably large. To address this challenge,
we developed SilkMoth, a system capable of rapidly
discovering related set pairs in collections of sets.
Internally, SilkMoth creates a signature for each set,
with the property that any other set which is related
must match the signature. SilkMoth then uses these
signatures to prune the search space, so only sets that
match the signatures are left as candidates. Finally,
SilkMoth applies the maximum matching metric on
remaining candidates to verify which of these
candidates are truly related sets. An important
property of SilkMoth is that it is guaranteed to output
exactly the same related set pairings as the
brute-force method, unlike approximate techniques.
Thus, a contribution of this paper is the
characterization of the space of signatures which
enable this property. We show that selecting the
optimal signature in this space is NP-complete, and
based on insights from the characterization of the
space, we propose two novel filters which help to prune
the candidates further before verification. In
addition, we introduce a simple optimization to the
calculation of the maximum matching metric itself based
on the triangle inequality. Compared to related
approaches, SilkMoth is much more general, handling a
larger space of similarity functions and relatedness
metrics, and is an order of magnitude more efficient on
real datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chung:2017:DQM,
author = "Yeounoh Chung and Sanjay Krishnan and Tim Kraska",
title = "A data quality metric {(DQM)}: how to estimate the
number of undetected errors in data sets",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "10",
pages = "1094--1105",
month = jun,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 17:12:46 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data cleaning, whether manual or algorithmic, is
rarely perfect leaving a dataset with an unknown number
of false positives and false negatives after cleaning.
In many scenarios, quantifying the number of remaining
errors is challenging because our data integrity rules
themselves may be incomplete, or the available
gold-standard datasets may be too small to extrapolate.
As the use of inherently fallible crowds becomes more
prevalent in data cleaning problems, it is important to
have estimators to quantify the extent of such errors.
We propose novel species estimators to estimate the
number of distinct remaining errors in a dataset after
it has been cleaned by a set of crowd workers ---
essentially, quantifying the utility of hiring
additional workers to clean the dataset. This problem
requires new estimators that are robust to false
positives and false negatives, and we empirically show
on three real-world datasets that existing species
estimators are unstable for this problem, while our
proposed techniques quickly converge.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Olma:2017:SCT,
author = "Matthaios Olma and Manos Karpathiotakis and Ioannis
Alagiannis and Manos Athanassoulis and Anastasia
Ailamaki",
title = "{Slalom}: coasting through raw data via adaptive
partitioning and indexing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "10",
pages = "1106--1117",
month = jun,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 17:12:46 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The constant flux of data and queries alike has been
pushing the boundaries of data analysis systems. The
increasing size of raw data files has made data loading
an expensive operation that delays the data-to-insight
time. Hence, recent in-situ query processing systems
operate directly over raw data, alleviating the loading
cost. At the same time, analytical workloads have
increasing number of queries. Typically, each query
focuses on a constantly shifting --- yet small ---
range. Minimizing the workload latency, now, requires
the benefits of indexing in in-situ query processing.
In this paper, we present Slalom, an in-situ query
engine that accommodates workload shifts by monitoring
user access patterns. Slalom makes on-the-fly
partitioning and indexing decisions, based on
information collected by lightweight monitoring. Slalom
has two key components: (i) an online partitioning and
indexing scheme, and (ii) a partitioning and indexing
tuner tailored for in-situ query engines. When compared
to the state of the art, Slalom offers performance
benefits by taking into account user query patterns to
(a) logically partition raw data files and (b) build
for each partition lightweight partition-specific
indexes. Due to its lightweight and adaptive nature,
Slalom achieves efficient accesses to raw data with
minimal memory consumption. Our experimentation with
both micro-benchmarks and real-life workloads shows
that Slalom outperforms state-of-the-art in-situ
engines (3--10$ \times $), and achieves comparable
query response times with fully indexed DBMS, offering
much lower ($ \approx 3 \times $) cumulative query
execution times for query workloads with increasing
size and unpredictable access patterns.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2017:MFJ,
author = "Yinan Li and Nikos R. Katsipoulakis and Badrish
Chandramouli and Jonathan Goldstein and Donald
Kossmann",
title = "{Mison}: a fast {JSON} parser for data analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "10",
pages = "1118--1129",
month = jun,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 17:12:46 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The growing popularity of the JSON format has fueled
increased interest in loading and processing JSON data
within analytical data processing systems. However, in
many applications, JSON parsing dominates performance
and cost. In this paper, we present a new JSON parser
called Mison that is particularly tailored to this
class of applications, by pushing down both projection
and filter operators of analytical queries into the
parser. To achieve these features, we propose to
deviate from the traditional approach of building
parsers using finite state machines (FSMs). Instead, we
follow a two-level approach that enables the parser to
jump directly to the correct position of a queried
field without having to perform expensive tokenizing
steps to find the field. At the upper level, Mison
speculatively predicts the logical locations of queried
fields based on previously seen patterns in a dataset.
At the lower level, Mison builds structural indices on
JSON data to map logical locations to physical
locations. Unlike all existing FSM-based parsers,
building structural indices converts control flow into
data flow, thereby largely eliminating inherently
unpredictable branches in the program and exploiting
the parallelism available in modern processors. We
experimentally evaluate Mison using representative
real-world JSON datasets and the TPC-H benchmark, and
show that Mison produces significant performance
benefits over the best existing JSON parsers; in some
cases, the performance improvement is over one order of
magnitude.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Huang:2017:OBV,
author = "Silu Huang and Liqi Xu and Jialin Liu and Aaron J.
Elmore and Aditya Parameswaran",
title = "{OrpheusDB}: bolt-on versioning for relational
databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "10",
pages = "1130--1141",
month = jun,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 17:12:46 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data science teams often collaboratively analyze
datasets, generating dataset versions at each stage of
iterative exploration and analysis. There is a pressing
need for a system that can support dataset versioning,
enabling such teams to efficiently store, track, and
query across dataset versions. We introduce OrpheusDB,
a dataset version control system that ``bolts on ''
versioning capabilities to a traditional relational
database system, thereby gaining the analytics
capabilities of the database ``for free''. We develop
and evaluate multiple data models for representing
versioned data, as well as a light-weight partitioning
scheme, LyreSplit, to further optimize the models for
reduced query latencies. With LyreSplit, OrpheusDB is
on average $ 10^3 \times $ faster in finding effective
(and better) partitionings than competing approaches,
while also reducing the latency of version retrieval by
up to $ 20 \times $ relative to schemes without
partitioning. LyreSplit can be applied in an online
fashion as new versions are added, alongside an
intelligent migration scheme that reduces migration
time by $ 10 \times $ on average.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Galakatos:2017:RRA,
author = "Alex Galakatos and Andrew Crotty and Emanuel Zgraggen
and Carsten Binnig and Tim Kraska",
title = "Revisiting reuse for approximate query processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "10",
pages = "1142--1153",
month = jun,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 17:12:46 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Visual data exploration tools allow users to quickly
gather insights from new datasets. As dataset sizes
continue to increase, though, new techniques will be
necessary to maintain the interactivity guarantees that
these tools require. Approximate query processing (AQP)
attempts to tackle this problem and allows systems to
return query results at ``human speed.'' However,
existing AQP techniques start to break down when
confronted with ad hoc queries that target the tails of
the distribution. We therefore present an AQP
formulation that can provide low-error approximate
results at interactive speeds, even for queries over
rare subpopulations. In particular, our formulation
treats query results as random variables in order to
leverage the ample opportunities for result reuse
inherent in interactive data exploration. As part of
our approach, we apply a variety of optimization
techniques that are based on probability theory,
including new query rewrite rules and index structures.
We implemented these techniques in a prototype system
and show that they can achieve interactivity where
alternative approaches cannot.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Orr:2017:PDS,
author = "Laurel Orr and Magdalena Balazinska and Dan Suciu",
title = "Probabilistic database summarization for interactive
data exploration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "10",
pages = "1154--1165",
month = jun,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 17:12:46 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present a probabilistic approach to generate a
small, query-able summary of a dataset for interactive
data exploration. Departing from traditional
summarization techniques, we use the Principle of
Maximum Entropy to generate a probabilistic
representation of the data that can be used to give
approximate query answers. We develop the theoretical
framework and formulation of our probabilistic
representation and show how to use it to answer
queries. We then present solving techniques and give
three critical optimizations to improve preprocessing
time and query accuracy. Lastly, we experimentally
evaluate our work using a 5 GB dataset of flights
within the United States and a 210 GB dataset from an
astronomy particle simulation. While our current work
only supports linear queries, we show that our
technique can successfully answer queries faster than
sampling while introducing, on average, no more error
than sampling and can better distinguish between rare
and nonexistent values.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Oukid:2017:MMT,
author = "Ismail Oukid and Daniel Booss and Adrien Lespinasse
and Wolfgang Lehner and Thomas Willhalm and
Gr{\'e}goire Gomes",
title = "Memory management techniques for large-scale
persistent-main-memory systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1166--1177",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Storage Class Memory (SCM) is a novel class of memory
technologies that promise to revolutionize database
architectures. SCM is byte-addressable and exhibits
latencies similar to those of DRAM, while being
non-volatile. Hence, SCM could replace both main memory
and storage, enabling a novel single-level database
architecture without the traditional I/O bottleneck.
Fail-safe persistent SCM allocation can be considered
conditio sine qua non for enabling this novel
architecture paradigm for database management systems.
In this paper we present PAllocator, a fail-safe
persistent SCM allocator whose design emphasizes high
concurrency and capacity scalability. Contrary to
previous works, PAllocator thoroughly addresses the
important challenge of persistent memory fragmentation
by implementing an efficient defragmentation algorithm.
We show that PAllocator outperforms state-of-the-art
persistent allocators by up to one order of magnitude,
both in operation throughput and recovery time, and
enables up to $ 2.39 \times $ higher operation
throughput on a persistent B-Tree.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shang:2017:TSJ,
author = "Shuo Shang and Lisi Chen and Zhewei Wei and Christian
S. Jensen and Kai Zheng and Panos Kalnis",
title = "Trajectory similarity join in spatial networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1178--1189",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The matching of similar pairs of objects, called
similarity join, is fundamental functionality in data
management. We consider the case of trajectory
similarity join (TS-Join), where the objects are
trajectories of vehicles moving in road networks. Thus,
given two sets of trajectories and a threshold $ \theta
$, the TS-Join returns all pairs of trajectories from
the two sets with similarity above $ \theta $. This
join targets applications such as trajectory
near-duplicate detection, data cleaning, ridesharing
recommendation, and traffic congestion prediction. With
these applications in mind, we provide a purposeful
definition of similarity. To enable efficient TS-Join
processing on large sets of trajectories, we develop
search space pruning techniques and take into account
the parallel processing capabilities of modern
processors. Specifically, we present a two-phase
divide-and-conquer algorithm. For each trajectory, the
algorithm first finds similar trajectories. Then it
merges the results to achieve a final result. The
algorithm exploits an upper bound on the spatiotemporal
similarity and a heuristic scheduling strategy for
search space pruning. The algorithm's per-trajectory
searches are independent of each other and can be
performed in parallel, and the merging has constant
cost. An empirical study with real data offers insight
in the performance of the algorithm and demonstrates
that is capable of outperforming a well-designed
baseline algorithm by an order of magnitude.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Rekatsinas:2017:HHD,
author = "Theodoros Rekatsinas and Xu Chu and Ihab F. Ilyas and
Christopher R{\'e}",
title = "{HoloClean}: holistic data repairs with probabilistic
inference",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1190--1201",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We introduce HoloClean, a framework for holistic data
repairing driven by probabilistic inference. HoloClean
unifies qualitative data repairing, which relies on
integrity constraints or external data sources, with
quantitative data repairing methods, which leverage
statistical properties of the input data. Given an
inconsistent dataset as input, HoloClean automatically
generates a probabilistic program that performs data
repairing. Inspired by recent theoretical advances in
probabilistic inference, we introduce a series of
optimizations which ensure that inference over
HoloClean's probabilistic model scales to instances
with millions of tuples. We show that HoloClean finds
data repairs with an average precision of $ \approx $
90\% and an average recall of above $ \approx $ 76\%
across a diverse array of datasets exhibiting different
types of errors. This yields an average F1 improvement
of more than $ 2 \times $ against state-of-the-art
methods.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Istvan:2017:CID,
author = "Zsolt Istv{\'a}n and David Sidler and Gustavo Alonso",
title = "{Caribou}: intelligent distributed storage",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1202--1213",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The ever increasing amount of data being handled in
data centers causes an intrinsic inefficiency: moving
data around is expensive in terms of bandwidth,
latency, and power consumption, especially given the
low computational complexity of many database
operations. In this paper we explore near-data
processing in database engines, i.e., the option of
offloading part of the computation directly to the
storage nodes. We implement our ideas in Caribou, an
intelligent distributed storage layer incorporating
many of the lessons learned while building systems with
specialized hardware. Caribou provides access to
DRAM/NVRAM storage over the network through a simple
key--value store interface, with each storage node
providing high-bandwidth near-data processing at line
rate and fault tolerance through replication. The
result is a highly efficient, distributed, intelligent
data storage that can be used to both boost performance
and reduce power consumption and real estate usage in
the data center thanks to the micro-server architecture
adopted.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chen:2017:TLA,
author = "Lingjiao Chen and Arun Kumar and Jeffrey Naughton and
Jignesh M. Patel",
title = "Towards linear algebra over normalized data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1214--1225",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Providing machine learning (ML) over relational data
is a mainstream requirement for data analytics systems.
While almost all ML tools require the input data to be
presented as a single table, many datasets are
multi-table. This forces data scientists to join those
tables first, which often leads to data redundancy and
runtime waste. Recent works on ``factorized'' ML
mitigate this issue for a few specific ML algorithms by
pushing ML through joins. But their approaches require
a manual rewrite of ML implementations. Such piecemeal
methods create a massive development overhead when
extending such ideas to other ML algorithms. In this
paper, we show that it is possible to mitigate this
overhead by leveraging a popular formal algebra to
represent the computations of many ML algorithms:
linear algebra. We introduce a new logical data type to
represent normalized data and devise a framework of
algebraic rewrite rules to convert a large set of
linear algebra operations over denormalized data into
operations over normalized data. We show how this
enables us to automatically ``factorize'' several
popular ML algorithms, thus unifying and generalizing
several prior works. We prototype our framework in the
popular ML environment R and an industrial R-over-RDBMS
tool. Experiments with both synthetic and real
normalized data show that our framework also yields
significant speed-ups, up to $ 36 \times $ on real
data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mehta:2017:CEB,
author = "Parmita Mehta and Sven Dorkenwald and Dongfang Zhao
and Tomer Kaftan and Alvin Cheung and Magdalena
Balazinska and Ariel Rokem and Andrew Connolly and
Jacob Vanderplas and Yusra AlSayyad",
title = "Comparative evaluation of big-data systems on
scientific image analytics workloads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1226--1237",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Scientific discoveries are increasingly driven by
analyzing large volumes of image data. Many new
libraries and specialized database management systems
(DBMSs) have emerged to support such tasks. It is
unclear how well these systems support real-world image
analysis use cases, and how performant the image
analytics tasks implemented on top of such systems are.
In this paper, we present the first comprehensive
evaluation of large-scale image analysis systems using
two real-world scientific image data processing use
cases. We evaluate five representative systems (SciDB,
Myria, Spark, Dask, and TensorFlow) and find that each
of them has shortcomings that complicate implementation
or hurt performance. Such shortcomings lead to new
research opportunities in making large-scale image
analysis both efficient and easy to use.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Aslay:2017:RMI,
author = "Cigdem Aslay and Francesco Bonchi and Laks V. S.
Lakshmanan and Wei Lu",
title = "Revenue maximization in incentivized social
advertising",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1238--1249",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Incentivized social advertising, an emerging marketing
model, provides monetization opportunities not only to
the owners of the social networking platforms but also
to their influential users by offering a ``cut'' on the
advertising revenue. We consider a social network (the
host) that sells ad-engagements to advertisers by
inserting their ads, in the form of promoted posts,
into the feeds of carefully selected ``initial
endorsers'' or seed users: these users receive monetary
incentives in exchange for their endorsements. The
endorsements help propagate the ads to the feeds of
their followers. Whenever any user engages with an ad,
the host is paid some fixed amount by the advertiser,
and the ad further propagates to the feed of her
followers, potentially recursively. In this context,
the problem for the host is is to allocate ads to
influential users, taking into account the propensity
of ads for viral propagation, and carefully
apportioning the monetary budget of each of the
advertisers between incentives to influential users and
ad-engagement costs, with the rational goal of
maximizing its own revenue. We show that, taking all
important factors into account, the problem of revenue
maximization in incentivized social advertising
corresponds to the problem of monotone submodular
function maximization, subject to a partition matroid
constraint on the ads-to-seeds allocation, and
submodular knapsack constraints on the advertisers'
budgets. We show that this problem is NP-hard and
devise two greedy algorithms with provable
approximation guarantees, which differ in their
sensitivity to seed user incentive costs. Our
approximation algorithms require repeatedly estimating
the expected marginal gain in revenue as well as in
advertiser payment. By exploiting a connection to the
recent advances made in scalable estimation of expected
influence spread, we devise efficient and scalable
versions of our two greedy algorithms. An extensive
experimental assessment confirms the high quality of
our proposal.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Rupprecht:2017:SNA,
author = "Lukas Rupprecht and William Culhane and Peter
Pietzuch",
title = "{SquirrelJoin}: network-aware distributed join
processing with lazy partitioning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1250--1261",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "To execute distributed joins in parallel on compute
clusters, systems partition and exchange data records
between workers. With large datasets, workers spend a
considerable amount of time transferring data over the
network. When compute clusters are shared among
multiple applications, workers must compete for network
bandwidth with other applications. These variances in
the available network bandwidth lead to network skew,
which causes straggling workers to prolong the join
completion time. We describe SquirrelJoin, a
distributed join processing technique that uses lazy
partitioning to adapt to transient network skew in
clusters. Workers maintain in-memory lazy partitions to
withhold a subset of records, i.e. not sending them
immediately to other workers for processing. Lazy
partitions are then assigned dynamically to other
workers based on network conditions: each worker takes
periodic throughput measurements to estimate its
completion time, and lazy partitions are allocated as
to minimise the join completion time. We implement
SquirrelJoin as part of the Apache Flink distributed
dataflow framework and show that, under transient
network contention in a shared compute cluster,
SquirrelJoin speeds up join completion times by up to $
2.9 \times $ with only a small, fixed overhead.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Rahman:2017:ISE,
author = "Sajjadur Rahman and Maryam Aliakbarpour and Ha Kyung
Kong and Eric Blais and Karrie Karahalios and Aditya
Parameswaran and Ronitt Rubinfield",
title = "{I}'ve seen ``enough'': incrementally improving
visualizations to support rapid decision making",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1262--1273",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data visualization is an effective mechanism for
identifying trends, insights, and anomalies in data. On
large datasets, however, generating visualizations can
take a long time, delaying the extraction of insights,
hampering decision making, and reducing exploration
time. One solution is to use online sampling-based
schemes to generate visualizations faster while
improving the displayed estimates incrementally,
eventually converging to the exact visualization
computed on the entire data. However, the intermediate
visualizations are approximate, and often fluctuate
drastically, leading to potentially incorrect
decisions. We propose sampling-based incremental
visualization algorithms that reveal the ``salient''
features of the visualization quickly --- with a $ 46
\times $ speedup relative to baselines --- while
minimizing error, thus enabling rapid and error-free
decision making. We demonstrate that these algorithms
are optimal in terms of sample complexity, in that
given the level of interactivity, they generate
approximations that take as few samples as possible. We
have developed the algorithms in the context of an
incremental visualization tool, titled I ncVisage, for
trendline and heatmap visualizations. We evaluate the
usability of IncVisage via user studies and demonstrate
that users are able to make effective decisions with
incrementally improving visualizations, especially
compared to vanilla online-sampling based schemes.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2017:MRT,
author = "Lei Li and Wen Hua and Xingzhong Du and Xiaofang
Zhou",
title = "Minimal on-road time route scheduling on
time-dependent graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1274--1285",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "On time-dependent graphs, fastest path query is an
important problem and has been well studied. It focuses
on minimizing the total travel time (waiting time +
on-road time) but does not allow waiting on any
intermediate vertex if the FIFO property is applied.
However, in practice, waiting on a vertex can reduce
the time spent on the road (for example, resuming
traveling after a traffic jam). In this paper, we study
how to find a path with the minimal on-road time on
time-dependent graphs by allowing waiting on some
predefined parking vertices. The existing works are
based on the following fact: the arrival time of a
vertex v is determined by the arrival time of its
in-neighbor u, which does not hold in our scenario
since we also consider the waiting time on u if u
allows waiting. Thus, determining the waiting time on
each parking vertex to achieve the minimal on-road time
becomes a big challenge, which further breaks FIFO
property. To cope with this challenging problem, we
propose two efficient algorithms using minimum on-road
travel cost function to answer the query. The
evaluations on multiple real-world time-dependent
graphs show that the proposed algorithms are more
accurate and efficient than the extensions of existing
algorithms. In addition, the results further indicate,
if the parking facilities are enabled in the route
scheduling algorithms, the on-road time will reduce
significantly compared to the fastest path
algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Katsipoulakis:2017:HVS,
author = "Nikos R. Katsipoulakis and Alexandros Labrinidis and
Panos K. Chrysanthis",
title = "A holistic view of stream partitioning costs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1286--1297",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Stream processing has become the dominant processing
model for monitoring and real-time analytics. Modern
Parallel Stream Processing Engines (pSPEs) have made it
feasible to increase the performance in both monitoring
and analytical queries by parallelizing a query's
execution and distributing the load on multiple
workers. A determining factor for the performance of a
pSPE is the partitioning algorithm used to disseminate
tuples to workers. Until now, partitioning methods in
pSPEs have been similar to the ones used in parallel
databases and only recently load-aware algorithms have
been employed to improve the effectiveness of parallel
execution. We identify and demonstrate the need to
incorporate aggregation costs in the partitioning model
when executing stateful operations in parallel, in
order to minimize the overall latency and/or
through-put. Towards this, we propose new stream
partitioning algorithms, that consider both tuple
imbalance and aggregation cost. We evaluate our
proposed algorithms and show that they can achieve up
to an order of magnitude better performance, compared
to the current state of the art.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Akbas:2017:TBC,
author = "Esra Akbas and Peixiang Zhao",
title = "Truss-based community search: a truss-equivalence
based indexing approach",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1298--1309",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We consider the community search problem defined upon
a large graph G: given a query vertex q in G, to find
as output all the densely connected subgraphs of G,
each of which contains the query v. As an online,
query-dependent variant of the well-known community
detection problem, community search enables
personalized community discovery that has found widely
varying applications in real-world, large-scale graphs.
In this paper, we study the community search problem in
the truss-based model aimed at discovering all dense
and cohesive k -truss communities to which the query
vertex q belongs. We introduce a novel equivalence
relation, k-truss equivalence, to model the intrinsic
density and cohesiveness of edges in k -truss
communities. Consequently, all the edges of G can be
partitioned to a series of k -truss equivalence classes
that constitute a space-efficient, truss-preserving
index structure, EquiTruss. Community search can be
henceforth addressed directly upon EquiTruss without
repeated, time-demanding accesses to the original
graph, G, which proves to be theoretically optimal. In
addition, EquiTruss can be efficiently updated in a
dynamic fashion when G evolves with edge insertion and
deletion. Experimental studies in real-world,
large-scale graphs validate the efficiency and
effectiveness of EquiTruss, which has achieved at least
an order of magnitude speedup in community search over
the state-of-the-art method, TCP-Index.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cambronero:2017:QOD,
author = "Jos{\'e} Cambronero and John K. Feser and Micah J.
Smith and Samuel Madden",
title = "Query optimization for dynamic imputation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1310--1321",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Missing values are common in data analysis and present
a usability challenge. Users are forced to pick between
removing tuples with missing values or creating a
cleaned version of their data by applying a relatively
expensive imputation strategy. Our system, ImputeDB,
incorporates imputation into a cost-based query
optimizer, performing necessary imputations on-the-fly
for each query. This allows users to immediately
explore their data, while the system picks the optimal
placement of imputation operations. We evaluate this
approach on three real-world survey-based datasets. Our
experiments show that our query plans execute between
10 and 140 times faster than first imputing the base
tables. Furthermore, we show that the query results
from on-the-fly imputation differ from the traditional
base-table imputation approach by 0--8\%. Finally, we
show that while dropping tuples with missing values
that fail query constraints discards 6--78\% of the
data, on-the-fly imputation loses only 0--21\%.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Marchant:2017:SER,
author = "Neil G. Marchant and Benjamin I. P. Rubinstein",
title = "In search of an entity resolution {OASIS}: optimal
asymptotic sequential importance sampling",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1322--1333",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Entity resolution (ER) presents unique challenges for
evaluation methodology. While crowdsourcing platforms
acquire ground truth, sound approaches to sampling must
drive labelling efforts. In ER, extreme class imbalance
between matching and non-matching records can lead to
enormous labelling requirements when seeking
statistically consistent estimates for rigorous
evaluation. This paper addresses this important
challenge with the OASIS algorithm: a sampler and
F-measure estimator for ER evaluation. OASIS draws
samples from a (biased) instrumental distribution,
chosen to ensure estimators with optimal asymptotic
variance. As new labels are collected OASIS updates
this instrumental distribution via a Bayesian latent
variable model of the annotator oracle, to quickly
focus on unlabelled items providing more information.
We prove that resulting estimates of F-measure,
precision, recall converge to the true population
values. Thorough comparisons of sampling methods on a
variety of ER datasets demonstrate significant
labelling reductions of up to 83\% without loss to
estimate accuracy.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tong:2017:FOT,
author = "Yongxin Tong and Libin Wang and Zimu Zhou and Bolin
Ding and Lei Chen and Jieping Ye and Ke Xu",
title = "Flexible online task assignment in real-time spatial
data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1334--1345",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The popularity of Online To Offline (O2O) service
platforms has spurred the need for online task
assignment in real-time spatial data, where streams of
spatially distributed tasks and workers are matched in
real time such that the total number of assigned pairs
is maximized. Existing online task assignment models
assume that each worker is either assigned a task
immediately or waits for a subsequent task at a fixed
location once she/he appears on the platform. Yet in
practice a worker may actively move around rather than
passively wait in place if no task is assigned. In this
paper, we define a new problem Flexible Two-sided
Online task Assignment (FTOA). FTOA aims to guide idle
workers based on the prediction of tasks and workers so
as to increase the total number of assigned worker-task
pairs. To address the FTOA problem, we face two
challenges: (i) How to generate guidance for idle
workers based on the prediction of the spatiotemporal
distribution of tasks and workers? (ii) How to leverage
the guidance of workers' movements to optimize the
online task assignment? To this end, we propose a novel
two-step framework, which integrates offline prediction
and online task assignment. Specifically, we estimate
the distributions of tasks and workers per time slot
and per unit area, and design an online task assignment
algorithm, Prediction-oriented Online task Assignment
in Real-time spatial data (POLAR-OP). It yields a
0.47-competitive ratio, which is nearly twice better
than that of the state-of-the-art. POLAR-OP also
reduces the time complexity to process each
newly-arrived task/worker to $ O(1) $. We validate the
effectiveness and efficiency of our methods via
extensive experiments on both synthetic datasets and
real-world datasets from a large-scale taxi-calling
platform.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bouros:2017:FSB,
author = "Panagiotis Bouros and Nikos Mamoulis",
title = "A forward scan based plane sweep algorithm for
parallel interval joins",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1346--1357",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The interval join is a basic operation that finds
application in temporal, spatial, and uncertain
databases. Although a number of centralized and
distributed algorithms have been proposed for the
efficient evaluation of interval joins, classic plane
sweep approaches have not been considered at their full
potential. A recent piece of related work proposes an
optimized approach based on plane sweep (PS) for modern
hardware, showing that it greatly outperforms previous
work. However, this approach depends on the development
of a complex data structure and its parallelization has
not been adequately studied. In this paper, we explore
the applicability of a largely ignored forward scan
(FS) based plane sweep algorithm, which is extremely
simple to implement. We propose two optimizations of FS
that greatly reduce its cost, making it competitive to
the state-of-the-art single-threaded PS algorithm while
achieving a lower memory footprint. In addition, we
show the drawbacks of a previously proposed hash-based
partitioning approach for parallel join processing and
suggest a domain-based partitioning approach that does
not produce duplicate results. Within our approach we
propose a novel breakdown of the partition join jobs
into a small number of independent mini-join jobs with
varying cost and manage to avoid redundant comparisons.
Finally, we show how these mini-joins can be scheduled
in multiple CPU cores and propose an adaptive domain
partitioning, aiming at load balancing. We include an
experimental study that demonstrates the efficiency of
our optimized FS and the scalability of our
parallelization framework.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Rong:2017:APA,
author = "Kexin Rong and Peter Bailis",
title = "{ASAP}: prioritizing attention via time series
smoothing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1358--1369",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Time series visualization of streaming telemetry
(i.e., charting of key metrics such as server load over
time) is increasingly prevalent in modern data
platforms and applications. However, many existing
systems simply plot the raw data streams as they
arrive, often obscuring large-scale trends due to
small-scale noise. We propose an alternative: to better
prioritize end users' attention, smooth time series
visualizations as much as possible to remove noise,
while retaining large-scale structure to highlight
significant deviations. We develop a new analytics
operator called ASAP that automatically smooths
streaming time series by adaptively optimizing the
trade-off between noise reduction (i.e., variance) and
trend retention (i.e., kurtosis). We introduce metrics
to quantitatively assess the quality of smoothed plots
and provide an efficient search strategy for optimizing
these metrics that combines techniques from stream
processing, user interface design, and signal
processing via autocorrelation-based pruning,
pixel-aware preaggregation, and on-demand refresh. We
demonstrate that ASAP can improve users' accuracy in
identifying long-term deviations in time series by up
to 38.4\% while reducing response times by up to
44.3\%. Moreover, ASAP delivers these results several
orders of magnitude faster than alternative search
strategies.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2017:KVL,
author = "Furong Li and Xin Luna Dong and Anno Langen and Yang
Li",
title = "Knowledge verification for long-tail verticals",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1370--1381",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Collecting structured knowledge for real-world
entities has become a critical task for many
applications. A big gap between the knowledge in
existing knowledge repositories and the knowledge in
the real world is the knowledge on tail verticals
(i.e., less popular domains). Such knowledge, though
not necessarily globally popular, can be personal
hobbies to many people and thus collectively impactful.
This paper studies the problem of knowledge
verification for tail verticals; that is, deciding the
correctness of a given triple. Through comprehensive
experimental study we answer the following questions.
(1) Can we find evidence for tail knowledge from an
extensive set of sources, including knowledge bases,
the web, and query logs? (2) Can we judge correctness
of the triples based on the collected evidence? (3) How
can we further improve knowledge verification on tail
verticals? Our empirical study suggests a new
knowledge-verification framework, which we call Facty,
that applies various kinds of evidence collection
techniques followed by knowledge fusion. Facty can
verify 50\% of the (correct) tail knowledge with a
precision of 84\%, and it significantly outperforms
state-of-the-art methods. Detailed error analysis on
the obtained results suggests future research
directions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Pande:2017:SRR,
author = "Shiladitya Pande and Sayan Ranu and Arnab
Bhattacharya",
title = "{SkyGraph}: retrieving regions of interest using
skyline subgraph queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1382--1393",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Several services today are annotated with points of
interest (PoIs) such as ``coffee shop'', ``park'', etc.
A region of interest (RoI) is a neighborhood that
contains PoIs relevant to the user. In this paper, we
study the scenario where a user wants to identify the
best RoI in a city. The user expresses relevance
through a set of keywords denoting PoIs. Ideally, the
RoI should be small enough in size such that the user
can conveniently explore the PoIs. On the other hand,
it should be as relevant as possible. How does one
balance the importance of size versus relevance? To a
user exploring the RoI on foot, size is more critical.
However, for a user equipped with a vehicle, relevance
is a more important factor. In this paper, we solve
this dilemma through skyline subgraph queries on
keyword-embedded road networks. Skyline subgraphs
subsume the choice of optimization function for an RoI
since the optimal RoI for any rational user is
necessarily a part of the skyline set. Our analysis
reveals that the problem of computing the skyline set
is NP-hard. We overcome the computational bottleneck by
proposing a polynomial-time approximation algorithm
called SkyGraph. To further expedite the running time,
we develop an index structure, Partner Index, that
drastically prunes the search space and provides up to
3 orders of magnitude speed-up on real road networks
over the baseline approach. The datasets and
executables are available at
http://www.cse.iitd.ac.in/~sayan/software.html.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tan:2017:REA,
author = "Wei Chit Tan and Meihui Zhang and Hazem Elmeleegy and
Divesh Srivastava",
title = "Reverse engineering aggregation queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1394--1405",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Query reverse engineering seeks to re-generate the SQL
query that produced a given query output table from a
given database. In this paper, we solve this problem
for OLAP queries with group-by and aggregation. We
develop a novel three-phase algorithm named REGAL$^1$
for this problem. First, based on a lattice graph
structure, we identify a set of group-by candidates for
the desired query. Second, we apply a set of
aggregation constraints that are derived from the
properties of aggregate operators at both the
table-level and the group-level to discover candidate
combinations of group-by columns and aggregations that
are consistent with the given query output table.
Finally, we find a multi-dimensional filter, i.e., a
conjunction of selection predicates over the base table
attributes, that is needed to generate the exact query
output table. We conduct an extensive experimental
study over the TPC-H dataset to demonstrate the
effectiveness and efficiency of our proposal.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yut:2017:LRL,
author = "Lele Yut and Ce Zhang and Yingxia Shao and Bin Cui",
title = "{LDA*}: a robust and large-scale topic modeling
system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1406--1417",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present LDA*, a system that has been deployed in
one of the largest Internet companies to fulfil their
requirements of ``topic modeling as an internal
service'' --- relying on thousands of machines,
engineers in different sectors submit their data, some
are as large as 1.8TB, to LDA* and get results back in
hours. LDA* is motivated by the observation that none
of the existing topic modeling systems is robust enough
--- Each of these existing systems is designed for a
specific point in the tradeoff space that can be
sub-optimal, sometimes by up to $ 10 \times $, across
workloads. Our first contribution is a systematic study
of all recently proposed samplers: AliasLDA, F+LDA,
LightLDA, and WarpLDA. We discovered a novel system
tradeoff among these samplers. Each sampler has
different sampling complexity and performs differently,
sometimes by $ 5 \times $, on documents with different
lengths. Based on this tradeoff, we further developed a
hybrid sampler that uses different samplers for
different types of documents. This hybrid approach
works across a wide range of workloads and outperforms
the fastest sampler by up to $ 2 \times $. We then
focused on distributed environments in which thousands
of workers, each with different performance (due to
virtualization and resource sharing), coordinate to
train a topic model. Our second contribution is an
asymmetric parameter server architecture that pushes
some computation to the parameter server side. This
architecture is motivated by the skew of the word
frequency distribution and a novel tradeoff we
discovered between communication and computation. With
this architecture, we outperform the traditional,
symmetric architecture by up to $ 2 \times $. With
these two contributions, together with a carefully
engineered implementation, our system is able to
outperform existing systems by up to $ 10 \times $ and
has already been running to provide topic modeling
services for more than six months.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kabiljo:2017:SHP,
author = "Igor Kabiljo and Brian Karrer and Mayank Pundir and
Sergey Pupyrev and Alon Shalita",
title = "Social hash partitioner: a scalable distributed
hypergraph partitioner",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1418--1429",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We design and implement a distributed algorithm for
balanced $k$-way hypergraph partitioning that minimizes
fanout, a fundamental hypergraph quantity also known as
the communication volume and $ (k - 1)$-cut metric, by
optimizing a novel objective called probabilistic
fanout. This choice allows a simple local search
heuristic to achieve comparable solution quality to the
best existing hypergraph partitioners. Our algorithm is
arbitrarily scalable due to a careful design that
controls computational complexity, space complexity,
and communication. In practice, we commonly process
hypergraphs with billions of vertices and hyperedges in
a few hours. We explain how the algorithm's
scalability, both in terms of hypergraph size and
bucket count, is limited only by the number of machines
available. We perform an extensive comparison to
existing distributed hypergraph partitioners and find
that our approach is able to optimize hypergraphs
roughly 100 times bigger on the same set of machines.
We call the resulting tool Social Hash Partitioner, and
accompanying this paper, we open-source the most
scalable version based on recursive bisection.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ahmed:2017:SMG,
author = "Nesreen K. Ahmed and Nick Duffield and Theodore L.
Willke and Ryan A. Rossi",
title = "On sampling from massive graph streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1430--1441",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We propose Graph Priority Sampling (gps), a new
paradigm for order-based reservoir sampling from
massive graph streams. gps provides a general way to
weight edge sampling according to auxiliary and/or size
variables so as to accomplish various estimation goals
of graph properties. In the context of subgraph
counting, we show how edge sampling weights can be
chosen so as to minimize the estimation variance of
counts of specified sets of subgraphs. In distinction
with many prior graph sampling schemes, gps separates
the functions of edge sampling and subgraph estimation.
We propose two estimation frameworks: (1) Post-Stream
estimation, to allow gps to construct a reference
sample of edges to support retrospective graph queries,
and (2) In-Stream estimation, to allow gps to obtain
lower variance estimates by incrementally updating the
subgraph count estimates during stream processing.
Unbiasedness of subgraph estimators is established
through a new Martingale formulation of graph stream
order sampling, in which subgraph estimators, written
as a product of constituent edge estimators, are
unbiased, even when computed at different points in the
stream. The separation of estimation and sampling
enables significant resource savings relative to
previous work. We illustrate our framework with
applications to triangle and wedge counting. We perform
a large-scale experimental study on real-world graphs
from various domains and types. gps achieves high
accuracy with < 1\% error for triangle and wedge
counting, while storing a small fraction of the graph
with average update times of a few microseconds per
edge. Notably, for billion-scale graphs, gps accurately
estimates triangle and wedge counts with < 1\% error,
while storing a small fraction of < 0.01\% of the total
edges in the graph.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yang:2017:PSS,
author = "Tong Yang and Yang Zhou and Hao Jin and Shigang Chen
and Xiaoming Li",
title = "Pyramid sketch: a sketch framework for frequency
estimation of data streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1442--1453",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Sketch is a probabilistic data structure, and is used
to store and query the frequency of any item in a given
multiset. Due to its high memory efficiency, it has
been applied to various fields in computer science,
such as stream database, network traffic measurement,
etc. The key metrics of sketches for data streams are
accuracy, speed, and memory usage. Various sketches
have been proposed, but they cannot achieve both high
accuracy and high speed using limited memory,
especially for skewed datasets. To address this issue,
we propose a sketch framework, the Pyramid sketch,
which can significantly improve accuracy as well as
update and query speed. To verify the effectiveness and
efficiency of our framework, we applied our framework
to four typical sketches. Extensive experimental
results show that the accuracy is improved up to 3.50
times, while the speed is improved up to 2.10 times. We
have released our source codes at Github [1].",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ciaccia:2017:RSR,
author = "Paolo Ciaccia and Davide Martinenghi",
title = "Reconciling skyline and ranking queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1454--1465",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Traditionally, skyline and ranking queries have been
treated separately as alternative ways of discovering
interesting data in potentially large datasets. While
ranking queries adopt a specific scoring function to
rank tuples, skyline queries return the set of
non-dominated tuples and are independent of attribute
scales and scoring functions. Ranking queries are thus
less general, but usually cheaper to compute and widely
used in data management systems. We propose a framework
to seamlessly integrate these two approaches by
introducing the notion of restricted skyline queries
(R-skylines). We propose R-skyline operators that
generalize both skyline and ranking queries by applying
the notion of dominance to a set of scoring functions
of interest. Such sets can be characterized, e.g., by
imposing constraints on the function's parameters, such
as the weights in a linear scoring function. We discuss
the formal properties of these new operators, show how
to implement them efficiently, and evaluate them on
both synthetic and real datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Giannakopoulou:2017:COQ,
author = "Stella Giannakopoulou and Manos Karpathiotakis and
Benjamin Gaidioz and Anastasia Ailamaki",
title = "{CleanM}: an optimizable query language for unified
scale-out data cleaning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1466--1477",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data cleaning has become an indispensable part of data
analysis due to the increasing amount of dirty data.
Data scientists spend most of their time preparing
dirty data before it can be used for data analysis. At
the same time, the existing tools that attempt to
automate the data cleaning procedure typically focus on
a specific use case and operation. Still, even such
specialized tools exhibit long running times or fail to
process large datasets. Therefore, from a user's
perspective, one is forced to use a different,
potentially inefficient tool for each category of
errors. This paper addresses the coverage and
efficiency problems of data cleaning. It introduces
CleanM (pronounced clean'em), a language which can
express multiple types of cleaning operations. CleanM
goes through a three-level translation process for
optimization purposes; a different family of
optimizations is applied in each abstraction level.
Thus, CleanM can express complex data cleaning tasks,
optimize them in a unified way, and deploy them in a
scaleout fashion. We validate the applicability of
CleanM by using it on top of CleanDB, a newly designed
and implemented framework which can query heterogeneous
data. When compared to existing data cleaning
solutions, CleanDB (a) covers more data corruption
cases, (b) scales better, and can handle cases for
which its competitors are unable to terminate, and (c)
uses a single interface for querying and for data
cleaning.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Xie:2017:DTS,
author = "Dong Xie and Feifei Li and Jeff M. Phillips",
title = "Distributed trajectory similarity search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1478--1489",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Mobile and sensing devices have already become
ubiquitous. They have made tracking moving objects an
easy task. As a result, mobile applications like Uber
and many IoT projects have generated massive amounts of
trajectory data that can no longer be processed by a
single machine efficiently. Among the typical query
operations over trajectories, similarity search is a
common yet expensive operator in querying trajectory
data. It is useful for applications in different
domains such as traffic and transportation
optimizations, weather forecast and modeling, and
sports analytics. It is also a fundamental operator for
many important mining operations such as clustering and
classification of trajectories. In this paper, we
propose a distributed query framework to process
trajectory similarity search over a large set of
trajectories. We have implemented the proposed
framework in Spark, a popular distributed data
processing engine, by carefully considering different
design choices. Our query framework supports both the
Hausdorff distance the Fr{\'e}chet distance. Extensive
experiments have demonstrated the excellent scalability
and query efficiency achieved by our design, compared
to other methods and design alternatives.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chandra:2017:ROJ,
author = "Bikash Chandra and S. Sudarshan",
title = "Runtime optimization of join location in parallel data
management systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1490--1501",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Applications running on parallel systems often need to
join a streaming relation or a stored relation with
data indexed in a parallel data storage system. Some
applications also compute UDFs on the joined tuples.
The join can be done at the data storage nodes,
corresponding to reduce side joins, or by fetching data
from the storage system to compute nodes, corresponding
to map side join. Both may be suboptimal: reduce side
joins may cause skew, while map side joins may lead to
a lot of data being transferred and replicated. In this
paper, we present techniques to make runtime decisions
between the two options on a per key basis, in order to
improve the throughput of the join, accounting for UDF
computation if any. Our techniques are based on an
extended ski-rental algorithm and provide worst-case
performance guarantees with respect to the optimal
point in the space considered by us. Our techniques use
load balancing taking into account the CPU, network and
I/O costs as well as the load on compute and storage
nodes. We have implemented our techniques on Hadoop,
Spark and the Muppet stream processing engine. Our
experiments show that our optimization techniques
provide a significant improvement in throughput over
existing techniques.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lehmberg:2017:SWT,
author = "Oliver Lehmberg and Christian Bizer",
title = "Stitching web tables for improving matching quality",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1502--1513",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "HTML tables on web pages (``web tables'') cover a wide
variety of topics. Data from web tables can thus be
useful for tasks such as knowledge base completion or
ad hoc table extension. Before table data can be used
for these tasks, the tables must be matched to the
respective knowledge base or base table. The challenges
of web table matching are the high heterogeneity and
the small size of the tables. Though it is known that
the majority of web tables are very small, the gold
standards that are used to compare web table matching
systems mostly consist of larger tables. In this
experimental paper, we evaluate T2K Match, a web table
to knowledge base matching system, and COMA, a standard
schema matching tool, using a sample of web tables that
is more realistic than the gold standards that were
previously used. We find that both systems fail to
produce correct results for many of the very small
tables in the sample. As a remedy, we propose to stitch
(combine) the tables from each web site into larger
ones and match these enlarged tables to the knowledge
base or base table afterwards. For this stitching
process, we evaluate different schema matching methods
in combination with holistic correspondence refinement.
Limiting the stitching procedure to web tables from the
same web site decreases the heterogeneity and allows us
to stitch tables with very high precision. Our
experiments show that applying table stitching before
running the actual matching method improves the
matching results by 0.38 in F1-measure for T2K Match
and by 0.14 for COMA. Also, stitching the tables allows
us to reduce the amount of tables in our corpus from 5
million original web tables to as few as 100,000
stitched tables.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shekelyan:2017:DHB,
author = "Michael Shekelyan and Anton Dign{\"o}s and Johann
Gamper",
title = "{DigitHist}: a histogram-based data summary with tight
error bounds",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1514--1525",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We propose DigitHist, a histogram summary for
selectivity estimation on multi-dimensional data with
tight error bounds. By combining multi-dimensional and
one-dimensional histograms along regular grids of
different resolutions, DigitHist provides an accurate
and reliable histogram approach for multi-dimensional
data. To achieve a compact summary, we use a sparse
representation combined with a novel histogram
compression technique that chooses a higher resolution
in dense regions and a lower resolution elsewhere. For
the construction of DigitHist, we propose a new error
measure, termed u -error, which minimizes the width
between the guaranteed upper and lower bounds of the
selectivity estimate. The construction algorithm
performs a single data scan and has linear time
complexity. An in-depth experimental evaluation shows
that DigitHist delivers superior precision and error
bounds than state-of-the-art competitors at a
comparable query time.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Pilman:2017:FSK,
author = "Markus Pilman and Kevin Bocksrocker and Lucas Braun
and Renato Marroqu{\'\i}n and Donald Kossmann",
title = "Fast scans on key--value stores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1526--1537",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Key-Value Stores (KVS) are becoming increasingly
popular because they scale up and down elastically,
sustain high throughputs for get/put workloads and have
low latencies. KVS owe these advantages to their
simplicity. This simplicity, however, comes at a cost:
It is expensive to process complex, analytical queries
on top of a KVS because today's generation of KVS does
not support an efficient way to scan the data. The
problem is that there are conflicting goals when
designing a KVS for analytical queries and for simple
get/put workloads: Analytical queries require high
locality and a compact representation of data whereas
elastic get/put workloads require sparse indexes. This
paper shows that it is possible to have it all, with
reasonable compromises. We studied the KVS design space
and built TellStore, a distributed KVS, that performs
almost as well as state-of-the-art KVS for get/put
workloads and orders of magnitude better for analytical
and mixed workloads. This paper presents the results of
comprehensive experiments with an extended version of
the YCSB benchmark and a workload from the
telecommunication industry.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lu:2017:FMC,
author = "Can Lu and Jeffrey Xu Yu and Hao Wei and Yikai Zhang",
title = "Finding the maximum clique in massive graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1538--1549",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Cliques refer to subgraphs in an undirected graph such
that vertices in each subgraph are pairwise adjacent.
The maximum clique problem, to find the clique with
most vertices in a given graph, has been extensively
studied. Besides its theoretical value as an NP-hard
problem, the maximum clique problem is known to have
direct applications in various fields, such as
community search in social networks and social media,
team formation in expert networks, gene expression and
motif discovery in bioinformatics and anomaly detection
in complex networks, revealing the structure and
function of networks. However, algorithms designed for
the maximum clique problem are expensive to deal with
real-world networks. In this paper, we devise a
randomized algorithm for the maximum clique problem.
Different from previous algorithms that search from
each vertex one after another, our approach RMC, for
the randomized maximum clique problem, employs a binary
search while maintaining a lower bound $ \omega_c $ and
an upper bound [EQUATION] of $ \omega (G) $. In each
iteration, RMC attempts to find a $ \omega_t $ -clique
where [EQUATION]. As finding $ \omega_t $ in each
iteration is NP-complete, we extract a seed set S such
that the problem of finding a $ \omega_t$-clique in G
is equivalent to finding a $ \omega_t$-clique in S with
probability guarantees $ (\geq 1 - n^{-c})$. We propose
a novel iterative algorithm to determine the maximum
clique by searching a $k$-clique in $S$ starting from $
k = \omega_c + 1$ until $S$ becomes [EQUATION], when
more iterations benefit marginally. As confirmed by the
experiments, our approach is much more efficient and
robust than previous solutions and can always find the
exact maximum clique.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2017:PPN,
author = "Yuankai Zhang and Adam O'Neill and Micah Sherr and
Wenchao Zhou",
title = "Privacy-preserving network provenance",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1550--1561",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Network accountability, forensic analysis, and failure
diagnosis are becoming increasingly important for
network management and security. Network provenance
significantly aids network administrators in these
tasks by explaining system behavior and revealing the
dependencies between system states. Although
resourceful, network provenance can sometimes be too
rich, revealing potentially sensitive information that
was involved in system execution. In this paper, we
propose a cryptographic approach to preserve the
confidentiality of provenance (sub)graphs while
allowing users to query and access the parts of the
graph for which they are authorized. Our proposed
solution is a novel application of searchable symmetric
encryption (SSE) and more generally structured
encryption (SE). Our SE-enabled provenance system
allows a node to enforce access control policies over
its provenance data even after the data has been
shipped to remote nodes (e.g., for optimization
purposes). We present a prototype of our design and
demonstrate its practicality, scalability, and
efficiency for both provenance maintenance and
querying.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Garcia-Ulloa:2017:TDS,
author = "Daniel A. Garcia-Ulloa and Li Xiong and Vaidy
Sunderam",
title = "Truth discovery for spatio-temporal events from
crowdsourced data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1562--1573",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "One of the greatest challenges in spatial
crowdsourcing is determining the veracity of reports
from multiple users about a particular event or
phenomenon. In this paper, we address the difficulties
of truth discovery in spatio-temporal tasks and present
a new method based on recursive Bayesian estimation
(BE) from multiple reports of users. Our method
incorporates a reliability model for users, which
improves as more reports arrive while increasing the
accuracy of the model in labeling the state of the
event. The model is further improved by Kalman
estimation (BE+KE) that models the spatio-temporal
correlations of the events and predicts the next state
of an event and is corrected when new reports arrive.
The methods are tested in a simulated environment, as
well as using real-world data. Experimental results
show that our methods are adaptable to the available
data, can incorporate previous beliefs, and outperform
existing truth discovery methods of spatio-temporal
events.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Trummer:2017:DVO,
author = "Immanuel Trummer and Jiancheng Zhu and Mark Bryan",
title = "Data vocalization: optimizing voice output of
relational data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1574--1585",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Research on data visualization aims at finding the
best way to present data via visual interfaces. We
introduce the complementary problem of ``data
vocalization''. Our goal is to present relational data
in the most efficient way via voice output. This
problem setting is motivated by emerging tools and
devices (e.g., Google Home, Amazon Echo, Apple's Siri,
or voice-based SQL interfaces) that communicate data
primarily via audio output to their users. We treat
voice output generation as an optimization problem. The
goal is to minimize speaking time while transmitting an
approximation of a relational table to the user. We
consider constraints on the precision of the
transmitted data as well as on the cognitive load
placed on the listener. We formalize voice output
optimization and show that it is NP-hard. We present
three approaches to solve that problem. First, we show
how the problem can be translated into an integer
linear program which enables us to apply corresponding
solvers. Second, we present a two-phase approach that
forms groups of similar rows in a pre-processing step,
using a variant of the apriori algorithm. Then, we
select an optimal combination of groups to generate a
speech. Finally, we present a greedy algorithm that
runs in polynomial time. Under simplifying assumptions,
we prove that it generates near-optimal output by
leveraging the sub-modularity property of our cost
function. We compare our algorithms experimentally and
analyze their complexity.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kang:2017:NON,
author = "Daniel Kang and John Emmons and Firas Abuzaid and
Peter Bailis and Matei Zaharia",
title = "{NoScope}: optimizing neural network queries over
video at scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "11",
pages = "1586--1597",
month = aug,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Sep 5 16:07:00 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Recent advances in computer vision---in the form of
deep neural networks---have made it possible to query
increasing volumes of video data with high accuracy.
However, neural network inference is computationally
expensive at scale: applying a state-of-the-art object
detector in real time (i.e., 30+ frames per second) to
a single video requires a \$4000 GPU. In response, we
present NoScope, a system for querying videos that can
reduce the cost of neural network video analysis by up
to three orders of magnitude via inference-optimized
model search. Given a target video, object to detect,
and reference neural network, NoScope automatically
searches for and trains a sequence, or cascade, of
models that preserves the accuracy of the reference
network but is specialized to the target video and are
therefore far less computationally expensive. NoScope
cascades two types of models: specialized models that
forego the full generality of the reference model but
faithfully mimic its behavior for the target video and
object; and difference detectors that highlight
temporal differences across frames. We show that the
optimal cascade architecture differs across videos and
objects, so NoScope uses an efficient cost-based
optimizer to search across models and cascades. With
this approach, NoScope achieves two to three order of
magnitude speed-ups (265--15,500$ \times $ real-time)
on binary classification tasks over fixed-angle webcam
and surveillance video while maintaining accuracy
within 1--5\% of state-of-the-art neural networks.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lee:2017:PRA,
author = "Juchang Lee and SeungHyun Moon and Kyu Hwan Kim and
Deok Hoe Kim and Sang Kyun Cha and Wook-Shin Han",
title = "Parallel replication across formats in {SAP HANA} for
scaling out mixed {OLTP\slash OLAP} workloads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1598--1609",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137767",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Modern in-memory database systems are facing the need
of efficiently supporting mixed workloads of OLTP and
OLAP. A conventional approach to this requirement is to
rely on ETL-style, application-driven data replication
between two very different OLTP and OLAP systems,
sacrificing real-time reporting on operational data. An
alternative approach is to run OLTP and OLAP workloads
in a single machine, which eventually limits the
maximum scalability of OLAP query performance. In order
to tackle this challenging problem, we propose a novel
database replication architecture called Asynchronous
Parallel Table Replication (ATR). ATR supports OLTP
workloads in one primary machine, while it supports
heavy OLAP workloads in replicas. Here, row-store
formats can be used for OLTP transactions at the
primary, while column-store formats are used for OLAP
analytical queries at the replicas. ATR is designed to
support elastic scalability of OLAP query performance
while it minimizes the overhead for transaction
processing at the primary and minimizes CPU consumption
for replayed transactions at the replicas. ATR employs
a novel optimistic lock-free parallel log replay scheme
which exploits characteristics of multi-version
concurrency control (MVCC) in order to enable real-time
reporting by minimizing the propagation delay between
the primary and replicas. Through extensive experiments
with a concrete implementation available in a
commercial database system, we demonstrate that ATR
achieves sub-second visibility delay even for
update-intensive workloads, providing scalable OLAP
performance without notable overhead to the primary.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shamsuddin:2017:DLD,
author = "Rittika Shamsuddin and Amit Sawant and Balakrishnan
Prabhakaran",
title = "Developing a low dimensional patient class profile in
accordance to their respiration-induced tumor motion",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1610--1621",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137768",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Tumor location displacement caused by
respiration-induced motion reduces the efficacy of
radiation therapy. Three medically relevant patterns
are often observed in the respiration-induced motion
signal: baseline shift, ES-Range shift, and D-Range
shift. In this paper, for patients with lower body
cancer, we develop class profiles (a low dimensional
pattern frequency structure) that characterize them in
terms of these three medically relevant patterns. We
propose an adaptive segmentation technique that turns
each respiration-induced motion signal into a multi-set
of segments based on persistent variations within the
signal. These multi-sets of segments is then probed for
base behaviors. These base behaviors are then used to
develop the group/class profiles using a modified
version of the clustering technique described in [1].
Finally, via quantitative analysis, we provide a
medical characterization for the class profiles, which
can be used to explore breathing intervention
technique. We show that, with (i) carefully designed
feature sets, (ii) the proposed adaptive segmentation
technique, (iii) the reasonable modifications to an
existing clustering algorithm for multi-sets, and (iv)
the proposed medical characterization methodology, it
is possible to reduce the time series
respiration-induced motion signals into a compact class
profile. One of our co-authors is a medical physician
and we used his expert opinion to verify the results.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ziauddin:2017:DBD,
author = "Mohamed Ziauddin and Andrew Witkowski and You Jung Kim
and Dmitry Potapov and Janaki Lahorani and Murali
Krishna",
title = "Dimensions based data clustering and zone maps",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1622--1633",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137769",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In recent years, the data warehouse industry has
witnessed decreased use of indexing but increased use
of compression and clustering of data facilitating
efficient data access and data pruning in the query
processing area. A classic example of data pruning is
the partition pruning, which is used when table data is
range or list partitioned. But lately, techniques have
been developed to prune data at a lower granularity
than a table partition or sub-partition. A good example
is the use of data pruning structure called zone map. A
zone map prunes zones of data from a table on which it
is defined. Data pruning via zone map is very effective
when the table data is clustered by the filtering
columns. The database industry has offered support to
cluster data in tables by its local columns, and to
define zone maps on clustering columns of such tables.
This has helped improve the performance of queries that
contain filter predicates on local columns. However,
queries in data warehouses are typically based on
star/snowflake schema with filter predicates usually on
columns of the dimension tables joined to a fact table.
Given this, the performance of data warehouse queries
can be significantly improved if the fact table data is
clustered by columns of dimension tables together with
zone maps that maintain min/max value ranges of these
clustering columns over zones of fact table data. In
recognition of this opportunity of significantly
improving the performance of data warehouse queries,
Oracle 12c release 1 has introduced the support for
dimension based clustering of fact tables together with
data pruning of the fact tables via dimension based
zone maps.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Noghabi:2017:SSS,
author = "Shadi A. Noghabi and Kartik Paramasivam and Yi Pan and
Navina Ramesh and Jon Bringhurst and Indranil Gupta and
Roy H. Campbell",
title = "{Samza}: stateful scalable stream processing at
{LinkedIn}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1634--1645",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137770",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Distributed stream processing systems need to support
stateful processing, recover quickly from failures to
resume such processing, and reprocess an entire data
stream quickly. We present Apache Samza, a distributed
system for stateful and fault-tolerant stream
processing. Samza utilizes a partitioned local state
along with a low-overhead background changelog
mechanism, allowing it to scale to massive state sizes
(hundreds of TB) per application. Recovery from
failures is sped up by re-scheduling based on Host
Affinity. In addition to processing infinite streams of
events, Samza supports processing a finite dataset as a
stream, from either a streaming source (e.g., Kafka), a
database snapshot (e.g., Databus), or a file system
(e.g. HDFS), without having to change the application
code (unlike the popular Lambda-based architectures
which necessitate maintenance of separate code bases
for batch and stream path processing). Samza is
currently in use at LinkedIn by hundreds of production
applications with more than 10, 000 containers. Samza
is an open-source Apache project adopted by many
top-tier companies (e.g., LinkedIn, Uber, Netflix,
TripAdvisor, etc.). Our experiments show that Samza:
(a) handles state efficiently, improving latency and
throughput by more than 100X compared to using a remote
storage; (b) provides recovery time independent of
state size; (c) scales performance linearly with number
of containers; and (d) supports reprocessing of the
data stream quickly and with minimal interference on
real-time traffic.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Falk:2017:QAK,
author = "Eric Falk and Vijay K. Gurbani and Radu State",
title = "Query-able {Kafka}: an agile data analytics pipeline
for mobile wireless networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1646--1657",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137771",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Due to their promise of delivering real-time network
insights, today's streaming analytics platforms are
increasingly being used in the communications networks
where the impact of the insights go beyond sentiment
and trend analysis to include real-time detection of
security attacks and prediction of network state (i.e.,
is the network transitioning towards an outage).
Current streaming analytics platforms operate under the
assumption that arriving traffic is to the order of
kilobytes produced at very high frequencies. However,
communications networks, especially the
telecommunication networks, challenge this assumption
because some of the arriving traffic in these networks
is to the order of gigabytes, but produced at medium to
low velocities. Furthermore, these large datasets may
need to be ingested in their entirety to render network
insights in real-time. Our interest is to subject
today's streaming analytics platforms --- constructed
from state-of-the art software components (Kafka,
Spark, HDFS, ElasticSearch) --- to traffic densities
observed in such communications networks. We find that
filtering on such large datasets is best done in a
common upstream point instead of being pushed to, and
repeated, in downstream components. To demonstrate the
advantages of such an approach, we modify Apache Kafka
to perform limited native data transformation and
filtering, relieving the downstream Spark application
from doing this. Our approach outperforms four
prevalent analytics pipeline architectures with
negligible overhead compared to standard Kafka. (Our
modifications to Apache Kafka are publicly available at
https://github.com/Esquive/queryable-kafka.git)",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Nica:2017:SDS,
author = "Anisoara Nica and Reza Sherkat and Mihnea Andrei and
Xun Cheng and Martin Heidel and Christian Bensberg and
Heiko Gerwens",
title = "{Statisticum}: data statistics management in {SAP
HANA}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1658--1669",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137772",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We introduce a new concept of leveraging traditional
data statistics as dynamic data integrity constraints.
These data statistics produce transient database
constraints, which are valid as long as they can be
proven to be consistent with the current data. We
denote this type of data statistics by constraint data
statistics, their properties needed for consistency
checking by consistency metadata, and their implied
integrity constraints by implied data statistics
constraints (implied constraints for short). Implied
constraints are valid integrity constraints which are
powerful query optimization tools employed, just as
traditional database constraints, in semantic query
transformation (aka query reformulation), partition
pruning, runtime optimization, and semi-join reduction,
to name a few. To our knowledge, this is the first work
introducing this novel and powerful concept of deriving
implied integrity constraints from data statistics. We
discuss theoretical aspects of the constraint data
statistics concept and their integration into query
processing. We present the current architecture of data
statistics management in SAP HANA and detail how
constraint data statistics are designed and integrated
into this architecture. As an instantiation of this
framework, we consider dynamic partition pruning for
data aging scenarios. We discuss our current
implementation for constraint data statistics objects
in SAP HANA which can be used for dynamic partition
pruning. We enumerate their properties and show how
consistency checking for implied integrity constraints
is supported in the data statistics architecture. Our
experimental evaluations on the TPC-H benchmark and a
real customer application confirm the effectiveness of
the implied integrity constraints; (1) for 59\% of
TPC-H queries, constraint data statistics utilization
results in pruning cold partitions and reducing memory
consumption, and (2) we observe up to 3 orders of
magnitude speed-up in query processing time, for a real
customer running an S/4HANA application.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gessert:2017:QQW,
author = "Felix Gessert and Michael Schaarschmidt and Wolfram
Wingerath and Erik Witt and Eiko Yoneki and Norbert
Ritter",
title = "{Quaestor}: query web caching for
database-as-a-service providers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1670--1681",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137773",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Today, web performance is primarily governed by
round-trip latencies between end devices and cloud
services. To improve performance, services need to
minimize the delay of accessing data. In this paper, we
propose a novel approach to low latency that relies on
existing content delivery and web caching
infrastructure. The main idea is to enable
application-independent caching of query results and
records with tunable consistency guarantees, in
particular bounded staleness. Quaestor (Query Store)
employs two key concepts to incorporate both
expiration-based and invalidation-based web caches: (1)
an Expiring Bloom Filter data structure to indicate
potentially stale data, and (2) statistically derived
cache expiration times to maximize cache hit rates.
Through a distributed query invalidation pipeline,
changes to cached query results are detected in
real-time. The proposed caching algorithms offer a new
means for data-centric cloud services to trade latency
against staleness bounds, e.g. in a
database-as-a-service. Quaestor is the core technology
of the backend-as-a-service platform Baqend, a cloud
service for low-latency websites. We provide empirical
evidence for Quaestor's scalability and performance
through both simulation and experiments. The results
indicate that for read-heavy workloads, up to tenfold
speed-ups can be achieved through Quaestor's caching.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gasiunas:2017:FBA,
author = "Vaidas Gasiunas and David Dominguez-Sal and Ralph
Acker and Aharon Avitzur and Ilan Bronshtein and Rushan
Chen and Eli Ginot and Norbert Martinez-Bazan and
Michael M{\"u}ller and Alexander Nozdrin and Weijie Ou
and Nir Pachter and Dima Sivov and Eliezer Levy",
title = "Fiber-based architecture for {NFV} cloud databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1682--1693",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137774",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The telco industry is gradually shifting from using
monolithic software packages deployed on custom
hardware to using modular virtualized software
functions deployed on cloudified data centers using
commodity hardware. This transformation is referred to
as Network Function Virtualization (NFV). The
scalability of the databases (DBs) underlying the
virtual network functions is the cornerstone for
reaping the benefits from the NFV transformation. This
paper presents an industrial experience of applying
shared-nothing techniques in order to achieve the
scalability of a DB in an NFV setup. The special
combination of requirements in NFV DBs are not easily
met with conventional execution models. Therefore, we
designed a special shared-nothing architecture that is
based on cooperative multi-tasking using user-level
threads (fibers). We further show that the fiber-based
approach outperforms the approach built using
conventional multi-threading and meets the variable
deployment needs of the NFV transformation.
Furthermore, fibers yield a simpler-to-maintain
software and enable controlling a trade-off between
long-duration computations and real-time requests.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bose:2017:PDF,
author = "Joos-Hendrik B{\"o}se and Valentin Flunkert and Jan
Gasthaus and Tim Januschowski and Dustin Lange and
David Salinas and Sebastian Schelter and Matthias
Seeger and Yuyang Wang",
title = "Probabilistic demand forecasting at scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1694--1705",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137775",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present a platform built on large-scale,
data-centric machine learning (ML) approaches, whose
particular focus is demand forecasting in retail. At
its core, this platform enables the training and
application of probabilistic demand forecasting models,
and provides convenient abstractions and support
functionality for forecasting problems. The platform
comprises of a complex end-to-end machine learning
system built on Apache Spark, which includes data
preprocessing, feature engineering, distributed
learning, as well as evaluation, experimentation and
ensembling. Furthermore, it meets the demands of a
production system and scales to large catalogues
containing millions of items. We describe the
challenges of building such a platform and discuss our
design decisions. We detail aspects on several levels
of the system, such as a set of general distributed
learning schemes, our machinery for ensembling
predictions, and a high-level dataflow abstraction for
modeling complex ML pipelines. To the best of our
knowledge, we are not aware of prior work on real-world
demand forecasting systems which rivals our approach in
terms of scalability.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lee:2017:EBG,
author = "Jinho Lee and Heesu Kim and Sungjoo Yoo and Kiyoung
Choi and H. Peter Hofstee and Gi-Joon Nam and Mark R.
Nutter and Damir Jamsek",
title = "{ExtraV}: boosting graph processing near storage with
a coherent accelerator",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1706--1717",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137776",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we propose ExtraV, a framework for
near-storage graph processing. It is based on the novel
concept of graph virtualization, which efficiently
utilizes a cache-coherent hardware accelerator at the
storage side to achieve performance and flexibility at
the same time. ExtraV consists of four main components:
(1) host processor, (2) main memory, (3) AFU
(Accelerator Function Unit) and (4) storage. The AFU, a
hardware accelerator, sits between the host processor
and storage. Using a coherent interface that allows
main memory accesses, it performs graph traversal
functions that are common to various algorithms while
the program running on the host processor (called the
host program) manages the overall execution along with
more application-specific tasks. Graph virtualization
is a high-level programming model of graph processing
that allows designers to focus on algorithm-specific
functions. Realized by the accelerator, graph
virtualization gives the host programs an illusion that
the graph data reside on the main memory in a layout
that fits with the memory access behavior of host
programs even though the graph data are actually stored
in a multi-level, compressed form in storage. We
prototyped ExtraV on a Power8 machine with a
CAPI-enabled FPGA. Our experiments on a real system
prototype offer significant speedup compared to
state-of-the-art software only implementations.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Carbone:2017:SMA,
author = "Paris Carbone and Stephan Ewen and Gyula F{\'o}ra and
Seif Haridi and Stefan Richter and Kostas Tzoumas",
title = "State management in {Apache Flink\reg}: consistent
stateful distributed stream processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1718--1729",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137777",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Stream processors are emerging in industry as an
apparatus that drives analytical but also mission
critical services handling the core of persistent
application logic. Thus, apart from scalability and
low-latency, a rising system need is first-class
support for application state together with strong
consistency guarantees, and adaptivity to cluster
reconfigurations, software patches and partial
failures. Although prior systems research has addressed
some of these specific problems, the practical
challenge lies on how such guarantees can be
materialized in a transparent, non-intrusive manner
that relieves the user from unnecessary constraints.
Such needs served as the main design principles of
state management in Apache Flink, an open source,
scalable stream processor. We present Flink's core
pipelined, in-flight mechanism which guarantees the
creation of lightweight, consistent, distributed
snapshots of application state, progressively, without
impacting continuous execution. Consistent snapshots
cover all needs for system reconfiguration, fault
tolerance and version management through coarse grained
rollback recovery. Application state is declared
explicitly to the system, allowing efficient
partitioning and transparent commits to persistent
storage. We further present Flink's backend
implementations and mechanisms for high availability,
external state queries and output commit. Finally, we
demonstrate how these mechanisms behave in practice
with metrics and large-deployment insights exhibiting
the low performance trade-offs of our approach and the
general benefits of exploiting asynchrony in
continuous, yet sustainable system deployments.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zheng:2017:PHA,
author = "Jianjun Zheng and Qian Lin and Jiatao Xu and Cheng Wei
and Chuwei Zeng and Pingan Yang and Yunfan Zhang",
title = "{PaxosStore}: high-availability storage made practical
in {WeChat}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1730--1741",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137778",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we present PaxosStore, a
high-availability storage system developed to support
the comprehensive business of WeChat. It employs a
combinational design in the storage layer to engage
multiple storage engines constructed for different
storage models. PaxosStore is characteristic of
extracting the Paxos-based distributed consensus
protocol as a middleware that is universally accessible
to the underlying multi-model storage engines. This
facilitates tuning, maintaining, scaling and extending
the storage engines. According to our experience in
engineering practice, to achieve a practical consistent
read/write protocol is far more complex than its
theory. To tackle such engineering complexity, we
propose a layered design of the Paxos-based storage
protocol stack, where PaxosLog, the key data structure
used in the protocol, is devised to bridge the
programming-oriented consistent read/write to the
storage-oriented Paxos procedure. Additionally, we
present optimizations based on Paxos that made
fault-tolerance more efficient. Discussion throughout
the paper primarily focuses on pragmatic solutions that
could be insightful for building practical distributed
storage systems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Antonopoulos:2017:ROI,
author = "Panagiotis Antonopoulos and Hanuma Kodavalla and Alex
Tran and Nitish Upreti and Chaitali Shah and Mirek
Sztajno",
title = "Resumable online index rebuild in {SQL} server",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1742--1753",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137779",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Azure SQL Database and the upcoming release of SQL
Server enhance Online Index Rebuild to provide
fault-tolerance and allow index rebuild operations to
resume after a system failure or a user-initiated
pause. SQL Server is the first commercial DBMS to
support pause and resume functionality for index
rebuilds. This is achieved by splitting the operation
into incremental units of work and persisting the
required state so that it can be resumed later with
minimal loss of progress. At the same time, the
proposed technology minimizes the log space required
for the operation to succeed, making it possible to
rebuild large indexes using only a small, constant
amount of log space. These capabilities are critical to
guarantee the reliability of these operations in an
environment where (a) the database sizes are increasing
at a much faster pace compared to the available
hardware, (b) system failures are frequent in Cloud
architectures using commodity hardware, (c) software
upgrades and other maintenance tasks are automatically
handled by the Cloud platforms, introducing further
unexpected failures for the users and (d) most modern
applications need to be available 24/7 and have very
tight maintenance windows. This paper describes the
design of ``Resumable Online Index Rebuild'' and
discusses how this technology can be extended to cover
more schema management operations in the future.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Andrei:2017:SHA,
author = "Mihnea Andrei and Christian Lemke and G{\"u}nter
Radestock and Robert Schulze and Carsten Thiel and
Rolando Blanco and Akanksha Meghlan and Muhammad
Sharique and Sebastian Seifert and Surendra Vishnoi and
Daniel Booss and Thomas Peh and Ivan Schreter and
Werner Thesing and Mehul Wagle and Thomas Willhalm",
title = "{SAP HANA} adoption of non-volatile memory",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1754--1765",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137780",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Non-Volatile RAM (NVRAM) is a novel class of hardware
technology which is an interesting blend of two storage
paradigms: byte-addressable DRAM and block-addressable
storage (e.g. HDD/SSD). Most of the existing enterprise
relational data management systems such as SAP HANA
have their internal architecture based on the inherent
assumption that memory is volatile and base their
persistence on explicit handling of block-oriented
storage devices. In this paper, we present the early
adoption of Non-Volatile Memory within the SAP HANA
Database, from the architectural and technical angles.
We discuss our architectural choices, dive deeper into
a few challenges of the NVRAM integration and their
solutions, and share our experimental results. As we
present our solutions for the NVRAM integration, we
also give, as a basis, a detailed description of the
relevant HANA internals.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2017:CIS,
author = "Mingming Zhang and Tianyu Wo and Tao Xie and Xuelian
Lin and Yaxiao Liu",
title = "{CarStream}: an industrial system of big data
processing for {Internet-of-Vehicles}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1766--1777",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137781",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "As the Internet-of-Vehicles (IoV) technology becomes
an increasingly important trend for future
transportation, designing large-scale IoV systems has
become a critical task that aims to process big data
uploaded by fleet vehicles and to provide data-driven
services. The IoV data, especially high-frequency
vehicle statuses (e.g., location, engine parameters),
are characterized as large volume with a low density of
value and low data quality. Such characteristics pose
challenges for developing real-time applications based
on such data. In this paper, we address the challenges
in designing a scalable IoV system by describing
CarStream, an industrial system of big data processing
for chauffeured car services. Connected with over
30,000 vehicles, CarStream collects and processes
multiple types of driving data including vehicle
status, driver activity, and passenger-trip
information. Multiple services are provided based on
the collected data. CarStream has been deployed and
maintained for three years in industrial usage,
collecting over 40 terabytes of driving data. This
paper shares our experiences on designing CarStream
based on large-scale driving-data streams, and the
lessons learned from the process of addressing the
challenges in designing and maintaining CarStream.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bonetta:2017:FJF,
author = "Daniele Bonetta and Matthias Brantner",
title = "{FAD.js}: fast {JSON} data access using {JIT}-based
speculative optimizations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1778--1789",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137782",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "JSON is one of the most popular data encoding formats,
with wide adoption in Databases and BigData frameworks
as well as native support in popular programming
languages such as JavaScript/Node.js, Python, and R.
Nevertheless, JSON data processing can easily become a
performance bottleneck in data-intensive applications
because of parse and serialization overhead. In this
paper, we introduce F ad.js, a runtime system for
efficient processing of JSON objects in data-intensive
applications. Fad.js is based on (1) speculative
just-in-time (JIT) compilation and (2) selective access
to data. Experiments show that applications using
Fad.js achieve speedups up to 2.7x for encoding and
9.9x for decoding JSON data when compared to
state-of-the art JSON processing libraries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Aggour:2017:CCL,
author = "Kareem S. Aggour and Jenny Weisenberg Williams and
Justin McHugh and Vijay S. Kumar",
title = "{Colt}: concept lineage tool for data flow metadata
capture and analysis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1790--1801",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137783",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Most organizations are becoming increasingly
data-driven, often processing data from many different
sources to enable critical business operations. Beyond
the well-addressed challenge of storing and processing
large volumes of data, financial institutions in
particular are increasingly subject to federal
regulations requiring high levels of accountability for
the accuracy and lineage of this data. For companies
like GE Capital, which maintain data across a globally
interconnected network of thousands of systems, it is
becoming increasingly challenging to capture an
accurate understanding of the data flowing between
those systems. To address this problem, we designed and
developed a concept lineage tool allowing
organizational data flows to be modeled, visualized and
interactively explored. This tool has novel features
that allow a data flow network to be contextualized in
terms of business-specific metadata such as the
concept, business, and product for which it applies.
Key analysis features have been implemented, including
the ability to trace the origination of particular
datasets, and to discover all systems where data is
found that meets some user-defined criteria. This tool
has been readily adopted by users at GE Capital and in
a short time has already become a business-critical
application, with over 2,200 data systems and over
1,000 data flows captured.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yeh:2017:MPI,
author = "Chin-Chia Michael Yeh and Nickolas Kavantzas and
Eamonn Keogh",
title = "Matrix profile {IV}: using weakly labeled time series
to predict outcomes",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1802--1812",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137784",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In academic settings over the last decade, there has
been significant progress in time series
classification. However, much of this work makes
assumptions that are simply unrealistic for deployed
industrial applications. Examples of these unrealistic
assumptions include the following: assuming that data
subsequences have a single fixed-length, are precisely
extracted from the data, and are correctly labeled
according to their membership in a set of equal-size
classes. In real-world industrial settings, these
patterns can be of different lengths, the class
annotations may only belong to a general region of the
data, may contain errors, and finally, the class
distribution is typically highly skewed. Can we learn
from such weakly labeled data? In this work, we
introduce SDTS, a scalable algorithm that can learn in
such challenging settings. We demonstrate the utility
of our ideas by learning from diverse datasets with
millions of datapoints. As we shall demonstrate, our
domain-agnostic parameter-free algorithm can be
competitive with domain-specific algorithms used in
neuroscience and entomology, even when those algorithms
have been tuned by domain experts to incorporate domain
knowledge.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chakkappen:2017:ASO,
author = "Sunil Chakkappen and Suratna Budalakoti and Ramarajan
Krishnamachari and Satyanarayana R. Valluri and Alan
Wood and Mohamed Zait",
title = "Adaptive statistics in {Oracle 12c}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1813--1824",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137785",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Database Management Systems (DBMS) continue to be the
foundation of mission critical applications, both OLTP
and Analytics. They provide a safe, reliable and
efficient platform to store and retrieve data. SQL is
the lingua franca of the database world. A database
developer writes a SQL statement to specify data
sources and express the desired result and the DBMS
will figure out the most efficient way to implement it.
The query optimizer is the component in a DBMS
responsible for finding the best execution plan for a
given SQL statement based on statistics, access
structures, location, and format. At the center of a
query optimizer is a cost model that consumes the above
information and helps the optimizer make decisions
related to query transformations, join order, join
methods, access paths, and data movement. The final
execution plan produced by the query optimizer depends
on the quality of information used by the cost model,
as well as the sophistication of the cost model. In
addition to statistics about the data, the cost model
also relies on statistics generated internally for
intermediate results, e.g. size of the output of a join
operation. This paper presents the problems caused by
incorrect statistics of intermediate results, survey
the existing solutions and present our solution
introduced in Oracle 12c. The solution includes
validating the generated statistics using table data
and via the automatic creation of auxiliary statistics
structures. We limit the overhead of the additional
work by confining their use to cases where it matters
the most, caching the computed statistics, and using
table samples. The statistics management is automated.
We demonstrate the benefits of our approach based on
experiments using two SQL workloads, a benchmark that
uses data from the Internal Movie Data Base (IMDB) and
a real customer workload.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Floratou:2017:DSR,
author = "Avrilia Floratou and Ashvin Agrawal and Bill Graham
and Sriram Rao and Karthik Ramasamy",
title = "{Dhalion}: self-regulating stream processing in
{Heron}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1825--1836",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137786",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In recent years, there has been an explosion of
large-scale real-time analytics needs and a plethora of
streaming systems have been developed to support such
applications. These systems are able to continue stream
processing even when faced with hardware and software
failures. However, these systems do not address some
crucial challenges facing their operators: the manual,
time-consuming and error-prone tasks of tuning various
configuration knobs to achieve service level objectives
(SLO) as well as the maintenance of SLOs in the face of
sudden, unpredictable load variation and hardware or
software performance degradation. In this paper, we
introduce the notion of self-regulating streaming
systems and the key properties that they must satisfy.
We then present the design and evaluation of Dhalion, a
system that provides self-regulation capabilities to
underlying streaming systems. We describe our
implementation of the Dhalion framework on top of
Twitter Heron, as well as a number of policies that
automatically reconfigure Heron topologies to meet
throughput SLOs, scaling resource consumption up and
down as needed. We experimentally evaluate our Dhalion
policies in a cloud environment and demonstrate their
effectiveness. We are in the process of open-sourcing
our Dhalion policies as part of the Heron project.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhu:2017:INO,
author = "Erkang Zhu and Ken Q. Pu and Fatemeh Nargesian and
Ren{\'e}e J. Miller",
title = "Interactive navigation of open data linkages",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1837--1840",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137788",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We developed Toronto Open Data Search to support the
ad hoc, interactive discovery of connections or
linkages between datasets. It can be used to
efficiently navigate through the open data cloud. Our
system consists of three parts: a user-interface
provided by a Web application; a scalable backend
infrastructure that supports navigational queries; and
a dynamic repository of open data tables. Our system
uses LSH Ensemble, an efficient index structure, to
compute linkages (attributes in two datasets with high
containment score) in real time at Internet scale. Our
application allows users to navigate along these
linkages by joining datasets. LSH Ensemble is scalable,
providing millisecond response times for linkage
discovery queries even over millions of datasets. Our
system offers users a highly interactive experience
making unrelated (and unlinked) dynamic collections of
datasets appear as a richly connected cloud of data
that can be navigated and combined easily in real
time.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Pimentel:2017:NTC,
author = "Jo{\~a}o Felipe Pimentel and Leonardo Murta and
Vanessa Braganholo and Juliana Freire",
title = "{noWorkflow}: a tool for collecting, analyzing, and
managing provenance from {Python} scripts",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1841--1844",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137789",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/python.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present noWorkflow, an open-source tool that
systematically and transparently collects provenance
from Python scripts, including data about the script
execution and how the script evolves over time. During
the demo, we will show how noWorkflow collects and
manages provenance, as well as how it supports the
analysis of computational experiments. We will also
encourage attendees to use noWorkflow for their own
scripts.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2017:ACB,
author = "Chao Wang and Yihao Feng and Qi Guo and Zhaoxian Li
and Kexin Liu and Zijian Tang and Anthony K. H. Tung
and Lifu Wu and Yuxin Zheng",
title = "{ARShop}: a cloud-based augmented reality system for
shopping",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1845--1848",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137790",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "ARShop is a one-stop solution for shopping in the
cyber-physical world with the help of crowd knowledge
and augmented reality. Its ultimate goal is to improve
customers' shopping experience. When a customer enters
a physical shop and snaps a shot, the enriched cyber
information of the surroundings will pop up and be
augmented on the screen. ARShop can also be the
customer's personal shopping assistant who can show
routes to the shops that the customer is interested in.
In addition, ARShop provides merchants with a web-based
interface to manage their shops and promote their
business to customers, and provides customers with an
Android App to query using images.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Aberger:2017:MGB,
author = "Christopher R. Aberger and Andrew Lamb and Kunle
Olukotun and Christopher R{\'e}",
title = "Mind the gap: bridging multi-domain query workloads
with {EmptyHeaded}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1849--1852",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137791",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Executing domain specific workloads from a relational
data warehouse is an increasingly popular task.
Unfortunately, classic relational database management
systems (RDBMS) are suboptimal in many domains (e.g.,
graph and linear algebra queries), and it is
challenging to transfer data from an RDBMS to a domain
specific toolkit in an efficient manner. This
demonstration showcases the EmptyHeaded engine: an
interactive query processing engine that leverages a
novel query architecture to support efficient execution
in multiple domains. To enable a unified design, the
EmptyHeaded architecture is built around recent
theoretical advancements in join processing and
automated in-query data transformations. This
demonstration highlights the strengths and weaknesses
of this novel type of query processing architecture
while showcasing its flexibility in multiple domains.
In particular, attendees will use EmptyHeaded's Jupyter
notebook front-end to interactively learn the
theoretical advantages of this new (and largely
unknown) approach and directly observe its performance
impact in multiple domains.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Maccioni:2017:CFL,
author = "Antonio Maccioni and Riccardo Torlone",
title = "Crossing the finish line faster when paddling the data
lake with {KAYAK}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1853--1856",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137792",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Paddling in a data lake is strenuous for a data
scientist. Being a loosely-structured collection of raw
data with little or no meta-information available, the
difficulties of extracting insights from a data lake
start from the initial phases of data analysis. Indeed,
data preparation, which involves many complex
operations (such as source and feature selection,
exploratory analysis, data profiling, and data
curation), is a long and involved activity for
navigating the lake before getting precious insights at
the finish line. In this framework, we demonstrate
KAYAK, a framework that supports data preparation in a
data lake with ad-hoc primitives and allows data
scientists to cross the finish line sooner. KAYAK takes
into account the tolerance of the user in waiting for
the primitives' results and it uses incremental
execution strategies to produce informative previews of
these results. The framework is based on a wise
management of metadata and on features that limit human
intervention, thus scaling smoothly when the data lake
evolves.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Niu:2017:DTT,
author = "Xing Niu and Bahareh Sadat Arab and Seokki Lee and Su
Feng and Xun Zou and Dieter Gawlick and Vasudha
Krishnaswamy and Zhen Hua Liu and Boris Glavic",
title = "Debugging transactions and tracking their provenance
with reenactment",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1857--1860",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137793",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Debugging transactions and understanding their
execution are of immense importance for developing OLAP
applications, to trace causes of errors in production
systems, and to audit the operations of a database.
However, debugging transactions is hard for several
reasons: (1) after the execution of a transaction, its
input is no longer available for debugging, (2)
internal states of a transaction are typically not
accessible, and (3) the execution of a transaction may
be affected by concurrently running transactions. We
present a debugger for transactions that enables
non-invasive, postmortem debugging of transactions with
provenance tracking and supports what-if scenarios
(changes to transaction code or data). Using
reenactment, a declarative replay technique we have
developed, a transaction is replayed over the state of
the DB seen by its original execution including all its
interactions with concurrently executed transactions
from the history. Importantly, our approach uses the
temporal database and audit logging capabilities
available in many DBMS and does not require any
modifications to the underlying database system nor
transactional workload.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Huang:2017:PES,
author = "Kai Huang and Sourav S. Bhowmick and Shuigeng Zhou and
Byron Choi",
title = "{\tt picasso}: exploratory search of connected
subgraph substructures in graph databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1861--1864",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137794",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Recently, exploratory search has received much
attention in information retrieval and database fields.
This search paradigm assists users who do not have a
clear search intent and are unfamiliar with the
underlying data space. Specifically, query formulation
evolves iteratively as the user becomes more familiar
with the content. Despite its growing importance,
exploratory search on graph-structured data has
received little attention in the literature. We
demonstrate a system called {\tt picasso} to realize
exploratory sub-structure search on a graph database
containing a set of small or medium-sized data graphs.
{\tt picasso} embodies several novel features such as
progressive (i.e., iterative) formulation of queries
visually and incremental processing, multi-stream
results exploration wall to visualize, explore, and
analyze search results to identify possible search
directions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cai:2017:DDI,
author = "Ruichu Cai and Zijie Lu and Li Wang and Zhenjie Zhang
and Tom Z. J. Fur and Marianne Winslett",
title = "{DITIR}: distributed index for high throughput
trajectory insertion and real-time temporal range
query",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1865--1868",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137795",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The prosperity of mobile social network and
location-based services, e.g., Uber, is backing the
explosive growth of spatial temporal streams on the
Internet. It raises new challenges to the underlying
data store system, which is supposed to support
extremely high-throughput trajectory insertion and
low-latency querying with spatial and temporal
constraints. State-of-the-art solutions, e.g., HBase,
do not render satisfactory performance, due to the high
overhead on index update. In this demonstration, we
present DITIR, our new system prototype tailored to
efficiently processing temporal and spacial queries
over historical data as well as latest updates. Our
system provides better performance guarantee, by
physically partitioning the incoming data tuples on
their arrivals and exploiting a template-based
insertion schema, to reach the desired ingestion
throughput. Load balancing mechanism is also introduced
to DITIR, by using which the system is capable of
achieving reliable performance against workload
dynamics. Our demonstration shows that DITIR supports
over 1 million tuple insertions in a second, when
running on a 10-node cluster. It also significantly
outperforms HBase by 7 times on ingestion throughput
and 5 times faster on query latency.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Pang:2017:FIV,
author = "Zhifei Pang and Sai Wu and Gang Chen and Ke Chen and
Lidan Shou",
title = "{FlashView}: an interactive visual explorer for raw
data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1869--1872",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137796",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "New data has been generated in an unexpected high
speed. To get insight of those data, data analysts will
perform a thorough study using state-of-the-art big
data analytical tools. Before the analysis starts, a
preprocessing is conducted, where data analyst tends to
issue a few ad-hoc queries on a new dataset to explore
and gain a better understanding. However, it is costly
to perform such ad-hoc queries on large scale data
using traditional data management systems, e.g., DBMS,
because data loading and indexing are very expensive.
In this demo, we propose a novel visual data explorer
system, FlashView, which omits the loading process by
directly querying raw data. FlashView applies
approximate query processing technique to achieve
real-time query results. It builds both in-memory index
and disk index to facilitate the data scanning. It also
supports tracking and updating multiple queries
concurrently. Note that FlashView is not designed as a
replacement of full-fledged DBMS. Instead, it tries to
help the analysts quickly understand the
characteristics of data, so he/she can selectively load
data into the DBMS to do more sophisticated analysis.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Subercaze:2017:UPT,
author = "Julien Subercaze and Christophe Gravier and Syed
Gillani and Abderrahmen Kammoun and Fr{\'e}d{\'e}rique
Laforest",
title = "{Upsortable}: programming top-$k$ queries over data
streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1873--1876",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137797",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Top-$k$ queries over data streams is a well studied
problem. There exists numerous systems allowing to
process continuous queries over sliding windows. At the
opposite, non-append only streams call for ad-hoc
solutions, e.g. tailor-made solutions implemented in a
mainstream programming language. In the meantime, the
Stream API and lambda expressions have been added in
Java 8, thus gaining powerful operations for data
stream processing. However, the Java Collections
Framework does not provide data structures to safely
and conveniently support sorted collections of evolving
data. In this paper, we demonstrate Upsortable, an
annotation-based approach that allows to use existing
sorted collections from the standard Java API for
dynamic data management. Our approach relies on a
combination of pre-compilation abstract syntax tree
modifications and runtime analysis of bytecode.
Upsortable offers the developer a safe and
time-efficient solution for developing top-$k$ queries
on data streams while keeping a full compatibility with
standard Java.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chamanara:2017:QSH,
author = "Javad Chamanara and Birgitta K{\"o}nig-Ries and H. V.
Jagadish",
title = "{QUIS}: in-situ heterogeneous data source querying",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1877--1880",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137798",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Existing data integration frameworks are poorly suited
for the special requirements of scientists. To answer a
specific research question, often, excerpts of data
from different sources need to be integrated. The
relevant parts and the set of underlying sources may
differ from query to query. The analyses also
oftentimes involve frequently changing data and
exploratory querying. Additionally, The data sources
not only store data in different formats, but also
provide inconsistent data access functionality. The
classic Extract-Transform-Load (ETL) approach seems too
complex and time-consuming and does not fit well with
interest and expertise of the scientists. With QUIS
(QUery In-Situ), we provide a solution for this
problem. QUIS is an open source heterogeneous in-situ
data querying system. It utilizes a federated query
virtualization approach that is built upon plugged-in
adapters. QUIS takes a user query and transforms
appropriate portions of it into the corresponding
computation model on individual data sources and
executes it. It complements the segments of the query
that the target data sources can not execute. Hence, it
guarantees full syntax and semantic support for its
language on all data sources. QUIS's in-situ querying
facility almost eliminates the time to prepare the data
while maintaining a competitive performance and steady
scalability. The present demonstration illustrates
interesting features of the system: virtual Schemas,
heterogeneous joins, and visual query results. We
provide a realistic data processing scenario to examine
the system's features. Users can interact with QUIS
using its desktop workbench, command line interface, or
from any R client including RStudio Server.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Alawini:2017:ADC,
author = "Abdussalam Alawini and Susan B. Davidson and Wei Hu
and Yinjun Wu",
title = "Automating data citation in {CiteDB}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1881--1884",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137799",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "An increasing amount of information is being collected
in structured, evolving, curated databases, driving the
question of how information extracted from such
datasets via queries should be cited. While several
databases say how data should be cited for web-page
views of the database, they leave it to users to
manually construct the citations. Furthermore, they do
not say how data extracted by queries other than
web-page views --- general queries --- should be cited.
This demo shows how citations can be specified for a
small set of views of the database, and used to
automatically generate citations for general queries
against the database.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fang:2017:CEB,
author = "Yixiang Fang and Reynold Cheng and Siqiang Luo and
Jiafeng Hu and Kai Huang",
title = "{C-explorer}: browsing communities in large graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1885--1888",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137800",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Community retrieval (CR) algorithms, which enable the
extraction of subgraphs from large social networks
(e.g., Facebook and Twitter), have attracted tremendous
interest. Various CR solutions, such as k -core and
codicil, have been proposed to obtain graphs whose
vertices are closely related. In this paper, we propose
the C-Explorer system to assist users in extracting,
visualizing, and analyzing communities. C-Explorer
provides online and interactive CR facilities, allowing
a user to view her interesting graphs, indicate her
required vertex q, and display the communities to which
q belongs. A seminal feature of C-Explorer is that it
uses an attributed graph, whose vertices are associated
with labels and keywords, and looks for an attributed
community (or AC), whose vertices are structurally and
semantically related. Moreover, C-Explorer implements
several state-of-the-art CR algorithms, as well as
functions for analyzing their effectiveness. We plan to
make C-Explorer an open-source web-based platform, and
design API functions for software developers to test
their CR algorithms in our system.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fan:2017:GPS,
author = "Wenfei Fan and Jingbo Xu and Yinghui Wu and Wenyuan Yu
and Jiaxin Jiang",
title = "{GRAPE}: parallelizing sequential graph computations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1889--1892",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137801",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We demonstrate GRAPE, a parallel GRAPh query Engine.
GRAPE advocates a parallel model based on a
simultaneous fixed point computation in terms of
partial and incremental evaluation. It differs from
prior systems in its ability to parallelize existing
sequential graph algorithms as a whole, without the
need for recasting the entire algorithms into a new
model. One of its unique features is that under a
monotonic condition, GRAPE parallelization guarantees
to terminate with correct answers as long as the
sequential algorithms ``plugged in'' are correct. We
demonstrate its parallel computations, ease-of-use and
performance compared with the start-of-the-art graph
systems. We also demonstrate a use case of GRAPE in
social media marketing.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Khoshkbarforoushha:2017:FDA,
author = "Alireza Khoshkbarforoushha and Rajiv Ranjan and Qing
Wang and Carsten Friedrich",
title = "{Flower}: a data analytics flow elasticity manager",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1893--1896",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137802",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A data analytics flow typically operates on three
layers: ingestion, analytics, and storage, each of
which is provided by a data-intensive system. These
systems are often available as cloud managed services,
enabling the users to have pain-free deployment of data
analytics flow applications such as click-stream
analytics. Despite straightforward orchestration,
elasticity management of the flows is challenging. This
is due to: (a) heterogeneity of workloads and diversity
of cloud resources such as queue partitions, compute
servers and NoSQL throughputs capacity, (b) workload
dependencies between the layers, and (c) different
performance behaviours and resource consumption
patterns. In this demonstration, we present Flower, a
holistic elasticity management system that exploits
advanced optimization and control theory techniques to
manage elasticity of complex data analytics flows on
clouds. Flower analyzes statistics and data collected
from different data-intensive systems to provide the
user with a suite of rich functionalities, including:
workload dependency analysis, optimal resource share
analysis, dynamic resource provisioning, and
cross-platform monitoring. We will showcase various
features of Flower using a real-world data analytics
flow. We will allow the audience to explore Flower by
visually defining and configuring a data analytics flow
elasticity manager and get hands-on experience with
integrated data analytics flow management.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2017:SAD,
author = "Zhiyi Wang and Dongyan Zhou and Shimin Chen",
title = "{STEED}: an analytical database system for
tree-structured data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1897--1900",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137803",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Tree-structured data formats, such as JSON and
Protocol Buffers, are capable of expressing
sophisticated data types, including nested, repeated,
and missing values. While such expressing power
contributes to their popularity in real-world
applications, it presents a significant challenge for
systems supporting tree-structured data. Existing
systems have focused on general-purpose solutions
either extending RDBMSs or designing native systems.
However, the general-purpose approach often results in
sophisticated data structures and algorithms, which may
not reflect and optimize for the actual structure
patterns in the real world. In this demonstration, we
showcase Steed, an analytical database System for
tree-structured data. We use the insights gained by
analyzing representative real-world tree structured
data as guidelines in the design of Steed. Steed learns
and extracts a schema tree for a data set and uses the
schema tree to reduce the storage space and improve the
efficiency of data field accesses. We observe that
sub-structures in real world data are often simple,
while the tree-structured data types can support very
sophisticated structures. We optimize the storage
structure, the column assembling algorithm, and the
in-memory layout for the simple sub-structures (a.k.a.
simple paths). Compared to representative
state-of-the-art systems (i.e. PostgreSQL/JSON,
MongoDB, and Hive+Parquet), Steed achieves orders of
magnitude better performance for data analysis
queries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Xiao:2017:LLC,
author = "Yonghui Xiao and Li Xiong and Si Zhang and Yang Cao",
title = "{LocLok}: location cloaking with differential privacy
via hidden {Markov} model",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1901--1904",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137804",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We demonstrate LocLok, a LOCation-cLOaKing system to
protect the locations of a user with differential
privacy. LocLok has two features: (a) it protects
locations under temporal correlations described through
hidden Markov model; (b) it releases the optimal noisy
location with the planar isotropic mechanism (PIM), the
first mechanism that achieves the lower bound of
differential privacy. We show the detailed computation
of LocLok with the following components: (a) how to
generate the possible locations with Markov model, (b)
how to perturb the location with PIM, and (c) how to
make inference about the true location in Markov model.
An online system with real-word dataset will be
presented with the computation details.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ren:2017:SAI,
author = "Xiangnan Ren and Olivier Cur{\'e} and Li Ke and Jeremy
Lhez and Badre Belabbess and Tendry Randriamalala and
Yufan Zheng and Gabriel Kepeklian",
title = "{Strider}: an adaptive, inference-enabled distributed
{RDF} stream processing engine",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1905--1908",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137805",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Real-time processing of data streams emanating from
sensors is becoming a common task in industrial
scenarios. An increasing number of processing jobs
executed over such platforms are requiring reasoning
mechanisms. The key implementation goal is thus to
efficiently handle massive incoming data streams and
support reasoning, data analytic services. Moreover, in
an on-going industrial project on anomaly detection in
large potable water networks, we are facing the effect
of dynamically changing data and work characteristics
in stream processing. The Strider system addresses
these research and implementation challenges by
considering scalability, fault-tolerance, high
throughput and acceptable latency properties. We will
demonstrate the benefits of Strider on an Internet of
Things-based real world and industrial setting.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2017:CAT,
author = "Yan Li and Ngai Meng Kou and Hao Wang and Leong Hou U.
and Zhiguo Gong",
title = "A confidence-aware top-$k$ query processing toolkit on
crowdsourcing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1909--1912",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137806",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Ranking techniques have been widely used in ubiquitous
applications like recommendation, information
retrieval, etc. For ranking computation hostile but
human friendly items, crowdsourcing is considered as an
emerging technique to process the ranking by human
power. However, there is a lack of an easy-to-use
toolkit for answering crowdsourced top- k query with
minimal effort. In this work, we demonstrate an
interactive programming toolkit that is a unified
solution for answering the crowd-sourced top- k
queries. The toolkit employs a new confidence-aware
crowdsourced top- k algorithm, SPR. The whole progress
of the algorithm is monitored and visualized to end
users in a timely manner. Besides the visualized result
and the statistics, the system also reports the
estimation of the monetary cost and the breakdown of
each phase. Based on the estimation, end users can
strike a balance between the budget and the quality
through the interface of this toolkit.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fionda:2017:EQK,
author = "Valeria Fionda and Giuseppe Pirr{\`o}",
title = "Explaining and querying knowledge graphs by
relatedness",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1913--1916",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137807",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We demonstrate RECAP, a tool that explains relatedness
between entities in Knowledge Graphs (KGs) and
implements a query by relatedness paradigm that allows
to retrieve entities related to those in input. One of
the peculiarities of RECAP is that it does not require
any data preprocessing and can combine knowledge from
multiple KGs. The underlying algorithmic techniques are
reduced to the execution of SPARQL queries plus some
local refinement. This makes the tool readily available
on a large variety of KGs accessible via SPARQL
endpoints. To show the general applicability of the
tool, we will cover a set of use cases drawn from a
variety of knowledge domains (e.g., biology, movies,
co-authorship networks) and report on the concrete
usage of RECAP in the SENSE4US FP7 project. We will
underline the technical aspects of the system and give
details on its implementation. The target audience of
the demo includes both researchers and practitioners
and aims at reporting on the benefits of RECAP in
practical knowledge discovery applications.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kunjir:2017:TAM,
author = "Mayuresh Kunjir and Shivnath Babu",
title = "{Thoth} in action: memory management in modern data
analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1917--1920",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137808",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Allocation and usage of memory in modern
data-processing platforms is based on an interplay of
algorithms at multiple levels: (i) at the
resource-management level across containers allocated
by resource managers like Mesos and Yarn, (ii) at the
container level among the OS and processes such as the
Java Virtual Machine (JVM), (iii) at the framework
level for caching, aggregation, data shuffles, and
application data structures, and (iv) at the JVM level
across various pools such as the Young and Old
Generation as well as the heap versus off-heap. We use
Thoth, a data-driven platform for multi-system cluster
management, to build a deep understanding of different
interplays in memory management options. Through
multiple memory management apps built in Thoth, we
demonstrate how Thoth can deal with multiple levels of
memory management as well as multi-tenant nature of
clusters.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Schule:2017:MSS,
author = "Maximilian E. Sch{\"u}le and Pascal M. N. Schliski and
Thomas Hutzelmann and Tobias Rosenberger and Viktor
Leis and Dimitri Vorona and Alfons Kemper and Thomas
Neumann",
title = "{Monopedia}: staying single is good enough --- the
hyper way for web scale applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1921--1924",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137809",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In order to handle the database load for web scale
applications, the conventional wisdom is that a cluster
of database servers and a caching layer are essential.
In this work, we argue that modern main memory database
systems are often fast enough to consolidate this
complex architecture into a single server (plus an
additional fail over system). To demonstrate this
claim, we design the Monopedia Benchmark, a benchmark
for web scale applications modeled after Wikipedia.
Using this benchmark, we show that it is indeed
possible to run the database workload of one of the
largest web sites in the world on a single database
server.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sun:2017:DDM,
author = "Ji Sun and Zeyuan Shang and Guoliang Li and Dong Deng
and Zhifeng Bao",
title = "Dima: a distributed in-memory similarity-based query
processing system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1925--1928",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137810",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data analysts in industries spend more than 80\% of
time on data cleaning and integration in the whole
process of data analytics due to data errors and
inconsistencies. It calls for effective query
processing techniques to tolerate the errors and
inconsistencies. In this paper, we develop a
distributed in-memory similarity-based query processing
system called Dima. Dima supports two core
similarity-based query operations, i.e., similarity
search and similarity join. Dima extends the SQL
programming interface for users to easily invoke these
two operations in their data analysis jobs. To avoid
expensive data transformation in a distributed
environment, we design selectable signatures where two
records approximately match if they share common
signatures. More importantly, we can adaptively select
the signatures to balance the workload. Dima builds
signature-based global indexes and local indexes to
support efficient similarity search and join. Since
Spark is one of the widely adopted distributed
in-memory computing systems, we have seamlessly
integrated Dima into Spark and developed effective
query optimization techniques in Spark. To the best of
our knowledge, this is the first full-fledged
distributed in-memory system that can support
similarity-based query processing. We demonstrate our
system in several scenarios, including entity matching,
web table integration and query recommendation.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chekol:2017:TTC,
author = "Melisachew W. Chekol and Giuseppe Pirr{\`o} and Joerg
Schoenfisch and Heiner Stuckenschmidt",
title = "{TeCoRe}: temporal conflict resolution in knowledge
graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1929--1932",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137811",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The management of uncertainty is crucial when
harvesting structured content from unstructured and
noisy sources. Knowledge Graphs (kgs), maintaining both
numerical and non-numerical facts supported by an
underlying schema, are a prominent example. Knowledge
Graph management is challenging because: (i) most of
existing kgs focus on static data, thus impeding the
availability of timewise knowledge; (ii) facts in kgs
are usually accompanied by a confidence score, which
witnesses how likely it is for them to hold. We
demonstrate TeCoRe, a system for temporal inference and
conflict resolution in uncertain temporal knowledge
graphs (utkgs). At the heart of TeCoRe are two
state-of-the-art probabilistic reasoners that are able
to deal with temporal constraints efficiently. While
one is scalable, the other can cope with more
expressive constraints. The demonstration will focus on
enabling users and applications to find inconsistencies
in utkgs. TeCoRe provides an interface allowing to
select utkgs and editing constraints; shows the maximal
consistent subset of the utkg, and displays statistics
(e.g., number of noisy facts removed) about the
debugging process.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2017:MTD,
author = "Xupeng Li and Bin Cui and Yiru Chen and Wentao Wu and
Ce Zhang",
title = "{MLog}: towards declarative in-database machine
learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1933--1936",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137812",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We demonstrate MLog, a high-level language that
integrates machine learning into data management
systems. Unlike existing machine learning frameworks
(e.g., TensorFlow, Theano, and Caffe), MLog is
declarative, in the sense that the system manages all
data movement, data persistency, and machine-learning
related optimizations (such as data batching)
automatically. Our interactive demonstration will show
audience how this is achieved based on the novel notion
of tensoral views (TViews), which are similar to
relational views but operate over tensors with linear
algebra. With MLog, users can succinctly specify not
only simple models such as SVM (in just two lines), but
also sophisticated deep learning models that are not
supported by existing in-database analytics systems
(e.g., MADlib, PAL, and SciDB), as a series of cascaded
TViews. Given the declarative nature of MLog, we
further demonstrate how query/program optimization
techniques can be leveraged to translate MLog programs
into native TensorFlow programs. The performance of the
automatically generated Tensor-Flow programs is
comparable to that of hand-optimized ones.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Demiralp:2017:FRV,
author = "{\c{C}}agatay Demiralp and Peter J. Haas and
Srinivasan Parthasarathy and Tejaswini Pedapati",
title = "Foresight: recommending visual insights",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1937--1940",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137813",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Current tools for exploratory data analysis (EDA)
require users to manually select data attributes,
statistical computations and visual encodings. This can
be daunting for large-scale, complex data. We introduce
Foresight, a system that helps the user rapidly
discover visual insights from large high-dimensional
datasets. Formally, an ``insight'' is a strong
manifestation of a statistical property of the data,
e.g., high correlation between two attributes, high
skewness or concentration about the mean of a single
attribute, a strong clustering of values, and so on.
For each insight type, Foresight initially presents
visualizations of the top k instances in the data,
based on an appropriate ranking metric. The user can
then look at ``nearby'' insights by issuing ``insight
queries'' containing constraints on insight strengths
and data attributes. Thus the user can directly explore
the space of insights, rather than the space of data
dimensions and visual encodings as in other visual
recommender systems. Foresight also provides ``global''
views of insight space to help orient the user and
ensure a thorough exploration process. Furthermore,
Foresight facilitates interactive exploration of large
datasets through fast, approximate sketching.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jacobs:2017:BDT,
author = "Steven Jacobs and Md Yusuf Sarwar Uddin and Michael
Carey and Vagelis Hristidis and Vassilis J. Tsotras and
N. Venkatasubramanian and Yao Wu and Syed Safir and
Purvi Kaul and Xikui Wang and Mohiuddin Abdul Qader and
Yawei Li",
title = "A {BAD} demonstration: towards {Big Active Data}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1941--1944",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137814",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Nearly all of today's Big Data systems are passive in
nature. We demonstrate our Big Active Data (``BAD'')
system, a scalable system that continuously and
reliably captures Big Data and facilitates the timely
and automatic delivery of new information to a large
population of interested users as well as supporting
analyses of historical information. We built our BAD
project by extending an existing scalable, open-source
BDMS (AsterixDB [1]) in this active direction. In this
demonstration, we allow our audience to participate in
an emergency notification application built on top of
our BAD platform, and highlight its capabilities.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Hassan:2017:CFE,
author = "Naeemul Hassan and Gensheng Zhang and Fatma Arslan and
Josue Caraballo and Damian Jimenez and Siddhant Gawsane
and Shohedul Hasan and Minumol Joseph and Aaditya
Kulkarni and Anil Kumar Nayak and Vikas Sable and
Chengkai Li and Mark Tremayne",
title = "{ClaimBuster}: the first-ever end-to-end fact-checking
system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1945--1948",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137815",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Our society is struggling with an unprecedented amount
of falsehoods, hyperboles, and half-truths. Politicians
and organizations repeatedly make the same false
claims. Fake news floods the cyberspace and even
allegedly influenced the 2016 election. In fighting
false information, the number of active fact-checking
organizations has grown from 44 in 2014 to 114 in early
2017.$^1$ Fact-checkers vet claims by investigating
relevant data and documents and publish their verdicts.
For instance, PolitiFact.com, one of the earliest and
most popular fact-checking projects, gives factual
claims truthfulness ratings such as True, Mostly True,
Half true, Mostly False, False, and even ``Pants on
Fire''. In the U.S., the election year made
fact-checking a part of household terminology. For
example, during the first presidential debate on
September 26, 2016, NPR.org's live fact-checking
website drew 7.4 million page views and delivered its
biggest traffic day ever.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Deep:2017:QDR,
author = "Shaleen Deep and Paraschos Koutris and Yash
Bidasaria",
title = "{QIRANA} demonstration: real time scalable query
pricing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1949--1952",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137816",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The last decade has seen a deluge in data collection
and dissemination across a broad range of areas. This
phenomena has led to creation of online data markets
where entities engage in sale and purchase of data. In
this scenario, the key challenge for the data market
platform is to ensure that it allows real time,
scalable, arbitrage-free pricing of user queries. At
the same time, the platform needs to flexible enough
for sellers in order to customize the setup of the data
to be sold. In this paper, we describe the
demonstration of Q irana, a light weight framework that
implements query-based pricing at scale. The framework
acts as a layer between the end users (buyers and
sellers) and the database. Qirana's demonstration
features that we highlight are: (i) allows sellers to
choose from a variety of pricing functions based on
their requirements and incorporates price points as a
guide for query pricing; (ii) helps the seller set
parameters by mocking workloads; (iii) buyers engage
with the platform by directly asking queries and track
their budget per dataset;. We demonstrate the tunable
parameters of our framework over a real-world dataset,
illustrating the promise of our approach.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Khan:2017:DDT,
author = "Meraj Khan and Larry Xu and Arnab Nandi and Joseph M.
Hellerstein",
title = "{DataTweener}: a demonstration of a tweening engine
for incremental visualization of data transforms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1953--1956",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137817",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the development and advancement of new data
interaction modalities, data exploration and analysis
has become a highly interactive process situating the
user in a session of successive queries. With rapidly
changing results, it becomes difficult for the end user
to fully comprehend transformations, especially the
transforms corresponding to complex queries. We
introduce ``data tweening'' as an informative way of
visualizing structural data transforms, presenting the
users with a series of incremental visual
representations of a resultset transformation. We
present transformations as ordered sequences of basic
structural transforms and visual cues. The sequences
are generated using an automated framework which
utilizes differences between the consecutive resultsets
and queries in a query session. We evaluate the
effectiveness of tweening as a visualization method
through a user study.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Salimi:2017:ZCI,
author = "Babak Salimi and Corey Cole and Dan R. K. Ports and
Dan Suciu",
title = "{ZaliQL}: causal inference from observational data at
scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1957--1960",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137818",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Causal inference from observational data is a subject
of active research and development in statistics and
computer science. Many statistical software packages
have been developed for this purpose. However, these
toolkits do not scale to large datasets. We propose and
demonstrate ZaliQL: a SQL-based framework for drawing
causal inference from observational data. ZaliQL
supports the state-of-the-art methods for causal
inference and runs at scale within PostgreSQL database
system. In addition, we built a visual interface to
wrap around ZaliQL. In our demonstration, we will use
this GUI to show a live investigation of the causal
effect of different weather conditions on flight
delays.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Alarabi:2017:DSH,
author = "Louai Alarabi and Mohamed F. Mokbel",
title = "A demonstration of {ST-Hadoop}: a {MapReduce}
framework for big spatio-temporal data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1961--1964",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137819",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This demo presents ST-Hadoop; the first full-fledged
open-source MapReduce framework with a native support
for spatio-temporal data. ST-Hadoop injects
spatio-temporal awareness in the Hadoop base code,
which results in achieving order(s) of magnitude better
performance than Hadoop and SpatialHadoop when dealing
with spatio-temporal data and queries. The key idea
behind ST-Hadoop is its ability in indexing
spatio-temporal data within Hadoop Distributed File
System (HDFS). A real system prototype of ST-Hadoop,
running on a local cluster of 24 machines, is
demonstrated with two big-spatio-temporal datasets of
Twitter and NYC Taxi data, each of around one billion
records.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bharadwaj:2017:CIL,
author = "S. Bharadwaj and L. Chiticariu and M. Danilevsky and
S. Dhingra and S. Divekar and A. Carreno-Fuentes and H.
Gupta and N. Gupta and S.-D. Han and M. Hern{\'a}ndez
and H. Ho and P. Jain and S. Joshi and H. Karanam and
S. Krishnan and R. Krishnamurthy and Y. Li and S.
Manivannan and A. Mittal and F. {\"O}zcan and A. Quamar
and P. Raman and D. Saha and K. Sankaranarayanan and J.
Sen and P. Sen and S. Vaithyanathan and M. Vasa and H.
Wang and H. Zhu",
title = "Creation and interaction with large-scale
domain-specific knowledge bases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1965--1968",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137820",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The ability to create and interact with large-scale
domain-specific knowledge bases from
unstructured/semi-structured data is the foundation for
many industry-focused cognitive systems. We will
demonstrate the Content Services system that provides
cloud services for creating and querying high-quality
domain-specific knowledge bases by analyzing and
integrating multiple (un/semi)structured content
sources. We will showcase an instantiation of the
system for a financial domain. We will also demonstrate
both cross-lingual natural language queries and
programmatic API calls for interacting with this
knowledge base.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jonathan:2017:DSC,
author = "Christopher Jonathan and Mohamed F. Mokbel",
title = "A demonstration of {Stella}: a crowdsourcing-based
geotagging framework",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1969--1972",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137821",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper demonstrates Stella; an efficient
crowdsourcing-based geotagging framework for any types
of objects. In this demonstration, we showcase the
effectiveness of Stella in geotagging images via two
different scenarios: (1) we provide a graphical
interface to show the process of a geotagging process
that have been done by using Amazon Mechanical Turk,
(2) we seek help from the conference attendees to
propose an image to be geotagged or to help us geotag
an image by using our application during the
demonstration period. At the end of the demonstration
period, we will show the geotagging result.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Moll:2017:EBV,
author = "Oscar Moll and Aaron Zalewski and Sudeep Pillai and
Sam Madden and Michael Stonebraker and Vijay
Gadepally",
title = "Exploring big volume sensor data with {Vroom}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1973--1976",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137822",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "State of the art sensors within a single autonomous
vehicle (AV) can produce video and LIDAR data at rates
greater than 30 GB/hour. Unsurprisingly, even small AV
research teams can accumulate tens of terabytes of
sensor data from multiple trips and multiple vehicles.
AV practitioners would like to extract information
about specific locations or specific situations for
further study, but are often unable to. Queries over AV
sensor data are different from generic analytics or
spatial queries because they demand reasoning about
fields of view as well as heavy computation to extract
features from scenes. In this article and demo we
present Vroom, a system for ad-hoc queries over AV
sensor databases. Vroom combines domain specific
properties of AV datasets with selective indexing and
multi-query optimization to address challenges posed by
AV sensor data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mottin:2017:NTE,
author = "Davide Mottin and Matteo Lissandrini and Yannis
Velegrakis and Themis Palpanas",
title = "New trends on exploratory methods for data analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1977--1980",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137824",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data usually comes in a plethora of formats and
dimensions, rendering the exploration and information
extraction processes cumbersome. Thus, being able to
cast exploratory queries in the data with the intent of
having an immediate glimpse on some of the data
properties is becoming crucial. An exploratory query
should be simple enough to avoid complicate declarative
languages (such as SQL) and mechanisms, and at the same
time retain the flexibility and expressiveness of such
languages. Recently, we have witnessed a rediscovery of
the so called example-based methods, in which the user,
or the analyst circumvent query languages by using
examples as input. An example is a representative of
the intended results, or in other words, an item from
the result set. Example-based methods exploit inherent
characteristics of the data to infer the results that
the user has in mind, but may not able to (easily)
express. They can be useful both in cases where a user
is looking for information in an unfamiliar dataset, or
simply when she is exploring the data without knowing
what to find in there. In this tutorial, we present an
excursus over the main methods for exploratory
analysis, with a particular focus on example-based
methods. We show how different data types require
different techniques, and present algorithms that are
specifically designed for relational, textual, and
graph data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Khan:2017:SSD,
author = "Arijit Khan and Sourav S. Bhowmick and Francesco
Bonchi",
title = "Summarizing static and dynamic big graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1981--1984",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137825",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Large-scale, highly-interconnected networks pervade
our society and the natural world around us, including
the World Wide Web, social networks, knowledge graphs,
genome and scientific databases, medical and government
records. The massive scale of graph data often
surpasses the available computation and storage
resources. Besides, users get overwhelmed by the
daunting task of understanding and using such graphs
due to their sheer volume and complexity. Hence, there
is a critical need to summarize large graphs into
concise forms that can be more easily visualized,
processed, and managed. Graph summarization has indeed
attracted a lot of interests from various research
communities, such as sociology, physics, chemistry,
bioinformatics, and computer science. Different ways of
summarizing graphs have been invented that are often
complementary to each other. In this tutorial, we
discuss algorithmic advances on graph summarization in
the context of both classical (e.g., static graphs) and
emerging (e.g., dynamic and stream graphs)
applications. We emphasize the current challenges and
highlight some future research directions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mouratidis:2017:GAT,
author = "Kyriakos Mouratidis",
title = "Geometric approaches for top-$k$ queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1985--1987",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137826",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Top- k processing is a well-studied problem with
numerous applications that is becoming increasingly
relevant with the growing availability of
recommendation systems and decision making software.
The objective of this tutorial is twofold. First, we
will delve into the geometric aspects of top- k
processing. Second, we will cover complementary
features to top- k queries, with strong practical
relevance and important applications, that have a
computational geometric nature. The tutorial will close
with insights in the effect of dimensionality on the
meaningfulness of top- k queries, and interesting
similarities to nearest neighbor search.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tong:2017:SCC,
author = "Yongxin Tong and Lei Chen and Cyrus Shahabi",
title = "Spatial crowdsourcing: challenges, techniques, and
applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1988--1991",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137827",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Crowdsourcing is a new computing paradigm where humans
are actively enrolled to participate in the procedure
of computing, especially for tasks that are
intrinsically easier for humans than for computers. The
popularity of mobile computing and sharing economy has
extended conventional web-based crowdsourcing to
spatial crowdsourcing (SC), where spatial data such as
location, mobility and the associated contextual
information, plays a central role. In fact, spatial
crowdsourcing has stimulated a series of recent
industrial successes including Citizen Sensing (Waze),
P2P ride-sharing (Uber) and Real-time Online-To-Offline
(O2O) services (Instacart and Postmates). In this
tutorial, we review the paradigm shift from web-based
crowdsourcing to spatial crowdsourcing. We dive deep
into the challenges and techniques brought by the
unique spatio-temporal characteristics of spatial
crowdsourcing. Particularly, we survey new designs in
task assignment, quality control, incentive mechanism
design and privacy protection on spatial crowdsourcing
platforms, as well as the new trend to incorporate
crowdsourcing to enhance existing spatial data
processing techniques. We also discuss case studies of
representative spatial crowdsourcing systems and raise
open questions and current challenges for the audience
to easily comprehend the tutorial and to advance this
important research area.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Eldawy:2017:EBS,
author = "Ahmed Eldawy and Mohamed F. Mokbel",
title = "The era of big spatial data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1992--1995",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137828",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this tutorial, we present the recent work in the
database community for handling Big Spatial Data. This
topic became very hot due to the recent explosion in
the amount of spatial data generated by smart phones,
satellites and medical devices, among others. This
tutorial goes beyond the use of existing systems as-is
(e.g., Hadoop, Spark or Impala), and digs deep into the
core components of big systems (e.g., indexing and
query processing) to describe how they are designed to
handle big spatial data. During this 90-minute
tutorial, we review the state-of-the-art work in the
area of Big Spatial Data while classifying the existing
research efforts according to the implementation
approach, underlying architecture, and system
components. In addition, we provide case studies of
full-fledged systems and applications that handle Big
Spatial Data which allows the audience to better
comprehend the whole tutorial.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Giatrakos:2017:CER,
author = "Nikos Giatrakos and Alexander Artikis and Antonios
Deligiannakis and Minos Garofalakis",
title = "Complex event recognition in the big data era",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "1996--1999",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137829",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The concept of event processing is established as a
generic computational paradigm in various application
fields, ranging from data processing in Web
environments, over maritime and transport, to finance
and medicine. Events report on state changes of a
system and its environment. Complex Event Recognition
(CER) in turn, refers to the identification of
complex/composite events of interest, which are
collections of simple events that satisfy some pattern,
thereby providing the opportunity for reactive and
proactive measures. Examples include the recognition of
attacks in computer network nodes, human activities on
video content, emerging stories and trends on the
Social Web, traffic and transport incidents in smart
cities, fraud in electronic marketplaces, cardiac
arrhythmias, and epidemic spread. In each scenario, CER
allows to make sense of Big event Data streams and
react accordingly. The goal of this tutorial is to
provide a step-by-step guide for realizing CER in the
Big Data era. To do so, it elaborates on major
challenges and describes algorithmic toolkits for
optimized manipulation of event streams characterized
by high volume, velocity and/or lack of veracity,
placing emphasis on distributed CER over potentially
heterogeneous (data variety) event sources. Finally, we
highlight future research directions in the field.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mohan:2017:TBD,
author = "C. Mohan",
title = "Tutorial: blockchains and databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "2000--2001",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137830",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In the last few years, blockchain (also known as
distributed ledger), the underlying technology of the
permissionless or public Bitcoin network, has become
very popular for use in private or permissioned
environments. Computer companies like IBM and
Microsoft, and many key players in different vertical
industry segments have recognized the utility of
blockchains for securely managing assets
(physical/digital) other than cryptocurrencies. IBM did
some pioneering work by architecting and implementing a
private blockchain system, and then open sourcing it.
That system, which has since then been named Fabric, is
being enhanced via the Hyperledger Consortium set up
under the auspices of the Linux Foundation. Other
efforts in the industry include Enterprise Ethereum, R3
Corda and BigchainDB.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zakhary:2017:CWS,
author = "Victor Zakhary and Divyakant Agrawal and Amr {El
Abbadi}",
title = "Caching at the web scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "2002--2005",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137831",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Today's web applications and social networks are
serving billions of users around the globe. These users
generate billions of key lookups and millions of data
object updates per second. A single user's social
network page load requires hundreds of key lookups.
This scale creates many design challenges for the
underlying storage systems. First, these systems have
to serve user requests with low latency. Any increase
in the request latency leads to a decrease in user
interest. Second, storage systems have to be highly
available. Failures should be handled seamlessly
without affecting user requests. Third, users consume
an order of magnitude more data than they produce.
Therefore, storage systems have to be optimized for
read-intensive workloads. To address these challenges,
distributed in-memory caching services have been widely
deployed on top of persistent storage. In this
tutorial, we survey the recent developments in
distributed caching services. We present the
algorithmic and architectural efforts behind these
systems focusing on the challenges in addition to open
research questions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2017:HLD,
author = "Guoliang Li",
title = "Human-in-the-loop data integration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "2006--2017",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137833",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data integration aims to integrate data in different
sources and provide users with a unified view. However,
data integration cannot be completely addressed by
purely automated methods. We propose a hybrid
human-machine data integration framework that harnesses
human ability to address this problem, and apply it
initially to the problem of entity matching. The
framework first uses rule-based algorithms to identify
possible matching pairs and then utilizes the crowd to
refine these candidate pairs in order to compute actual
matching pairs. In the first step, we propose
similarity-based rules and knowledge-based rules to
obtain some candidate matching pairs, and develop
effective algorithms to learn these rules based on some
given positive and negative examples. We build a
distributed in-memory system DIMA to efficiently apply
these rules. In the second step, we propose a
selection-inference-refine framework that uses the
crowd to verify the candidate pairs. We first select
some ``beneficial'' tasks to ask the crowd and then use
transitivity and partial order to infer the answers of
unasked tasks based on the crowdsourcing results of the
asked tasks. Next we refine the inferred answers with
high uncertainty due to the disagreement from the
crowd. We develop a crowd-powered database system CDB
and deploy it on real crowdsourcing platforms. CDB
allows users to utilize a SQL-like language for
processing crowd-based queries. Lastly, we provide
emerging challenges in human-in-the-loop data
integration.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lehner:2017:DCU,
author = "Wolfgang Lehner",
title = "The data center under your desk: how disruptive is
modern hardware for {DB} system design?",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "2018--2019",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137834",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "While we are already used to see more than 1,000 cores
within a single machine, the next processing platforms
for database engines will be heterogeneous with
built-in GPU-style processors as well as specialized
FPGAs or chips with domain-specific instruction sets.
Moreover, the traditional volatile as well as the
upcoming non-volatile RAM with capacities in the 100s
of TBytes per machine will provide great opportunities
for storage engines but also call for radical changes
on the architecture of such systems. Finally, the
emergence of economically affordable,
high-speed/low-latency interconnects as a basis for
rack-scale computing is questioning long-standing
folklore algorithmic assumptions but will certainly
play an important role in the big picture of building
modern data management platforms. In this talk, we will
try to classify and review existing approaches from a
performance, robustness, as well as energy efficiency
perspective and pinpoint interesting starting points
for further research activities.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Milo:2017:SMM,
author = "Tova Milo",
title = "7 secrets that my mother didn't tell me",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "2020--2020",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137835",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "What does it take to be a good researcher? And, is it
different when you are a women? These are questions
that many of us are wondering about throughout our
career. Being honored with a VLDB Women in Database
Research Award, I would like to share with you in this
talk some of the secrets to successful research that I
have learned over the years. These secrets highlight
some of the fundamental research directions that I have
taken. No less importantly, they explain how I
successfully got to work on them, both personally and
professionally.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lv:2017:IPL,
author = "Qin Lv and William Josephson and Zhe Wang and Moses
Charikar and Kai Li",
title = "Intelligent probing for locality sensitive hashing:
multi-probe {LSH} and beyond",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "12",
pages = "2021--2024",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3137765.3137836",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:19 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The past decade has been marked by the (continued)
explosion of diverse data content and the fast
development of intelligent data analytics techniques.
One problem we identified in the mid-2000s was
similarity search of feature-rich data. The challenge
here was achieving both high accuracy and high
efficiency in high-dimensional spaces. Locality
sensitive hashing (LSH), which uses certain random
space partitions and hash table lookups to find
approximate nearest neighbors, was a promising approach
with theoretical guarantees. But LSH alone was
insufficient since a large number of hash tables were
required to achieve good search quality. Building on an
idea of Panigrahy, our multi-probe LSH method
introduced the idea of intelligent probing. Given a
query object, we strategically probe its neighboring
hash buckets (in a query-dependent fashion) by
calculating the statistical probabilities of similar
objects falling into each bucket. Such intelligent
probing can significantly reduce the number of hash
tables while achieving high quality. In this paper, we
revisit the problem motivation, the challenges, the key
design considerations of multi-probe LSH, as well as
discuss recent developments in this space and some
questions for further research.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Qin:2017:SRB,
author = "Dai Qin and Angela Demke Brown and Ashvin Goel",
title = "Scalable replay-based replication for fast databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "13",
pages = "2025--2036",
month = sep,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:20 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Primary-backup replication is commonly used for
providing fault tolerance in databases. It is performed
by replaying the database recovery log on a backup
server. Such a scheme raises several challenges for
modern, high-throughput multi-core databases. It is
hard to replay the recovery log concurrently, and so
the backup can become the bottleneck. Moreover, with
the high transaction rates on the primary, the log
transfer can cause network bottlenecks. Both these
bottlenecks can significantly slow the primary
database. In this paper, we propose using record-replay
for replicating fast databases. Our design enables
replay to be performed scalably and concurrently, so
that the backup performance scales with the primary
performance. At the same time, our approach requires
only 15--20\% of the network bandwidth required by
traditional logging, reducing network infrastructure
costs significantly.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ren:2017:SSE,
author = "Kai Ren and Qing Zheng and Joy Arulraj and Garth
Gibson",
title = "{SlimDB}: a space-efficient key--value storage engine
for semi-sorted data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "13",
pages = "2037--2048",
month = sep,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:20 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Modern key--value stores often use write-optimized
indexes and compact in-memory indexes to speed up read
and write performance. One popular write-optimized
index is the Log-structured merge-tree (LSM-tree) which
provides indexed access to write-intensive data. It has
been increasingly used as a storage backbone for many
services, including file system metadata management,
graph processing engines, and machine learning feature
storage engines. Existing LSM-tree implementations
often exhibit high write amplifications caused by
compaction, and lack optimizations to maximize read
performance on solid-state disks. The goal of this
paper is to explore techniques that leverage common
workload characteristics shared by many systems using
key--value stores to reduce the read/write
amplification overhead typically associated with
general-purpose LSM-tree implementations. Our
experiments show that by applying these design
techniques, our new implementation of a key--value
store, SlimDB, can be two to three times faster, use
less memory to cache metadata indices, and show lower
tail latency in read operations compared to popular
LSM-tree implementations such as LevelDB and RocksDB.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Abdelaziz:2017:SEC,
author = "Ibrahim Abdelaziz and Razen Harbi and Zuhair Khayyat
and Panos Kalnis",
title = "A survey and experimental comparison of distributed
{SPARQL} engines for very large {RDF} data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "13",
pages = "2049--2060",
month = sep,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:20 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Distributed SPARQL engines promise to support very
large RDF datasets by utilizing shared-nothing computer
clusters. Some are based on distributed frameworks such
as MapReduce; others implement proprietary distributed
processing; and some rely on expensive preprocessing
for data partitioning. These systems exhibit a variety
of trade-offs that are not well-understood, due to the
lack of any comprehensive quantitative and qualitative
evaluation. In this paper, we present a survey of 22
state-of-the-art systems that cover the entire spectrum
of distributed RDF data processing and categorize them
by several characteristics. Then, we select 12
representative systems and perform extensive
experimental evaluation with respect to preprocessing
cost, query performance, scalability and workload
adaptability, using a variety of synthetic and real
large datasets with up to 4.3 billion triples. Our
results provide valuable insights for practitioners to
understand the trade-offs for their usage scenarios.
Finally, we publish online our evaluation framework,
including all datasets and workloads, for researchers
to compare their novel systems against the existing
ones.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kunft:2017:BEM,
author = "Andreas Kunft and Asterios Katsifodimos and Sebastian
Schelter and Tilmann Rabl and Volker Markl",
title = "{Blockjoin}: efficient matrix partitioning through
joins",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "13",
pages = "2061--2072",
month = sep,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:20 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Linear algebra operations are at the core of many
Machine Learning (ML) programs. At the same time, a
considerable amount of the effort for solving data
analytics problems is spent in data preparation. As a
result, end-to-end ML pipelines often consist of (i)
relational operators used for joining the input data, (
ii) user defined functions used for feature extraction
and vectorization, and (iii) linear algebra operators
used for model training and cross-validation. Often,
these pipelines need to scale out to large datasets. In
this case, these pipelines are usually implemented on
top of dataflow engines like Hadoop, Spark, or Flink.
These dataflow engines implement relational operators
on row-partitioned datasets. However, efficient linear
algebra operators use block-partitioned matrices. As a
result, pipelines combining both kinds of operators
require rather expensive changes to the physical
representation, in particular re-partitioning steps. In
this paper, we investigate the potential of reducing
shuffling costs by fusing relational and linear algebra
operations into specialized physical operators. We
present BlockJoin, a distributed join algorithm which
directly produces block-partitioned results. To
minimize shuffling costs, BlockJoin applies database
techniques known from columnar processing, such as
index-joins and late materialization, in the context of
parallel dataflow engines. Our experimental evaluation
shows speedups up to 6$ \times $ and the skew
resistance of BlockJoin compared to state-of-the-art
pipelines implemented in Spark.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Choi:2017:EMR,
author = "Dong-Wan Choi and Jian Pei and Thomas Heinis",
title = "Efficient mining of regional movement patterns in
semantic trajectories",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "13",
pages = "2073--2084",
month = sep,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:20 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Semantic trajectory pattern mining is becoming more
and more important with the rapidly growing volumes of
semantically rich trajectory data. Extracting
sequential patterns in semantic trajectories plays a
key role in understanding semantic behaviour of human
movement, which can widely be used in many applications
such as location-based advertising, road capacity
optimisation, and urban planning. However, most of
existing works on semantic trajectory pattern mining
focus on the entire spatial area, leading to missing
some locally significant patterns within a region.
Based on this motivation, this paper studies a regional
semantic trajectory pattern mining problem, aiming at
identifying all the regional sequential patterns in
semantic trajectories. Specifically, we propose a new
density scheme to quantify the frequency of a
particular pattern in space, and thereby formulate a
new mining problem of finding all the regions in which
such a pattern densely occurs. For the proposed
problem, we develop an efficient mining algorithm,
called RegMiner (Regional Semantic Trajectory Pattern
Miner), which effectively reveals movement patterns
that are locally frequent in such a region but not
necessarily dominant in the entire space. Our empirical
study using real trajectory data shows that RegMiner
finds many interesting local patterns that are hard to
find by a state-of-the-art global pattern mining
scheme, and it also runs several orders of magnitude
faster than the global pattern mining algorithm.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kiefer:2017:EJS,
author = "Martin Kiefer and Max Heimel and Sebastian Bre{\ss}
and Volker Markl",
title = "Estimating join selectivities using
bandwidth-optimized kernel density models",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "10",
number = "13",
pages = "2085--2096",
month = sep,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:20 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Accurately predicting the cardinality of intermediate
plan operations is an essential part of any modern
relational query optimizer. The accuracy of said
estimates has a strong and direct impact on the quality
of the generated plans, and incorrect estimates can
have a negative impact on query performance. One of the
biggest challenges in this field is to predict the
result size of join operations. Kernel Density
Estimation (KDE) is a statistical method to estimate
multivariate probability distributions from a data
sample. Previously, we introduced a modern, self-tuning
selectivity estimator for range scans based on KDE that
out-performs state-of-the-art multidimensional
histograms and is efficient to evaluate on graphics
cards. In this paper, we extend these
bandwidth-optimized KDE models to estimate the result
size of single and multiple joins. In particular, we
propose two approaches: (1) Building a KDE model from a
sample drawn from the join result. (2) Efficiently
combining the information from base table KDE models.
We evaluated our KDE-based join estimators on a variety
of synthetic and real-world datasets, demonstrating
that they are superior to state-of-the art join
estimators based on sketching or sampling.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Menon:2017:ROF,
author = "Prashanth Menon and Todd C. Mowry and Andrew Pavlo",
title = "Relaxed operator fusion for in-memory databases:
making compilation, vectorization, and prefetching work
together at last",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "1",
pages = "1--13",
month = sep,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:21 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In-memory database management systems (DBMSs) are a
key component of modern on-line analytic processing
(OLAP) applications, since they provide low-latency
access to large volumes of data. Because disk accesses
are no longer the principle bottleneck in such systems,
the focus in designing query execution engines has
shifted to optimizing CPU performance. Recent systems
have revived an older technique of using just-in-time
(JIT) compilation to execute queries as native code
instead of interpreting a plan. The state-of-the-art in
query compilation is to fuse operators together in a
query plan to minimize materialization overhead by
passing tuples efficiently between operators. Our
empirical analysis shows, however, that more tactful
materialization yields better performance. We present a
query processing model called ``relaxed operator
fusion'' that allows the DBMS to introduce staging
points in the query plan where intermediate results are
temporarily materialized. This allows the DBMS to take
advantage of inter-tuple parallelism inherent in the
plan using a combination of prefetching and SIMD
vectorization to support faster query execution on data
sets that exceed the size of CPU-level caches. Our
evaluation shows that our approach reduces the
execution time of OLAP queries by up to 2.2$ \times $
and achieves up to 1.8$ \times $ better performance
compared to other in-memory DBMSs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2017:PSS,
author = "Yu Liu and Bolong Zheng and Xiaodong He and Zhewei Wei
and Xiaokui Xiao and Kai Zheng and Jiaheng Lu",
title = "{Probesim}: scalable single-source and top-$k$
{SimRank} computations on dynamic graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "1",
pages = "14--26",
month = sep,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:21 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Single-source and top- k SimRank queries are two
important types of similarity search in graphs with
numerous applications in web mining, social network
analysis, spam detection, etc. A plethora of techniques
have been proposed for these two types of queries, but
very few can efficiently support similarity search over
large dynamic graphs, due to either significant
preprocessing time or large space overheads. This paper
presents ProbeSim, an index-free algorithm for
single-source and top- k SimRank queries that provides
a non-trivial theoretical guarantee in the absolute
error of query results. ProbeSim estimates SimRank
similarities without precomputing any indexing
structures, and thus can naturally support real-time
SimRank queries on dynamic graphs. Besides the
theoretical guarantee, ProbeSim also offers satisfying
practical efficiency and effectiveness due to
non-trivial optimizations. We conduct extensive
experiments on a number of benchmark datasets, which
demonstrate that our solutions outperform the existing
methods in terms of efficiency and effectiveness.
Notably, our experiments include the first empirical
study that evaluates the effectiveness of SimRank
algorithms on graphs with billion edges, using the idea
of pooling.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Guagliardo:2017:FSS,
author = "Paolo Guagliardo and Leonid Libkin",
title = "A formal semantics of {SQL} queries, its validation,
and applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "1",
pages = "27--39",
month = sep,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:21 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "While formal semantics of theoretical languages
underlying SQL have been provided in the past, they all
made simplifying assumptions ranging from changes in
the syntax to omitting bag semantics and nulls. This
situation is reminiscent of what happens in the field
of programming languages, where semantics of formal
calculi underlying the main features of languages are
abundant, but formal semantics of real languages that
people use are few and far between. We consider the
basic class of SQL queries --- essentially
SELECT-FROM-WHERE queries with subqueries, set/bag
operations, and nulls --- and define a formal semantics
for it, without any departures from the real language.
This fragment already requires decisions related to the
data model and handling variable names that are
normally disregarded by simplified semantics. To
justify our choice of the semantics, we validate it
experimentally on a large number of randomly generated
queries and databases. We give two applications of the
semantics. One is the first formal proof of the
equivalence of basic SQL and relational algebra that
extends to bag semantics and nulls. The other
application looks at the three-valued logic employed by
SQL, which is universally assumed to be necessary to
handle nulls. We prove however that this is not so, as
three-valued logic does not add expressive power: every
SQL query in our fragment can be evaluated under the
usual two-valued Boolean semantics of conditions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kim:2017:EHS,
author = "Jinhyun Kim and Jun-Ki Min and Kyuseok Shim",
title = "Efficient {Haar$^+$} synopsis construction for the
maximum absolute error measure",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "1",
pages = "40--52",
month = sep,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:21 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Several wavelet synopsis construction algorithms were
previously proposed based on dynamic programming for
unrestricted Haar wavelet synopses as well as Haar$^+$
synopses. However, they find an optimal synopsis for
every incoming value in each node of a coefficient
tree, even if different incoming values share an
identical optimal synopsis. To alleviate the
limitation, we present novel algorithms, which keep
only a minimal set of the distinct optimal synopses in
each node of the tree, for the error-bounded synopsis
problem. Furthermore, we propose the methods to
restrict coefficient values to be considered to compute
the optimal synopses in each node. In addition, by
partitioning all optimal synopses in each node into a
set of groups, such that every group can be represented
by a compact representation, we significantly improve
the performance of the proposed algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tao:2017:ASJ,
author = "Wenbo Tao and Dong Deng and Michael Stonebraker",
title = "Approximate string joins with abbreviations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "1",
pages = "53--65",
month = sep,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:21 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "String joins have wide applications in data
integration and cleaning. The inconsistency of data
caused by data errors, term variations and missing
values has led to the need for approximate string joins
(ASJ). In this paper, we study ASJ with abbreviations,
which are a frequent type of term variation. Although
prior works have studied ASJ given a user-inputted
dictionary of synonym rules, they have three common
limitations. First, they suffer from low precision in
the presence of abbreviations having multiple full
forms. Second, their join algorithms are not scalable
due to the exponential time complexity. Third, the
dictionary may not exist since abbreviations are highly
domain-dependent. We propose an end-to-end workflow to
address these limitations. There are three main
components in the workflow: (1) a new similarity
measure taking abbreviations into account that can
handle abbreviations having multiple full forms, (2) an
efficient join algorithm following the
filter-verification framework and (3) an unsupervised
approach to learn a dictionary of abbreviation rules
from input strings. We evaluate our workflow on four
real-world datasets and show that our workflow outputs
accurate join results, scales well as input size grows
and greatly outperforms state-of-the-art approaches in
both accuracy and efficiency.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Nguyen:2017:QDF,
author = "Dat Ba Nguyen and Abdalghani Abujabal and Nam Khanh
Tran and Martin Theobald and Gerhard Weikum",
title = "Query-driven on-the-fly knowledge base construction",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "1",
pages = "66--79",
month = sep,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:21 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Today's openly available knowledge bases, such as
DBpedia, Yago, Wikidata or Freebase, capture billions
of facts about the world's entities. However, even the
largest among these (i) are still limited in up-to-date
coverage of what happens in the real world, and (ii)
miss out on many relevant predicates that precisely
capture the wide variety of relationships among
entities. To overcome both of these limitations, we
propose a novel approach to build on-the-fly knowledge
bases in a query-driven manner. Our system, called
QKBfly, supports analysts and journalists as well as
question answering on emerging topics, by dynamically
acquiring relevant facts as timely and comprehensively
as possible. QKBfly is based on a semantic-graph
representation of sentences, by which we perform three
key IE tasks, namely named-entity disambiguation,
co-reference resolution and relation extraction, in a
light-weight and integrated manner. In contrast to Open
IE, our output is canonicalized. In contrast to
traditional IE, we capture more predicates, including
ternary and higher-arity ones. Our experiments
demonstrate that QKBfly can build high-quality,
on-the-fly knowledge bases that can readily be
deployed, e.g., for the task of ad-hoc question
answering.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Poppe:2017:GGB,
author = "Olga Poppe and Chuan Lei and Elke A. Rundensteiner and
David Maier",
title = "{GRETA}: graph-based real-time event trend
aggregation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "1",
pages = "80--92",
month = sep,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:21 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Streaming applications from algorithmic trading to
traffic management deploy Kleene patterns to detect and
aggregate arbitrarily-long event sequences, called
event trends. State-of-the-art systems process such
queries in two steps. Namely, they first construct all
trends and then aggregate them. Due to the exponential
costs of trend construction, this two-step approach
suffers from both a long delays and high memory costs.
To overcome these limitations, we propose the
Graph-based Real-time Event Trend Aggregation (GRETA)
approach that dynamically computes event trend
aggregation without first constructing these trends. We
define the GRETA graph to compactly encode all trends.
Our GRETA runtime incrementally maintains the graph,
while dynamically propagating aggregates along its
edges. Based on the graph, the final aggregate is
incrementally updated and instantaneously returned at
the end of each query window. Our GRETA runtime
represents a win-win solution, reducing both the time
complexity from exponential to quadratic and the space
complexity from exponential to linear in the number of
events. Our experiments demonstrate that GRETA achieves
up to four orders of magnitude speed-up and up to
50--fold memory reduction compared to the
state-of-the-art two-step approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Guo:2017:PPP,
author = "Wentian Guo and Yuchen Li and Mo Sha and Kian-Lee
Tan",
title = "Parallel {Personalized PageRank} on dynamic graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "1",
pages = "93--106",
month = sep,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:21 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/pagerank.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Personalized PageRank (PPR) is a well-known proximity
measure in graphs. To meet the need for dynamic PPR
maintenance, recent works have proposed a local update
scheme to support incremental computation.
Nevertheless, sequential execution of the scheme is
still too slow for highspeed stream processing.
Therefore, we are motivated to design a parallel
approach for dynamic PPR computation. First, as updates
always come in batches, we devise a batch processing
method to reduce synchronization cost among every
single update and enable more parallelism for iterative
parallel execution. Our theoretical analysis shows that
the parallel approach has the same asymptotic
complexity as the sequential approach. Second, we
devise novel optimization techniques to effectively
reduce runtime overheads for parallel processes.
Experimental evaluation shows that our parallel
algorithm can achieve orders of magnitude speedups on
GPUs and multi-core CPUs compared with the
state-of-the-art sequential algorithm.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sha:2017:ADG,
author = "Mo Sha and Yuchen Li and Bingsheng He and Kian-Lee
Tan",
title = "Accelerating dynamic graph analytics on {GPUs}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "1",
pages = "107--120",
month = sep,
year = "2017",
CODEN = "????",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Oct 10 17:16:21 MDT 2017",
bibsource = "http://portal.acm.org/;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "As graph analytics often involves compute-intensive
operations, GPUs have been extensively used to
accelerate the processing. However, in many
applications such as social networks, cyber security,
and fraud detection, their representative graphs evolve
frequently and one has to perform a rebuild of the
graph structure on GPUs to incorporate the updates.
Hence, rebuilding the graphs becomes the bottleneck of
processing high-speed graph streams. In this paper, we
propose a GPU-based dynamic graph storage scheme to
support existing graph algorithms easily. Furthermore,
we propose parallel update algorithms to support
efficient stream updates so that the maintained graph
is immediately available for high-speed analytic
processing on GPUs. Our extensive experiments with
three streaming applications on large-scale real and
synthetic datasets demonstrate the superior performance
of our proposed approach.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Appuswamy:2017:AIS,
author = "Raja Appuswamy and Angelos C. Anadiotis and Danica
Porobic and Mustafa K. Iman and Anastasia Ailamaki",
title = "Analyzing the impact of system architecture on the
scalability of {OLTP} engines for high-contention
workloads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "2",
pages = "121--134",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3149193.3149194",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 30 06:16:03 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Main-memory OLTP engines are being increasingly
deployed on multicore servers that provide abundant
thread-level parallelism. However, recent research has
shown that even the state-of-the-art OLTP engines are
unable to exploit available parallelism for high
contention workloads. While previous studies have shown
the lack of scalability of all popular concurrency
control protocols, they consider only one system
architecture---a non-partitioned, shared everything one
where transactions can be scheduled to run on any core
and can access any data or metadata stored in shared
memory. In this paper, we perform a thorough analysis
of the impact of other architectural alternatives
(Data-oriented transaction execution, Partitioned
Serial Execution, and Delegation) on scalability under
high contention scenarios. In doing so, we present
Trireme, a main-memory OLTP engine testbed that
implements four system architectures and several
popular concurrency control protocols in a single code
base. Using Trireme, we present an extensive
experimental study to understand (i) the impact of each
system architecture on overall scalability, (ii) the
interaction between system architecture and concurrency
control protocols, and (iii) the pros and cons of new
architectures that have been proposed recently to
explicitly deal with high-contention workloads.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jung:2017:SDL,
author = "Hyungsoo Jung and Hyuck Han and Sooyong Kang",
title = "Scalable database logging for multicores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "2",
pages = "135--148",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3149193.3149195",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 30 06:16:03 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Modern databases, guaranteeing atomicity and
durability, store transaction logs in a volatile,
central log buffer and then flush the log buffer to
non-volatile storage by the write-ahead logging
principle. Buffering logs in central log store has
recently faced a severe multicore scalability problem,
and log flushing has been challenged by synchronous I/O
delay. We have designed and implemented a fast and
scalable logging method, E leda, that can migrate a
surge of transaction logs from volatile memory to
stable storage without risking durable transaction
atomicity. Our efficient implementation of Eleda is
enabled by a highly concurrent data structure,
Grasshopper, that eliminates a multicore scalability
problem of centralized logging and enhances system
utilization in the presence of synchronous I/O delay.
We implemented Eleda and plugged it to WiredTiger and
Shore-MT by replacing their log managers. Our
evaluation showed that Eleda-based transaction systems
improve performance up to $ 71 \times $, thus showing
the applicability of Eleda.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bonifati:2017:ASL,
author = "Angela Bonifati and Wim Martens and Thomas Timm",
title = "An analytical study of large {SPARQL} query logs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "2",
pages = "149--161",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3149193.3149196",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 30 06:16:03 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the adoption of RDF as the data model for Linked
Data and the Semantic Web, query specification from
end-users has become more and more common in SPARQL
endpoints. In this paper, we conduct an in-depth
analytical study of the queries formulated by end-users
and harvested from large and up-to-date query logs from
a wide variety of RDF data sources. As opposed to
previous studies, ours is the first assessment on a
voluminous query corpus, spanning over several years
and covering many representative SPARQL endpoints.
Apart from the syntactical structure of the queries,
that exhibits already interesting results on this
generalized corpus, we drill deeper in the structural
characteristics related to the graph and hypergraph
representation of queries. We outline the most common
shapes of queries when visually displayed as undirected
graphs, and characterize their (hyper-)tree width.
Moreover, we analyze the evolution of queries over
time, by introducing the novel concept of a streak,
i.e., a sequence of queries that appear as subsequent
modifications of a seed query. Our study offers several
fresh insights on the already rich query features of
real SPARQL queries formulated by real users, and
brings us to draw a number of conclusions and pinpoint
future directions for SPARQL query evaluation, query
optimization, tuning, and benchmarking.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2017:ACT,
author = "Pinghui Wang and Yiyan Qi and Yu Sun and Xiangliang
Zhang and Jing Tao and Xiaohong Guan",
title = "Approximately counting triangles in large graph
streams including edge duplicates with a fixed memory
usage",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "2",
pages = "162--175",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3149193.3149197",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 30 06:16:03 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Counting triangles in a large graph is important for
detecting network anomalies such as spam web pages and
suspicious accounts (e.g., fraudsters and advertisers)
on online social networks. However, it is challenging
to compute the number of triangles in a large graph
represented as a stream of edges with a low
computational cost when given a limited memory.
Recently, several effective sampling-based
approximation methods have been developed to solve this
problem. However, they assume the graph stream of
interest contains no duplicate edges, which does not
hold in many real-world graph streams (e.g., phone
calling networks). In this paper, we observe that these
methods exhibit a large estimation error or
computational cost even when modified to deal with
duplicate edges using deduplication techniques such as
Bloom filter and hash-based sampling. To solve this
challenge, we design a one-pass streaming algorithm for
uniformly sampling distinct edges at a high speed.
Compared to state-of-the-art algorithms, our algorithm
reduces the sampling cost per edge from O (log k ) ( k
is the maximum number of sampled edges determined by
the available memory space) to O (1) without using any
additional memory space. Based on sampled edges, we
develop a simple yet accurate method to infer the
number of triangles in the original graph stream. We
conduct extensive experiments on a variety of
real-world large graphs, and the results demonstrate
that our method is several times more accurate and
faster than state-of-the-art methods with the same
memory usage.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Qiao:2017:SMC,
author = "Miao Qiao and Hao Zhang and Hong Cheng",
title = "Subgraph matching: on compression and computation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "2",
pages = "176--188",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3149193.3149198",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 30 06:16:03 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/string-matching.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Subgraph matching finds a set I of all occurrences of
a pattern graph in a target graph. It has a wide range
of applications while suffers an expensive computation.
This efficiency issue has been studied extensively. All
existing approaches, however, turn a blind eye to the
output crisis, that is, when the system has to
materialize I as a preprocessing/intermediate/final
result or an index, the cost of the export of I
dominates the overall cost, which could be prohibitive
even for a small pattern graph. This paper studies
subgraph matching via two problems. (1) Is there an
ideal compression of I? (2) Will the compression of I
reversely boost the computation of I? For the problem
(1), we propose a technique called VCBC to compress I
to code(I) which serves effectively the same as I. For
problem (2), we propose a subgraph matching computation
framework CBF which computes code(I) instead of I to
bring down the output cost. CBF further reduces the
overall cost by reducing the intermediate results.
Extensive experiments show that the compression ratio
of VCBC can be up to $ 10^5 $ which also significantly
lowers the output cost of CBF. Extensive experiments
show the superior performance of CBF over existing
approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Singh:2017:SEM,
author = "Rohit Singh and Venkata Vamsikrishna Meduri and Ahmed
Elmagarmid and Samuel Madden and Paolo Papotti and
Jorge-Arnulfo Quian{\'e}-Ruiz and Armando Solar-Lezama
and Nan Tang",
title = "Synthesizing entity matching rules by examples",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "2",
pages = "189--202",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3149193.3149199",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 30 06:16:03 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Entity matching (EM) is a critical part of data
integration. We study how to synthesize entity matching
rules from positive-negative matching examples. The
core of our solution is program synthesis, a powerful
tool to automatically generate rules (or programs) that
satisfy a given high-level specification, via a
predefined grammar. This grammar describes a General
Boolean Formula ( GBF ) that can include arbitrary
attribute matching predicates combined by conjunctions
($ \vee $), disjunctions ($ \wedge $) and negations ($
\isonot $), and is expressive enough to model EM
problems, from capturing arbitrary attribute
combinations to handling missing attribute values. The
rules in the form of GBF are more concise than
traditional EM rules represented in Disjunctive Normal
Form ( DNF ). Consequently, they are more interpretable
than decision trees and other machine learning
algorithms that output deep trees with many branches.
We present a new synthesis algorithm that, given only
positive-negative examples as input, synthesizes EM
rules that are effective over the entire dataset.
Extensive experiments show that we outperform other
interpretable rules (e.g., decision trees with low
depth) in effectiveness, and are comparable with
non-interpretable tools (e.g., decision trees with high
depth, gradient-boosting trees, random forests and
SVM).",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{He:2017:SST,
author = "Liang He and Bin Shao and Yatao Li and Huanhuan Xia
and Yanghua Xiao and Enhong Chen and Liang Jeff Chen",
title = "{Stylus}: a strongly-typed store for serving massive
{RDF} data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "2",
pages = "203--216",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3149193.3149200",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 30 06:16:03 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "RDF is one of the most commonly used knowledge
representation forms. Many highly influential knowledge
bases, such as Freebase and PubChemRDF, are in RDF
format. An RDF data set is usually represented as a
collection of subject-predicate-object triples. Despite
the flexibility of RDF triples, it is challenging to
serve SPARQL queries on RDF data efficiently by
directly managing triples due to the following two
reasons. First, heavy joins on a large number of
triples are needed for query processing, resulting in a
large number of data scans and large redundant
intermediate results; Second, weakly-typed triple
representation provides suboptimal random access ---
typically with logarithmic complexity. This data access
challenge, unfortunately, cannot be easily met by a
better query optimizer as large graph processing is
extremely I/O-intensive. In this paper, we argue that
strongly-typed graph representation is the key to
high-performance RDF query processing. We propose
Stylus --- a strongly-typed store for serving massive
RDF data. Stylus exploits a strongly-typed storage
scheme to boost the performance of RDF query
processing. The storage scheme is essentially a
materialized join view on entities, it thus can
eliminate a large number of unnecessary joins on
triples. Moreover, it is equipped with a compact
representation for intermediate results and an
efficient graph-decomposition based query planner.
Experimental results on both synthetic and real-life
RDF data sets confirm that the proposed approach can
dramatically boost the performance of SPARQL query
processing.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ioannou:2017:HQE,
author = "Ekaterini Ioannou and Minos Garofalakis",
title = "Holistic query evaluation over information extraction
pipelines",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "2",
pages = "217--229",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3149193.3149201",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 30 06:16:03 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We introduce holistic in-database query processing
over information extraction pipelines. This requires
considering the joint conditional distribution over
generic Conditional Random Fields that uses factor
graphs to encode extraction tasks. Our approach
introduces Canopy Factor Graphs, a novel probabilistic
model for effectively capturing the joint conditional
distribution given a canopy clustering of the data, and
special query operators for retrieving resolution
information. Since inference on such models is
intractable, we introduce an approximate technique for
query processing and optimizations that cut across the
integrated tasks for reducing the required processing
time. Effectiveness and scalability are verified
through an extensive experimental evaluation using real
and synthetic data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Psaropoulos:2017:ICP,
author = "Georgios Psaropoulos and Thomas Legler and Norman May
and Anastasia Ailamaki",
title = "Interleaving with coroutines: a practical approach for
robust index joins",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "2",
pages = "230--242",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3149193.3149202",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 30 06:16:03 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Index join performance is determined by the efficiency
of the lookup operation on the involved index. Although
database indexes are highly optimized to leverage
processor caches, main memory accesses inevitably
increase lookup runtime when the index outsizes the
last-level cache; hence, index join performance drops.
Still, robust index join performance becomes possible
with instruction stream interleaving: given a group of
lookups, we can hide cache misses in one lookup with
instructions from other lookups by switching among
their respective instruction streams upon a cache miss.
In this paper, we propose interleaving with coroutines
for any type of index join. We showcase our proposal on
SAP HANA by implementing binary search and CSB$^+$-tree
traversal for an instance of index join related to
dictionary compression. Coroutine implementations not
only perform similarly to prior interleaving
techniques, but also resemble the original code
closely, while supporting both interleaved and
non-interleaved execution. Thus, we claim that
coroutines make interleaving practical for use in real
DBMS codebases.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wen:2017:ESG,
author = "Dong Wen and Lu Qin and Ying Zhang and Lijun Chang and
Xuemin Lin",
title = "Efficient structural graph clustering: an index-based
approach",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "3",
pages = "243--255",
month = nov,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3157794.3157795",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Dec 11 16:07:56 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Graph clustering is a fundamental problem widely
experienced across many industries. The structural
graph clustering (SCAN) method obtains not only
clusters but also hubs and outliers. However, the
clustering results closely depend on two sensitive
parameters, $ \epsilon $ and $ \mu $, while the optimal
parameter setting depends on different graph properties
and various user requirements. Moreover, all existing
SCAN solutions need to scan at least the whole graph,
even if only a small number of vertices belong to
clusters. In this paper we propose an index-based
method for SCAN. Based on our index, we cluster the
graph for any $ \epsilon $ and $ \mu $ in $ O(\Sigma_{c
\epsilon C} |E_C|) $ time, where $C$ is the result set
of all clusters and $ | E_C |$ is the number of edges
in a specific cluster $C$. In other words, the time
expended to compute structural clustering depends only
on the result size, not on the size of the original
graph. Our index's space complexity is bounded by $
O(m)$, where $m$ is the number of edges in the graph.
To handle dynamic graph updates, we propose algorithms
and several optimization techniques for maintaining our
index. We conduct extensive experiments to practically
evaluate the performance of all our proposed algorithms
on 10 real-world networks, one of which contains more
than 1 billion edges. The experimental results
demonstrate that our approaches significantly
outperform existing solutions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{DeCapitanidiVimercati:2017:AMM,
author = "Sabrina {De Capitani di Vimercati} and Sara Foresti
and Sushil Jajodia and Giovanni Livraga and Stefano
Paraboschi and Pierangela Samarati",
title = "An authorization model for multi provider queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "3",
pages = "256--268",
month = nov,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3157794.3157796",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Dec 11 16:07:56 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present a novel approach for the specification and
enforcement of authorizations that enables controlled
data sharing for collaborative queries in the cloud.
Data authorities can establish authorizations
regulating access to their data distinguishing three
visibility levels (no visibility, encrypted visibility,
and plaintext visibility). Authorizations are enforced
in the query execution by possibly restricting
operation assignments to other parties and by adjusting
visibility of data on-the-fly. Our approach enables
users and data authorities to fully enjoy the benefits
and economic savings of the competitive open cloud
market, while maintaining control over data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ratner:2017:SRT,
author = "Alexander Ratner and Stephen H. Bach and Henry
Ehrenberg and Jason Fries and Sen Wu and Christopher
R{\'e}",
title = "{Snorkel}: rapid training data creation with weak
supervision",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "3",
pages = "269--282",
month = nov,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3157794.3157797",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Dec 11 16:07:56 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Labeling training data is increasingly the largest
bottleneck in deploying machine learning systems. We
present Snorkel, a first-of-its-kind system that
enables users to train state-of-the-art models without
hand labeling any training data. Instead, users write
labeling functions that express arbitrary heuristics,
which can have unknown accuracies and correlations.
Snorkel denoises their outputs without access to ground
truth by incorporating the first end-to-end
implementation of our recently proposed machine
learning paradigm, data programming. We present a
flexible interface layer for writing labeling functions
based on our experience over the past year
collaborating with companies, agencies, and research
labs. In a user study, subject matter experts build
models $ 2.8 \times $ faster and increase predictive
performance an average 45.5\% versus seven hours of
hand labeling. We study the modeling tradeoffs in this
new setting and propose an optimizer for automating
tradeoff decisions that gives up to $ 1.8 \times $
speedup per pipeline execution. In two collaborations,
with the U.S. Department of Veterans Affairs and the
U.S. Food and Drug Administration, and on four
open-source text and image data sets representative of
other deployments, Snorkel provides 132\% average
improvements to predictive performance over prior
heuristic approaches and comes within an average 3.60\%
of the predictive performance of large hand-curated
training sets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2017:VPV,
author = "Yuliang Li and Alin Deutsch and Victor Vianu",
title = "{VERIFAS}: a practical verifier for artifact systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "3",
pages = "283--296",
month = nov,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3157794.3157798",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Dec 11 16:07:56 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data-driven workflows, of which IBM's Business
Artifacts are a prime exponent, have been successfully
deployed in practice, adopted in industrial standards,
and have spawned a rich body of research in academia,
focused primarily on static analysis. The present
research bridges the gap between the theory and
practice of artifact verification with VERIFAS, the
first implementation of practical significance of an
artifact verifier with full support for unbounded data.
VERIFAS verifies within seconds linear-time temporal
properties over real-world and synthetic workflows of
complexity in the range recommended by software
engineering practice. Compared to our previous
implementation based on the widely-used Spin model
checker, VERIFAS not only supports a model with richer
data manipulations but also outperforms it by over an
order of magnitude. VERIFAS' good performance is due to
a novel symbolic representation approach and a family
of specialized optimizations.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jia:2017:DMG,
author = "Zhihao Jia and Yongkee Kwon and Galen Shipman and Pat
McCormick and Mattan Erez and Alex Aiken",
title = "A distributed multi-{GPU} system for fast graph
processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "3",
pages = "297--310",
month = nov,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3157794.3157799",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Dec 11 16:07:56 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present Lux, a distributed multi-GPU system that
achieves fast graph processing by exploiting the
aggregate memory bandwidth of multiple GPUs and taking
advantage of locality in the memory hierarchy of
multi-GPU clusters. Lux provides two execution models
that optimize algorithmic efficiency and enable
important GPU optimizations, respectively. Lux also
uses a novel dynamic load balancing strategy that is
cheap and achieves good load balance across GPUs. In
addition, we present a performance model that
quantitatively predicts the execution times and
automatically selects the runtime configurations for
Lux applications. Experiments show that Lux achieves up
to 20X speedup over state-of-the-art shared memory
systems and up to two orders of magnitude speedup over
distributed systems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bleifuss:2017:EDC,
author = "Tobias Bleifu{\ss} and Sebastian Kruse and Felix
Naumann",
title = "Efficient denial constraint discovery with {Hydra}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "3",
pages = "311--323",
month = nov,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3157794.3157800",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Dec 11 16:07:56 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Denial constraints (DCs) are a generalization of many
other integrity constraints (ICs) widely used in
databases, such as key constraints, functional
dependencies, or order dependencies. Therefore, they
can serve as a unified reasoning framework for all of
these ICs and express business rules that cannot be
expressed by the more restrictive IC types. The process
of formulating DCs by hand is difficult, because it
requires not only domain expertise but also database
knowledge, and due to DCs' inherent complexity, this
process is tedious and error-prone. Hence, an automatic
DC discovery is highly desirable: we search for all
valid denial constraints in a given database instance.
However, due to the large search space, the problem of
DC discovery is computationally expensive. We propose a
new algorithm Hydra, which overcomes the quadratic
runtime complexity in the number of tuples of
state-of-the-art DC discovery methods. The new
algorithm's experimentally determined runtime grows
only linearly in the number of tuples. This results in
a speedup by orders of magnitude, especially for
datasets with a large number of tuples. Hydra can
deliver results in a matter of seconds that to date
took hours to compute.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Azim:2017:RRC,
author = "Tahir Azim and Manos Karpathiotakis and Anastasia
Ailamaki",
title = "{ReCache}: reactive caching for fast analytics over
heterogeneous data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "3",
pages = "324--337",
month = nov,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3157794.3157801",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Dec 11 16:07:56 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "As data continues to be generated at exponentially
growing rates in heterogeneous formats, fast analytics
to extract meaningful information is becoming
increasingly important. Systems widely use in-memory
caching as one of their primary techniques to speed up
data analytics. However, caches in data analytics
systems cannot rely on simple caching policies and a
fixed data layout to achieve good performance.
Different datasets and workloads require different
layouts and policies to achieve optimal performance.
This paper presents ReCache, a cache-based performance
accelerator that is reactive to the cost and
heterogeneity of diverse raw data formats. Using timing
measurements of caching operations and selection
operators in a query plan, ReCache accounts for the
widely varying costs of reading, parsing, and caching
data in nested and tabular formats. Combining these
measurements with information about frequently accessed
data fields in the workload, ReCache automatically
decides whether a nested or relational column-oriented
layout would lead to better query performance.
Furthermore, ReCache keeps track of commonly utilized
operators to make informed cache admission and eviction
decisions. Experiments on synthetic and real-world
datasets show that our caching techniques decrease
caching overhead for individual queries by an average
of 59\%. Furthermore, over the entire workload, ReCache
reduces execution time by 19-75\% compared to existing
techniques.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yuan:2017:EED,
author = "Long Yuan and Lu Qin and Xuemin Lin and Lijun Chang
and Wenjie Zhang",
title = "Effective and efficient dynamic graph coloring",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "3",
pages = "338--351",
month = nov,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3157794.3157802",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Dec 11 16:07:56 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Graph coloring is a fundamental graph problem that is
widely applied in a variety of applications. The aim of
graph coloring is to minimize the number of colors used
to color the vertices in a graph such that no two
incident vertices have the same color. Existing
solutions for graph coloring mainly focus on computing
a good coloring for a static graph. However, since many
real-world graphs are highly dynamic, in this paper, we
aim to incrementally maintain the graph coloring when
the graph is dynamically updated. We target on two
goals: high effectiveness and high efficiency. To
achieve high effectiveness, we maintain the graph
coloring in a way such that the coloring result is
consistent with one of the best static graph coloring
algorithms for large graphs. To achieve high
efficiency, we investigate efficient incremental
algorithms to update the graph coloring by exploring a
small number of vertices. We design a color-propagation
based algorithm which only explores the vertices within
the 2-hop neighbors of the update-related and
color-changed vertices. We then propose a novel color
index to maintain some summary color information and,
thus, bound the explored vertices within the neighbors
of these vertices. Moreover, we derive some effective
pruning rules to further reduce the number of
propagated vertices. The experimental results
demonstrate the high effectiveness and efficiency of
our approach.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zacharatou:2017:GRR,
author = "Eleni Tzirita Zacharatou and Harish Doraiswamy and
Anastasia Ailamaki and Cl{\'a}udio T. Silva and Juliana
Freiref",
title = "{GPU} rasterization for real-time spatial aggregation
over arbitrary polygons",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "3",
pages = "352--365",
month = nov,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3157794.3157803",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Dec 11 16:07:56 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Visual exploration of spatial data relies heavily on
spatial aggregation queries that slice and summarize
the data over different regions. These queries comprise
computationally-intensive point-in-polygon tests that
associate data points to polygonal regions, challenging
the responsiveness of visualization tools. This
challenge is compounded by the sheer amounts of data,
requiring a large number of such tests to be performed.
Traditional pre-aggregation approaches are unsuitable
in this setting since they fix the query constraints
and support only rectangular regions. On the other
hand, query constraints are defined interactively in
visual analytics systems, and polygons can be of
arbitrary shapes. In this paper, we convert a spatial
aggregation query into a set of drawing operations on a
canvas and leverage the rendering pipeline of the
graphics hardware (GPU) to enable interactive response
times. Our technique trades-off accuracy for response
time by adjusting the canvas resolution, and can even
provide accurate results when combined with a polygon
index. We evaluate our technique on two large
real-world data sets, exhibiting superior performance
compared to index-based approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shah:2017:KFK,
author = "Vraj Shah and Arun Kumar and Xiaojin Zhu",
title = "Are key--foreign key joins safe to avoid when learning
high-capacity classifiers?",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "3",
pages = "366--379",
month = nov,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3157794.3157804",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Dec 11 16:07:56 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Machine learning (ML) over relational data is a
booming area of data management. While there is a lot
of work on scalable and fast ML systems, little work
has addressed the pains of sourcing data for ML tasks.
Real-world relational databases typically have many
tables (often, dozens) and data scientists often
struggle to even obtain all tables for joins before ML.
In this context, Kumar et al. showed recently that
key-foreign key dependencies (KFKDs) between tables
often lets us avoid such joins without significantly
affecting prediction accuracy-an idea they called
``avoiding joins safely.'' While initially
controversial, this idea has since been used by
multiple companies to reduce the burden of data
sourcing for ML. But their work applied only to linear
classifiers. In this work, we verify if their results
hold for three popular high-capacity classifiers:
decision trees, non-linear SVMs, and ANNs. We conduct
an extensive experimental study using both real-world
datasets and simulations to analyze the effects of
avoiding KFK joins on such models. Our results show
that these high-capacity classifiers are surprisingly
and counter-intuitively more robust to avoiding KFK
joins compared to linear classifiers, refuting an
intuition from the prior work's analysis. We explain
this behavior intuitively and identify open questions
at the intersection of data management and ML
theoretical research. All of our code and datasets are
available for download from
http://cseweb.ucsd.edu/~arunkk/hamlet.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2017:WRC,
author = "Zheng Liu and Lei Chen",
title = "Worker recommendation for crowdsourced {Q\&A}
services: a triple-factor aware approach",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "3",
pages = "380--392",
month = nov,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.14778/3157794.3157805",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Dec 11 16:07:56 MST 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Worker Recommendation (WR) is one of the most
important functions for crowdsourced Q\&A services.
Specifically, given a set of tasks to be solved, WR
recommends each task with a certain group of workers,
whom are expected to give timely answers with high
qualities. To address the WR problem, recent studies
have introduced a number of recommendation approaches,
which take advantage of workers' expertises or
preferences towards different types of tasks. However,
without a thorough consideration of workers'
characters, such approaches will lead to either
inadequate task fulfillment or inferior answer quality.
In this work, we propose the Triple-factor Aware Worker
Recommendation framework, which collectively considers
workers' expertises, preferences and activenesses to
maximize the overall production of high quality
answers. We construct the Latent Hierarchical
Factorization Model, which is able to infer the tasks'
underlying categories and workers' latent characters
from the historical data; and we propose a novel
parameter inference method, which only requires the
processing of positive instances, giving rise to
significantly higher time efficiency and better
inference quality. What's more, the sampling-based
recommendation algorithm is developed, such that the
near optimal worker recommendation can be generated for
a presented batch of tasks with considerably reduced
time consumption. Comprehensive experiments have been
carried out using both real and synthetic datasets,
whose results verify the effectiveness and efficiency
of our proposed methods.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gong:2017:CSD,
author = "Shufeng Gong and Yanfeng Zhang and Ge Yu",
title = "Clustering stream data by exploring the evolution of
density mountain",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "4",
pages = "393--405",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3164135.3164136",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Feb 15 16:29:05 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Stream clustering is a fundamental problem in many
streaming data analysis applications. Comparing to
classical batch-mode clustering, there are two key
challenges in stream clustering: (i) Given that input
data are changing continuously, how to incrementally
update their clustering results efficiently? (ii) Given
that clusters continuously evolve with the evolution of
data, how to capture the cluster evolution activities?
Unfortunately, most of existing stream clustering
algorithms can neither update the cluster result in
real-time nor track the evolution of clusters. In this
paper, we propose a stream clustering algorithm
EDMStream by exploring the Evolution of Density
Mountain. The density mountain is used to abstract the
data distribution, the changes of which indicate data
distribution evolution. We track the evolution of
clusters by monitoring the changes of density
mountains. We further provide efficient data structures
and filtering schemes to ensure that the update of
density mountains is in real-time, which makes online
clustering possible. The experimental results on
synthetic and real datasets show that, comparing to the
state-of-the-art stream clustering algorithms, e.g.,
D-Stream, DenStream, DBSTREAM and MR-Stream, our
algorithm is able to response to a cluster update much
faster (say 7-15x faster than the best of the
competitors) and at the same time achieve comparable
cluster quality. Furthermore, EDMStream successfully
captures the cluster evolution activities.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2017:QFL,
author = "Tianzheng Wang and Ryan Johnson and Ippokratis
Pandis",
title = "Query fresh: log shipping on steroids",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "4",
pages = "406--419",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3164135.3164137",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Feb 15 16:29:05 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Hot standby systems often have to trade safety (i.e.,
not losing committed work) and freshness (i.e., having
access to recent updates) for performance. Guaranteeing
safety requires synchronous log shipping that blocks
the primary until the log records are durably
replicated in one or multiple backups; maintaining
freshness necessitates fast log replay on backups, but
is often defeated by the dual-copy architecture and
serial replay: a backup must generate the ``real'' data
from the log to make recent updates accessible to
read-only queries. This paper proposes Query Fresh, a
hot standby system that provides both safety and
freshness while maintaining high performance on the
primary. The crux is an append-only storage
architecture used in conjunction with fast networks
(e.g., InfiniBand) and byte-addressable, non-volatile
memory (NVRAM). Query Fresh avoids the dual-copy design
and treats the log as the database, enabling
lightweight, parallel log replay that does not block
the primary. Experimental results using the TPC-C
benchmark show that under Query Fresh, backup servers
can replay log records faster than they are generated
by the primary server, using one quarter of the
available compute resources. With a 56Gbps network,
Query Fresh can support up to 4--5 synchronous
replicas, each of which receives and replays $ \approx
$1.4GB of log records per second, with up to 4--6\%
overhead on the primary compared to a standalone server
that achieves 620kTPS without replication.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sahu:2017:ULG,
author = "Siddhartha Sahu and Amine Mhedhbi and Semih Salihoglu
and Jimmy Lin and M. Tamer {\"O}zsu",
title = "The ubiquity of large graphs and surprising challenges
of graph processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "4",
pages = "420--431",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3164135.3164139",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Feb 15 16:29:05 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Graph processing is becoming increasingly prevalent
across many application domains. In spite of this
prevalence, there is little research about how graphs
are actually used in practice. We conducted an online
survey aimed at understanding: (i) the types of graphs
users have; (ii) the graph computations users run;
(iii) the types of graph software users use; and (iv)
the major challenges users face when processing their
graphs. We describe the participants' responses to our
questions highlighting common patterns and challenges.
We further reviewed user feedback in the mailing lists,
bug reports, and feature requests in the source
repositories of a large suite of software products for
processing graphs. Through our review, we were able to
answer some new questions that were raised by
participants' responses and identify specific
challenges that users face when using different classes
of graph software. The participants' responses and data
we obtained revealed surprising facts about graph
processing in practice. In particular, real-world
graphs represent a very diverse range of entities and
are often very large, and scalability and visualization
are undeniably the most pressing challenges faced by
participants. We hope these findings can guide future
research.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ramachandra:2017:FOI,
author = "Karthik Ramachandra and Kwanghyun Park and K.
Venkatesh Emani and Alan Halverson and C{\'e}sar
Galindo-Legaria and Conor Cunningham",
title = "{Froid}: optimization of imperative programs in a
relational database",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "4",
pages = "432--444",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3164135.3164140",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Feb 15 16:29:05 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "For decades, RDBMSs have supported declarative SQL as
well as imperative functions and procedures as ways for
users to express data processing tasks. While the
evaluation of declarative SQL has received a lot of
attention resulting in highly sophisticated techniques,
the evaluation of imperative programs has remained
na{\"\i}ve and highly inefficient. Imperative programs
offer several benefits over SQL and hence are often
preferred and widely used. But unfortunately, their
abysmal performance discourages, and even prohibits
their use in many situations. We address this important
problem that has hitherto received little attention. We
present Froid, an extensible framework for optimizing
imperative programs in relational databases. Froid's
novel approach automatically transforms entire User
Defined Functions (UDFs) into relational algebraic
expressions, and embeds them into the calling SQL
query. This form is now amenable to cost-based
optimization and results in efficient, set-oriented,
parallel plans as opposed to inefficient, iterative,
serial execution of UDFs. Froid's approach additionally
brings the benefits of many compiler optimizations to
UDFs with no additional implementation effort. We
describe the design of Froid and present our
experimental evaluation that demonstrates performance
improvements of up to multiple orders of magnitude on
real workloads.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2017:ESH,
author = "Ye Li and Leong Hou U. and Man Lung Yiu and Ngai Meng
Kou",
title = "An experimental study on hub labeling based shortest
path algorithms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "4",
pages = "445--457",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3164135.3164141",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Feb 15 16:29:05 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Shortest path distance retrieval is a core component
in many important applications. For a decade, hub
labeling (HL) techniques have been considered as a
practical solution with fast query response time (e.g.,
1--3 orders of magnitude faster), competitive indexing
time, and slightly larger storage overhead (e.g.,
several times larger). These techniques enhance query
throughput up to hundred thousands queries per second,
which is particularly helpful in large user
environment. Despite the importance of HL techniques,
we are not aware of any comprehensive experimental
study on HL techniques. Thus it is difficult for a
practitioner to adopt HL techniques for her
applications. To address the above issues, we provide a
comprehensive experimental study on the
state-of-the-art HL technique with analysis of their
efficiency, effectiveness and applicability. From
insightful summary of different HL techniques, we
further develop a simple yet effective HL techniques
called Significant path based Hub Pushing (SHP) which
greatly improves indexing time of previous techniques
while retains good query performance. We also
complement extensive comparisons between HL techniques
and other shortest path solutions to demonstrate
robustness and efficiency of HL techniques.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Merritt:2017:CLS,
author = "Alexander Merritt and Ada Gavrilovska and Yuan Chen
and Dejan Milojicic",
title = "Concurrent log-structured memory for many-core
key--value stores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "4",
pages = "458--471",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3164135.3164142",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Feb 15 16:29:05 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Key-value stores are an important tool in managing and
accessing large in-memory data sets. As many
applications benefit from having as much of their
working state fit into main memory, an important design
of the memory management of modern key--value stores is
the use of log-structured approaches, enabling
efficient use of the memory capacity, by compacting
objects to avoid fragmented states. However, with the
emergence of thousand-core and peta-byte memory
platforms (DRAM or future storage-class memories)
log-structured designs struggle to scale, preventing
parallel applications from exploiting the full
capabilities of the hardware: careful coordination is
required for background activities (compacting and
organizing memory) to remain asynchronous with respect
to the use of the interface, and for insertion
operations to avoid contending for centralized
resources such as the log head and memory pools. In
this work, we present the design of a log-structured
key--value store called Nibble that incorporates a
multi-head log for supporting concurrent writes, a
novel distributed epoch mechanism for scalable memory
reclamation, and an optimistic concurrency index. We
implement Nibble in the Rust language in ca. 4000 lines
of code, and evaluate it across a variety of
data-serving workloads on a 240-core cache-coherent
server. Our measurements show Nibble scales linearly in
uniform YCSB workloads, matching competitive
non-log-structured key--value stores for write-
dominated traces at 50 million operations per second on
1 TiB-sized working sets. Our memory analysis shows
Nibble is efficient, requiring less than 10\%
additional capacity, whereas memory use by
non-log-structured key--value store designs may be as
high as 2x.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ceccarello:2017:CUG,
author = "Matteo Ceccarello and Carlo Fantozzi and Andrea
Pietracaprina and Geppino Pucci and Fabio Vandin",
title = "Clustering uncertain graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "4",
pages = "472--484",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3164135.3164143",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Feb 15 16:29:05 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "An uncertain graph $ G = (V, E, p : E \to (0, 1]) $
can be viewed as a probability space whose outcomes
(referred to as possible worlds ) are subgraphs of $G$
where any edge $ e \epsilon E$ occurs with probability
$ p(e)$, independently of the other edges. These graphs
naturally arise in many application domains where data
management systems are required to cope with
uncertainty in interrelated data, such as computational
biology, social network analysis, network reliability,
and privacy enforcement, among the others. For this
reason, it is important to devise fundamental querying
and mining primitives for uncertain graphs. This paper
contributes to this endeavor with the development of
novel strategies for clustering uncertain graphs.
Specifically, given an uncertain graph $G$ and an
integer $k$, we aim at partitioning its nodes into $k$
clusters, each featuring a distinguished center node,
so to maximize the minimum/average connection
probability of any node to its cluster's center, in a
random possible world. We assess the NP-hardness of
maximizing the minimum connection probability, even in
the presence of an oracle for the connection
probabilities, and develop efficient approximation
algorithms for both problems and some useful variants.
Unlike previous works in the literature, our algorithms
feature provable approximation guarantees and are
capable to keep the granularity of the returned
clustering under control. Our theoretical findings are
complemented with several experiments that compare our
algorithms against some relevant competitors, with
respect to both running-time and quality of the
returned clusterings.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Abdelaziz:2017:LSQ,
author = "Ibrahim Abdelaziz and Essam Mansour and Mourad Ouzzani
and Ashraf Aboulnaga and Panos Kalnis",
title = "{Lusail}: a system for querying linked data at scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "4",
pages = "485--498",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3164135.3164144",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Feb 15 16:29:05 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The RDF data model allows publishing interlinked RDF
datasets, where each dataset is independently
maintained and is queryable via a SPARQL endpoint. Many
applications would benefit from querying the resulting
large, decentralized, geo-distributed graph through a
federated SPARQL query processor. A crucial factor for
good performance in federated query processing is
pushing as much computation as possible to the local
endpoints. Surprisingly, existing federated SPARQL
engines are not effective at this task since they rely
only on schema information. Consequently, they cause
unnecessary data retrieval and communication, leading
to poor scalability and response time. This paper
addresses these limitations and presents Lusail, a
scalable and efficient federated SPARQL system for
querying large RDF graphs that are geo-distributed on
different endpoints. Lusail uses a novel query
rewriting algorithm to push computation to the local
endpoints by relying on information about the RDF
instances and not only the schema. The query rewriting
algorithm has the additional advantage of exposing
parallelism in query processing, which Lusail exploits
through advanced scheduling at query run time. Our
experiments on billions of triples of real and
synthetic data show that Lusail outperforms
state-of-the-art systems by orders of magnitude in
terms of scalability and response time.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Harmouch:2017:CEE,
author = "Hazar Harmouch and Felix Naumann",
title = "Cardinality estimation: an experimental survey",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "4",
pages = "499--512",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3164135.3164145",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Feb 15 16:29:05 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data preparation and data profiling comprise many both
basic and complex tasks to analyze a dataset at hand
and extract metadata, such as data distributions, key
candidates, and functional dependencies. Among the most
important types of metadata is the number of distinct
values in a column, also known as the zeroth-frequency
moment. Cardinality estimation itself has been an
active research topic in the past decades due to its
many applications. The aim of this paper is to review
the literature of cardinality estimation and to present
a detailed experimental study of twelve algorithms,
scaling far beyond the original experiments. First, we
outline and classify approaches to solve the problem of
cardinality estimation --- we describe their main idea,
error-guarantees, advantages, and disadvantages. Our
experimental survey then compares the performance all
twelve cardinality estimation algorithms. We evaluate
the algorithms' accuracy, runtime, and memory
consumption using synthetic and real-world datasets.
Our results show that different algorithms excel in
different in categories, and we highlight their
trade-offs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Park:2017:SSL,
author = "Jong-Hyeok Park and Gihwan Oh and Sang-Won Lee",
title = "{SQL} statement logging for making {SQLite} truly
lite",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "4",
pages = "513--525",
month = dec,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3164135.3164146",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Feb 15 16:29:05 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The lightweight codebase of SQLite was helpful in
making it become the de-facto standard database in most
mobile devices, but, at the same time, forced it to
take less-complicated transactional schemes, such as
physical page logging, journaling, and force commit,
which in turn cause excessive write amplification.
Thus, the write IO cost in SQLite is not lightweight at
all. In this paper, to make SQLite truly lite in terms
of IO efficiency for the transactional support, we
propose SQLite/SSL, a per-transaction SQL statement
logging scheme: when a transaction commits, SQLite/SSL
ensures its durability by storing only SQL statements
of small size, thus writing less and performing faster
at no compromise of transactional solidity. Our main
contribution is to show that, based on the observation
that mobile transactions tend to be short and exhibit
strong update locality, logical logging can, though
long discarded, become an elegant and perfect fit for
SQLite-based mobile applications. Further, we leverage
the WAL journal mode in vanilla SQLite as a
transaction-consistent checkpoint mechanism which is
indispensable in any logical logging scheme. In
addition, we show for the first time that
byte-addressable NVM (non-volatile memory) in host-side
can realize the full potential of logical logging
because it allows to store fine-grained logs quickly.
We have prototyped SQLite/SSL by augmenting vanilla
SQLite with a transaction-consistent checkpoint
mechanism and a redo-only recovery logic, and have
evaluated its performance using a set of synthetic and
real workloads. When a real NVM board is used as its
log device, SQLite/SSL can outperform vanilla SQLite's
WAL mode by up to 300x and also outperform the
state-of-the-arts SQLite/PPL scheme by several folds in
terms of IO time.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
remark = "The speedups come from employing nonvolatile memory
(which costs about 10 times as much as DRAM) for
database updates, and delaying writes to the
filesystem, which is important for SSD devices that
have limited write life. The target platform is mobile
devices. There is no mention of whether the extensions
to the public-domain SQLite3 code are available to
others.",
}
@Article{Johnson:2018:TPD,
author = "Noah Johnson and Joseph P. Near and Dawn Song",
title = "Towards practical differential privacy for {SQL}
queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "5",
pages = "526--539",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3177732.3177733",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 17 07:25:04 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Differential privacy promises to enable general data
analytics while protecting individual privacy, but
existing differential privacy mechanisms do not support
the wide variety of features and databases used in
real-world SQL-based analytics systems. This paper
presents the first practical approach for differential
privacy of SQL queries. Using 8.1 million real-world
queries, we conduct an empirical study to determine the
requirements for practical differential privacy, and
discuss limitations of previous approaches in light of
these requirements. To meet these requirements we
propose elastic sensitivity, a novel method for
approximating the local sensitivity of queries with
general equijoins. We prove that elastic sensitivity is
an upper bound on local sensitivity and can therefore
be used to enforce differential privacy using any local
sensitivity-based mechanism. We build FLEX, a practical
end-to-end system to enforce differential privacy for
SQL queries using elastic sensitivity. We demonstrate
that FLEX is compatible with any existing database, can
enforce differential privacy for real-world SQL
queries, and incurs negligible (0.03\%) performance
overhead.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shraer:2018:CSS,
author = "Alexander Shraer and Alexandre Aybes and Bryan Davis
and Christos Chrysafis and Dave Browning and Eric
Krugler and Eric Stone and Harrison Chandler and Jacob
Farkas and John Quinn and Jonathan Ruben and Michael
Ford and Mike McMahon and Nathan Williams and Nicolas
Favre-Felix and Nihar Sharma and Ori Herrnstadt and
Paul Seligman and Raghav Pisolkar and Scott Dugas and
Scott Gray and Sytze Harkema and Valentin Kravtsov and
Vanessa Hong and Wan Ling Yih and Yizuo Tian",
title = "{Cloudkit}: structured storage for mobile
applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "5",
pages = "540--552",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3164135.3164138",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 17 07:25:04 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "CloudKit is Apple's cloud backend service and
application development framework that provides
strongly-consistent storage for structured data and
makes it easy to synchronize data across user devices
or share it among multiple users. Launched more than 3
years ago, CloudKit forms the foundation for more than
50 Apple apps, including many of our most important and
popular applications such as Photos, iCloud Drive,
Notes, Keynote, and News, as well as many third-party
apps. To deliver this at large scale, CloudKit
explicitly leverages multi-tenancy at the application
level as well as at the user level to guide efficient
data placement and distribution. By using CloudKit
application developers are free to focus on delivering
the application front-end and logic while relying on
CloudKit for scale, consistency, durability and
security. CloudKit manages petabytes of data and
handles hundreds of millions of users around the world
on a daily basis.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Arulraj:2018:BHP,
author = "Joy Arulraj and Justin Levandoski and Umar Farooq
Minhas and Per-Ake Larson",
title = "{Bztree}: a high-performance latch-free range index
for non-volatile memory",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "5",
pages = "553--565",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3164135.3164147",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 17 07:25:04 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Storing a database (rows and indexes) entirely in
non-volatile memory (NVM) potentially enables both high
performance and fast recovery. To fully exploit
parallelism on modern CPUs, modern main-memory
databases use latch-free (lock-free) index structures,
e.g. Bw-tree or skip lists. To achieve high performance
NVM-resident indexes also need to be latch-free. This
paper describes the design of the BzTree, a latch-free
B-tree index designed for NVM. The BzTree uses a
persistent multi-word compare-and-swap operation
(PMwCAS) as a core building block, enabling an index
design that has several important advantages compared
with competing index structures such as the Bw-tree.
First, the BzTree is latch-free yet simple to
implement. Second, the BzTree is fast --- showing up to
2x higher throughput than the Bw-tree in our
experiments. Third, the BzTree does not require any
special-purpose recovery code. Recovery is
near-instantaneous and only involves rolling back (or
forward) any PMwCAS operations that were in-flight
during failure. Our end-to-end recovery experiments of
BzTree report an average recovery time of 145 $ \mu $
s. Finally, the same BzTree implementation runs
seamlessly on both volatile RAM and NVM, which greatly
reduces the cost of code maintenance.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Huang:2018:FFP,
author = "Yuzhen Huang and Tatiana Jin and Yidi Wu and Zhenkun
Cai and Xiao Yan and Fan Yang and Jinfeng Li and Yuying
Guo and James Cheng",
title = "{FlexPS}: flexible parallelism control in parameter
server architecture",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "5",
pages = "566--579",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3177732.3177734",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 17 07:25:04 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "As a general abstraction for coordinating the
distributed storage and access of model parameters, the
parameter server (PS) architecture enables distributed
machine learning to handle large datasets and high
dimensional models. Many systems, such as Parameter
Server and Petuum, have been developed based on the PS
architecture and widely used in practice. However, none
of these systems supports changing parallelism during
runtime, which is crucial for the efficient execution
of machine learning tasks with dynamic workloads. We
propose a new system, called FlexPS, which introduces a
novel multi-stage abstraction to support flexible
parallelism control. With the multi-stage abstraction,
a machine learning task can be mapped to a series of
stages and the parallelism for a stage can be set
according to its workload. Optimizations such as stage
scheduler, stage-aware consistency controller, and
direct model transfer are proposed for the efficiency
of multi-stage machine learning in FlexPS. As a general
and complete PS systems, FlexPS also incorporates many
optimizations that are not limited to multi-stage
machine learning. We conduct extensive experiments
using a variety of machine learning workloads, showing
that FlexPS achieves significant speedups and resource
saving compared with the state-of-the-art PS systems
such as Petuum and Multiverso.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yaghmazadeh:2018:AMH,
author = "Navid Yaghmazadeh and Xinyu Wang and Isil Dillig",
title = "Automated migration of hierarchical data to relational
tables using programming-by-example",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "5",
pages = "580--593",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3177732.3177735",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 17 07:25:04 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "While many applications export data in hierarchical
formats like XML and JSON, it is often necessary to
convert such hierarchical documents to a relational
representation. This paper presents a novel
programming-by-example approach, and its implementation
in a tool called Mitra, for automatically migrating
tree-structured documents to relational tables. We have
evaluated the proposed technique using two sets of
experiments. In the first experiment, we used Mitra to
automate 98 data transformation tasks collected from
StackOverflow. Our method can generate the desired
program for 94\% of these benchmarks with an average
synthesis time of 3.8 seconds. In the second
experiment, we used Mitra to generate programs that can
convert real-world XML and JSON datasets to
full-fledged relational databases. Our evaluation shows
that Mitra can automate the desired transformation for
all datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Luo:2018:TTO,
author = "Siqiang Luo and Ben Kao and Guoliang Li and Jiafeng Hu
and Reynold Cheng and Yudian Zheng",
title = "{TOAIN}: a throughput optimizing adaptive index for
answering dynamic {$k$ NN} queries on road networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "5",
pages = "594--606",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3177732.3177736",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 17 07:25:04 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We study the classical k NN queries on road networks.
Existing solutions mostly focus on reducing query
processing time. In many applications, however, system
throughput is a more important measure. We devise a
mathematical model that describes throughput in terms
of a number of system characteristics. We show that
query time is only one of the many parameters that
impact throughput. Others include update time and
query/update arrival rates. We show that the
traditional approach of improving query time alone is
generally inadequate in optimizing throughput.
Moreover, existing solutions lack flexibility in
adapting to environments of different characteristics.
We propose Toain, which is a very flexible algorithm
that can be easily trained to adapt to a given
environment for maximizing query throughput. We conduct
extensive experiments on both real and synthetic data
and show that Toain gives significantly higher
throughput compared with existing solutions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2018:EMT,
author = "Tian Li and Jie Zhong and Ji Liu and Wentao Wu and Ce
Zhang",
title = "{Ease.ml}: towards multi-tenant resource sharing for
machine learning workloads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "5",
pages = "607--620",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3177732.3177737",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 17 07:25:04 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present ease.ml, a declarative machine learning
service platform. With ease.ml, a user defines the
high-level schema of an ML application and submits the
task via a Web interface. The system then deals with
the rest, such as model selection and data movement.
The ultimate question we hope to understand is that, as
a ``service provider'' that manages a shared cluster of
machines running machine learning workloads, what is
the resource sharing strategy that maximizes the global
satisfaction of all our users? This paper does not
completely answer this general question, but focuses on
solving the first technical challenge we were facing
when trying to build ease.ml. We observe that resource
sharing is a critical yet subtle issue in this
multi-tenant scenario, as we have to balance between
efficiency and fairness. We first formalize the problem
that we call multi-tenant model selection, aiming for
minimizing the total regret of all users running
automatic model selection tasks. We then develop a
novel algorithm that combines multi-armed bandits with
Bayesian optimization and prove a regret bound under
the multi-tenant setting. Finally, we report our
evaluation of ease.ml on synthetic data and on two
services we are providing to our users, namely, image
classification with deep neural networks and binary
classification with Azure ML Studio. Our experimental
evaluation results show that our proposed solution can
be up to 9.8x faster in achieving the same global
average accuracy for all users as the two popular
heuristics used by our users before ease.ml, and 4.1 x
faster than state-of-the-art systems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Qi:2018:TOE,
author = "Jianzhong Qi and Yufei Tao and Yanchuan Chang and Rui
Zhang",
title = "Theoretically optimal and empirically efficient
{R}-trees with strong parallelizability",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "5",
pages = "621--634",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3177732.3177738",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 17 07:25:04 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The massive amount of data and large variety of data
distributions in the big data era call for access
methods that are efficient in both query processing and
index bulk-loading, and over both practical and
worst-case workloads. To address this need, we revisit
a classic multidimensional access method --- the
R-tree. We propose a novel R-tree packing strategy that
produces R-trees with an asymptotically optimal I/O
complexity for window queries in the worst case. Our
experiments show that the R-trees produced by the
proposed strategy are highly efficient on real and
synthetic data of different distributions. The proposed
strategy is also simple to parallelize, since it relies
only on sorting. We propose a parallel algorithm for
R-tree bulk-loading based on the proposed packing
strategy, and analyze its performance under the
massively parallel communication model. Experimental
results confirm the efficiency and scalability of the
parallel algorithm over large data sets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lin:2018:DAM,
author = "Xueling Lin and Lei Chen",
title = "Domain-aware multi-truth discovery from conflicting
sources",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "5",
pages = "635--647",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3177732.3177739",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 17 07:25:04 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In the Big Data era, truth discovery has served as a
promising technique to solve conflicts in the facts
provided by numerous data sources. The most significant
challenge for this task is to estimate source
reliability and select the answers supported by high
quality sources. However, existing works assume that
one data source has the same reliability on any kinds
of entity, ignoring the possibility that a source may
vary in reliability on different domains. To capture
the influence of various levels of expertise in
different domains, we integrate domain expertise
knowledge to achieve a more precise estimation of
source reliability. We propose to infer the domain
expertise of a data source based on its data richness
in different domains. We also study the mutual
influence between domains, which will affect the
inference of domain expertise. Through leveraging the
unique features of the multi-truth problem that sources
may provide partially correct values of a data item, we
assign more reasonable confidence scores to value sets.
We propose an integrated Bayesian approach to
incorporate the domain expertise of data sources and
confidence scores of value sets, aiming to find
multiple possible truths without any supervision.
Experimental results on two real-world datasets
demonstrate the feasibility, efficiency and
effectiveness of our approach.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tian:2018:CAL,
author = "Boyu Tian and Jiamin Huang and Barzan Mozafari and
Grant Schoenebeck",
title = "Contention-aware lock scheduling for transactional
databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "5",
pages = "648--662",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3177732.3177740",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 17 07:25:04 MST 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Lock managers are among the most studied components in
concurrency control and transactional systems. However,
one question seems to have been generally overlooked:
``When there are multiple lock requests on the same
object, which one(s) should be granted first?'' Nearly
all existing systems rely on a FIFO (first in, first
out) strategy to decide which transaction(s) to grant
the lock to. In this paper, however, we show that the
lock scheduling choices have significant ramifications
on the overall performance of a transactional system.
Despite the large body of research on job scheduling
outside the database context, lock scheduling presents
subtle but challenging requirements that render
existing results on scheduling inapt for a
transactional database. By carefully studying this
problem, we present the concept of contention-aware
scheduling, show the hardness of the problem, and
propose novel lock scheduling algorithms (LDSF and
bLDSF), which guarantee a constant factor approximation
of the best scheduling. We conduct extensive
experiments using a popular database on both TPC-C and
a microbenchmark. Compared to FIFO---the default
scheduler in most database systems---our bLDSF
algorithm yields up to 300x speedup in overall
transaction latency. Alternatively, our LDSF algorithm,
which is simpler and achieves comparable performance to
bLDSF, has already been adopted by open-source
community, and was chosen as the default scheduling
strategy in MySQL 8.0.3+",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Patel:2018:QDP,
author = "Jignesh M. Patel and Harshad Deshmukh and Jianqiao Zhu
and Navneet Potti and Zuyu Zhang and Marc Spehlmann and
Hakan Memisoglu and Saket Saurabh",
title = "{Quickstep}: a data platform based on the scaling-up
approach",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "6",
pages = "663--676",
month = feb,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3184470.3184471",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 10 06:50:54 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Modern servers pack enough storage and computing power
that just a decade ago was spread across a modest-sized
cluster. This paper presents a prototype system, called
Quickstep, to exploit the large amount of parallelism
that is packed inside modern servers. Quickstep builds
on a vast body of previous methods for organizing data,
optimizing, scheduling and executing queries, and
brings them together in a single system. Quickstep also
includes new query processing methods that go beyond
previous approaches. To keep the project focused, the
project's initial target is read-mostly in-memory data
warehousing workloads in single-node settings. In this
paper, we describe the design and implementation of
Quickstep for this target application space. We also
present experimental results comparing the performance
of Quickstep to a number of other systems,
demonstrating that Quickstep is often faster than many
other contemporary systems, and in some cases faster by
orders-of-magnitude. Quickstep is an Apache
(incubating) project.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kondylakis:2018:CSB,
author = "Haridimos Kondylakis and Niv Dayan and Kostas
Zoumpatianos and Themis Palpanas",
title = "{Coconut}: a scalable bottom-up approach for building
data series indexes",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "6",
pages = "677--690",
month = feb,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3184470.3184472",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 10 06:50:54 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Many modern applications produce massive amounts of
data series that need to be analyzed, requiring
efficient similarity search operations. However, the
state-of-the-art data series indexes that are used for
this purpose do not scale well for massive datasets in
terms of performance, or storage costs. We pinpoint the
problem to the fact that existing summarizations of
data series used for indexing cannot be sorted while
keeping similar data series close to each other in the
sorted order. This leads to two design problems. First,
traditional bulk-loading algorithms based on sorting
cannot be used. Instead, index construction takes place
through slow top-down insertions, which create a
non-contiguous index that results in many random I/Os.
Second, data series cannot be sorted and split across
nodes evenly based on their median value; thus, most
leaf nodes are in practice nearly empty. This further
slows down query speed and amplifies storage costs. To
address these problems, we present Coconut. The first
innovation in Coconut is an inverted, sortable data
series summarization that organizes data series based
on a z-order curve, keeping similar series close to
each other in the sorted order. As a result, Coconut is
able to use bulk-loading techniques that rely on
sorting to quickly build a contiguous index using large
sequential disk I/Os. We then explore prefix-based and
median-based splitting policies for bottom-up
bulk-loading, showing that median-based splitting
outperforms the state of the art, ensuring that all
nodes are densely populated. Overall, we show
analytically and empirically that Coconut dominates the
state-of-the-art data series indexes in terms of
construction speed, query speed, and storage costs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ammar:2018:DES,
author = "Khaled Ammar and Frank McSherry and Semih Salihoglu
and Manas Joglekar",
title = "Distributed evaluation of subgraph queries using
worst-case optimal low-memory dataflows",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "6",
pages = "691--704",
month = feb,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3184470.3184473",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 10 06:50:54 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We study the problem of finding and monitoring
fixed-size subgraphs in a continually changing
large-scale graph. We present the first approach that
(i) performs worst-case optimal computation and
communication, (ii) maintains a total memory footprint
linear in the number of input edges, and (iii) scales
down per-worker computation, communication, and memory
requirements linearly as the number of workers
increases, even on adversarially skewed inputs. Our
approach is based on worst-case optimal join
algorithms, recast as a data-parallel dataflow
computation. We describe the general algorithm and
modifications that make it robust to skewed data, prove
theoretical bounds on its resource requirements in the
massively parallel computing model, and implement and
evaluate it on graphs containing as many as 64 billion
edges. The underlying algorithm and ideas generalize
from finding and monitoring subgraphs to the more
general problem of computing and maintaining relational
equi-joins over dynamic relations.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2018:MFC,
author = "Teng Li and Zhiyuan Xu and Jian Tang and Yanzhi Wang",
title = "Model-free control for distributed stream data
processing using deep reinforcement learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "6",
pages = "705--718",
month = feb,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3184470.3184474",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 10 06:50:54 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we focus on general-purpose Distributed
Stream Data Processing Systems (DSDPSs), which deal
with processing of unbounded streams of continuous data
at scale distributedly in real or near-real time. A
fundamental problem in a DSDPS is the scheduling
problem (i.e., assigning workload to workers/machines)
with the objective of minimizing average end-to-end
tuple processing time. A widely-used solution is to
distribute workload evenly over machines in the cluster
in a round-robin manner, which is obviously not
efficient due to lack of consideration for
communication delay. Model-based approaches (such as
queueing theory) do not work well either due to the
high complexity of the system environment. We aim to
develop a novel model-free approach that can learn to
well control a DSDPS from its experience rather than
accurate and mathematically solvable system models,
just as a human learns a skill (such as cooking,
driving, swimming, etc). Specifically, we, for the
first time, propose to leverage emerging Deep
Reinforcement Learning (DRL) for enabling model-free
control in DSDPSs; and present design, implementation
and evaluation of a novel and highly effective
DRL-based control framework, which minimizes average
end-to-end tuple processing time by jointly learning
the system environment via collecting very limited
runtime statistics data and making decisions under the
guidance of powerful Deep Neural Networks (DNNs). To
validate and evaluate the proposed framework, we
implemented it based on a widely-used DSDPS, Apache
Storm, and tested it with three representative
applications: continuous queries, log stream processing
and word count (stream version). Extensive experimental
results show (1) Compared to Storm's default scheduler
and the state-of-the-art model-based method, the
proposed framework reduces average tuple processing by
33.5\% and 14.0\% respectively on average. (2) The
proposed framework can quickly reach a good scheduling
solution during online learning, which justifies its
practicability for online control in DSDPSs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Psallidas:2018:SFG,
author = "Fotis Psallidas and Eugene Wu",
title = "{Smoke}: fine-grained lineage at interactive speed",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "6",
pages = "719--732",
month = feb,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3184470.3184475",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 10 06:50:54 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data lineage describes the relationship between
individual input and output data items of a workflow
and is an integral ingredient for both traditional
(e.g., debugging or auditing) and emergent (e.g.,
explanations or cleaning) applications. The core,
long-standing problem that lineage systems need to
address---and the main focus of this paper---is to
quickly capture lineage across a workflow in order to
speed up future queries over lineage. Current lineage
systems, however, either incur high lineage capture
overheads, high lineage query processing costs, or
both. In response, developers resort to manual
implementations of applications that, in principal, can
be expressed and optimized in lineage terms. This paper
describes Smoke, an in-memory database engine that
provides both fast lineage capture and lineage query
processing. To do so, Smoke tightly integrates the
lineage capture logic into physical database operators;
stores lineage in efficient lineage representations;
and employs optimizations if future lineage queries are
known up-front. Our experiments on microbenchmarks and
realistic workloads show that Smoke reduces the lineage
capture overhead and lineage query costs by multiple
orders of magnitude as compared to state-of-the-art
alternatives. On real-world applications, we show that
Smoke meets the latency requirements of interactive
visualizations (e.g., $<$ 150ms) and outperforms
hand-written implementations of data profiling
primitives.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Idris:2018:CQI,
author = "Muhammad Idris and Mart{\'\i}n Ugarte and Stijn
Vansummeren and Hannes Voigt and Wolfgang Lehner",
title = "Conjunctive queries with inequalities under updates",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "7",
pages = "733--745",
month = mar,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3192965.3192966",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 29 08:31:56 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Modern application domains such as Composite Event
Recognition (CER) and real-time Analytics require the
ability to dynamically refresh query results under high
update rates. Traditional approaches to this problem
are based either on the materialization of subresults
(to avoid their recomputation) or on the recomputation
of subresults (to avoid the space overhead of
materialization). Both techniques have recently been
shown suboptimal: instead of materializing results and
subresults, one can maintain a data structure that
supports efficient maintenance under updates and can
quickly enumerate the full query output, as well as the
changes produced under single updates. Unfortunately,
these data structures have been developed only for
aggregate-join queries composed of equi-joins, limiting
their applicability in domains such as CER where
temporal joins are commonplace. In this paper, we
present a new approach for dynamically evaluating
queries with multi-way $ \theta $-joins under updates
that is effective in avoiding both materialization and
recomputation of results, while supporting a wide range
of applications. To do this we generalize Dynamic
Yannakakis, an algorithm for dynamically processing
acyclic equi-join queries. In tandem, and of
independent interest, we generalize the notions of
acyclicity and free-connexity to arbitrary $ \theta
$-joins. We instantiate our framework to the case where
$ \theta $-joins are only composed of equalities and
inequalities ($<$, $ \leq $, $=$, $>$, $ \geq $) and
experimentally compare this algorithm, called IEDyn, to
state of the art CER systems as well as incremental
view maintenance engines. IEDyn performs consistently
better than the competitor systems with up to two
orders of magnitude improvements in both time and
memory consumption.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yint:2018:BER,
author = "Zhicheng Yint and Jin Sun and Ming Li and Jaliya
Ekanayake and Haibo Lin and Marc Friedman and Jos{\'e}
A. Blakeley and Clemens Szyperski and Nikhil R.
Devanur",
title = "Bubble execution: resource-aware reliable analytics at
cloud scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "7",
pages = "746--758",
month = mar,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3192965.3192967",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 29 08:31:56 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Enabling interactive data exploration at cloud scale
requires minimizing end-to-end query execution latency,
while guaranteeing fault tolerance, and query execution
under resource-constraints. Typically, such a query
execution involves orchestrating the execution of
hundreds or thousands of related tasks on cloud scale
clusters. Without any resource constraints, all query
tasks can be scheduled to execute simultaneously (gang
scheduling) while connected tasks stream data between
them. When the data size referenced by a query
increases, gang scheduling may be resource-wasteful or
un-satisfiable with a limited, per-query resource
budget. This paper introduces Bubble Execution, a new
query processing framework for interactive workloads at
cloud scale, that balances cost-based query
optimization, fault tolerance, optimal resource
management, and execution orchestration. Bubble
execution involves dividing a query execution graph
into a collection of query sub-graphs (bubbles), and
scheduling them within a per-query resource budget. The
query operators (tasks) inside a bubble stream data
between them while fault tolerance is handled by
persisting temporary results at bubble boundaries. Our
implementation enhances our JetScope service, for
interactive workloads, deployed in production clusters
at Microsoft. Experiments with TPC-H queries show that
bubble execution can reduce resource usage
significantly in the presence of failures while
maintaining performance competitive with gang
execution.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kruse:2018:EDA,
author = "Sebastian Kruse and Felix Naumann",
title = "Efficient discovery of approximate dependencies",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "7",
pages = "759--772",
month = mar,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3192965.3192968",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 29 08:31:56 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Functional dependencies (FDs) and unique column
combinations (UCCs) form a valuable ingredient for many
data management tasks, such as data cleaning, schema
recovery, and query optimization. Because these
dependencies are unknown in most scenarios, their
automatic discovery has been well researched. However,
existing methods mostly discover only exact
dependencies, i.e., those without violations.
Real-world dependencies, in contrast, are frequently
approximate due to data exceptions, ambiguities, or
data errors. This relaxation to approximate
dependencies renders their discovery an even harder
task than the already challenging exact dependency
discovery. To this end, we propose the novel and highly
efficient algorithm Pyro to discover both approximate
FDs and approximate UCCs. Pyro combines a
separate-and-conquer search strategy with
sampling-based guidance that quickly detects dependency
candidates and verifies them. In our broad experimental
evaluation, Pyro outperforms existing discovery
algorithms by a factor of up to 33, scales to larger
datasets, and at the same time requires the least main
memory.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2018:RID,
author = "Yue Wang and Alexandra Meliou and Gerome Miklau",
title = "{RC-index}: diversifying answers to range queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "7",
pages = "773--786",
month = mar,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3192965.3192969",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 29 08:31:56 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Query result diversification is widely used in data
exploration, Web search, and recommendation systems.
The problem of returning diversified query results
consists of finding a small subset of valid query
answers that are representative and different from one
another, usually quantified by a diversity score. Most
existing techniques for query diversification first
compute all valid query results and then find a diverse
subset. These techniques are inefficient when the set
of valid query results is large. Other work has
proposed efficient solutions for restricted application
settings, where results are shared across multiple
queries. In this paper, our goal is to support result
diversification for general range queries over a single
relation. We propose the RC-Index, a novel index
structure that achieves efficiency by reducing the
number of items that must be retrieved by the database
to form a diverse set of the desired size (about 1
second for a dataset of 1 million items). Further, we
prove that an RC-Index offers strong approximation
guarantees. To the best of our knowledge, this is the
first index-based diversification method with a
guaranteed approximation ratio for range queries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ding:2018:UUP,
author = "Xin Ding and Lu Chen and Yunjun Gao and Christian S.
Jensen and Hujun Bao",
title = "{UlTraMan}: a unified platform for big trajectory data
management and analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "7",
pages = "787--799",
month = mar,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3192965.3192970",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 29 08:31:56 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Massive trajectory data is being generated by
GPS-equipped devices, such as cars and mobile phones,
which is used increasingly in transportation,
location-based services, and urban computing. As a
result, a variety of methods have been proposed for
trajectory data management and analytics. However,
traditional systems and methods are usually designed
for very specific data management or analytics needs,
which forces users to stitch together heterogeneous
systems to analyze trajectory data in an inefficient
manner. Targeting the overall data pipeline of big
trajectory data management and analytics, we present a
unified platform, termed as UlTraMan. In order to
achieve scalability, efficiency, persistence, and
flexibility, (i) we extend Apache Spark with respect to
both data storage and computing by seamlessly
integrating a key--value store, and (ii) we enhance the
MapReduce paradigm to allow flexible optimizations
based on random data access. We study the resulting
system's flexibility using case studies on data
retrieval, aggregation analyses, and pattern mining.
Extensive experiments on real and synthetic trajectory
data are reported to offer insight into the scalability
and performance of UlTraMan.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jindal:2018:SSM,
author = "Alekh Jindal and Konstantinos Karanasos and Sriram Rao
and Hiren Patel",
title = "Selecting subexpressions to materialize at datacenter
scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "7",
pages = "800--812",
month = mar,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3192965.3192971",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 29 08:31:56 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We observe significant overlaps in the computations
performed by user jobs in modern shared analytics
clusters. Na{\"\i}vely computing the same
subexpressions multiple times results in wasting
cluster resources and longer execution times. Given
that these shared cluster workloads consist of tens of
thousands of jobs, identifying overlapping computations
across jobs is of great interest to both cluster
operators and users. Nevertheless, existing approaches
support orders of magnitude smaller workloads or employ
heuristics with limited effectiveness. In this paper,
we focus on the problem of subexpression selection for
large workloads, i.e., selecting common parts of job
plans and materializing them to speed-up the evaluation
of subsequent jobs. We provide an ILP-based formulation
of our problem and map it to a bipartite graph labeling
problem. Then, we introduce BigSubs, a vertex-centric
graph algorithm to iteratively choose in parallel which
subexpressions to materialize and which subexpressions
to use for evaluating each job. We provide a
distributed implementation of our approach using our
internal SQL-like execution framework, SCOPE, and
assess its effectiveness over production workloads.
BigSubs supports workloads with tens of thousands of
jobs, yielding savings of up to 40\% in machine-hours.
We are currently integrating our techniques with the
SCOPE runtime in our production clusters.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Nargesian:2018:TUS,
author = "Fatemeh Nargesian and Erkang Zhu and Ken Q. Pu and
Ren{\'e}e J. Miller",
title = "Table union search on open data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "7",
pages = "813--825",
month = mar,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3192965.3192973",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 29 08:31:56 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We define the table union search problem and present a
probabilistic solution for finding tables that are
unionable with a query table within massive
repositories. Two tables are unionable if they share
attributes from the same domain. Our solution
formalizes three statistical models that describe how
unionable attributes are generated from set domains,
semantic domains with values from an ontology, and
natural language domains. We propose a data-driven
approach that automatically determines the best model
to use for each pair of attributes. Through a
distribution-aware algorithm, we are able to find the
optimal number of attributes in two tables that can be
unioned. To evaluate accuracy, we created and
open-sourced a benchmark of Open Data tables. We show
that our table union search outperforms in speed and
accuracy existing algorithms for finding related tables
and scales to provide efficient search over Open Data
repositories containing more than one million
attributes.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chen:2018:STH,
author = "Jianfei Chen and Jun Zhu and Jie Lu and Shixia Liu",
title = "Scalable training of hierarchical topic models",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "7",
pages = "826--839",
month = mar,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3192965.3192972",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 29 08:31:56 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Large-scale topic models serve as basic tools for
feature extraction and dimensionality reduction in many
practical applications. As a natural extension of flat
topic models, hierarchical topic models (HTMs) are able
to learn topics of different levels of abstraction,
which lead to deeper understanding and better
generalization than their flat counterparts. However,
existing scalable systems for flat topic models cannot
handle HTMs, due to their complicated data structures
such as trees and concurrent dynamically growing
matrices, as well as their susceptibility to local
optima. In this paper, we study the hierarchical latent
Dirichlet allocation (hLDA) model which is a powerful
nonparametric Bayesian HTM. We propose an efficient
partially collapsed Gibbs sampling algorithm for hLDA,
as well as an initialization strategy to deal with
local optima introduced by tree-structured models. We
also identify new system challenges in building
scalable systems for HTMs, and propose efficient data
layout for vectorizing HTM as well as distributed data
structures including dynamic matrices and trees.
Empirical studies show that our system is 87 times more
efficient than the previous open-source implementation
for hLDA, and can scale to thousands of CPU cores. We
demonstrate our scalability on a 131-million-document
corpus with 28 billion tokens, which is 4--5 orders of
magnitude larger than previously used corpus. Our
distributed implementation can extract 1,722 topics
from the corpus with 50 machines in just 7 hours.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Coskun:2018:IFN,
author = "Mustafa Coskun and Ananth Grama and Mehmet
Koyut{\"u}rk",
title = "Indexed fast network proximity querying",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "8",
pages = "840--852",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3204028.3204029",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 29 08:31:56 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Node proximity queries are among the most common
operations on network databases. A common measure of
node proximity is random walk based proximity, which
has been shown to be less susceptible to noise and
missing data. Real-time processing of random-walk based
proximity queries poses significant computational
challenges for larger graphs with over billions of
nodes and edges, since it involves solution of large
linear systems of equations. Due to the importance of
this operation, significant effort has been devoted to
developing efficient methods for random-walk based node
proximity computations. These methods either aim to
speed up iterative computations by exploiting numerical
properties of random walks, or rely on computation and
storage of matrix inverses to avoid computation during
query processing. Although both approaches have been
well studied, the speedup achieved by iterative
approaches does not translate to real-time query
processing, and the storage requirements of
inversion-based approaches prohibit their use on very
large graph databases. We present a novel approach to
significantly reducing the computational cost of random
walk based node proximity queries with scalable
indexing. Our approach combines domain
graph-partitioning based indexing with fast iterative
computations during query processing using Chebyshev
polynomials over the complex elliptic plane. This
approach combines the query processing benefits of
inversion techniques with the memory and storage
benefits of iterative approache. Using real-world
networks with billions of nodes and edges, and top- k
proximity queries as the benchmark problem, we show
that our algorithm, I-C hopper, significantly
outperforms existing methods. Specifically, it
drastically reduces convergence time of the iterative
procedure, while also reducing storage requirements for
indexing.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zheng:2018:ODP,
author = "Libin Zheng and Lei Chen and Jieping Ye",
title = "Order dispatch in price-aware ridesharing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "8",
pages = "853--865",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3204028.3204030",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 29 08:31:56 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the prevalence of car-hailing applications,
ridesharing becomes more and more popular because of
its great potential in monetary saving and
environmental protection. Order dispatch is the key
problem in ridesharing, which has a strong impact on
riders' experience and platform's performance. Existing
order dispatch research works fail to consider the
price of the orders, which can be an important
reference because it directly relates to the platform's
profit. Our work takes the order price into concern,
and formulates a constrained optimization problem,
which takes platform's profit as the optimization
objective and performs controls on riders' detour
distance and waiting time. We prove the problem is
NP-hard, thus, we propose approximation methods. We
further develop a simulation framework based on real
ridesharing order and vehicle data. We conduct
experiments with this simulation framework to evaluate
the effectiveness and efficiency of the proposed
methods.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mouratidis:2018:EPU,
author = "Kyriakos Mouratidis and Bo Tang",
title = "Exact processing of uncertain top-$k$ queries in
multi-criteria settings",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "8",
pages = "866--879",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3204028.3204031",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 29 08:31:56 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Traditional rank-aware processing assumes a dataset
that contains available options to cover a specific
need (e.g., restaurants, hotels, etc) and users who
browse that dataset via top- k queries with linear
scoring functions, i.e., by ranking the options
according to the weighted sum of their attributes, for
a set of given weights. In practice, however, user
preferences (weights) may only be estimated with
bounded accuracy, or may be inherently uncertain due to
the inability of a human user to specify exact weight
values with absolute accuracy. Motivated by this, we
introduce the uncertain top-k query ( UTK ). Given
uncertain preferences, that is, an approximate
description of the weight values, the UTK query reports
all options that may belong to the top- k set. A second
version of the problem additionally reports the exact
top- k set for each of the possible weight settings. We
develop a scalable processing framework for both UTK
versions, and demonstrate its efficiency using standard
benchmark datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Berti-Equille:2018:DGF,
author = "Laure Berti-{\'E}quille and Hazar Harmouch and Felix
Naumann and No{\"e}l Novelli and Saravanan
Thirumuruganathan",
title = "Discovery of genuine functional dependencies from
relational data with missing values",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "8",
pages = "880--892",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3204028.3204032",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 29 08:31:56 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Functional dependencies (FDs) play an important role
in maintaining data quality. They can be used to
enforce data consistency and to guide repairs over a
database. In this work, we investigate the problem of
missing values and its impact on FD discovery. When
using existing FD discovery algorithms, some genuine
FDs could not be detected precisely due to missing
values or some non-genuine FDs can be discovered even
though they are caused by missing values with a certain
NULL semantics. We define a notion of genuineness and
propose algorithms to compute the genuineness score of
a discovered FD. This can be used to identify the
genuine FDs among the set of all valid dependencies
that hold on the data. We evaluate the quality of our
method over various real-world and semi-synthetic
datasets with extensive experiments. The results show
that our method performs well for relatively large FD
sets and is able to accurately capture genuine FDs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cai:2018:ETD,
author = "Qingchao Cai and Zhongle Xie and Meihui Zhang and Gang
Chen and H. V. Jagadish and Beng Chin Ooi",
title = "Effective temporal dependence discovery in time series
data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "8",
pages = "893--905",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3204028.3204033",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 29 08:31:56 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "To analyze user behavior over time, it is useful to
group users into cohorts, giving rise to cohort
analysis. We identify several crucial limitations of
current cohort analysis, motivated by the unmet need
for temporal dependence discovery. To address these
limitations, we propose a generalization that we call
recurrent cohort analysis. We introduce a set of
operators for recurrent cohort analysis and design
access methods specific to these operators in both
single-node and distributed environments. Through
extensive experiments, we show that recurrent cohort
analysis when implemented using the proposed access
methods is up to six orders faster than one implemented
as a layer on top of a database in a single-node
setting, and two orders faster than one implemented
using Spark SQL in a distributed setting.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Arora:2018:HIP,
author = "Akhil Arora and Sakshi Sinha and Piyush Kumar and
Arnab Bhattacharya",
title = "{HD-index}: pushing the scalability-accuracy boundary
for approximate {kNN} search in high-dimensional
spaces",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "8",
pages = "906--919",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3204028.3204034",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 29 08:31:56 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Nearest neighbor searching of large databases in
high-dimensional spaces is inherently difficult due to
the curse of dimensionality. A flavor of approximation
is, therefore, necessary to practically solve the
problem of nearest neighbor search. In this paper, we
propose a novel yet simple indexing scheme, HD-Index,
to solve the problem of approximate k-nearest neighbor
queries in massive high-dimensional databases. HD-Index
consists of a set of novel hierarchical structures
called RDB-trees built on Hilbert keys of database
objects. The leaves of the RDB-trees store distances of
database objects to reference objects, thereby allowing
efficient pruning using distance filters. In addition
to triangular inequality, we also use Ptolemaic
inequality to produce better lower bounds. Experiments
on massive (up to billion scale) high-dimensional (up
to 1000+) datasets show that HD-Index is effective,
efficient, and scalable.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ahmad:2018:LSL,
author = "Yousuf Ahmad and Omar Khattab and Arsal Malik and
Ahmad Musleh and Mohammad Hammoud and Mucahid Kutlu and
Mostafa Shehata and Tamer Elsayed",
title = "{LA3}: a scalable link- and locality-aware linear
algebra-based graph analytics system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "8",
pages = "920--933",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3204028.3204035",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 29 08:31:56 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper presents LA3, a scalable distributed system
for graph analytics. LA3 couples a vertex-based
programming model with a highly optimized linear
algebra-based engine. It translates any vertex-centric
program into an iteratively executed sparse
matrix-vector multiplication (SpMV). To reduce
communication and enhance scalability, the adjacency
matrix representing an input graph is partitioned into
locality-aware 2D tiles distributed across multiple
processes. Alongside, three major optimizations are
incorporated to preclude redundant computations and
minimize communication. First, the link-based structure
of the input graph is exploited to classify vertices
into different types. Afterwards, vertices of special
types are factored out of the main loop of the graph
application to avoid superfluous computations. We refer
to this novel optimization as computation filtering.
Second, a communication filtering mechanism is involved
to optimize for the high sparsity of the input matrix
due to power-law distributions, common in real-world
graphs. This optimization ensures that each process
receives only the messages that pertain to non-zero
entries in its tiles, substantially reducing
communication traffic since most tiles are highly
sparse. Lastly, a pseudo-asynchronous computation and
communication optimization is proposed, whereby
processes progress and communicate asynchronously,
consume messages as soon as they become available, and
block otherwise. We implemented and extensively tested
LA3 on private and public clouds. Results show that LA3
outperforms six related state-of-the-art and popular
distributed graph analytics systems by an average of
10X.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2018:TSE,
author = "Dongxiang Zhang and Mengting Ding and Dingyu Yang and
Yi Liu and Ju Fan and Heng Tao Shen",
title = "Trajectory simplification: an experimental study and
quality analysis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "9",
pages = "934--946",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3213880.3213885",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 30 09:26:43 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The ubiquitousness of GPS sensors in smart-phones,
vehicles and wearable devices has enabled the
collection of massive volumes of trajectory data from
tracing moving objects. Consequently, an unprecedented
scale of timestamped GPS data has been generated and
posed an urgent demand for an effective storage
mechanism for trajectory databases. The mainstream
compression technique is called trajectory
simplification, that finds a subsequence to approximate
the original trajectory and attempts to minimize the
information loss under a distance measure. Even though
various simplification algorithms have been proposed in
the past decades, there still lacks a thorough
comparison to cover all the state-of-the-art algorithms
and evaluate their quality using datasets in
diversified motion patterns. Hence, it still remains a
challenge for GPS data collectors to determine a proper
algorithm in a concrete application. In addition,
almost the entire line of previous methods uses
error-based metrics to evaluate the compression
quality, while ignoring their usability in supporting
spatio-temporal queries on top of the reduced database.
To bridge these gaps, we conduct so far the most
comprehensive evaluation on trajectory simplification
techniques. We compare the performance of 25 algorithms
in total using five real datasets in different motion
patterns. According to the experimental findings, we
present useful guidance for the selection or
development of effective trajectory simplification
algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Antenucci:2018:CBE,
author = "Dolan Antenucci and Michael Cafarella",
title = "Constraint-based explanation and repair of
filter-based transformations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "9",
pages = "947--960",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3213880.3213886",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 30 09:26:43 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data analysts often need to transform an existing
dataset, such as with filtering, into a new dataset for
downstream analysis. Even the most trivial of mistakes
in this phase can introduce bias and lead to the
formation of invalid conclusions. For example, consider
a researcher identifying subjects for trials of a new
statin drug. She might identify patients with a high
dietary cholesterol intake as a population likely to
benefit from the drug, however, selection of these
individuals could bias the test population to those
with a generally unhealthy lifestyle, thereby
compromising the analysis. Reducing the potential for
bias in the dataset transformation process can minimize
the need to later engage in the tedious, time-consuming
process of trying to eliminate bias while preserving
the target dataset. We propose a novel interaction
model for explain-and-repair data transformation
systems, in which users inter-actively define
constraints for transformation code and the resultant
data. The system satisfies these constraints as far as
possible, and provides an explanation for any problems
encountered. We present an algorithm that yields
filter-based transformation code satisfying user
constraints. We implemented and evaluated a prototype
of this architecture, E meril, using both synthetic and
real-world datasets. Our approach finds solutions 34\%
more often and 77\% more quickly than the previous
state-of-the-art solution.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2018:SSQ,
author = "Xiaolan Wang and Aaron Feng and Behzad Golshan and
Alon Halevy and George Mihaila and Hidekazu Oiwa and
Wang-Chiew Tan",
title = "Scalable semantic querying of text",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "9",
pages = "961--974",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3213880.3213887",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 30 09:26:43 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present the Koko system that takes declarative
information extraction to a new level by incorporating
advances in natural language processing techniques in
its extraction language. K oko is novel in that its
extraction language simultaneously supports conditions
on the surface of the text and on the structure of the
dependency parse tree of sentences, thereby allowing
for more refined extractions. Koko also supports
conditions that are forgiving to linguistic variation
of expressing concepts and allows to aggregate evidence
from the entire document in order to filter
extractions. To scale up, K oko exploits a
multi-indexing scheme and heuristics for efficient
extractions. We extensively evaluate Koko over publicly
available text corpora. We show that Koko indices take
up the smallest amount of space, are notably faster and
more effective than a number of prior indexing schemes.
Finally, we demonstrate Koko's scalability on a corpus
of 5 million Wikipedia articles.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bellomarini:2018:VSD,
author = "Luigi Bellomarini and Emanuel Sallinger and Georg
Gottlob",
title = "The {Vadalog} system: datalog-based reasoning for
knowledge graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "9",
pages = "975--987",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3213880.3213888",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 30 09:26:43 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Over the past years, there has been a resurgence of
Datalog-based systems in the database community as well
as in industry. In this context, it has been recognized
that to handle the complex knowledge-based scenarios
encountered today, such as reasoning over large
knowledge graphs, Datalog has to be extended with
features such as existential quantification. Yet,
Datalog-based reasoning in the presence of existential
quantification is in general undecidable. Many efforts
have been made to define decidable fragments. Warded
Datalog+/- is a very promising one, as it captures
PTIME complexity while allowing ontological reasoning.
Yet so far, no implementation of Warded Datalog+/- was
available. In this paper we present the Vadalog system,
a Datalog-based system for performing complex logic
reasoning tasks, such as those required in advanced
knowledge graphs. The Vadalog system is Oxford's
contribution to the VADA research programme, a joint
effort of the universities of Oxford, Manchester and
Edinburgh and around 20 industrial partners. As the
main contribution of this paper, we illustrate the
first implementation of Warded Datalog+/-, a
high-performance Datalog+/- system utilizing an
aggressive termination control strategy. We also
provide a comprehensive experimental evaluation.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Medya:2018:NND,
author = "Sourav Medya and Sayan Ranu and Jithin Vachery and
Ambuj Singh",
title = "Noticeable network delay minimization via node
upgrades",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "9",
pages = "988--1001",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3213880.3213889",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 30 09:26:43 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In several domains, the flow of data is governed by an
underlying network. Reduction of delays in end-to-end
data flow is an important network optimization task.
Reduced delays enable shorter travel times for vehicles
in road networks, faster information flow in social
networks, and increased rate of packets in
communication networks. While techniques for network
delay minimization have been proposed, they fail to
provide any noticeable reduction in individual data
flows. Furthermore, they treat all nodes as equally
important, which is often not the case in real-world
networks. In this paper, we incorporate these practical
aspects and propose a network design problem where the
goal is to perform k network upgrades such that it
maximizes the number of flows in the network with a
noticeable reduction in delay. We show that the problem
is NP-hard, APX-hard, and non-submodular. We overcome
these computational challenges by designing an
importance sampling based algorithm with provable
quality guarantees. Through extensive experiments on
real and synthetic data sets, we establish that
importance sampling imparts up to 1000 times speed-up
over the greedy approach, and provides up to 70 times
the improvement achieved by the state-of-the-art
technique.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Palkar:2018:EEE,
author = "Shoumik Palkar and James Thomas and Deepak Narayanan
and Pratiksha Thaker and Rahul Palamuttam and Parimajan
Negi and Anil Shanbhag and Malte Schwarzkopf and Holger
Pirk and Saman Amarasinghe and Samuel Madden and Matei
Zaharia",
title = "Evaluating end-to-end optimization for data analytics
applications in {Weld}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "9",
pages = "1002--1015",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3213880.3213890",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 30 09:26:43 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Modern analytics applications use a diverse mix of
libraries and functions. Unfortunately, there is no
optimization across these libraries, resulting in
performance penalties as high as an order of magnitude
in many applications. To address this problem, we
proposed Weld, a common runtime for existing data
analytics libraries that performs key physical
optimizations such as pipelining under existing,
imperative library APIs. In this work, we further
develop the Weld vision by designing an automatic
adaptive optimizer for Weld applications, and
evaluating its impact on realistic data science
workloads. Our optimizer eliminates multiple forms of
overhead that arise when composing imperative libraries
like Pandas and NumPy, and uses lightweight
measurements to make data-dependent decisions at
run-time in ad-hoc workloads where no statistics are
available, with sub-second overhead. We also evaluate
which optimizations have the largest impact in practice
and whether Weld can be integrated into libraries
incrementally. Our results are promising: using our
optimizer, Weld accelerates data science workloads by
up to 23X on one thread and 80X on eight threads, and
its adaptive optimizations provide up to a 3.75X
speedup over rule-based optimization. Moreover, Weld
provides benefits if even just 4--5 operators in a
library are ported to use it. Our results show that
common runtime designs like Weld may be a viable
approach to accelerate analytics.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Muller:2018:ISE,
author = "Magnus M{\"u}ller and Guido Moerkotte and Oliver
Kolb",
title = "Improved selectivity estimation by combining knowledge
from sampling and synopses",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "9",
pages = "1016--1028",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3213880.3213882",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 30 09:26:43 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Estimating selectivities remains a critical task in
query processing. Optimizers rely on the accuracy of
selectivities when generating execution plans and, in
approximate query answering, estimated selectivities
affect the quality of the result. Many systems maintain
synopses, e.g., histograms, and, in addition, provide
sampling facilities. In this paper, we present a novel
approach to combine knowledge from synopses and
sampling for the purpose of selectivity estimation for
conjunctive queries. We first show how to extract
information from synopses and sampling such that they
are mutually consistent. In a second step, we show how
to combine them and decide on an admissible selectivity
estimate. We compare our approach to state-of-the-art
methods and evaluate the strengths and limitations of
each approach.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Han:2018:EAA,
author = "Kai Han and Keke Huang and Xiaokui Xiao and Jing Tang
and Aixin Sun and Xueyan Tang",
title = "Efficient algorithms for adaptive influence
maximization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "9",
pages = "1029--1040",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3213880.3213883",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 30 09:26:43 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Given a social network $G$, the influence maximization
(IM) problem seeks a set $S$ of $k$ seed nodes in $G$
to maximize the expected number of nodes activated via
an influence cascade starting from $S$. Although a lot
of algorithms have been proposed for IM, most of them
only work under the non-adaptive setting, i.e., when
all $k$ seed nodes are selected before we observe how
they influence other users. In this paper, we study the
adaptive IM problem, where we select the $k$ seed nodes
in batches of equal size $b$, such that the choice of
the $i$-th batch can be made after the influence
results of the first $ i - 1$ batches are observed. We
propose the first practical algorithms for adaptive IM
with an approximation guarantee of $ 1 - \exp (\xi -
1)$ for $ b = 1$ and $ 1 - \exp (\xi - 1 + 1 / e)$ for
$ b > 1$, where $ \xi $ is any number in $ (0, 1)$. Our
approach is based on a novel AdaptGreedy framework
instantiated by non-adaptive IM algorithms, and its
performance can be substantially improved if the
non-adaptive IM algorithm has a small expected
approximation error. However, no current non-adaptive
IM algorithms provide such a desired property.
Therefore, we further propose a non-adaptive IM
algorithm called EPIC, which not only has the same
worst-case performance bounds with that of the
state-of-the-art non-adaptive IM algorithms, but also
has a reduced expected approximation error. We also
provide a theoretical analysis to quantify the
performance gain brought by instantiating AdaptGreedy
using EPIC, compared with a naive approach using the
existing IM algorithms. Finally, we use real social
networks to evaluate the performance of our approach
through extensive experiments, and the experimental
experiments strongly corroborate the superiorities of
our approach.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Breslow:2018:MFF,
author = "Alex D. Breslow and Nuwan S. Jayasena",
title = "{Morton} filters: faster, space-efficient cuckoo
filters via biasing, compression, and decoupled logical
sparsity",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "9",
pages = "1041--1055",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3213880.3213884",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 30 09:26:43 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Approximate set membership data structures (ASMDSs)
are ubiquitous in computing. They trade a tunable,
often small, error rate ($ \epsilon $) for large space
savings. The canonical ASMDS is the Bloom filter, which
supports lookups and insertions but not deletions in
its simplest form. Cuckoo filters (CFs), a recently
proposed class of ASMDSs, add deletion support and
often use fewer bits per item for equal $ \epsilon $.
This work introduces the Morton filter (MF), a novel
AS-MDS that introduces several key improvements to CFs.
Like CFs, MFs support lookups, insertions, and
deletions, but improve their respective throughputs by
1.3x to 2.5x, 0.9x to 15.5x, and 1.3x to 1.6x. MFs
achieve these improvements by (1) introducing a
compressed format that permits a logically sparse
filter to be stored compactly in memory, (2) leveraging
succinct embedded metadata to prune unnecessary memory
accesses, and (3) heavily biasing insertions to use a
single hash function. With these optimizations,
lookups, insertions, and deletions often only require
accessing a single hardware cache line from the filter.
These improvements are not at a loss in space
efficiency, as MFs typically use comparable to slightly
less space than CFs for the same epsis;.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bi:2018:OPA,
author = "Fei Bi and Lijun Chang and Xuemin Lin and Wenjie
Zhang",
title = "An optimal and progressive approach to online search
of top-$k$ influential communities",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "9",
pages = "1056--1068",
month = may,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3213880.3213881",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 30 09:26:43 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Community search over large graphs is a fundamental
problem in graph analysis. Recent studies propose to
compute top- k influential communities, where each
reported community not only is a cohesive subgraph but
also has a high influence value. The existing
approaches to the problem of top- k influential
community search can be categorized as index-based
algorithms and online search algorithms without
indexes. The index-based algorithms, although being
very efficient in conducting community searches, need
to pre-compute a special-purpose index and only work
for one built-in vertex weight vector. In this paper,
we investigate online search approaches and propose an
instance-optimal algorithm LocalSearch whose time
complexity is linearly proportional to the size of the
smallest subgraph that a correct algorithm needs to
access without indexes. In addition, we also propose
techniques to make LocalSearch progressively compute
and report the communities in decreasing influence
value order such that k does not need to be specified.
Moreover, we extend our framework to the general case
of top- k influential community search regarding other
cohesiveness measures. Extensive empirical studies on
real graphs demonstrate that our algorithms outperform
the existing online search algorithms by several orders
of magnitude.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Meister:2018:EAT,
author = "Andreas Meister and Guido Moerkotte and Gunter Saake",
title = "Errata for {``Analysis of two existing and one new
dynamic programming algorithm for the generation of
optimal bushy join trees without cross products''}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "10",
pages = "1069--1070",
month = jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3231751.3231756",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 26 16:31:24 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In the published version of EnumerateCmp in the
Section 3.3 on Page 936 [1], see also Algorithm 1, a
small error is included in Line 5. In the first call of
EnumerateCsgRec, too many nodes $ (X \cup N) $ will be
excluded for the emission of complements, leading to
the fact that, in general, not all complements will be
emitted correctly.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Park:2018:DSB,
author = "Noseong Park and Mahmoud Mohammadi and Kshitij Gorde
and Sushil Jajodia and Hongkyu Park and Youngmin Kim",
title = "Data synthesis based on generative adversarial
networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "10",
pages = "1071--1083",
month = jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3231751.3231757",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 26 16:31:24 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Privacy is an important concern for our society where
sharing data with partners or releasing data to the
public is a frequent occurrence. Some of the techniques
that are being used to achieve privacy are to remove
identifiers, alter quasi-identifiers, and perturb
values. Unfortunately, these approaches suffer from two
limitations. First, it has been shown that private
information can still be leaked if attackers possess
some background knowledge or other information sources.
Second, they do not take into account the adverse
impact these methods will have on the utility of the
released data. In this paper, we propose a method that
meets both requirements. Our method, called table-GAN,
uses generative adversarial networks (GANs) to
synthesize fake tables that are statistically similar
to the original table yet do not incur information
leakage. We show that the machine learning models
trained using our synthetic tables exhibit performance
that is similar to that of models trained using the
original table for unknown testing cases. We call this
property model compatibility. We believe that
anonymization/perturbation/synthesis methods without
model compatibility are of little value. We used four
real-world datasets from four different domains for our
experiments and conducted in-depth comparisons with
state-of-the-art anonymization, perturbation, and
generation techniques. Throughout our experiments, only
our method consistently shows balance between privacy
level and model compatibility.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lockard:2018:CDS,
author = "Colin Lockard and Xin Luna Dong and Arash Einolghozati
and Prashant Shiralkar",
title = "{CERES}: distantly supervised relation extraction from
the semi-structured web",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "10",
pages = "1084--1096",
month = jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3231751.3231758",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 26 16:31:24 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The web contains countless semi-structured websites,
which can be a rich source of information for
populating knowledge bases. Existing methods for
extracting relations from the DOM trees of
semi-structured webpages can achieve high precision and
recall only when manual annotations for each website
are available. Although there have been efforts to
learn extractors from automatically generated labels,
these methods are not sufficiently robust to succeed in
settings with complex schemas and information-rich
websites. In this paper we present a new method for
automatic extraction from semi-structured websites
based on distant supervision. We automatically generate
training labels by aligning an existing knowledge base
with a website and leveraging the unique structural
characteristics of semi-structured websites. We then
train a classifier based on the potentially noisy and
incomplete labels to predict new relation instances.
Our method can compete with annotation-based techniques
in the literature in terms of extraction quality. A
large-scale experiment on over 400,000 pages from
dozens of multi-lingual long-tail websites harvested
1.25 million facts at a precision of 90\%.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Nazi:2018:EEI,
author = "Azade Nazi and Bolin Ding and Vivek Narasayya and
Surajit Chaudhuri",
title = "Efficient estimation of inclusion coefficient using
hyperloglog sketches",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "10",
pages = "1097--1109",
month = jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3231751.3231759",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 26 16:31:24 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Efficiently estimating the inclusion coefficient ---
the fraction of values of one column that are contained
in another column --- is useful for tasks such as data
profiling and foreign-key detection. We present a new
estimator, BML, for inclusion coefficient based on
Hyperloglog sketches that results in significantly
lower error compared to the state-of-the art approach
that uses Bottom-k sketches. We evaluate the error of
the BML estimator using experiments on industry
benchmarks such as TPC-H and TPC-DS, and several
real-world databases. As an independent contribution,
we show how Hyperloglog sketches can be maintained
incrementally with data deletions using only a constant
amount of additional memory.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fier:2018:SSJ,
author = "Fabian Fier and Nikolaus Augsten and Panagiotis Bouros
and Ulf Leser and Johann-Christoph Freytag",
title = "Set similarity joins on {MapReduce}: an experimental
survey",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "10",
pages = "1110--1122",
month = jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3231751.3231760",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 26 16:31:24 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Set similarity joins, which compute pairs of similar
sets, constitute an important operator primitive in a
variety of applications, including applications that
must process large amounts of data. To handle these
data volumes, several distributed set similarity join
algorithms have been proposed. Unfortunately, little is
known about the relative performance, strengths and
weaknesses of these techniques. Previous comparisons
are limited to a small subset of relevant algorithms,
and the large differences in the various test setups
make it hard to draw overall conclusions. In this paper
we survey ten recent, distributed set similarity join
algorithms, all based on the MapReduce paradigm. We
empirically compare the algorithms in a uniform test
environment on twelve datasets that expose different
characteristics and represent a broad range of
applications. Our experiments yield a surprising
result: All algorithms in our test fail to scale for at
least one dataset and are sensitive to long sets,
frequent set elements, low similarity thresholds, or a
combination thereof. Interestingly, some algorithms
even fail to handle the small datasets that can easily
be processed in a non-distributed setting. Our analytic
investigation of the algorithms pinpoints the reasons
for the poor performance and targeted experiments
confirm our analytic findings. Based on our
investigation, we suggest directions for future
research in the area.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ding:2018:PSH,
author = "Bailu Ding and Sudipto Das and Wentao Wu and Surajit
Chaudhuri and Vivek Narasayya",
title = "{Plan Stitch}: harnessing the best of many plans",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "10",
pages = "1123--1136",
month = jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3231751.3231761",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 26 16:31:24 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Query performance regression due to the query
optimizer selecting a bad query execution plan is a
major pain point in production workloads. Commercial
DBMSs today can automatically detect and correct such
query plan regressions by storing previously-executed
plans and reverting to a previous plan which is still
valid and has the least execution cost. Such
reversion-based plan correction has relatively low risk
of plan regression since the decision is based on
observed execution costs. However, this approach
ignores potentially valuable information of efficient
subplans collected from other previously-executed
plans. In this paper, we propose a novel technique,
Plan Stitch, that automatically and opportunistically
combines efficient subplans of previously-executed
plans into a valid new plan, which can be cheaper than
any individual previously-executed plan. We implement
Plan Stitch on top of Microsoft SQL Server. Our
experiments on TPC-DS benchmark and three real-world
customer workloads show that plans obtained via Plan
Stitch can reduce execution cost significantly, with a
reduction of up to two orders of magnitude in execution
cost when compared to reverting to the cheapest
previously-executed plan.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2018:FES,
author = "Sheng Wang and Tien Tuan Anh Dinh and Qian Lin and
Zhongle Xie and Meihui Zhang and Qingchao Cai and Gang
Chen and Beng Chin Ooi and Pingcheng Ruan",
title = "{Forkbase}: an efficient storage engine for blockchain
and forkable applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "10",
pages = "1137--1150",
month = jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3231751.3231762",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 26 16:31:24 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Existing data storage systems offer a wide range of
functionalities to accommodate an equally diverse range
of applications. However, new classes of applications
have emerged, e.g., blockchain and collaborative
analytics, featuring data versioning, fork semantics,
tamper-evidence or any combination thereof. They
present new opportunities for storage systems to
efficiently support such applications by embedding the
above requirements into the storage. In this paper, we
present ForkBase, a storage engine designed for
blockchain and forkable applications. By integrating
core application properties into the storage, ForkBase
not only delivers high performance but also reduces
development effort. The storage manages multiversion
data and supports two variants of fork semantics which
enable different fork workflows. ForkBase is fast and
space efficient, due to a novel index class that
supports efficient queries as well as effective
detection of duplicate content across data objects,
branches and versions. We demonstrate ForkBase's
performance using three applications: a blockchain
platform, a wiki engine and a collaborative analytics
application. We conduct extensive experimental
evaluation against respective state-of-the-art
solutions. The results show that ForkBase achieves
superior performance while significantly lowering the
development effort.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ammar:2018:EAD,
author = "Khaled Ammar and M. Tamer {\"O}zsu",
title = "Experimental analysis of distributed graph systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "10",
pages = "1151--1164",
month = jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3231751.3231764",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 26 16:31:24 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper evaluates eight parallel graph processing
systems: Hadoop, HaLoop, Vertica, Giraph, GraphLab
(PowerGraph), Blogel, Flink Gelly, and GraphX (SPARK)
over four very large datasets (Twitter, World Road
Network, UK 200705, and ClueWeb) using four workloads
(PageRank, WCC, SSSP and K-hop). The main objective is
to perform an independent scale-out study by
experimentally analyzing the performance, usability,
and scalability (using up to 128 machines) of these
systems. In addition to performance results, we discuss
our experiences in using these systems and suggest some
system tuning heuristics that lead to better
performance.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{He:2018:TDE,
author = "Yeye He and Xu Chu and Kris Ganjam and Yudian Zheng
and Vivek Narasayya and Surajit Chaudhuri",
title = "Transform-data-by-example {(TDE)}: an extensible
search engine for data transformations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "10",
pages = "1165--1177",
month = jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3231751.3231766",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 26 16:31:24 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Today, business analysts and data scientists
increasingly need to clean, standardize and transform
diverse data sets, such as name, address, date time,
and phone number, before they can perform analysis.
This process of data transformation is an important
part of data preparation, and is known to be difficult
and time-consuming for end-users. Traditionally,
developers have dealt with these longstanding
transformation problems using custom code libraries.
They have built vast varieties of custom logic for name
parsing and address standardization, etc., and shared
their source code in places like GitHub. Data
transformation would be a lot easier for end-users if
they can discover and reuse such existing
transformation logic. We developed
Transform-Data-by-Example ( TDE ), which works like a
search engine for data transformations. TDE ``indexes''
vast varieties of transformation logic in source code,
DLLs, web services and mapping tables, so that users
only need to provide a few input/output examples to
demonstrate a desired transformation, and TDE can
interactively find relevant functions to synthesize new
programs consistent with all examples. Using an index
of 50K functions crawled from GitHub and Stackoverflow,
TDE can already handle many common transformations not
currently supported by existing systems. On a benchmark
with over 200 transformation tasks, TDE generates
correct transformations for 72\% tasks, which is
considerably better than other systems evaluated. A
beta version of TDE for Microsoft Excel is available
via Office store. Part of the TDE technology also ships
in Microsoft Power BI.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{OKeeffe:2018:FRE,
author = "Dan O'Keeffe and Theodoros Salonidis and Peter
Pietzuch",
title = "{Frontier}: resilient edge processing for the
{Internet of Things}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "10",
pages = "1178--1191",
month = jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3231751.3231767",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 26 16:31:24 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In an edge deployment model, Internet-of-Things (IoT)
applications, e.g. for building automation or video
surveillance, must process data locally on IoT devices
without relying on permanent connectivity to a cloud
backend. The ability to harness the combined resources
of multiple IoT devices for computation is influenced
by the quality of wireless network connectivity. An
open challenge is how practical edge-based IoT
applications can be realised that are robust to changes
in network bandwidth between IoT devices, due to
interference and intermittent connectivity. We present
Frontier, a distributed and resilient edge processing
platform for IoT devices. The key idea is to express
data-intensive IoT applications as continuous
data-parallel streaming queries and to improve query
throughput in an unreliable wireless network by
exploiting network path diversity: a query includes
operator replicas at different IoT nodes, which
increases possible network paths for data. Frontier
dynamically routes stream data to operator replicas
based on network path conditions. Nodes probe path
throughput and use backpressure stream routing to
decide on transmission rates, while exploiting multiple
operator replicas for data-parallelism. If a node loses
network connectivity, a transient disconnection
recovery mechanism reprocesses the lost data. Our
experimental evaluation of Frontier shows that network
path diversity improves throughput by $ 1.3 \times $--$
2.8 \times $ for different IoT applications, while
being resilient to intermittent network connectivity.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Haynes:2018:LDV,
author = "Brandon Haynes and Amrita Mazumdar and Armin Alaghi
and Magdalena Balazinska and Luis Ceze and Alvin
Cheung",
title = "{LightDB}: a {DBMS} for virtual reality video",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "10",
pages = "1192--1205",
month = jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3231751.3231768",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 26 16:31:24 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present the data model, architecture, and
evaluation of LightDB, a database management system
designed to efficiently manage virtual, augmented, and
mixed reality (VAMR) video content. VAMR video differs
from its two-dimensional counterpart in that it is
spherical with periodic angular dimensions, is
nonuniformly and continuously sampled, and applications
that consume such videos often have demanding latency
and throughput requirements. To address these
challenges, LightDB treats VAMR video data as a
logically-continuous six-dimensional light field.
Furthermore, LightDB supports a rich set of operations
over light fields, and automatically transforms
declarative queries into executable physical plans. We
have implemented a prototype of LightDB and, through
experiments with VAMR applications in the literature,
we find that LightDB offers up to $ 4 \times $
throughput improvements compared with prior work.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{McKenna:2018:OEH,
author = "Ryan McKenna and Gerome Miklau and Michael Hay and
Ashwin Machanavajjhala",
title = "Optimizing error of high-dimensional statistical
queries under differential privacy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "10",
pages = "1206--1219",
month = jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3231751.3231769",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 26 16:31:24 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Differentially private algorithms for answering sets
of predicate counting queries on a sensitive database
have many applications. Organizations that collect
individual-level data, such as statistical agencies and
medical institutions, use them to safely release
summary tabulations. However, existing techniques are
accurate only on a narrow class of query workloads, or
are extremely slow, especially when analyzing more than
one or two dimensions of the data. In this work we
propose HDMM, a new differentially private algorithm
for answering a workload of predicate counting queries,
that is especially effective for higher-dimensional
datasets. HDMM represents query workloads using an
implicit matrix representation and exploits this
compact representation to efficiently search (a subset
of) the space of differentially private algorithms for
one that answers the input query workload with high
accuracy. We empirically show that HDMM can efficiently
answer queries with lower error than state-of-the-art
techniques on a variety of low and high dimensional
datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2018:MBM,
author = "Yu Liu and Hantian Zhang and Luyuan Zeng and Wentao Wu
and Ce Zhang",
title = "{MLbench}: benchmarking machine learning services
against human experts",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "10",
pages = "1220--1232",
month = jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3231751.3231770",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 26 16:31:24 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Modern machine learning services and systems are
complicated data systems --- the process of designing
such systems is an art of compromising between
functionality, performance, and quality. Providing
different levels of system supports for different
functionalities, such as automatic feature engineering,
model selection and ensemble, and hyperparameter
tuning, could improve the quality, but also introduce
additional cost and system complexity. In this paper,
we try to facilitate the process of asking the
following type of questions: How much will the users
lose if we remove the support of functionality x from a
machine learning service? Answering this type of
questions using existing datasets, such as the UCI
datasets, is challenging. The main contribution of this
work is a novel dataset, MLBench, harvested from Kaggle
competitions. Unlike existing datasets, MLBench
contains not only the raw features for a machine
learning task, but also those used by the winning teams
of Kaggle competitions. The winning features serve as a
baseline of best human effort that enables multiple
ways to measure the quality of machine learning
services that cannot be supported by existing datasets,
such as relative ranking on Kaggle and relative
accuracy compared with best-effort systems. We then
conduct an empirical study using MLBench to understand
example machine learning services from Amazon and
Microsoft Azure, and showcase how MLBench enables a
comparative study revealing the strength and weakness
of these existing machine learning services
quantitatively and systematically. The full version of
this paper can be found at {\tt
arxiv.org/abs/1707.09562}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chen:2018:MCL,
author = "Lu Chen and Chengfei Liu and Rui Zhou and Jianxin Li
and Xiaochun Yang and Bin Wang",
title = "Maximum co-located community search in large scale
social networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "10",
pages = "1233--1246",
month = jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3231751.3231755",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 26 16:31:24 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The problem of k-truss search has been well defined
and investigated to find the highly correlated user
groups in social networks. But there is no previous
study to consider the constraint of users' spatial
information in k-truss search, denoted as co-located
community search in this paper. The co-located
community can serve many real applications. To search
the maximum co-located communities efficiently, we
first develop an efficient exact algorithm with several
pruning techniques. After that, we further develop an
approximation algorithm with adjustable accuracy
guarantees and explore more effective pruning rules,
which can reduce the computational cost significantly.
To accelerate the real-time efficiency, we also devise
a novel quadtree based index to support the efficient
retrieval of users in a region and optimise the search
regions with regards to the given query region.
Finally, we verify the performance of our proposed
algorithms and index using five real datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zalipynis:2018:CDF,
author = "Ramon Antonio Rodriges Zalipynis",
title = "{ChronosDB}: distributed, file based, geospatial array
{DBMS}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "10",
pages = "1247--1261",
month = jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3231751.3231754",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 26 16:31:24 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "An array DBMS streamlines large N-d array management.
A large portion of such arrays originates from the
geospatial domain. The arrays often natively come as
raster files while standalone command line tools are
one of the most popular ways for processing these
files. Decades of development and feedback resulted in
numerous feature-rich, elaborate, free and
quality-assured tools optimized mostly for a single
machine. ChronosDB partially delegates in situ data
processing to such tools and offers a formal N-d array
data model to abstract from the files and the tools.
ChronosDB readily provides a rich collection of array
operations at scale and outperforms SciDB by up to $ 75
\times $ on average.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Macke:2018:ASR,
author = "Stephen Macke and Yiming Zhang and Silu Huang and
Aditya Parameswaran",
title = "Adaptive sampling for rapidly matching histograms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "10",
pages = "1262--1275",
month = jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3231751.3231753",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 26 16:31:24 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In exploratory data analysis, analysts often have a
need to identify histograms that possess a specific
distribution, among a large class of candidate
histograms, e.g., find countries whose income
distribution is most similar to that of Greece. This
distribution could be a new one that the user is
curious about, or a known distribution from an existing
histogram visualization. At present, this process of
identification is brute-force, requiring the manual
generation and evaluation of a large number of
histograms. We present FastMatch: an end-to-end
approach for interactively retrieving the histogram
visualizations most similar to a user-specified target,
from a large collection of histograms. The primary
technical contribution underlying FastMatch is a
probabilistic algorithm, HistSim, a theoretically sound
sampling-based approach to identify the top- k closest
histograms under $ l_1 $ distance. While HistSim can be
used independently, within FastMatch we couple HistSim
with a novel system architecture that is aware of
practical considerations, employing asynchronous
block-based sampling policies. FastMatch obtains
near-perfect accuracy with up to $ 35 \times $ speedup
over approaches that do not use sampling on several
real-world datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Asudeh:2018:LSJ,
author = "Abolfazl Asudeh and Azade Nazi and Jees Augustine and
Saravanan Thirumuruganathan and Nan Zhang and Gautam
Das and Divesh Srivastava",
title = "Leveraging similarity joins for signal
reconstruction",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "10",
pages = "1276--1288",
month = jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3231751.3231752",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 26 16:31:24 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Signal reconstruction problem (SRP) is an important
optimization problem where the objective is to identify
a solution to an underdetermined system of linear
equations that is closest to a given prior. It has a
substantial number of applications in diverse areas
including network traffic engineering, medical image
reconstruction, acoustics, astronomy and many more.
Most common approaches for SRP do not scale to large
problem sizes. In this paper, we propose a dual
formulation of this problem and show how adapting
database techniques developed for scalable similarity
joins provides a significant speedup. Extensive
experiments on real-world and synthetic data show that
our approach produces a significant speedup of up to
20x over competing approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yu:2018:SHC,
author = "Xiangyao Yu and Yu Xia and Andrew Pavlo and Daniel
Sanchez and Larry Rudolph and Srinivas Devadas",
title = "{Sundial}: harmonizing concurrency control and caching
in a distributed {OLTP} database management system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "10",
pages = "1289--1302",
month = jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3231751.3231763",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 26 16:31:24 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Distributed transactions suffer from poor performance
due to two major limiting factors. First, distributed
transactions suffer from high latency because each of
their accesses to remote data incurs a long network
delay. Second, this high latency increases the
likelihood of contention among distributed
transactions, leading to high abort rates and low
performance. We present Sundial, an in-memory
distributed optimistic concurrency control protocol
that addresses these two limitations. First, to reduce
the transaction abort rate, Sundial dynamically
determines the logical order among transactions at
runtime, based on their data access patterns. Sundial
achieves this by applying logical leases to each data
element, which allows the database to dynamically
calculate a transaction's logical commit timestamp.
Second, to reduce the overhead of remote data accesses,
Sundial allows the database to cache remote data in a
server's local main memory and maintains cache
coherence. With logical leases, Sundial integrates
concurrency control and cache coherence into a simple
unified protocol. We evaluate Sundial against
state-of-the-art distributed concurrency control
protocols. Sundial outperforms the next-best protocol
by up to 57\% under high contention. Sundial's caching
scheme improves performance by up to $ 4.6 \times $ in
workloads with high access skew.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mai:2018:CSP,
author = "Luo Mai and Kai Zeng and Rahul Potharaju and Le Xu and
Steve Suh and Shivaram Venkataraman and Paolo Costa and
Terry Kim and Saravanan Muthukrishnan and Vamsi Kuppa
and Sudheer Dhulipalla and Sriram Rao",
title = "{Chi}: a scalable and programmable control plane for
distributed stream processing systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "10",
pages = "1303--1316",
month = jun,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3231751.3231765",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 26 16:31:24 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Stream-processing workloads and modern shared cluster
environments exhibit high variability and
unpredictability. Combined with the large parameter
space and the diverse set of user SLOs, this makes
modern streaming systems very challenging to statically
configure and tune. To address these issues, in this
paper we investigate a novel control-plane design, Chi,
which supports continuous monitoring and feedback, and
enables dynamic re-configuration. Chi leverages the key
insight of embedding control-plane messages in the
data-plane channels to achieve a low-latency and
flexible control plane for stream-processing systems.
Chi introduces a new reactive programming model and
design mechanisms to asynchronously execute control
policies, thus avoiding global synchronization. We show
how this allows us to easily implement a wide spectrum
of control policies targeting different use cases
observed in production. Large-scale experiments using
production workloads from a popular cloud provider
demonstrate the flexibility and efficiency of our
approach.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Thomas:2018:CES,
author = "Anthony Thomas and Arun Kumar",
title = "A comparative evaluation of systems for scalable
linear algebra-based analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "13",
pages = "2168--2182",
month = sep,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3275366.3275367",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Oct 11 16:22:00 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The growing use of statistical and machine learning
(ML) algorithms to analyze large datasets has given
rise to new systems to scale such algorithms. But
implementing new scalable algorithms in low-level
languages is a painful process, especially for
enterprise and scientific users. To mitigate this
issue, a new breed of systems expose high-level bulk
linear algebra (LA) primitives that are scalable. By
composing such LA primitives, users can write analysis
algorithms in a higher-level language, while the system
handles scalability issues. But there is little work on
a unified comparative evaluation of the scalability,
efficiency, and effectiveness of such ``scalable LA
systems.'' We take a major step towards filling this
gap. We introduce a suite of LA-specific tests based on
our analysis of the data access and communication
patterns of LA workloads and their use cases. Using our
tests, we perform a comprehensive empirical comparison
of a few popular scalable LA systems: MADlib, MLlib,
SystemML, ScaLAPACK, SciDB, and TensorFlow using both
synthetic data and a large real-world dataset. Our
study has revealed several scalability bottlenecks,
unusual performance trends, and even bugs in some
systems. Our findings have already led to improvements
in SystemML, with other systems' developers also
expressing interest. All of our code and data scripts
are available for download at
https://adalabucsd.github.io/slab.html.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Karthik:2018:CPL,
author = "Srinivas Karthik and Jayant R. Haritsa and Sreyash
Kenkre and Vinayaka Pandit",
title = "A concave path to low-overhead robust query
processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "13",
pages = "2183--2195",
month = sep,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3275366.3275368",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Oct 11 16:22:00 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "To address the classical selectivity estimation
problem in database systems, a radically different
query processing technique called PlanBouquet was
proposed in 2014. In this approach, the estimation
process is completely abandoned and replaced with a
calibrated selectivity discovery mechanism. The
beneficial outcome is that provable guarantees are
obtained on worst-case execution performance, thereby
facilitating robust query processing. An improved
version of PlanBouquet, called SpillBound (SB), which
significantly accelerates the selectivity discovery
process, and provides platform-independent performance
guarantees, was presented two years ago.
Notwithstanding its benefits, a limitation of
SpillBound is that its guarantees are predicated on
expending enormous preprocessing efforts during query
compilation, making it suitable only for canned queries
that are invoked repeatedly. In this paper, we address
this limitation by leveraging the fact that plan cost
functions typically exhibit concave down behavior with
regard to predicate selectivities. Specifically, we
design FrugalSpillBound, which provably achieves
extremely attractive tradeoffs between the performance
guarantees and the compilation overheads. For instance,
relaxing the performance guarantee by a factor of two
typically results in at least two orders of magnitude
reduction in the overheads. Further, when empirically
evaluated on benchmark OLAP queries, the decrease in
overheads is even greater, often more than three orders
of magnitude. Therefore, FrugalSpillBound substantively
extends robust query processing towards supporting
ad-hoc queries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wen:2018:ISE,
author = "Yuhao Wen and Xiaodan Zhu and Sudeepa Roy and Jun
Yang",
title = "Interactive summarization and exploration of top
aggregate query answers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "13",
pages = "2196--2208",
month = sep,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3275366.3275369",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Oct 11 16:22:00 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present a system for summarization and interactive
exploration of high-valued aggregate query answers to
make a large set of possible answers more informative
to the user. Our system outputs a set of clusters on
the high-valued query answers showing their common
properties such that the clusters are diverse as much
as possible to avoid repeating information, and cover a
certain number of top original answers as indicated by
the user. Further, the system facilitates interactive
exploration of the query answers by helping the user
(i) choose combinations of parameters for clustering,
(ii) inspect the clusters as well as the elements they
contain, and (iii) visualize how changes in parameters
affect clustering. We define optimization problems,
study their complexity, explore properties of the
solutions investigating the semi-lattice structure on
the clusters, and propose efficient algorithms and
optimizations to achieve these goals. We evaluate our
techniques experimentally and discuss our prototype
with a graphical user interface that facilitates this
interactive exploration. A user study is conducted to
evaluate the usability of our approach.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kersten:2018:EYA,
author = "Timo Kersten and Viktor Leis and Alfons Kemper and
Thomas Neumann and Andrew Pavlo and Peter Boncz",
title = "Everything you always wanted to know about compiled
and vectorized queries but were afraid to ask",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "13",
pages = "2209--2222",
month = sep,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3275366.3275370",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Oct 11 16:22:00 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The query engines of most modern database systems are
either based on vectorization or data-centric code
generation. These two state-of-the-art query processing
paradigms are fundamentally different in terms of
system structure and query execution code. Both
paradigms were used to build fast systems. However,
until today it is not clear which paradigm yields
faster query execution, as many implementation-specific
choices obstruct a direct comparison of architectures.
In this paper, we experimentally compare the two models
by implementing both within the same test system. This
allows us to use for both models the same query
processing algorithms, the same data structures, and
the same parallelization framework to ultimately create
an apples-to-apples comparison. We find that both are
efficient, but have different strengths and weaknesses.
Vectorization is better at hiding cache miss latency,
whereas data-centric compilation requires fewer CPU
instructions, which benefits cache-resident workloads.
Besides raw, single-threaded performance, we also
investigate SIMD as well as multi-core parallelization
and different hardware architectures. Finally, we
analyze qualitative differences as a guide for system
architects.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gao:2018:DTK,
author = "Junyang Gao and Pankaj K. Agarwal and Jun Yang",
title = "Durable top-$k$ queries on temporal data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "13",
pages = "2223--2235",
month = sep,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3275366.3275371",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Oct 11 16:22:00 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Many datasets have a temporal dimension and contain a
wealth of historical information. When using such data
to make decisions, we often want to examine not only
the current snapshot of the data but also its history.
For example, given a result object of a snapshot query,
we can ask for its ``durability,'' or intuitively, how
long (or how often) it was valid in the past. This
paper considers durable top-k queries, which look for
objects whose values were among the top k for at least
some fraction of the times during a given
interval---e.g., stocks that were among the top 20 most
heavily traded for at least 80\% of the trading days
during the last quarter of 2017. We present a
comprehensive suite of techniques for solving this
problem, ranging from exact algorithms where k is fixed
in advance, to approximate methods that work for any k
and are able to exploit workload and data
characteristics to improve accuracy while capping index
cost. We show that our methods vastly outperform
baseline and previous methods using both real and
synthetic datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Linardi:2018:SVL,
author = "Michele Linardi and Themis Palpanas",
title = "Scalable, variable-length similarity search in data
series: the {ULISSE} approach",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "13",
pages = "2236--2248",
month = sep,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3275366.3275372",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Oct 11 16:22:00 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data series similarity search is an important
operation and at the core of several analysis tasks and
applications related to data series collections.
Despite the fact that data series indexes enable fast
similarity search, all existing indexes can only answer
queries of a single length (fixed at index construction
time), which is a severe limitation. In this work, we
propose ULISSE, the first data series index structure
designed for answering similarity search queries of
variable length. Our contribution is two-fold. First,
we introduce a novel representation technique, which
effectively and succinctly summarizes multiple
sequences of different length (irrespective of
Z-normalization). Based on the proposed index, we
describe efficient algorithms for approximate and exact
similarity search, combining disk based index visits
and in-memory sequential scans. We experimentally
evaluate our approach using several synthetic and real
datasets. The results show that ULISSE is several times
(and up to orders of magnitude) more efficient in terms
of both space and time cost, when compared to competing
approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sauer:2018:FLS,
author = "Caetano Sauer and Goetz Graefe and Theo H{\"a}rder",
title = "{FineLine}: log-structured transactional storage and
recovery",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "13",
pages = "2249--2262",
month = sep,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3275366.3275373",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Oct 11 16:22:00 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Recovery is an intricate aspect of transaction
processing architectures. In its traditional
implementation, recovery requires the management of two
persistent data stores---a write-ahead log and a
materialized database---which must be carefully
orchestrated to maintain transactional consistency.
Furthermore, the design and implementation of recovery
algorithms have deep ramifications into almost every
component of the internal system architecture, from
concurrency control to buffer management and access
path implementation. Such complexity not only incurs
high costs for development, testing, and training, but
also unavoidably affects system performance,
introducing overheads and limiting scalability. This
paper proposes a novel approach for transactional
storage and recovery called FineLine. It simplifies the
implementation of transactional database systems by
eliminating the log-database duality and maintaining
all persistent data in a single, log-structured data
structure. This approach not only provides more
efficient recovery with less overhead, but also
decouples the management of persistent data from
in-memory access paths. As such, it blurs the lines
that separate in-memory from disk-based database
systems, providing the efficiency of the former with
the reliability of the latter.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Rahman:2018:IMH,
author = "Protiva Rahman and Courtney Hebert and Arnab Nandi",
title = "{ICARUS}: minimizing human effort in iterative data
completion",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "11",
number = "13",
pages = "2263--2276",
month = sep,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3275366.3275374",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Oct 11 16:22:00 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "An important step in data preparation involves dealing
with incomplete datasets. In some cases, the missing
values are unreported because they are characteristics
of the domain and are known by practitioners. Due to
this nature of the missing values, imputation and
inference methods do not work and input from domain
experts is required. A common method for experts to
fill missing values is through rules. However, for
large datasets with thousands of missing data points,
it is laborious and time consuming for a user to make
sense of the data and formulate effective completion
rules. Thus, users need to be shown subsets of the data
that will have the most impact in completing missing
fields. Further, these subsets should provide the user
with enough information to make an update. Choosing
subsets that maximize the probability of filling in
missing data from a large dataset is computationally
expensive. To address these challenges, we present
Icarus, which uses a heuristic algorithm to show the
user small subsets of the database in the form of a
matrix. This allows the user to iteratively fill in
data by applying suggested rules based on their direct
edits to the matrix. The suggested rules amplify the
users' input to multiple missing fields by using the
database schema to infer hierarchies. Simulations show
Icarus has an average improvement of 50\% across three
datasets over the baseline system. Further, in-person
user studies demonstrate that naive users can fill in
68\% of missing data within an hour, while manual rule
specification spans weeks.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kim:2018:LIW,
author = "Sunghwan Kim and Taesung Lee and Seung-won Hwang and
Sameh Elnikety",
title = "List intersection for web search: algorithms, cost
models, and optimizations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "1",
pages = "1--13",
month = sep,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3275536.3275537",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jan 2 18:29:47 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper studies the optimization of list
intersection, especially in the context of the matching
phase of search engines. Given a user query, we
intersect the postings lists corresponding to the query
keywords to generate the list of documents matching all
keywords. Since the speed of list intersection depends
the algorithm, hardware, and list lengths and their
correlations, none the existing intersection algorithms
outperforms the others in every scenario. Therefore, we
develop a cost-based approach in which we identify a
search space, spanning existing algorithms and their
combinations. We propose a cost model to estimate the
cost of the algorithms with their combinations, and use
the cost model to search for the lowest-cost algorithm.
The resulting plan is usually a combination of 2-way
algorithms, outperforming conventional 2-way and k -way
algorithms. The proposed approach is more general than
designing a specific algorithm, as the cost models can
be adapted to different hardware. We validate the cost
model experimentally on two different CPUs, and show
that the cost model closely estimates the actual cost.
Using both real and synthetic datasets, we show that
the proposed cost-based optimizer outperforms the
state-of-the-art alternatives.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Whittaker:2018:ICC,
author = "Michael Whittaker and Joseph M. Hellerstein",
title = "Interactive checks for coordination avoidance",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "1",
pages = "14--27",
month = sep,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3275536.3275538",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jan 2 18:29:47 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Strongly consistent distributed systems are easy to
reason about but face fundamental limitations in
availability and performance. Weakly consistent systems
can be implemented with very high performance but place
a burden on the application developer to reason about
complex interleavings of execution. Invariant
confluence provides a formal framework for
understanding when we can get the best of both worlds.
An invariant confluent object can be efficiently
replicated with no coordination needed to preserve its
invariants. However, actually determining whether or
not an object is invariant confluent is challenging. In
this paper, we establish conditions under which a
commonly used sufficient condition for invariant
confluence is both necessary and sufficient, and we use
this condition to design (a) a general-purpose
interactive invariant confluence decision procedure and
(b) a novel sufficient condition that can be checked
automatically. We then take a step beyond invariant
confluence and introduce a generalization of invariant
confluence, called segmented invariant confluence, that
allows us to replicate non-invariant confluent objects
with a small amount of coordination. We implemented
these formalisms in a prototype called Lucy and found
that our decision procedures efficiently handle common
real-world workloads including foreign keys, rollups,
escrow transactions, and more. We also found that
segmented invariant confluent replication can deliver
up to an order of magnitude more throughput than
linearizable replication for low contention workloads
and comparable throughput for medium to high contention
workloads.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Qin:2018:PPF,
author = "Jianbin Qin and Chuan Xiao",
title = "Pigeonring: a principle for faster thresholded
similarity search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "1",
pages = "28--42",
month = sep,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3275536.3275539",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jan 2 18:29:47 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The pigeonhole principle states that if n items are
contained in m boxes, then at least one box has no more
than n/m items. It is utilized to solve many data
management problems, especially for thresholded
similarity searches. Despite many pigeonhole
principle-based solutions proposed in the last few
decades, the condition stated by the principle is weak.
It only constrains the number of items in a single box.
By organizing the boxes in a ring, we propose a new
principle, called the pigeonring principle, which
constrains the number of items in multiple boxes and
yields stronger conditions. To utilize the new
principle, we focus on problems defined in the form of
identifying data objects whose similarities or
distances to the query is constrained by a threshold.
Many solutions to these problems utilize the pigeonhole
principle to find candidates that satisfy a filtering
condition. By the new principle, stronger filtering
conditions can be established. We show that the
pigeonhole principle is a special case of the new
principle. This suggests that all the pigeonhole
principle-based solutions are possible to be
accelerated by the new principle. A universal filtering
framework is introduced to encompass the solutions to
these problems based on the new principle. Besides, we
discuss how to quickly find candidates specified by the
new principle. The implementation requires only minor
modifications on top of existing pigeonhole
principle-based algorithms. Experimental results on
real datasets demonstrate the applicability of the new
principle as well as the superior performance of the
algorithms based on the new principle.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sariyuce:2018:LAH,
author = "Ahmet Erdem Sariy{\"u}ce and C. Seshadhri and Ali
Pinar",
title = "Local algorithms for hierarchical dense subgraph
discovery",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "1",
pages = "43--56",
month = sep,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3275536.3275540",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jan 2 18:29:47 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Finding the dense regions of a graph and relations
among them is a fundamental problem in network
analysis. Core and truss decompositions reveal dense
subgraphs with hierarchical relations. The incremental
nature of algorithms for computing these decompositions
and the need for global information at each step of the
algorithm hinders scalable parallelization and
approximations since the densest regions are not
revealed until the end. In a previous work, Lu et al.
proposed to iteratively compute the h -indices of
neighbor vertex degrees to obtain the core numbers and
prove that the convergence is obtained after a finite
number of iterations. This work generalizes the
iterative h -index computation for truss decomposition
as well as nucleus decomposition which leverages
higher-order structures to generalize core and truss
decompositions. In addition, we prove convergence
bounds on the number of iterations. We present a
framework of local algorithms to obtain the core,
truss, and nucleus decompositions. Our algorithms are
local, parallel, offer high scalability, and enable
approximations to explore time and quality trade-offs.
Our shared-memory implementation verifies the
efficiency, scalability, and effectiveness of our local
algorithms on real-world networks.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yang:2018:CED,
author = "Jingru Yang and Ju Fan and Zhewei Wei and Guoliang Li
and Tongyu Liu and Xiaoyong Du",
title = "Cost-effective data annotation using game-based
crowdsourcing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "1",
pages = "57--70",
month = sep,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3275536.3275541",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jan 2 18:29:47 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Large-scale data annotation is indispensable for many
applications, such as machine learning and data
integration. However, existing annotation solutions
either incur expensive cost for large datasets or
produce noisy results. This paper introduces a
cost-effective annotation approach, and focuses on the
labeling rule generation problem that aims to generate
high-quality rules to largely reduce the labeling cost
while preserving quality. To address the problem, we
first generate candidate rules, and then devise a
game-based crowdsourcing approach C ROWDGAME to select
high-quality rules by considering coverage and
precision. CROWDGAME employs two groups of crowd
workers: one group answers rule validation tasks
(whether a rule is valid) to play a role of rule
generator, while the other group answers tuple checking
tasks (whether the annotated label of a data tuple is
correct) to play a role of rule refuter. We let the two
groups play a two-player game: rule generator
identifies high-quality rules with large coverage and
precision, while rule refuter tries to refute its
opponent rule generator by checking some tuples that
provide enough evidence to reject rules covering the
tuples. This paper studies the challenges in CROWDGAME.
The first is to balance the trade-off between coverage
and precision. We define the loss of a rule by
considering the two factors. The second is rule
precision estimation. We utilize Bayesian estimation to
combine both rule validation and tuple checking tasks.
The third is to select crowdsourcing tasks to fulfill
the game-based framework for minimizing the loss. We
introduce a minimax strategy and develop efficient task
selection algorithms. We conduct experiments on entity
matching and relation extraction, and the results show
that our method outperforms state-of-the-art
solutions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Huang:2018:OAL,
author = "Enhui Huang and Liping Peng and Luciano {Di Palma} and
Ahmed Abdelkafi and Anna Liu and Yanlei Diao",
title = "Optimization for active learning-based interactive
database exploration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "1",
pages = "71--84",
month = sep,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3275536.3275542",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jan 2 18:29:47 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "There is an increasing gap between fast growth of data
and limited human ability to comprehend data.
Consequently, there has been a growing demand of data
management tools that can bridge this gap and help the
user retrieve high-value content from data more
effectively. In this work, we aim to build interactive
data exploration as a new database service, using an
approach called ``explore-by-example''. In particular,
we cast the explore-by-example problem in a principled
``active learning'' framework, and bring the properties
of important classes of database queries to bear on the
design of new algorithms and optimizations for active
learning-based database exploration. These new
techniques allow the database system to overcome a
fundamental limitation of traditional active learning,
i.e., the slow convergence problem. Evaluation results
using real-world datasets and user interest patterns
show that our new system significantly outperforms
state-of-the-art active learning techniques and data
exploration systems in accuracy while achieving desired
efficiency for interactive performance.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bleifuss:2018:ECN,
author = "Tobias Bleifu{\ss} and Leon Bornemann and Theodore
Johnson and Dmitri V. Kalashnikov and Felix Naumann and
Divesh Srivastava",
title = "Exploring change: a new dimension of data analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "2",
pages = "85--98",
month = oct,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3282495.3282496",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jan 2 18:29:48 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data and metadata in datasets experience many
different kinds of change. Values are inserted, deleted
or updated; rows appear and disappear; columns are
added or repurposed, etc. In such a dynamic situation,
users might have many questions related to changes in
the dataset, for instance which parts of the data are
trustworthy and which are not? Users will wonder: How
many changes have there been in the recent minutes,
days or years? What kind of changes were made at which
points of time? How dirty is the data? Is data
cleansing required? The fact that data changed can hint
at different hidden processes or agendas: a frequently
crowd-updated city name may be controversial; a person
whose name has been recently changed may be the target
of vandalism; and so on. We show various use cases that
benefit from recognizing and exploring such change. We
envision a system and methods to interactively explore
such change, addressing the variability dimension of
big data challenges. To this end, we propose a model to
capture change and the process of exploring dynamic
data to identify salient changes. We provide
exploration primitives along with motivational examples
and measures for the volatility of data. We identify
technical challenges that need to be addressed to make
our vision a reality, and propose directions of future
work for the data management community.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ghosh:2018:FSS,
author = "Bishwamittra Ghosh and Mohammed Eunus Ali and Farhana
M. Choudhury and Sajid Hasan Apon and Timos Sellis and
Jianxin Li",
title = "The flexible socio spatial group queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "2",
pages = "99--111",
month = oct,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3282495.3282497",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jan 2 18:29:48 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A socio spatial group query finds a group of users who
possess strong social connections with each other and
have the minimum aggregate spatial distance to a
meeting point. Existing studies limit to either finding
the best group of a fixed size for a single meeting
location, or a single group of a fixed size w.r.t.
multiple locations. However, it is highly desirable to
consider multiple locations in a real-life scenario in
order to organize impromptu activities of groups of
various sizes. In this paper, we propose Top k Flexible
Socio Spatial Group Query (Top k-FSSGQ) to find the top
k groups w.r.t. multiple POIs where each group follows
the minimum social connectivity constraints. We devise
a ranking function to measure the group score by
combining social closeness, spatial distance, and group
size, which provides the flexibility of choosing groups
of different sizes under different constraints. To
effectively process the Top k-FSSGQ, we first develop
an Exact approach that ensures early termination of the
search based on the derived upper bounds. We prove that
the problem is NP-hard, hence we first present a
heuristic based approximation algorithm to effectively
select members in intermediate solution groups based on
the social connectivity of the users. Later we design a
Fast Approximate approach based on the relaxed social
and spatial bounds, and connectivity constraint
heuristic. Experimental studies have verified the
effectiveness and efficiency of our proposed approaches
on real datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Echihabi:2018:LHD,
author = "Karima Echihabi and Kostas Zoumpatianos and Themis
Palpanas and Houda Benbrahim",
title = "The {Lernaean Hydra} of data series similarity search:
an experimental evaluation of the state of the art",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "2",
pages = "112--127",
month = oct,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3282495.3282498",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jan 2 18:29:48 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Increasingly large data series collections are
becoming commonplace across many different domains and
applications. A key operation in the analysis of data
series collections is similarity search, which has
attracted lots of attention and effort over the past
two decades. Even though several relevant approaches
have been proposed in the literature, none of the
existing studies provides a detailed evaluation against
the available alternatives. The lack of comparative
results is further exacerbated by the non-standard use
of terminology, which has led to confusion and
misconceptions. In this paper, we provide definitions
for the different flavors of similarity search that
have been studied in the past, and present the first
systematic experimental evaluation of the efficiency of
data series similarity search techniques. Based on the
experimental results, we describe the strengths and
weaknesses of each approach and give recommendations
for the best approach to use under typical use cases.
Finally, by identifying the shortcomings of each
method, our findings lay the ground for solid further
developments in the field.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2018:RML,
author = "Wei Wang and Jinyang Gao and Meihui Zhang and Sheng
Wang and Gang Chen and Teck Khim Ng and Beng Chin Ooi
and Jie Shao and Moaz Reyad",
title = "{Rafiki}: machine learning as an analytics service
system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "2",
pages = "128--140",
month = oct,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3282495.3282499",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jan 2 18:29:48 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Big data analytics is gaining massive momentum in the
last few years. Applying machine learning models to big
data has become an implicit requirement or an
expectation for most analysis tasks, especially on
high-stakes applications. Typical applications include
sentiment analysis against reviews for analyzing
on-line products, image classification in food logging
applications for monitoring user's daily intake, and
stock movement prediction. Extending traditional
database systems to support the above analysis is
intriguing but challenging. First, it is almost
impossible to implement all machine learning models in
the database engines. Second, expert knowledge is
required to optimize the training and inference
procedures in terms of efficiency and effectiveness,
which imposes heavy burden on the system users. In this
paper, we develop and present a system, called Rafiki,
to provide the training and inference service of
machine learning models. Rafiki provides distributed
hyper-parameter tuning for the training service, and
online ensemble modeling for the inference service
which trades off between latency and accuracy.
Experimental results confirm the efficiency,
effectiveness, scalability and usability of Rafiki.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Subotic:2018:AIS,
author = "Pavle Suboti{\'c} and Herbert Jordan and Lijun Chang
and Alan Fekete and Bernhard Scholz",
title = "Automatic index selection for large-scale datalog
computation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "2",
pages = "141--153",
month = oct,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3282495.3282500",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jan 2 18:29:48 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Datalog has been applied to several use cases that
require very high performance on large rulesets and
factsets. It is common to create indexes for relations
to improve search performance. However, the existing
indexing schemes either require manual index selection
or result in insufficient performance on very large
tasks. In this paper, we propose an automatic scheme to
select indexes. We automatically create the minimum
number of indexes to speed up all the searches in a
given Datalog program. We have integrated our indexing
scheme into an open-source Datalog engine S OUFFL{\'E}.
We obtain performance on a par with what users have
accepted from hand-optimized Datalog programs running
on state-of-the-art Datalog engines, while we do not
require the effort of manual index selection. Extensive
experiments on large real Datalog programs demonstrate
that our indexing scheme results in considerable
speedups (up to 2x) and significantly less memory usage
(up to 6x) compared with other automated index
selections.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Song:2018:SLF,
author = "Shuang Song and Xu Liu and Qinzhe Wu and Andreas
Gerstlauer and Tao Li and Lizy K. John",
title = "Start late, finish early: a distributed graph
processing system with redundancy reduction",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "2",
pages = "154--168",
month = oct,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3282495.3282501",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jan 2 18:29:48 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Graph processing systems are important in the big data
domain. However, processing graphs in parallel often
introduces redundant computations in existing
algorithms and models. Prior work has proposed
techniques to optimize redundancies for out-of-core
graph systems, rather than distributed graph systems.
In this paper, we study various state-of-the-art
distributed graph systems and observe root causes for
these pervasively existing redundancies. To reduce
redundancies without sacrificing parallelism, we
further propose SLFE, a distributed graph processing
system, designed with the principle of ``start late or
finish early''. SLFE employs a novel preprocessing
stage to obtain a graph's topological knowledge with
negligible overhead. SLFE's redundancy-aware
vertex-centric computation model can then utilize such
knowledge to reduce the redundant computations at
runtime. SLFE also provides a set of APIs to improve
programmability. Our experiments on an 8-machine
high-performance cluster show that SLFE outperforms all
well-known distributed graph processing systems with
the inputs of real-world graphs, yielding up to 75x
speedup. Moreover, SLFE outperforms two
state-of-the-art shared memory graph systems on a
high-end machine with up to 1644x speedup. SLFE's
redundancy-reduction schemes are generally applicable
to other vertex-centric graph processing systems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ding:2018:IOC,
author = "Bailu Ding and Lucja Kot and Johannes Gehrke",
title = "Improving optimistic concurrency control through
transaction batching and operation reordering",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "2",
pages = "169--182",
month = oct,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3282495.3282502",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jan 2 18:29:48 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "OLTP systems can often improve throughput by batching
transactions and processing them as a group. Batching
has been used for optimizations such as message packing
and group commits; however, there is little research on
the benefits of a holistic approach to batching across
a transaction's entire life cycle. In this paper, we
present a framework to incorporate batching at multiple
stages of transaction execution for OLTP systems based
on optimistic concurrency control. Storage batching
enables reordering of transaction reads and writes at
the storage layer, reducing conflicts on the same
object. Validator batching enables reordering of
transactions before validation, reducing conflicts
between transactions. Dependencies between transactions
make transaction reordering a non-trivial problem, and
we propose several efficient and practical algorithms
that can be customized to various transaction
precedence policies such as reducing tail latency. We
also show how to reorder transactions with a
thread-aware policy in multi-threaded OLTP architecture
without a centralized validator. In-depth experiments
on a research prototype, an opensource OLTP system, and
a production OLTP system show that our techniques
increase transaction throughput by up to 2.2x and
reduce their tail latency by up to 71\% compared with
the start-of-the-art systems on workloads with high
data contention.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Xie:2018:QLC,
author = "Ting Xie and Varun Chandola and Oliver Kennedy",
title = "Query log compression for workload analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "3",
pages = "183--196",
month = nov,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3291264.3291265",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jan 18 05:54:04 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Analyzing database access logs is a key part of
performance tuning, intrusion detection, benchmark
development, and many other database administration
tasks. Unfortunately, it is common for production
databases to deal with millions or more queries each
day, so these logs must be summarized before they can
be used. Designing an appropriate summary encoding
requires trading off between conciseness and
information content. For example: simple workload
sampling may miss rare, but high impact queries. In
this paper, we present L OGR, a lossy log compression
scheme suitable for use in many automated log analytics
tools, as well as for human inspection. We formalize
and analyze the space/fidelity trade-off in the context
of a broader family of ``pattern'' and ``pattern
mixture'' log encodings to which LOGR belongs. We show
through a series of experiments that LOGR compressed
encodings can be created efficiently, come with
provable information-theoretic bounds on their
accuracy, and outperform state-of-art log summarization
strategies.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ali:2018:MTC,
author = "Mohammed Eunus Ali and Shadman Saqib Eusuf and Kaysar
Abdullah and Farhana M. Choudhury and J. Shane
Culpepper and Timos Sellis",
title = "The maximum trajectory coverage query in spatial
databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "3",
pages = "197--209",
month = nov,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3291264.3291266",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jan 18 05:54:04 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the widespread use of GPS-enabled mobile devices,
an unprecedented amount of trajectory data has become
available from various sources such as Bikely,
GPS-wayPoints, and Uber. The rise of smart
transportation services and recent break-throughs in
autonomous vehicles increase our reliance on trajectory
data in a wide variety of applications. Supporting
these services in emerging platforms requires more
efficient query processing in trajectory databases. In
this paper, we propose two new coverage queries for
trajectory databases: (i) k Best Facility Trajectory
Search ( k BFT); and (ii) k Best Coverage Facility
Trajectory Search ( k BCovFT). We propose a novel index
structure, the Trajectory Quadtree (TQ-tree) that
utilizes a quadtree to hierarchically organize
trajectories into different nodes, and then applies a
z-ordering to further organize the trajectories by
spatial locality inside each node. This structure is
highly effective in pruning the trajectory search
space, which is of independent interest. By exploiting
the TQ-tree, we develop a divide-and-conquer approach
to efficiently process a k BFT query. To solve the k
BCovFT, which is a non-submodular NP-hard problem, we
propose a greedy approximation. We evaluate our
algorithms through an extensive experimental study on
several real datasets, and demonstrate that our
algorithms outperform baselines by two to three orders
of magnitude.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wu:2018:TLO,
author = "Chenggang Wu and Alekh Jindal and Saeed Amizadeh and
Hiren Patel and Wangchao Le and Shi Qiao and Sriram
Rao",
title = "Towards a learning optimizer for shared clouds",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "3",
pages = "210--222",
month = nov,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3291264.3291267",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jan 18 05:54:04 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Query optimizers are notorious for inaccurate cost
estimates, leading to poor performance. The root of the
problem lies in inaccurate cardinality estimates, i.e.,
the size of intermediate (and final) results in a query
plan. These estimates also determine the resources
consumed in modern shared cloud infrastructures. In
this paper, we present C ARDLEARNER, a machine learning
based approach to learn cardinality models from
previous job executions and use them to predict the
cardinalities in future jobs. The key intuition in our
approach is that shared cloud workloads are often
recurring and overlapping in nature, and so we could
learn cardinality models for overlapping subgraph
templates. We discuss various learning approaches and
show how learning a large number of smaller models
results in high accuracy and explainability. We further
present an exploration technique to avoid learning bias
by considering alternate join orders and learning
cardinality models over them. We describe the feedback
loop to apply the learned models back to future job
executions. Finally, we show a detailed evaluation of
our models (up to 5 orders of magnitude less error),
query plans (60\% applicability), performance (up to
100\% faster, 3x fewer resources), and exploration
(optimal in few 10s of executions).",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Varma:2018:SAW,
author = "Paroma Varma and Christopher R{\'e}",
title = "{Snuba}: automating weak supervision to label training
data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "3",
pages = "223--236",
month = nov,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3291264.3291268",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jan 18 05:54:04 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "As deep learning models are applied to increasingly
diverse problems, a key bottleneck is gathering enough
high-quality training labels tailored to each task.
Users therefore turn to weak supervision, relying on
imperfect sources of labels like pattern matching and
user-defined heuristics. Unfortunately, users have to
design these sources for each task. This process can be
time consuming and expensive: domain experts often
perform repetitive steps like guessing optimal
numerical thresholds and developing informative text
patterns. To address these challenges, we present
Snuba, a system to automatically generate heuristics
using a small labeled dataset to assign training labels
to a large, unlabeled dataset in the weak supervision
setting. Snuba generates heuristics that each labels
the subset of the data it is accurate for, and
iteratively repeats this process until the heuristics
together label a large portion of the unlabeled data.
We develop a statistical measure that guarantees the
iterative process will automatically terminate before
it degrades training label quality. Snuba automatically
generates heuristics in under five minutes and performs
up to 9.74 F1 points better than the best known
user-defined heuristics developed over many days. In
collaborations with users at research labs, Stanford
Hospital, and on open source datasets, Snuba
outperforms other automated approaches like
semi-supervised learning by up to 14.35 F1 points.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Asudeh:2018:OSR,
author = "Abolfazl Asudeh and H. V. Jagadish and Gerome Miklau
and Julia Stoyanovich",
title = "On obtaining stable rankings",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "3",
pages = "237--250",
month = nov,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3291264.3291269",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jan 18 05:54:04 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Decision making is challenging when there is more than
one criterion to consider. In such cases, it is common
to assign a goodness score to each item as a weighted
sum of its attribute values and rank them accordingly.
Clearly, the ranking obtained depends on the weights
used for this summation. Ideally, one would want the
ranked order not to change if the weights are changed
slightly. We call this property stability of the
ranking. A consumer of a ranked list may trust the
ranking more if it has high stability. A producer of a
ranked list prefers to choose weights that result in a
stable ranking, both to earn the trust of potential
consumers and because a stable ranking is intrinsically
likely to be more meaningful. In this paper, we develop
a framework that can be used to assess the stability of
a provided ranking and to obtain a stable ranking
within an ``acceptable'' range of weight values (called
``the region of interest''). We address the case where
the user cares about the rank order of the entire set
of items, and also the case where the user cares only
about the top- k items. Using a geometric
interpretation, we propose algorithms that produce
stable rankings. In addition to theoretical analyses,
we conduct extensive experiments on real datasets that
validate our proposal.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ji:2018:PTB,
author = "Shuping Ji and Hans-Arno Jacobsen",
title = "{PS}-tree-based efficient {Boolean} expression
matching for high-dimensional and dense workloads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "3",
pages = "251--264",
month = nov,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3291264.3291270",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jan 18 05:54:04 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Boolean expression matching is an important function
for many applications. However, existing solutions
still suffer from limitations when applied to
high-dimensional and dense workloads. To overcome these
limitations, in this paper, we design a data structure
called PS-Tree that can efficiently index subscriptions
in one dimension. By dividing predicates into disjoint
predicate spaces, PS-Tree achieves high matching
performance and good expressiveness. Based on PS-Tree,
we first propose a Boolean expression matching
algorithm PSTBloom. By efficiently filtering out a
large proportion of unmatching subscriptions, PSTBloom
achieves high matching performance, especially for
high-dimensional workloads. PSTBloom also achieves fast
index construction and a small memory footprint.
Compared with state-of-the-art methods, comprehensive
experiments show that PSTBloom reduces matching time,
index construction time and memory usage by up to 84\%,
78\% and 94\%, respectively. Although PSTBloom is
effective for many workload distributions, dense
workloads represent new challenges to PSTBloom and
other algorithms. To effectively handle dense
workloads, we further propose the PSTHash algorithm,
which divides subscriptions into disjoint
multidimensional predicate spaces. This organization
prunes partially matching subscriptions efficiently.
Comprehensive experiments on both synthetic and
real-world datasets show that PSTHash improves the
matching performance by up to 92\% for dense
workloads.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yan:2018:SMR,
author = "Yizhou Yan and Lei Cao and Samuel Madden and Elke A.
Rundensteiner",
title = "{SWIFT}: mining representative patterns from large
event streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "3",
pages = "265--277",
month = nov,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3291264.3291271",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jan 18 05:54:04 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Event streams generated by smart devices common in
modern Internet of Things applications must be
continuously mined to monitor the behavior of the
underlying system. In this work, we propose a stream
pattern mining system for supporting online IoT
applications. First, to solve the pattern explosion
problem of existing stream pattern mining strategies,
we now design pattern semantics that continuously
produce a compact set of patterns that max-imumly
compresses the dynamic data streams, called MDL-based
Representative Patterns (MRP). We then design a
one-pass SWIFT approach that continuously mines the
up-to-date MRP pattern set for each stream window upon
the arrival or expiration of individual events. We show
that SWIFT is guaranteed to select the update operation
for each individual incoming event that leads to the
most compact encoding of the sequence in the current
window. We further enhance SWIFT to support batch
updates, called B-SWIFT. B-SWIFT adopts a lazy update
strategy that guarantees that only the minimal number
of operations are conducted to process an incoming
event batch for MRP pattern mining. Evaluation by our
industry lighting lab collaborator demonstrates that
SWIFT successfully solves their use cases and finds
more representative patterns than the alternative
approaches adapting the state-of-the-art static
representative pattern mining methods. Our experimental
study confirms that SWIFT outperforms the best existing
method up to 50\% in the compactness of produced
pattern encodings, while providing a 4 orders of
magnitude speedup.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{C:2018:SSS,
author = "Paul Suganthan G. C. and Adel Ardalan and AnHai Doan
and Aditya Akella",
title = "{Smurf}: self-service string matching using random
forests",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "3",
pages = "278--291",
month = nov,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3291264.3291272",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jan 18 05:54:04 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/string-matching.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We argue that more attention should be devoted to
developing self-service string matching (SM) solutions,
which lay users can easily use. We show that Falcon, a
self-service entity matching (EM) solution, can be
applied to SM and is more accurate than current
self-service SM solutions. However, Falcon often asks
lay users to label many string pairs (e.g., 770-1050 in
our experiments). This is expensive, can significantly
compound labeling mistakes, and takes a long time. We
developed Smurf, a self-service SM solution that
reduces the labeling effort by 43-76\%, yet achieves
comparable F$_1$ accuracy. The key to make Smurf
possible is a novel solution to efficiently execute a
random forest (that Smurf learns via active learning
with the lay user) over two sets of strings. This
solution uses RDBMS-style plan optimization to reuse
computations across the trees in the forest. As such,
Smurf significantly advances self-service SM and raises
interesting future directions for self-service EM and
scalable random forest execution over structured
data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Liu:2018:CSD,
author = "Feilong Liu and Ario Salmasi and Spyros Blanas and
Anastasios Sidiropoulos",
title = "Chasing similarity: distribution-aware aggregation
scheduling",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "3",
pages = "292--306",
month = nov,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3291264.3291273",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jan 18 05:54:04 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Parallel aggregation is a ubiquitous operation in data
analytics that is expressed as GROUP BY in SQL, reduce
in Hadoop, or segment in TensorFlow. Parallel
aggregation starts with an optional local
pre-aggregation step and then repartitions the
intermediate result across the network. While local
pre-aggregation works well for low-cardinality
aggregations, the network communication cost remains
significant for high-cardinality aggregations even
after local pre-aggregation. The problem is that the
repartition-based algorithm for high-cardinality
aggregation does not fully utilize the network. In this
work, we first formulate a mathematical model that
captures the performance of parallel aggregation. We
prove that finding optimal aggregation plans from a
known data distribution is NP-hard, assuming the Small
Set Expansion conjecture. We propose GRASP, a GReedy
Aggregation Scheduling Protocol that decomposes
parallel aggregation into phases. GRASP is
distribution-aware as it aggregates the most similar
partitions in each phase to reduce the transmitted data
size in subsequent phases. In addition, GRASP takes the
available network bandwidth into account when
scheduling aggregations in each phase to maximize
network utilization. The experimental evaluation on
real data shows that GRASP outperforms
repartition-based aggregation by 3.5x and LOOM by
2.0x.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bater:2018:SES,
author = "Johes Bater and Xi He and William Ehrich and Ashwin
Machanavajjhala and Jennie Rogers",
title = "{Shrinkwrap}: efficient {SQL} query processing in
differentially private data federations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "3",
pages = "307--320",
month = nov,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3291264.3291274",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jan 18 05:54:04 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A private data federation is a set of autonomous
databases that share a unified query interface offering
in-situ evaluation of SQL queries over the union of the
sensitive data of its members. Owing to privacy
concerns, these systems do not have a trusted data
collector that can see all their data and their member
databases cannot learn about individual records of
other engines. Federations currently achieve this goal
by evaluating queries obliviously using secure
multiparty computation. This hides the intermediate
result cardinality of each query operator by
exhaustively padding it. With cascades of such
operators, this padding accumulates to a blow-up in the
output size of each operator and a proportional loss in
query performance. Hence, existing private data
federations do not scale well to complex SQL queries
over large datasets. We introduce Shrinkwrap, a private
data federation that offers data owners a
differentially private view of the data held by others
to improve their performance over oblivious query
processing. Shrinkwrap uses computational differential
privacy to minimize the padding of intermediate query
results, achieving up to a 35X performance improvement
over oblivious query processing. When the query needs
differentially private output, Shrinkwrap provides a
trade-off between result accuracy and query evaluation
performance.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gill:2018:SPP,
author = "Gurbinder Gill and Roshan Dathathri and Loc Hoang and
Keshav Pingali",
title = "A study of partitioning policies for graph analytics
on large-scale distributed platforms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "4",
pages = "321--334",
month = dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3297753.3297754",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 27 14:03:31 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Distributed-memory clusters are used for in-memory
processing of very large graphs with billions of nodes
and edges. This requires partitioning the graph among
the machines in the cluster. When a graph is
partitioned, a node in the graph may be replicated on
several machines, and communication is required to keep
these replicas synchronized. Good partitioning policies
attempt to reduce this synchronization overhead while
keeping the computational load balanced across
machines. A number of recent studies have looked at
ways to control replication of nodes, but these studies
are not conclusive because they were performed on small
clusters with eight to sixteen machines, did not
consider work-efficient data-driven algorithms, or did
not optimize communication for the partitioning
strategies they studied. This paper presents an
experimental study of partitioning strategies for
work-efficient graph analytics applications on large
KNL and Skylake clusters with up to 256 machines using
the Gluon communication runtime which implements
partitioning-specific communication optimizations.
Evaluation results show that although simple
partitioning strategies like Edge-Cuts perform well on
a small number of machines, an alternative partitioning
strategy called Cartesian Vertex-Cut (CVC) performs
better at scale even though paradoxically it has a
higher replication factor and performs more
communication than Edge-Cut partitioning does. Results
from communication micro-benchmarks resolve this
paradox by showing that communication overhead depends
not only on communication volume but also on the
communication pattern among the partitions. These
experiments suggest that high-performance graph
analytics systems should support multiple partitioning
strategies, like Gluon does, as no single graph
partitioning strategy is best for all cluster sizes.
For such systems, a decision tree for selecting a good
partitioning strategy based on characteristics of the
computation and the cluster is presented.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kumar:2018:UDG,
author = "K. Ashwin Kumar and Petros Efstathopoulos",
title = "Utility-driven graph summarization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "4",
pages = "335--347",
month = dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3297753.3297755",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 27 14:03:31 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A lot of the large datasets analyzed today represent
graphs. In many real-world applications, summarizing
large graphs is beneficial (or necessary) so as to
reduce a graph's size and, thus, achieve a number of
benefits, including but not limited to (1) significant
speed-up for graph algorithms, (2) graph storage space
reduction, (3) faster network transmission, (4)
improved data privacy, (5) more effective graph
visualization, etc. During the summarization process,
potentially useful information is removed from the
graph (nodes and edges are removed or transformed).
Consequently, one important problem with graph
summarization is that, although it reduces the size of
the input graph, it also adversely affects and reduces
its utility. The key question that we pose in this
paper is, can we summarize and compress a graph while
ensuring that its utility or usefulness does not drop
below a certain user-specified utility threshold? We
explore this question and propose a novel iterative
utility-driven graph summarization approach. During
iterative summarization, we incrementally keep track of
the utility of the graph summary. This enables a user
to query a graph summary that is conditioned on a
user-specified utility value. We present both
exhaustive and scalable approaches for implementing our
proposed solution. Our experimental results on
real-world graph datasets show the effectiveness of our
proposed approach. Finally, through multiple real-world
applications we demonstrate the practicality of our
notion of utility of the computed graph summary.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kara:2018:CCS,
author = "Kaan Kara and Ken Eguro and Ce Zhang and Gustavo
Alonso",
title = "{ColumnML}: column-store machine learning with
on-the-fly data transformation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "4",
pages = "348--361",
month = dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3297753.3297756",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 27 14:03:31 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The ability to perform machine learning (ML) tasks in
a database management system (DBMS) provides the data
analyst with a powerful tool. Unfortunately,
integration of ML into a DBMS is challenging for
reasons varying from differences in execution model to
data layout requirements. In this paper, we assume a
column-store main-memory DBMS, optimized for online
analytical processing, as our initial system. On this
system, we explore the integration of
coordinate-descent based methods working natively on
columnar format to train generalized linear models. We
use a cache-efficient, partitioned stochastic
coordinate descent algorithm providing linear
throughput scalability with the number of cores while
preserving convergence quality, up to 14 cores in our
experiments. Existing column oriented DBMS rely on
compression and even encryption to store data in
memory. When those features are considered, the
performance of a CPU based solution suffers. Thus, in
the paper we also show how to exploit hardware
acceleration as part of a hybrid CPU+FPGA system to
provide on-the-fly data transformation combined with an
FPGA-based coordinate-descent engine. The resulting
system is a column-store DBMS with its important
features preserved (e.g., data compression) that offers
high performance machine learning capabilities.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2018:CED,
author = "Yanying Li and Haipei Sun and Boxiang Dong and Hui
(Wendy) Wang",
title = "Cost-efficient data acquisition on online data
marketplaces for correlation analysis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "4",
pages = "362--375",
month = dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3297753.3297757",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 27 14:03:31 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Incentivized by the enormous economic profits, the
data marketplace platform has been proliferated
recently. In this paper, we consider the data
marketplace setting where a data shopper would like to
buy data instances from the data marketplace for
correlation analysis of certain attributes. We assume
that the data in the marketplace is dirty and not free.
The goal is to find the data instances from a large
number of datasets in the marketplace whose join result
not only is of high-quality and rich join
informativeness, but also delivers the best correlation
between the requested attributes. To achieve this goal,
we design DANCE, a middleware that provides the desired
data acquisition service. DANCE consists of two phases:
(1) In the off-line phase, it constructs a two-layer
join graph from samples. The join graph includes the
information of the datasets in the marketplace at both
schema and instance levels; (2) In the online phase, it
searches for the data instances that satisfy the
constraints of data quality, budget, and join
informativeness, while maximizing the correlation of
source and target attribute sets. We prove that the
complexity of the search problem is NP-hard, and design
a heuristic algorithm based on Markov chain Monte Carlo
(MCMC). Experiment results on two benchmark and one
real datasets demonstrate the efficiency and
effectiveness of our heuristic data acquisition
algorithm.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dolatshah:2018:CCL,
author = "Mohamad Dolatshah and Mathew Teoh and Jiannan Wang and
Jian Pei",
title = "Cleaning crowdsourced labels using oracles for
statistical classification",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "4",
pages = "376--389",
month = dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3297753.3297758",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 27 14:03:31 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Nowadays, crowdsourcing is being widely used to
collect training data for solving classification
problems. However, crowdsourced labels are often noisy,
and there is a performance gap between classification
with noisy labels and classification with ground-truth
labels. In this paper, we consider how to apply
oracle-based label cleaning to reduce the gap. We
propose TARS, a label-cleaning advisor that can provide
two pieces of valuable advice for data scientists when
they need to train or test a model using noisy labels.
Firstly, in the model testing stage, given a test
dataset with noisy labels, and a classification model,
TARS can use the test data to estimate how well the
model will perform w.r.t. ground-truth labels.
Secondly, in the model training stage, given a training
dataset with noisy labels, and a classification
algorithm, TARS can determine which label should be
sent to an oracle to clean such that the model can be
improved the most. For the first advice, we propose an
effective estimation technique, and study how to
compute confidence intervals to bound its estimation
error. For the second advice, we propose a novel
cleaning strategy along with two optimization
techniques, and illustrate that it is superior to the
existing cleaning strategies. We evaluate TARS on both
simulated and real-world datasets. The results show
that (1) TARS can use noisy test data to accurately
estimate a model's true performance for various
evaluation metrics; and (2) TARS can improve the model
accuracy by a larger margin than the existing cleaning
strategies, for the same cleaning budget.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lissandrini:2018:BMM,
author = "Matteo Lissandrini and Martin Brugnara and Yannis
Velegrakis",
title = "Beyond macrobenchmarks: microbenchmark-based graph
database evaluation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "4",
pages = "390--403",
month = dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3297753.3297759",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 27 14:03:31 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Despite the increasing interest in graph databases
their requirements and specifications are not yet fully
understood by everyone, leading to a great deal of
variation in the supported functionalities and the
achieved performances. In this work, we provide a
comprehensive study of the existing graph database
systems. We introduce a novel microbenchmarking
framework that provides insights on their performance
that go beyond what macro-benchmarks can offer. The
framework includes the largest set of queries and
operators so far considered. The graph database systems
are evaluated on synthetic and real data, from
different domains, and at scales much larger than any
previous work. The framework is materialized as an
open-source suite and is easily extended to new
datasets, systems, and queries$^1$.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Balegas:2018:IIP,
author = "Valter Balegas and S{\'e}rgio Duarte and Carla
Ferreira and Rodrigo Rodrigues and Nuno
Pregui{\c{c}}a",
title = "{IPA}: invariant-preserving applications for weakly
consistent replicated databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "4",
pages = "404--418",
month = dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3297753.3297760",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 27 14:03:31 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "It is common to use weakly consistent replication to
achieve high availability and low latency at a global
scale. In this setting, concurrent updates may lead to
states where application invariants do not hold. Some
systems coordinate the execution of (conflicting)
operations to avoid invariant violations, leading to
high latency and reduced availability for those
operations. This problem is worsened by the difficulty
in identifying precisely which operations conflict. In
this paper we propose a novel approach to preserve
application invariants without coordinating the
execution of operations. The approach consists of
modifying operations in a way that application
invariants are maintained in the presence of concurrent
updates. When no conflicting updates occur, the
modified operations present their original semantics.
Otherwise, we use sensible and deterministic conflict
resolution policies that preserve the invariants of the
application. To implement this approach, we developed a
static analysis, IPA, that identifies conflicting
operations and proposes the necessary modifications to
operations. Our analysis shows that IPA can avoid
invariant violations in many applications, including
typical database applications. Our evaluation reveals
that the offline static analysis runs fast enough for
being used with large applications. The overhead
introduced in the modified operations is low and it
leads to lower latency and higher throughput when
compared with other approaches that enforce
invariants.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Abuzaid:2018:DRI,
author = "Firas Abuzaid and Peter Kraft and Sahaana Suri and
Edward Gan and Eric Xu and Atul Shenoy and Asvin
Ananthanarayan and John Sheu and Erik Meijer and Xi Wu
and Jeff Naughton and Peter Bailis and Matei Zaharia",
title = "{DIFF}: a relational interface for large-scale data
explanation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "4",
pages = "419--432",
month = dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3297753.3297761",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 27 14:03:31 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A range of explanation engines assist data analysts by
performing feature selection over increasingly
high-volume and high-dimensional data, grouping and
highlighting commonalities among data points. While
useful in diverse tasks such as user behavior
analytics, operational event processing, and root cause
analysis, today's explanation engines are designed as
standalone data processing tools that do not
interoperate with traditional, SQL-based analytics
workflows; this limits the applicability and
extensibility of these engines. In response, we propose
the DIFF operator, a relational aggregation operator
that unifies the core functionality of these engines
with declarative relational query processing. We
implement both single-node and distributed versions of
the DIFF operator in MB SQL, an extension of MacroBase,
and demonstrate how DIFF can provide the same semantics
as existing explanation engines while capturing a broad
set of production use cases in industry, including at
Microsoft and Facebook. Additionally, we illustrate how
this declarative approach to data explanation enables
new logical and physical query optimizations. We
evaluate these optimizations on several real-world
production applications, and find that DIFF in MB SQL
can outperform state-of-the-art engines by up to an
order of magnitude.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Basat:2018:SFI,
author = "Ran {Ben Basat} and Roy Friedman and Rana Shahout",
title = "Stream frequency over interval queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "4",
pages = "433--445",
month = dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3297753.3297762",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 27 14:03:31 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Stream frequency measurements are fundamental in many
data stream applications such as financial data
trackers, intrusion-detection systems, and network
monitoring. Typically, recent data items are more
relevant than old ones, a notion we can capture through
a sliding window abstraction. This paper considers a
generalized sliding window model that supports stream
frequency queries over an interval given at query time.
This enables drill-down queries, in which we can
examine the behavior of the system in finer and finer
granularities. For this model, we asymptotically
improve the space bounds of existing work, reduce the
update and query time to a constant, and provide
deterministic solutions. When evaluated over real
Internet packet traces, our fastest algorithm processes
items 90--250 times faster, serves queries at least 730
times quicker and consumes at least 40\% less space
than the best known method.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Xin:2018:HHO,
author = "Doris Xin and Stephen Macke and Litian Ma and Jialin
Liu and Shuchen Song and Aditya Parameswaran",
title = "{HELIX}: holistic optimization for accelerating
iterative machine learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "4",
pages = "446--460",
month = dec,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.14778/3297753.3297763",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 27 14:03:31 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Machine learning workflow development is a process of
trial-and-error: developers iterate on workflows by
testing out small modifications until the desired
accuracy is achieved. Unfortunately, existing machine
learning systems focus narrowly on model training---a
small fraction of the overall development time---and
neglect to address iterative development. We propose H
elix, a machine learning system that optimizes the
execution across iterations ---intelligently caching
and reusing, or recomputing intermediates as
appropriate. Helix captures a wide variety of
application needs within its Scala DSL, with succinct
syntax defining unified processes for data
preprocessing, model specification, and learning. We
demonstrate that the reuse problem can be cast as a
Max-Flow problem, while the caching problem is NP-Hard.
We develop effective lightweight heuristics for the
latter. Empirical evaluation shows that Helix is not
only able to handle a wide variety of use cases in one
unified workflow but also much faster, providing run
time reductions of up to 19x over state-of-the-art
systems, such as DeepDive or KeystoneML, on four
real-world applications in natural language processing,
computer vision, social and natural sciences.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fu:2019:FAN,
author = "Cong Fu and Chao Xiang and Changxu Wang and Deng Cai",
title = "Fast approximate nearest neighbor search with the
navigating spreading-out graph",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "5",
pages = "461--474",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3303753.3303754",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 27 14:03:31 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Approximate nearest neighbor search (ANNS) is a
fundamental problem in databases and data mining. A
scalable ANNS algorithm should be both memory-efficient
and fast. Some early graph-based approaches have shown
attractive theoretical guarantees on search time
complexity, but they all suffer from the problem of
high indexing time complexity. Recently, some
graph-based methods have been proposed to reduce
indexing complexity by approximating the traditional
graphs; these methods have achieved revolutionary
performance on million-scale datasets. Yet, they still
can not scale to billion-node databases. In this paper,
to further improve the search-efficiency and
scalability of graph-based methods, we start by
introducing four aspects: (1) ensuring the connectivity
of the graph; (2) lowering the average out-degree of
the graph for fast traversal; (3) shortening the search
path; and (4) reducing the index size. Then, we propose
a novel graph structure called Monotonic Relative
Neighborhood Graph (MRNG) which guarantees very low
search complexity (close to logarithmic time). To
further lower the indexing complexity and make it
practical for billion-node ANNS problems, we propose a
novel graph structure named Navigating Spreading-out
Graph (NSG) by approximating the MRNG. The NSG takes
the four aspects into account simultaneously. Extensive
experiments show that NSG outperforms all the existing
algorithms significantly. In addition, NSG shows
superior performance in the E-commercial scenario of
Taobao (Alibaba Group) and has been integrated into
their billion-scale search engine.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2019:DRF,
author = "Qi Wang and Torsten Suel",
title = "Document reordering for faster intersection",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "5",
pages = "475--487",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3303753.3303755",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 27 14:03:31 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A lot of research has studied how to optimize inverted
index structures in search engines through suitable
reassignment of document identifiers. This approach was
originally proposed to allow for better compression of
the index, but subsequent work showed that it can also
result in significant speed-ups for conjunctive queries
and even certain types of disjunctive top-k algorithms.
However, we do not have a good understanding of why
this happens, and how we could directly optimize an
index for query processing speed. As a result, existing
techniques attempt to optimize for size, and treat
speed increases as a welcome side-effect. In this
paper, we take an initial but important step towards
understanding and modeling speed increases due to
document reordering. We define the problem of
minimizing the cost of queries given an inverted index
and a query distribution, relate it to work on adaptive
set intersection, and show that it is fundamentally
different from that of minimizing compressed index
size. We then propose a heuristic algorithm for finding
a document reordering that minimizes query processing
costs under suitable cost models. Our experiments show
significant increases in the speed of intersections
over state-of-the-art reordering techniques.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2019:CCS,
author = "Xiaofei Zhang and M. Tamer {\"O}zsu",
title = "Correlation constraint shortest path over large
multi-relation graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "5",
pages = "488--501",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3303753.3303756",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 27 14:03:31 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Multi-relation graphs intuitively capture the
heterogeneous correlations among real-world entities by
allowing multiple types of relationships to be
represented as entity-connecting edges, i.e., two
entities could be correlated with more than one type of
relationship. This is important in various applications
such as social network analysis, ecology, and
bio-informatics. Existing studies on these graphs
usually consider an edge label constraint perspective,
where each edge contains only one label and each edge
is considered independently. For example, there are
lines of research focusing on reachability between two
vertices under a set of edge label constraints, or
finding paths whose consecutive edge labels satisfy a
user-specified logical expression. This is too
restricted in real graphs, and in this work, we define
a generic correlation constraint on multi-relation
graphs from the perspective of vertex correlations,
where a correlation can be defined recursively.
Specifically, we formalize and investigate the shortest
path problem over large multi-relation graphs in the
presence of both necessity and denial constraints,
which have various real applications. We show that it
is nontrivial to apply conventional graph traversal
algorithms (e.g., BFS or DFS) to address the challenge.
To effectively reduce the search space, we propose a
Hybrid Relation Encoding method, a.k.a. HyRE, to encode
both topological and relation information in a compact
way. We conduct extensive experiments over large
real-world graphs to validate the effectiveness and
efficiency of the proposed solution.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lang:2019:POF,
author = "Harald Lang and Thomas Neumann and Alfons Kemper and
Peter Boncz",
title = "Performance-optimal filtering: {Bloom} overtakes
{Cuckoo} at high throughput",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "5",
pages = "502--515",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3303753.3303757",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 27 14:03:31 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We define the concept of performance-optimal filtering
to indicate the Bloom or Cuckoo filter configuration
that best accelerates a particular task. While the
space-precision tradeoff of these filters has been well
studied, we show how to pick a filter that maximizes
the performance for a given workload. This choice might
be ``suboptimal'' relative to traditional
space-precision metrics, but it will lead to better
performance in practice. In this paper, we focus on
high-throughput filter use cases, aimed at avoiding CPU
work, e.g., a cache miss, a network message, or a local
disk I/O --- events that can happen at rates of
millions to hundreds per second. Besides the
false-positive rate and memory footprint of the filter,
performance optimality has to take into account the
absolute cost of the filter lookup as well as the saved
work per lookup that filtering avoids; while the actual
rate of negative lookups in the workload determines
whether using a filter improves overall performance at
all. In the course of the paper, we introduce new
filter variants, namely the register-blocked and
cache-sectorized Bloom filters. We present new
implementation techniques and perform an extensive
evaluation on modern hardware platforms, including the
wide-SIMD Skylake-X and Knights Landing. This
experimentation shows that in high-throughput
situations, the lower lookup cost of blocked Bloom
filters allows them to overtake Cuckoo filters.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zeuch:2019:AES,
author = "Steffen Zeuch and Bonaventura {Del Monte} and Jeyhun
Karimov and Clemens Lutz and Manuel Renz and Jonas
Traub and Sebastian Bre{\ss} and Tilmann Rabl and
Volker Markl",
title = "Analyzing efficient stream processing on modern
hardware",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "5",
pages = "516--530",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3303753.3303758",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 27 14:03:31 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Modern Stream Processing Engines (SPEs) process large
data volumes under tight latency constraints. Many SPEs
execute processing pipelines using message passing on
shared-nothing architectures and apply a
partition-based scale-out strategy to handle
high-velocity input streams. Furthermore, many
state-of-the-art SPEs rely on a Java Virtual Machine to
achieve platform independence and speed up system
development by abstracting from the underlying
hardware. In this paper, we show that taking the
underlying hardware into account is essential to
exploit modern hardware efficiently. To this end, we
conduct an extensive experimental analysis of current
SPEs and SPE design alternatives optimized for modern
hardware. Our analysis highlights potential bottlenecks
and reveals that state-of-the-art SPEs are not capable
of fully exploiting current and emerging hardware
trends, such as multi-core processors and high-speed
networks. Based on our analysis, we describe a set of
design changes to the common architecture of SPEs to
scale-up on modern hardware. We show that the
single-node throughput can be increased by up to two
orders of magnitude compared to state-of-the-art SPEs
by applying specialized code generation, fusing
operators, batch-style parallelization strategies, and
optimized windowing. This speedup allows for deploying
typical streaming applications on a single or a few
nodes instead of large clusters.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Luo:2019:EDI,
author = "Chen Luo and Michael J. Carey",
title = "Efficient data ingestion and query processing for
{LSM}-based storage systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "5",
pages = "531--543",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3303753.3303759",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 27 14:03:31 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In recent years, the Log Structured Merge (LSM) tree
has been widely adopted by NoSQL and NewSQL systems for
its superior write performance. Despite its popularity,
however, most existing work has focused on LSM-based
key--value stores with only a single LSM-tree;
auxiliary structures, which are critical for supporting
ad-hoc queries, have received much less attention. In
this paper, we focus on efficient data ingestion and
query processing for general-purpose LSM-based storage
systems. We first propose and evaluate a series of
optimizations for efficient batched point lookups,
significantly improving the range of applicability of
LSM-based secondary indexes. We then present several
new and efficient maintenance strategies for LSM-based
storage systems. Finally, we have implemented and
experimentally evaluated the proposed techniques in the
context of the Apache AsterixDB system, and we present
the results here.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chrysogelos:2019:HEH,
author = "Periklis Chrysogelos and Manos Karpathiotakis and Raja
Appuswamy and Anastasia Ailamaki",
title = "{HetExchange}: encapsulating heterogeneous {CPU--GPU}
parallelism in {JIT} compiled engines",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "5",
pages = "544--556",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3303753.3303760",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 27 14:03:31 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Modern server hardware is increasingly heterogeneous
as hardware accelerators, such as GPUs, are used
together with multicore CPUs to meet the computational
demands of modern data analytics work-loads.
Unfortunately, query parallelization techniques used by
analytical database engines are designed for
homogeneous multicore servers, where query plans are
parallelized across CPUs to process data stored in
cache coherent shared memory. Thus, these techniques
are unable to fully exploit available heterogeneous
hardware, where one needs to exploit task-parallelism
of CPUs and data-parallelism of GPUs for processing
data stored in a deep, non-cache-coherent memory
hierarchy with widely varying access latencies and
bandwidth. In this paper, we introduce HetExchange-a
parallel query execution framework that encapsulates
the heterogeneous parallelism of modern
multi-CPU-multi-GPU servers and enables the
parallelization of (pre-)existing sequential relational
operators. In contrast to the interpreted nature of
traditional Exchange, HetExchange is designed to be
used in conjunction with JIT compiled engines in order
to allow a tight integration with the proposed
operators and generation of efficient code for
heterogeneous hardware. We validate the applicability
and efficiency of our design by building a prototype
that can operate over both CPUs and GPUs, and enables
its operators to be parallelism- and
data-location-agnostic. In doing so, we show that
efficiently exploiting CPU-GPU parallelism can provide
2.8x and 6.4x improvement in performance compared to
state-of-the-art CPU-based and GPU-based DBMS.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Atzeni:2019:MMS,
author = "Paolo Atzeni and Luigi Bellomarini and Paolo Papotti
and Riccardo Torlone",
title = "Meta-mappings for schema mapping reuse",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "5",
pages = "557--569",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3303753.3303761",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 27 14:03:31 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The definition of mappings between heterogeneous
schemas is a critical activity of any database
application. Existing tools provide high level
interfaces for the discovery of correspondences between
elements of schemas, but schema mappings need to be
manually specified every time from scratch, even if the
scenario at hand is similar to one that has already
been addressed. The problem is that schema mappings are
precisely defined over a pair of schemas and cannot
directly be reused on different scenarios. We tackle
this challenge by generalizing schema mappings as
meta-mappings: formalisms that describe transformations
between generic data structures called meta-schemas. We
formally characterize schema mapping reuse and explain
how meta-mappings are able to: (i) capture enterprise
knowledge from previously defined schema mappings and
(ii) use this knowledge to suggest new mappings. We
develop techniques to infer meta-mappings from existing
mappings, to organize them into a searchable
repository, and to leverage the repository to propose
to users mappings suitable for their needs. We study
effectiveness and efficiency in an extensive evaluation
over real-world scenarios and show that our system can
infer, store, and search millions of meta-mappings in
seconds.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Xu:2019:EEG,
author = "Lijie Xu and Tian Guo and Wensheng Dou and Wei Wang
and Jun Wei",
title = "An experimental evaluation of garbage collectors on
big data applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "5",
pages = "570--583",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3303753.3303762",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 27 14:03:31 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Popular big data frameworks, ranging from Hadoop
MapReduce to Spark, rely on garbage-collected
languages, such as Java and Scala. Big data
applications are especially sensitive to the
effectiveness of garbage collection (i.e., GC), because
they usually process a large volume of data objects
that lead to heavy GC overhead. Lacking in-depth
understanding of GC performance has impeded performance
improvement in big data applications. In this paper, we
conduct the first comprehensive evaluation on three
popular garbage collectors, i.e., Parallel, CMS, and
G1, using four representative Spark applications. By
thoroughly investigating the correlation between these
big data applications' memory usage patterns and the
collectors' GC patterns, we obtain many findings about
GC inefficiencies. We further propose empirical
guidelines for application developers, and insightful
optimization strategies for designing big-data-friendly
garbage collectors.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Guo:2019:AOC,
author = "Jinwei Guo and Peng Cai and Jiahao Wang and Weining
Qian and Aoying Zhou",
title = "Adaptive optimistic concurrency control for
heterogeneous workloads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "5",
pages = "584--596",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3303753.3303763",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 27 14:03:31 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Optimistic concurrency control (OCC) protocols
validate whether a transaction has conflicts with other
concurrent transactions after this transaction
completes its execution. In this work, we demonstrate
that the validation phase has a great influence on the
performance of modern in-memory database systems,
especially under heterogeneous workloads. The cost of
validating operations in a transaction is determined by
two main factors. The first factor is the operation
type. An OCC protocol would take much less cost on
validating a single-record read operation than
validating a key-range scan operation. The second
factor is the workload type. Existing schemes in OCC
variants for validating key-range scan perform
differently under various workloads. Although various
validation schemes share the same goal of guaranteeing
a transaction schedule to be serializable, there are
remarkable differences between the costs they
introduced. These observations motivate us to design an
optimistic concurrency control which can choose a
low-cost validation scheme at runtime, referred to as
adaptive optimistic concurrency control (AOCC). First,
at transaction-level granularity, AOCC can assign a
validation method to a transaction according to the
features of its operations. Furthermore, for each
operation in a transaction, the validation method is
selected according to not only the number of accessed
records but also the instant characteristics of
workloads. Experimental results show that AOCC has good
performance and scalability under heterogeneous
workloads mixed with point accesses and predicate
queries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lin:2019:MTC,
author = "Yu-Shan Lin and Shao-Kan Pi and Meng-Kai Liao and
Ching Tsai and Aaron Elmore and Shan-Hung Wu",
title = "{MgCrab}: transaction crabbing for live migration in
deterministic database systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "5",
pages = "597--610",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3303753.3303764",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 27 14:03:31 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Recent deterministic database systems have achieved
high scalability and high availability in distributed
environments given OLTP workloads. However, modern OLTP
applications usually have changing workloads or access
patterns, so how to make the resource provisioning
elastic to the changing workloads becomes an important
design goal for a deterministic database system. Live
migration, which moves the specified data from a source
machine to a destination node while continuously
serving the incoming transactions, is a key technique
required for the elasticity. In this paper, we present
MgCrab, a live migration technique for a deterministic
database system, that leverages the determinism to
maintain the consistency of data on the source and
destination nodes at very low cost during a migration
period. We implement MgCrab on an open-source database
system. Extensive experiments were conducted and the
results demonstrate the effectiveness of MgCrab.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Maiyya:2019:UCA,
author = "Sujaya Maiyya and Faisal Nawab and Divyakant Agrawal
and Amr {El Abbadi}",
title = "Unifying consensus and atomic commitment for effective
cloud data management",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "5",
pages = "611--623",
month = jan,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3303753.3303765",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Feb 27 14:03:31 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
note = "See errata \cite{Maiyya:2021:EUC}.",
abstract = "Data storage in the Cloud needs to be scalable and
fault-tolerant. Atomic commitment protocols such as Two
Phase Commit (2PC) provide ACID guarantees for
transactional access to sharded data and help in
achieving scalability. Whereas consensus protocols such
as Paxos consistently replicate data across different
servers and provide fault tolerance. Cloud based
datacenters today typically treat the problems of
scalability and fault-tolerance disjointedly. In this
work, we propose a unification of these two different
paradigms into one framework called Consensus and
Commitment (C\&C) framework. The C\&C framework can
model existing and well known data management protocols
as well as propose new ones. We demonstrate the
advantages of the C\&C framework by developing a new
atomic commitment protocol, Paxos Atomic Commit (PAC),
which integrates commitment with recovery in a
Paxos-like manner. We also instantiate commit protocols
from the C\&C framework catered to different Cloud data
management techniques. In particular, we propose a
novel protocol, Generalized PAC (G-PAC) that integrates
atomic commitment and fault tolerance in a cloud
paradigm involving both sharding and replication of
data. We compare the performance of G-PAC with a
Spanner-like protocol, where 2PC is used at the logical
data level and Paxos is used for consistent replication
of logical data. The experimental results highlight the
benefits of combining consensus along with commitment
into a single integrated protocol.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wu:2019:ATC,
author = "Chenggang Wu and Vikram Sreekanti and Joseph M.
Hellerstein",
title = "Autoscaling tiered cloud storage in {Anna}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "6",
pages = "624--638",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3311880.3311881",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 20 17:32:19 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we describe how we extended a
distributed key--value store called Anna into an
autoscaling, multi-tier service for the cloud. In its
extended form, Anna is designed to overcome the narrow
cost-performance limitations typical of current cloud
storage systems. We describe three key aspects of
Anna's new design: multi-master selective replication
of hot keys, a vertical tiering of storage layers with
different cost-performance tradeoffs, and horizontal
elasticity of each tier to add and remove nodes in
response to load dynamics. Anna's policy engine uses
these mechanisms to balance service-level objectives
around cost, latency and fault tolerance. Experimental
results explore the behavior of Anna's mechanisms and
policy, exhibiting orders of magnitude efficiency
improvements over both commodity cloud KVS services and
research systems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dignos:2019:SST,
author = "Anton Dign{\"o}s and Boris Glavic and Xing Niu and
Michael B{\"o}hlen and Johann Gamper",
title = "Snapshot semantics for temporal multiset relations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "6",
pages = "639--652",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3311880.3311882",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 20 17:32:19 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Snapshot semantics is widely used for evaluating
queries over temporal data: temporal relations are seen
as sequences of snapshot relations, and queries are
evaluated at each snapshot. In this work, we
demonstrate that current approaches for snapshot
semantics over interval-timestamped multiset relations
are subject to two bugs regarding snapshot aggregation
and bag difference. We introduce a novel temporal data
model based on K -relations that overcomes these bugs
and prove it to correctly encode snapshot semantics.
Furthermore, we present an efficient implementation of
our model as a database middleware and demonstrate
experimentally that our approach is competitive with
native implementations.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kwashie:2019:CEE,
author = "Selasi Kwashie and Lin Liu and Jixue Liu and Markus
Stumptner and Jiuyong Li and Lujing Yang",
title = "{Certus}: an effective entity resolution approach with
graph differential dependencies {(GDDs)}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "6",
pages = "653--666",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3311880.3311883",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 20 17:32:19 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Entity resolution (ER) is the problem of accurately
identifying multiple, differing, and possibly
contradicting representations of unique real-world
entities in data. It is a challenging and fundamental
task in data cleansing and data integration. In this
work, we propose graph differential dependencies (GDDs)
as an extension of the recently developed graph entity
dependencies (which are formal constraints for graph
data) to enable approximate matching of values.
Furthermore, we investigate a special discovery of GDDs
for ER by designing an algorithm for generating a
non-redundant set of GDDs in labelled data. Then, we
develop an effective ER technique, Certus, that employs
the learned GDDs for improving the accuracy of ER
results. We perform extensive empirical evaluation of
our proposals on five real-world ER benchmark datasets
and a proprietary database to test their effectiveness
and efficiency. The results from the experiments show
the discovery algorithm and Certus are efficient; and
more importantly, GDDs significantly improve the
precision of ER without considerable trade-off of
recall.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Han:2019:EEA,
author = "Kai Han and Fei Gui and Xiaokui Xiao and Jing Tang and
Yuntian He and Zongmai Cao and He Huang",
title = "Efficient and effective algorithms for clustering
uncertain graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "6",
pages = "667--680",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3311880.3311884",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 20 17:32:19 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We consider the edge uncertainty in an undirected
graph and study the k -median (resp. k -center)
problems, where the goal is to partition the graph
nodes into k clusters such that the average (resp.
minimum) connection probability between each node and
its cluster's center is maximized. We analyze the
hardness of these problems, and propose algorithms that
provide considerably improved approximation guarantees
than the existing studies do. Specifically, our
algorithms offer (1 --- 1/e)-approximations for the k
-median problem and (OPTck)-approximations for the k
-center problem, where OPTck is the optimal objective
function value for k -center. In addition, our
algorithms incorporate several non-trivial
optimizations that significantly enhance their
practical efficiency. Extensive experimental results
demonstrate that our algorithms considerably outperform
the existing methods on both computation efficiency and
the quality of clustering results.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zou:2019:PMD,
author = "Jia Zou and Arun Iyengar and Chris Jermaine",
title = "{Pangea}: monolithic distributed storage for data
analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "6",
pages = "681--694",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3311880.3311885",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 20 17:32:19 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Storage and memory systems for modern data analytics
are heavily layered, managing shared persistent data,
cached data, and nonshared execution data in separate
systems such as a distributed file system like HDFS, an
in-memory file system like Alluxio, and a computation
framework like Spark. Such layering introduces
significant performance and management costs. In this
paper we propose a single system called Pangea that can
manage all data---both intermediate and long-lived
data, and their buffer/caching, data placement
optimization, and failure recovery---all in one
monolithic distributed storage system, without any
layering. We present a detailed performance evaluation
of Pangea and show that its performance compares
favorably with several widely used layered systems such
as Spark.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fan:2019:SMD,
author = "Zhiwei Fan and Jianqiao Zhu and Zuyu Zhang and Aws
Albarghouthi and Paraschos Koutris and Jignesh M.
Patel",
title = "Scaling-up in-memory datalog processing: observations
and techniques",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "6",
pages = "695--708",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3311880.3311886",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 20 17:32:19 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Recursive query processing has experienced a recent
resurgence, as a result of its use in many modern
application domains, including data integration, graph
analytics, security, program analysis, networking and
decision making. Due to the large volumes of data being
processed, several research efforts across multiple
communities have explored how to scale up recursive
queries, typically expressed in Datalog. Our experience
with these tools indicate that their performance does
not translate across domains---e.g., a tool designed
for large-scale graph analytics does not exhibit the
same performance on program-analysis tasks, and vice
versa. Starting from the above observation, we make the
following two contributions. First, we perform a
detailed experimental evaluation comparing a number of
state-of-the-art Datalog systems on a wide spectrum of
graph analytics and program-analysis tasks, and
summarize the pros and cons of existing techniques.
Second, we design and implement our own general-purpose
Datalog engine, called RecStep, on top of a parallel
single-node relational system. We outline the
techniques we applied on RecStep, as well as the
contribution of each technique to the overall
performance. Using RecStep as a baseline, we
demonstrate that it generally out-performs
state-of-the-art parallel Datalog engines on complex
and large-scale Datalog evaluation, by a 4-6X margin.
An additional insight from our work is that it is
possible to build a high-performance Datalog system on
top of a relational engine, an idea that has been
dismissed in past work.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Archer:2019:CAL,
author = "Aaron Archer and Kevin Aydin and Mohammad Hossein
Bateni and Vahab Mirrokni and Aaron Schild and Ray Yang
and Richard Zhuang",
title = "Cache-aware load balancing of data center
applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "6",
pages = "709--723",
month = feb,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3311880.3311887",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 20 17:32:19 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Our deployment of cache-aware load balancing in the
Google web search backend reduced cache misses by $
\approx $0.5x, contributing to a double-digit
percentage increase in the throughput of our serving
clusters by relieving a bottleneck. This innovation has
benefited all production workloads since 2015, serving
billions of queries daily. A load balancer forwards
each query to one of several identical serving
replicas. The replica pulls each term's postings list
into RAM from flash, either locally or over the
network. Flash bandwidth is a critical bottleneck,
motivating an application-directed RAM cache on each
replica. Sending the same term reliably to the same
replica would increase the chance it hits cache, and
avoid polluting the other replicas' caches. However,
most queries contain multiple terms and we have to send
the whole query to one replica, so it is not possible
to achieve a perfect partitioning of terms to replicas.
We solve this via a voting scheme, whereby the load
balancer conducts a weighted vote by the terms in each
query, and sends the query to the winning replica. We
develop a multi-stage scalable algorithm to learn these
weights. We first construct a large-scale term-query
graph from logs and apply a distributed balanced graph
partitioning algorithm to cluster each term to a
preferred replica. This yields a good but simplistic
initial voting table, which we then iteratively refine
via cache simulation to capture feedback effects.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Borkowski:2019:MCR,
author = "Michael Borkowski and Christoph Hochreiner and Stefan
Schulte",
title = "Minimizing cost by reducing scaling operations in
distributed stream processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "7",
pages = "724--737",
month = mar,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3317315.3317316",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 20 17:32:19 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Elastic distributed stream processing systems are able
to dynamically adapt to changes in the workload. Often,
these systems react to the rate of incoming data, or to
the level of resource utilization, by scaling up or
down. The goal is to optimize the system's resource
usage, thereby reducing its operational cost. However,
such scaling operations consume resources on their own,
introducing a certain overhead of resource usage, and
therefore cost, for every scaling operation. In
addition, migrations caused by scaling operations
inevitably lead to brief processing gaps. Therefore, an
excessive number of scaling operations should be
avoided. We approach this problem by preventing
unnecessary scaling operations and over-compensating
reactions to short-term changes in the workload. This
allows to maintain elasticity, while also minimizing
the incurred overhead cost of scaling operations. To
achieve this, we use advanced filtering techniques from
the field of signal processing to pre-process raw
system measurements, thus mitigating superfluous
scaling operations. We perform a real-world testbed
evaluation verifying the effects, and provide a
break-even cost analysis to show the economic
feasibility of our approach.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wu:2019:PPB,
author = "Yinjun Wu and Abdussalam Alawini and Daniel Deutch and
Tova Milo and Susan Davidson",
title = "{ProvCite}: provenance-based data citation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "7",
pages = "738--751",
month = mar,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3317315.3317317",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 20 17:32:19 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "As research products expand to include structured
datasets, the challenge arises of how to automatically
generate citations to the results of arbitrary queries
against such datasets. Previous work explored this
problem in the context of conjunctive queries and views
using a Rewriting-Based Model (RBM). However, an
increasing number of scientific queries are aggregate,
e.g. statistical summaries of the underlying data, for
which the RBM cannot be easily extended. In this paper,
we show how a Provenance-Based Model (PBM) can be
leveraged to (1) generate citations to conjunctive as
well as aggregate queries and views; (2) associate
citations with individual result tuples to enable
arbitrary subsets of the result set to be cited
(fine-grained citations); and (3) be optimized to
return citations in acceptable time. Our implementation
of PBM in ProvCite shows that it not only handles a
larger class of queries and views than RBM, but can
outperform it when restricted to conjunctive views in
some cases.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fan:2019:DCF,
author = "Wenfei Fan and Ping Lu and Chao Tian and Jingren
Zhou",
title = "Deducing certain fixes to graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "7",
pages = "752--765",
month = mar,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3317315.3317318",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 20 17:32:19 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper proposes to deduce certain fixes to graphs
G based on data quality rules \Sigma and ground truth
\Gamma ( i.e., validated attribute values and entity
matches). We fix errors detected by \Sigma in G such
that the fixes are assured correct as long as \Sigma
and \Gamma are correct. We deduce certain fixes in two
paradigms. (a) We interact with users and
``incrementally'' fix errors online. Whenever users
pick a small set V$_0$ of nodes in G, we fix all errors
pertaining to V$_0$ and accumulate ground truth in the
process. (b) Based on accumulated \Gamma, we repair the
entire graph G offline; while this may not correct all
errors in G, all fixes are guaranteed certain. We
develop techniques for deducing certain fixes. (1) We
define data quality rules to support conditional
functional dependencies, recursively defined keys and
negative rules on graphs, such that we can deduce fixes
by combining data repairing and object identification.
(2) We show that deducing certain fixes is
Church--Rosser, i.e., the deduction converges at the
same fixes regardless of the order of rules applied.
(3) We establish the complexity of three fundamental
problems associated with certain fixes. (4) We provide
(parallel) algorithms for deducing certain fixes online
and offline, and guarantee to reduce running time when
given more processors. Using real-life and synthetic
data, we experimentally verify the effectiveness and
scalability of our methods.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ceccarello:2019:SCC,
author = "Matteo Ceccarello and Andrea Pietracaprina and Geppino
Pucci",
title = "Solving $k$-center clustering (with outliers) in
{MapReduce} and streaming, almost as accurately as
sequentially",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "7",
pages = "766--778",
month = mar,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3317315.3317319",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 20 17:32:19 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Center-based clustering is a fundamental primitive for
data analysis and becomes very challenging for large
datasets. In this paper, we focus on the popular k
center variant which, given a set S of points from some
metric space and a parameter k < | S |, requires to
identify a subset of k centers in S minimizing the
maximum distance of any point of S from its closest
center. A more general formulation, introduced to deal
with noisy datasets, features a further parameter z and
allows up to z points of S (outliers) to be disregarded
when computing the maximum distance from the centers.
We present coreset-based 2-round MapReduce algorithms
for the above two formulations of the problem, and a
1-pass Streaming algorithm for the case with outliers.
For any fixed \&\#1013; \> 0, the algorithms
yield solutions whose approximation ratios are a mere
additive term \&\#1013; away from those achievable
by the best known polynomial-time sequential
algorithms, a result that substantially improves upon
the state of the art. Our algorithms are rather simple
and adapt to the intrinsic complexity of the dataset,
captured by the doubling dimension D of the metric
space. Specifically, our analysis shows that the
algorithms become very space-efficient for the
important case of small (constant) D. These theoretical
results are complemented with a set of experiments on
real-world and synthetic datasets of up to over a
billion points, which show that our algorithms yield
better quality solutions over the state of the art
while featuring excellent scalability, and that they
also lend themselves to sequential implementations much
faster than existing ones.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2019:EED,
author = "Xiaolan Wang and Alexandra Meliou",
title = "{Explain$3$D}: explaining disagreements in disjoint
datasets",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "7",
pages = "779--792",
month = mar,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3317315.3317320",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 20 17:32:19 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data plays an important role in applications, analytic
processes, and many aspects of human activity. As data
grows in size and complexity, we are met with an
imperative need for tools that promote understanding
and explanations over data-related operations. Data
management research on explanations has focused on the
assumption that data resides in a single dataset, under
one common schema. But the reality of today's data is
that it is frequently unintegrated, coming from
different sources with different schemas. When
different datasets provide different answers to
semantically similar questions, understanding the
reasons for the discrepancies is challenging and cannot
be handled by the existing single-dataset solutions. In
this paper, we propose explain3D, a framework for
explaining the disagreements across disjoint datasets
(3D). Explain3D focuses on identifying the reasons for
the differences in the results of two semantically
similar queries operating on two datasets with
potentially different schemas. Our framework leverages
the queries to perform a semantic mapping across the
relevant parts of their provenance; discrepancies in
this mapping point to causes of the queries'
differences. Exploiting the queries gives explain3D an
edge over traditional schema matching and record
linkage techniques, which are query-agnostic. Our work
makes the following contributions: (1) We formalize the
problem of deriving optimal explanations for the
differences of the results of semantically similar
queries over disjoint datasets. Our optimization
problem considers two types of explanations,
provenance-based and value-based, defined over an
evidence mapping, which makes our solution
interpretable. (2) We design a 3-stage framework for
solving the optimal explanation problem. (3) We develop
a smart-partitioning optimizer that improves the
efficiency of the framework by orders of magnitude. (4)
We experiment with real-world and synthetic data to
demonstrate that explain3D can derive precise
explanations efficiently, and is superior to
alternative methods based on integration techniques and
single-dataset explanation frameworks.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Won:2019:DDS,
author = "Youjip Won and Sundoo Kim and Juseong Yun and Dam
Quang Tuan and Jiwon Seo",
title = "{DASH}: database shadowing for mobile {DBMS}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "7",
pages = "793--806",
month = mar,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3317315.3317321",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 20 17:32:19 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this work, we propose Database Shadowing, or DASH,
which is a new crash recovery technique for SQLite
DBMS. DASH is a hybrid mixture of classical shadow
paging and logging. DASH addresses four major issues in
the current SQLite journal modes: the performance and
write amplification issues of the rollback mode and the
storage space requirement and tail latency issues of
the WAL mode. DASH exploits two unique characteristics
of SQLite: the database files are small and the
transactions are entirely serialized. DASH consists of
three key ingredients Aggregate Update, Atomic Exchange
and Version Reset. Aggregate Update eliminates the
redundant write overhead and the requirement to
maintain multiple snapshots both of which are inherent
in the out-of-place update. Atomic Exchange resolves
the overhead of updating the locations of individual
database pages exploiting order-preserving nature of
the metadata update operation in modern filesystem.
Version Reset makes the result of the Atomic Exchange
durable without relying on expensive filesystem
journaling. The salient aspect of DASH lies in its
simplicity and compatibility with the legacy. DASH does
not require any modifications in the underlying
filesystem or the database organization. It requires
only 451 LOC to implement. In Cyclomatic Complexity
score, which represents software complexity, DASH
renders 33\% lower (simpler) mark than PERSIST and WAL
modes of SQLite. We implement DASH for SQLite on
Android and extensively evaluate it on widely used
smartphone devices. DASH yields 4x performance gain
over PERSIST mode (default journaling mode). Compared
to WAL mode (the fastest journaling mode), DASH uses
only 2.5\% of the storage space on average. The
transaction latency of DASH at 99.9\% is one fourth of
that of WAL mode.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2019:AGL,
author = "Zeke Wang and Kaan Kara and Hantian Zhang and Gustavo
Alonso and Onur Mutlu and Ce Zhang",
title = "Accelerating generalized linear models with
{MLWeaving}: a one-size-fits-all system for
any-precision learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "7",
pages = "807--821",
month = mar,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3317315.3317322",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 20 17:32:19 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Learning from the data stored in a database is an
important function increasingly available in relational
engines. Methods using lower precision input data are
of special interest given their overall higher
efficiency. However, in databases, these methods have a
hidden cost: the quantization of the real value into a
smaller number is an expensive step. To address this
issue, we present ML-Weaving, a data structure and
hardware acceleration technique intended to speed up
learning of generalized linear models over low
precision data. MLWeaving provides a compact in-memory
representation that enables the retrieval of data at
any level of precision. MLWeaving also provides a
highly efficient implementation of stochastic gradient
descent on FPGAs and enables the dynamic tuning of
precision, instead of using a fixed precision level
during learning. Experimental results show that
MLWeaving converges up to 16 x faster than
low-precision implementations of first-order methods on
CPUs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jankov:2019:DRC,
author = "Dimitrije Jankov and Shangyu Luo and Binhang Yuan and
Zhuhua Cai and Jia Zou and Chris Jermaine and Zekai J.
Gao",
title = "Declarative recursive computation on an {RDBMS}: or,
why you should use a database for distributed machine
learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "7",
pages = "822--835",
month = mar,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3317315.3317323",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 20 17:32:19 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A number of popular systems, most notably Google's
TensorFlow, have been implemented from the ground up to
support machine learning tasks. We consider how to make
a very small set of changes to a modern relational
database management system (RDBMS) to make it suitable
for distributed learning computations. Changes include
adding better support for recursion, and optimization
and execution of very large compute plans. We also show
that there are key advantages to using an RDBMS as a
machine learning platform. In particular, learning
based on a database management system allows for
trivial scaling to large data sets and especially large
models, where different computational units operate on
different parts of a model that may be too large to fit
into RAM.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ghandeharizadeh:2019:DIE,
author = "Shahram Ghandeharizadeh and Hieu Nguyen",
title = "Design, implementation, and evaluation of write-back
policy with cache augmented data stores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "8",
pages = "836--849",
month = apr,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3324301.3324302",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:01 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The Cache Augmented Data Store (CADS) architecture
extends a persistent data store with an in-memory cache
manager. It is widely deployed to support
read-intensive workloads. However, its write-around and
write-through policies prevent the caching tier from
absorbing write load. This means the data store layer
must scale to process writes even when the extra
capacity is not needed for read load. We address this
limitation by devising a write-back technique to enable
the caching layer to process both reads and writes.
This technique preserves ACID transactions. We present
a client side implementation of write-back and evaluate
it using the YCSB, BG, and TPC-C benchmarks. In
addition, we compare our write-back with (a) write-back
policy of a data store such as MongoDB and (b)
write-back policy of a host-side cache such as
Flashcache.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Nguyen:2019:UGE,
author = "Thanh Tam Nguyen and Matthias Weidlich and Hongzhi Yin
and Bolong Zheng and Quoc Viet Hung Nguyen and Bela
Stantic",
title = "User guidance for efficient fact checking",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "8",
pages = "850--863",
month = apr,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3324301.3324303",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:01 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The Web constitutes a valuable source of information.
In recent years, it fostered the construction of
large-scale knowledge bases, such as Freebase, YAGO,
and DBpedia. The open nature of the Web, with content
potentially being generated by everyone, however, leads
to inaccuracies and misinformation. Construction and
maintenance of a knowledge base thus has to rely on
fact checking, an assessment of the credibility of
facts. Due to an inherent lack of ground truth
information, such fact checking cannot be done in a
purely automated manner, but requires human
involvement. In this paper, we propose a comprehensive
framework to guide users in the validation of facts,
striving for a minimisation of the invested effort. Our
framework is grounded in a novel probabilistic model
that combines user input with automated credibility
inference. Based thereon, we show how to guide users in
fact checking by identifying the facts for which
validation is most beneficial. Moreover, our framework
includes techniques to reduce the manual effort
invested in fact checking by determining when to stop
the validation and by supporting efficient batching
strategies. We further show how to handle fact checking
in a streaming setting. Our experiments with three
real-world datasets demonstrate the efficiency and
effectiveness of our framework: A knowledge base of
high quality, with a precision of above 90\%, is
constructed with only a half of the validation effort
required by baseline techniques.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ke:2019:DCR,
author = "Xiangyu Ke and Arijit Khan and Leroy Lim Hong Quan",
title = "An in-depth comparison of $s$--$t$ reliability
algorithms over uncertain graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "8",
pages = "864--876",
month = apr,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3324301.3324304",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:01 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Uncertain, or probabilistic, graphs have been
increasingly used to represent noisy linked data in
many emerging applications, and have recently attracted
the attention of the database research community. A
fundamental problem on uncertain graphs is the s-t
reliability, which measures the probability that a
target node t is reachable from a source node s in a
probabilistic (or uncertain) graph, i.e., a graph where
every edge is assigned a probability of existence. Due
to the inherent complexity of the s-t reliability
estimation problem (\#P-hard), various sampling and
indexing based efficient algorithms were proposed in
the literature. However, since they have not been
thoroughly compared with each other, it is not clear
whether the later algorithm outperforms the earlier
ones. More importantly, the comparison framework,
datasets, and metrics were often not consistent (e.g.,
different convergence criteria were employed to find
the optimal number of samples) across these works. We
address this serious concern by re-implementing six
state-of-the-art s-t reliability estimation methods in
a common system and code base, using several medium and
large-scale, real-world graph datasets, identical
evaluation metrics, and query workloads. Through our
systematic and in-depth analysis of experimental
results, we report surprising findings, such as many
follow-up algorithms can actually be several orders of
magnitude inefficient, less accurate, and more memory
intensive compared to the ones that were proposed
earlier. We conclude by discussing our recommendations
on the road ahead.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fan:2019:DSP,
author = "Wenfei Fan and Chunming Hu and Muyang Liu and Ping Lu
and Qiang Yin and Jingren Zhou",
title = "Dynamic scaling for parallel graph computations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "8",
pages = "877--890",
month = apr,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3324301.3324305",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:01 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper studies scaling out/in to cope with load
surges. Given a graph G that is vertex-partitioned and
distributed across n processors, it is to add (resp.
remove) k processors and re-distribute G across n + k
(resp. n --- k ) processors such that the load among
the processors is balanced, and its replication factor
and migration cost are minimized. We show that this
tri-criteria optimization problem is intractable, even
when k is a constant and when either load balancing or
minimum migration is not required. Nonetheless, we
propose two parallel solutions to dynamic scaling. One
consists of approximation algorithms by extending
consistent hashing. Given a load balancing factor above
a lower bound, the algorithms guarantee provable bounds
on both replication factor and migration cost. The
other is a generic scaling scheme. Given any existing
vertex-partitioner VP of users' choice, it adaptively
scales VP in and out such that it incurs minimum
migration cost, and ensures balance and replication
factors within a bound relative to that of VP. Using
real-life and synthetic graphs, we experimentally
verify the efficiency, effectiveness and scalability of
the solutions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2019:TTR,
author = "Dongsheng Li and Yiming Zhang and Jinyan Wang and
Kian-Lee Tan",
title = "{TopoX}: topology refactorization for efficient graph
partitioning and processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "8",
pages = "891--905",
month = apr,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3324301.3324306",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:01 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Traditional graph partitioning methods attempt to both
minimize communication cost and guarantee load
balancing in computation. However, the skewed degree
distribution of natural graphs makes it difficult to
simultaneously achieve the two objectives. This paper
proposes topology refactorization (TR), a
topology-aware method allowing graph-parallel systems
to separately handle the two objectives:
refactorization is mainly focused on reducing
communication cost, and partitioning is mainly targeted
for balancing the load. TR transforms a skewed graph
into a more communication-efficient topology through
fusion and fission, where the fusion operation
organizes a set of neighboring low-degree vertices into
a super-vertex, and the fission operation splits a
high-degree vertex into a set of sibling sub-vertices.
Based on TR, we design an efficient graph-parallel
system (TopoX) which pipelines refactorization with
partitioning to both reduce communication cost and
balance computation load. Prototype evaluation shows
that TopoX outperforms state-of-the-art PowerLyra by up
to 78.5\% (from 37.2\%) on real-world graphs and is
significantly faster than other graph-parallel systems,
while only introducing small refactorization overhead
and memory consumption.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Avdiukhin:2019:MDB,
author = "Dmitrii Avdiukhin and Sergey Pupyrev and Grigory
Yaroslavtsev",
title = "Multi-dimensional balanced graph partitioning via
projected gradient descent",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "8",
pages = "906--919",
month = apr,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3324301.3324307",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:01 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Motivated by performance optimization of large-scale
graph processing systems that distribute the graph
across multiple machines, we consider the balanced
graph partitioning problem. Compared to most of the
previous work, we study the multi-dimensional variant
in which balance according to multiple weight functions
is required. As we demonstrate by experimental
evaluation, such multi-dimensional balance is essential
for achieving performance improvements for typical
distributed graph processing workloads. We propose a
new scalable technique for the multidimensional
balanced graph partitioning problem. It is based on
applying randomized projected gradient descent to a
non-convex continuous relaxation of the objective. We
show how to implement the new algorithm efficiently in
both theory and practice utilizing various approaches
for the projection step. Experiments with large-scale
graphs containing up to hundreds of billions of edges
indicate that our algorithm has superior performance
compared to the state of the art.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cao:2019:EDS,
author = "Lei Cao and Yizhou Yan and Samuel Madden and Elke A.
Rundensteiner and Mathan Gopalsamy",
title = "Efficient discovery of sequence outlier patterns",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "8",
pages = "920--932",
month = apr,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3324301.3324308",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:01 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Modern Internet of Things ( IoT ) applications
generate massive amounts of time-stamped data, much of
it in the form of discrete, symbolic sequences. In this
work, we present a new system called TOP that deTects
Outlier Patterns from these sequences. To solve the
fundamental limitation of existing pattern mining
semantics that miss outlier patterns hidden inside of
larger frequent patterns, TOP offers new pattern
semantics based on contextual patterns that distinguish
the independent occurrence of a pattern from its
occurrence as part of its super-pattern. We present
efficient algorithms for the mining of this new class
of contextual patterns. In particular, in contrast to
the bottom-up strategy for state-of-the-art pattern
mining techniques, our top-down Reduce strategy piggy
backs pattern detection with the detection of the
context in which a pattern occurs. Our approach
achieves linear time complexity in the length of the
input sequence. Effective optimization techniques such
as context-driven search space pruning and inverted
index-based outlier pattern detection are also proposed
to further speed up contextual pattern mining. Our
experimental evaluation demonstrates the effectiveness
of TOP at capturing meaningful outlier patterns in
several real-world IoT use cases. We also demonstrate
the efficiency of TOP, showing it to be up to 2 orders
of magnitude faster than adapting state-of-the-art
mining to produce this new class of contextual outlier
patterns, allowing us to scale outlier pattern mining
to large sequence datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bogatov:2019:CEO,
author = "Dmytro Bogatov and George Kollios and Leonid Reyzin",
title = "A comparative evaluation of order-revealing encryption
schemes and secure range-query protocols",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "8",
pages = "933--947",
month = apr,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3324301.3324309",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:01 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Database query evaluation over encrypted data can
allow database users to maintain the privacy of their
data while outsourcing data processing.
Order-Preserving Encryption (OPE) and Order-Revealing
Encryption (ORE) were designed to enable efficient
query execution, but provide only partial privacy. More
private protocols, based on Searchable Symmetric
Encryption (SSE), Oblivious RAM (ORAM) or custom
encrypted data structures, have also been designed. In
this paper, we develop a framework to provide the first
comprehensive comparison among a number of range query
protocols that ensure varying levels of privacy of user
data. We evaluate five ORE-based and five generic range
query protocols. We analyze and compare them both
theoretically and experimentally and measure their
performance over database indexing and query
evaluation. We report not only execution time but also
I/O performance, communication amount, and usage of
cryptographic primitive operations. Our comparison
reveals some interesting insights concerning the
relative security and performance of these approaches
in database settings.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Orakzai:2019:HFM,
author = "Faisal Orakzai and Toon Calders and Torben Bach
Pedersen",
title = "$ k / 2$-hop: fast mining of convoy patterns with
effective pruning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "9",
pages = "948--960",
month = may,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3329772.3329773",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:01 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the increase of devices equipped with location
sensors, mining spatio-temporal data for interesting
behavioral patterns has gained attention in recent
years. One of such well-known patterns is the convoy
pattern which can be used, e.g., to find groups of
people moving together in public transport or to
prevent traffic jams. A convoy consists of at least m
objects moving together for at least k consecutive time
instants where m and k are user-defined parameters.
Convoy mining is an expensive task and existing
sequential algorithms do not scale to real-life dataset
sizes. Existing sequential as well as parallel
algorithms require a complex set of data-dependent
parameters which are hard to set and tune. Therefore,
in this paper, we propose a new fast exact sequential
convoy pattern mining algorithm ``k/2-hop'' that is
free of data-dependent parameters. The proposed
algorithm processes the data corresponding to a few
specific key timestamps at each step and quickly prunes
objects with no possibility of forming a convoy. Thus,
only a very small portion of the complete dataset is
considered for mining convoys. Our experimental results
show that k/2-hop outperforms existing sequential as
well as parallel convoy pattern mining algorithms by
orders of magnitude, and scales to larger datasets
which existing algorithms fail on.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sun:2019:BAD,
author = "Ji Sun and Zeyuan Shang and Guoliang Li and Dong Deng
and Zhifeng Bao",
title = "Balance-aware distributed string similarity-based
query processing system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "9",
pages = "961--974",
month = may,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3329772.3329774",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:01 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data analysts spend more than 80\% of time on data
cleaning and integration in the whole process of data
analytics due to data errors and inconsistencies.
Similarity-based query processing is an important way
to tolerate the errors and inconsistencies. However,
similarity-based query processing is rather costly and
traditional database cannot afford such expensive
requirement. In this paper, we develop a distributed
in-memory similarity-based query processing system
called Dima. Dima supports four core similarity
operations, i.e., similarity selection, similarity
join, top- k selection and top- k join. Dima extends
SQL for users to easily invoke these similarity-based
operations in their data analysis tasks. To avoid
expensive data transmission in a distributed
environment, we propose balance-aware signatures where
two records are similar if they share common
signatures, and we can adaptively select the signatures
to balance the workload. Dima builds signature-based
global indexes and local indexes to support similarity
operations. Since Spark is one of the widely adopted
distributed in-memory computing systems, we have
seamlessly integrated Dima into Spark and developed
effective query optimization techniques in Spark. To
the best of our knowledge, this is the first
full-fledged distributed in-memory system that can
support complex similarity-based query processing on
large-scale datasets. We have conducted extensive
experiments on four real-world datasets. Experimental
results show that Dima outperforms state-of-the-art
studies by 1--3 orders of magnitude and has good
scalability.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ruan:2019:FGS,
author = "Pingcheng Ruan and Gang Chen and Tien Tuan Anh Dinh
and Qian Lin and Beng Chin Ooi and Meihui Zhang",
title = "Fine-grained, secure and efficient data provenance on
blockchain systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "9",
pages = "975--988",
month = may,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3329772.3329775",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:01 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The success of Bitcoin and other cryptocurrencies
bring enormous interest to blockchains. A blockchain
system implements a tamper-evident ledger for recording
transactions that modify some global states. The system
captures entire evolution history of the states. The
management of that history, also known as data
provenance or lineage, has been studied extensively in
database systems. However, querying data history in
existing blockchains can only be done by replaying all
transactions. This approach is applicable to
large-scale, offline analysis, but is not suitable for
online transaction processing. We present LineageChain,
a fine-grained, secure and efficient provenance system
for blockchains. LineageChain exposes provenance
information to smart contracts via simple and elegant
interfaces, thereby enabling a new class of blockchain
applications whose execution logics depend on
provenance information at runtime. LineageChain
captures provenance during contract execution, and
efficiently stores it in a Merkle tree. LineageChain
provides a novel skip list index designed for
supporting efficient provenance query processing. We
have implemented LineageChain on top of Hyperledger and
a blockchain-optimized storage system called ForkBase.
Our extensive evaluation of LineageChain demonstrates
its benefits to the new class of blockchain
applications, its efficient query, and its small
storage overhead.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Choi:2019:PTK,
author = "Dalsu Choi and Chang-Sup Park and Yon Dohn Chung",
title = "Progressive top-$k$ subarray query processing in array
databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "9",
pages = "989--1001",
month = may,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3329772.3329776",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:01 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Unprecedented amounts of multidimensional array data
are currently being generated in many fields. These
multidimensional array data naturally and efficiently
fit into the array data model, and many array
management systems based on the array data model have
appeared. Accordingly, the requirement for data
exploration methods for large multidimensional array
data has also increased. In this paper, we propose a
method for efficient top- k subarray query processing
in array databases, which is one of the most important
query types for exploring multidimensional data. First,
we define novel top- k query models for array
databases: overlap-allowing and disjoint top- k
subarray queries. Second, we propose a suite of top- k
subarray query processing methods, called PPTS and
extend them to distributed processing. Finally, we
present the results of extensive experiments using real
datasets from an array database, which show that our
proposed methods outperform existing na{\"\i}ve
methods.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Hoffmann:2019:MLC,
author = "Moritz Hoffmann and Andrea Lattuada and Frank
McSherry",
title = "{Megaphone}: latency-conscious state migration for
distributed streaming dataflows",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "9",
pages = "1002--1015",
month = may,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3329772.3329777",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:01 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We design and implement Megaphone, a data migration
mechanism for stateful distributed dataflow engines
with latency objectives. When compared to existing
migration mechanisms, Megaphone has the following
differentiating characteristics: (i) migrations can be
subdivided to a configurable granularity to avoid
latency spikes, and (ii) migrations can be prepared
ahead of time to avoid runtime coordination. Megaphone
is implemented as a library on an unmodified timely
dataflow implementation, and provides an operator
interface compatible with its existing APIs. We
evaluate Megaphone on established benchmarks with
varying amounts of state and observe that compared to
na{\"\i}ve approaches Megaphone reduces service
latencies during reconfiguration by orders of magnitude
without significantly increasing steady-state
overhead.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tam:2019:ADR,
author = "Nguyen Thanh Tam and Matthias Weidlich and Bolong
Zheng and Hongzhi Yin and Nguyen Quoc Viet Hung and
Bela Stantic",
title = "From anomaly detection to rumour detection using data
streams of social platforms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "9",
pages = "1016--1029",
month = may,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3329772.3329778",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:01 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Social platforms became a major source of rumours.
While rumours can have severe real-world implications,
their detection is notoriously hard: Content on social
platforms is short and lacks semantics; it spreads
quickly through a dynamically evolving network; and
without considering the context of content, it may be
impossible to arrive at a truthful interpretation.
Traditional approaches to rumour detection, however,
exploit solely a single content modality, e.g., social
media posts, which limits their detection accuracy. In
this paper, we cope with the aforementioned challenges
by means of a multi-modal approach to rumour detection
that identifies anomalies in both, the entities (e.g.,
users, posts, and hashtags) of a social platform and
their relations. Based on local anomalies, we show how
to detect rumours at the network level, following a
graph-based scan approach. In addition, we propose
incremental methods, which enable us to detect rumours
using streaming data of social platforms. We illustrate
the effectiveness and efficiency of our approach with a
real-world dataset of 4M tweets with more than 1000
rumours.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gupta:2019:OIT,
author = "Peeyush Gupta and Yin Li and Sharad Mehrotra and Nisha
Panwar and Shantanu Sharma and Sumaya Almanee",
title = "{Obscure}: information-theoretic oblivious and
verifiable aggregation queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "9",
pages = "1030--1043",
month = may,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3329772.3329779",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:01 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Despite extensive research on cryptography, secure and
efficient query processing over outsourced data remains
an open challenge. We develop communication-efficient
and information-theoretically secure algorithms for
privacy-preserving aggregation queries using
multi-party computation (MPC). Specifically, query
processing techniques over secret-shared data
outsourced by single or multiple database owners are
developed. These algorithms allow a user to execute
queries on the secret-shared database and also prevent
the network and the (adversarial) clouds to learn the
user's queries, results, or the database. We further
develop (non-mandatory) privacy-preserving result
verification algorithms that detect malicious
behaviors, and experimentally validate the efficiency
of our approach over large datasets, the size of which
prior approaches to secret-sharing or MPC systems have
not scaled to.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dutt:2019:SER,
author = "Anshuman Dutt and Chi Wang and Azade Nazi and Srikanth
Kandula and Vivek Narasayya and Surajit Chaudhuri",
title = "Selectivity estimation for range predicates using
lightweight models",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "9",
pages = "1044--1057",
month = may,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3329772.3329780",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:01 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Query optimizers depend on selectivity estimates of
query predicates to produce a good execution plan. When
a query contains multiple predicates, today's
optimizers use a variety of assumptions, such as
independence between predicates, to estimate
selectivity. While such techniques have the benefit of
fast estimation and small memory footprint, they often
incur large selectivity estimation errors. In this
work, we reconsider selectivity estimation as a
regression problem. We explore application of neural
networks and tree-based ensembles to the important
problem of selectivity estimation of multi-dimensional
range predicates. While their straightforward
application does not outperform even simple baselines,
we propose two simple yet effective design choices,
i.e., regression label transformation and feature
engineering, motivated by the selectivity estimation
context. Through extensive empirical evaluation across
a variety of datasets, we show that the proposed models
deliver both highly accurate estimates as well as fast
estimation.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yuan:2019:CSP,
author = "Ye Yuan and Xiang Lian and Guoren Wang and Yuliang Ma
and Yishu Wang",
title = "Constrained shortest path query in a large
time-dependent graph",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "10",
pages = "1058--1070",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3339490.3339491",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The constrained shortest path (CSP) query over static
graphs has been extensively studied, since it has wide
applications in transportation networks,
telecommunication networks and etc. Such networks are
dynamic and evolve over time, being modeled as
time-dependent graphs. Therefore, in this paper, we
study the CSP query over a large time-dependent graph.
Specifically, we study the point CSP (PCSP) query and
interval CSP (ICSP) query. We formally prove that it is
NP-complete to process a PCSP query and at least
EXPSPACE to answer an ICSP query. We propose
approximate sequential algorithms to answer the PCSP
and ICSP queries efficiently. We also develop parallel
algorithms for the queries that guarantee to scale with
big time-dependent graphs. Using real-life graphs, we
experimentally verify the efficiency and scalability of
our algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chu:2019:FTC,
author = "Lingyang Chu and Zhefeng Wang and Jian Pei and Yanyan
Zhang and Yu Yang and Enhong Chen",
title = "Finding theme communities from database networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "10",
pages = "1071--1084",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3339490.3339492",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Given a database network where each vertex is
associated with a transaction database, we are
interested in finding theme communities. Here, a theme
community is a cohesive subgraph such that a common
pattern is frequent in all transaction databases
associated with the vertices in the subgraph. Finding
all theme communities from a database network enjoys
many novel applications. However, it is challenging
since even counting the number of all theme communities
in a database network is \#P-hard. Inspired by the
observation that a theme community shrinks when the
length of the pattern increases, we investigate several
properties of theme communities and develop TCFI, a
scalable algorithm that uses these properties to
effectively prune the patterns that cannot form any
theme community. We also design TC-Tree, a scalable
algorithm that decomposes and indexes theme communities
efficiently. Retrieving a ranked list of theme
communities from a TC-Tree of hundreds of millions of
theme communities takes less than 1 second. Extensive
experiments and a case study demonstrate the
effectiveness and scalability of TCFI and TC-Tree in
discovering and querying meaningful theme communities
from large database networks.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Pan:2019:RSB,
author = "James J. Pan and Guoliang Li and Juntao Hu",
title = "{Ridesharing}: simulator, benchmark, and evaluation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "10",
pages = "1085--1098",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3339490.3339493",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Ridesharing is becoming a popular mode of
transportation with profound effects on the industry.
Recent algorithms for vehicle-to-customer matching have
been developed; yet cross-study evaluations of their
performance and applicability to real-world ridesharing
are lacking. Evaluation is complicated by the online
and real-time nature of the ridesharing problem. In
this paper, we develop a simulator for evaluating
ridesharing algorithms, and we provide a set of
benchmarks to test a wide range of scenarios
encountered in the real world. These scenarios include
different road networks, different numbers of vehicles,
larger scales of customer requests, and others. We
apply the benchmarks to several state-of-the-art search
and join based ridesharing algorithms to demonstrate
the usefulness of the simulator and the benchmarks. We
find quickly-computable heuristics outperforming other
more complex methods, primarily due to faster
computation speed. Our work points the direction for
designing and evaluating future ridesharing
algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lai:2019:DSM,
author = "Longbin Lai and Zhu Qing and Zhengyi Yang and Xin Jin
and Zhengmin Lai and Ran Wang and Kongzhang Hao and
Xuemin Lin and Lu Qin and Wenjie Zhang and Ying Zhang
and Zhengping Qian and Jingren Zhou",
title = "Distributed subgraph matching on timely dataflow",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "10",
pages = "1099--1112",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3339490.3339494",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Recently there emerge many distributed algorithms that
aim at solving subgraph matching at scale. Existing
algorithm-level comparisons failed to provide a
systematic view of distributed subgraph matching mainly
due to the intertwining of strategy and optimization.
In this paper, we identify four strategies and three
general-purpose optimizations from representative
state-of-the-art algorithms. We implement the four
strategies with the optimizations based on the common
Timely dataflow system for systematic strategy-level
comparison. Our implementation covers all
representative algorithms. We conduct extensive
experiments for both unlabelled matching and labelled
matching to analyze the performance of distributed
subgraph matching under various settings, which is
finally summarized as a practical guide.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Qiao:2019:HDS,
author = "Shi Qiao and Adrian Nicoara and Jin Sun and Marc
Friedman and Hiren Patel and Jaliya Ekanayake",
title = "Hyper dimension shuffle: efficient data repartition at
petabyte scale in {SCOPE}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "10",
pages = "1113--1125",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3339490.3339495",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In distributed query processing, data shuffle is one
of the most costly operations. We examined scaling
limitations to data shuffle that current systems and
the research literature do not solve. As the number of
input and output partitions increases, na{\"\i}ve
shuffling will result in high fan-out and fan-in. There
are practical limits to fan-out, as a consequence of
limits on memory buffers, network ports and I/O
handles. There are practical limits to fan-in because
it multiplies the communication errors due to faults in
commodity clusters impeding progress. Existing
solutions that limit fan-out and fan-in do so at the
cost of scaling quadratically in the number of nodes in
the data flow graph. This dominates the costs of
shuffling large datasets. We propose a novel algorithm
called Hyper Dimension Shuffle that we have introduced
in production in SCOPE, Microsoft's internal big data
analytics system. Hyper Dimension Shuffle is inspired
by the divide and conquer concept, and utilizes a
recursive partitioner with intermediate aggregations.
It yields quasilinear complexity of the shuffling graph
with tight guarantees on fan-out and fan-in. We
demonstrate how it avoids the shuffling graph blow-up
of previous algorithms to shuffle at petabyte-scale
efficiently on both synthetic benchmarks and real
applications.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cormode:2019:ARQ,
author = "Graham Cormode and Tejas Kulkarni and Divesh
Srivastava",
title = "Answering range queries under local differential
privacy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "10",
pages = "1126--1138",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3339490.3339496",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Counting the fraction of a population having an input
within a specified interval i.e. a range query, is a
fundamental data analysis primitive. Range queries can
also be used to compute other core statistics such as
quantiles, and to build prediction models. However,
frequently the data is subject to privacy concerns when
it is drawn from individuals, and relates for example
to their financial, health, religious or political
status. In this paper, we introduce and analyze methods
to support range queries under the local variant of
differential privacy [23], an emerging standard for
privacy-preserving data analysis. The local model
requires that each user releases a noisy view of her
private data under a privacy guarantee. While many
works address the problem of range queries in the
trusted aggregator setting, this problem has not been
addressed specifically under untrusted aggregation
(local DP) model even though many primitives have been
developed recently for estimating a discrete
distribution. We describe and analyze two classes of
approaches for range queries, based on hierarchical
histograms and the Haar wavelet transform. We show that
both have strong theoretical accuracy guarantees on
variance. In practice, both methods are fast and
require minimal computation and communication
resources. Our experiments show that the wavelet
approach is most accurate in high privacy settings,
while the hierarchical approach dominates for weaker
privacy requirements.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2019:VPB,
author = "Kai Wang and Xuemin Lin and Lu Qin and Wenjie Zhang
and Ying Zhang",
title = "Vertex priority based butterfly counting for
large-scale bipartite networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "10",
pages = "1139--1152",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3339490.3339497",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Bipartite networks are of great importance in many
real-world applications. In bipartite networks,
butterfly (i.e., a complete 2 x 2 biclique) is the
smallest non-trivial cohesive structure and plays a key
role. In this paper, we study the problem of efficient
counting the number of butterflies in bipartite
networks. The most advanced techniques are based on
enumerating wedges which is the dominant cost of
counting butterflies. Nevertheless, the existing
algorithms cannot efficiently handle large-scale
bipartite networks. This becomes a bottleneck in
large-scale applications. In this paper, instead of the
existing layer-priority-based techniques, we propose a
vertex-priority-based paradigm BFC-VP to enumerate much
fewer wedges; this leads to a significant improvement
of the time complexity of the state-of-the-art
algorithms. In addition, we present cache-aware
strategies to further improve the time efficiency while
theoretically retaining the time complexity of BFC-VP.
Moreover, we also show that our proposed techniques can
work efficiently in external and parallel contexts. Our
extensive empirical studies demonstrate that the
proposed techniques can speed up the state-of-the-art
techniques by up to two orders of magnitude for the
real datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cao:2019:BVS,
author = "Yang Cao and Wenfei Fan and Tengfei Yuan",
title = "Block as a value for {SQL} over {NoSQL}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "10",
pages = "1153--1166",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3339490.3339498",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper presents Zidian, a middleware for
key--value (KV) stores to speed up SQL query evaluation
over NoSQL. As opposed to common practice that takes a
tuple id or primary key as key and the entire tuple as
value, Zidian proposes a block-as-a-value model BaaV.
BaaV represents a relation as keyed blocks ( k, B ),
where k is a key of a block (a set) B of partial
tuples. We extend relational algebra to BaaV. We show
that under BaaV, Zidian substantially reduces data
access and communication cost. We provide
characterizations (sufficient and necessary conditions)
for (a) result-preserving queries, i.e., queries
covered by available BaaV stores, (b) scan-free
queries, i.e., queries that can be evaluated without
scanning any table, and (c) bounded queries, i.e.,
queries that can be answered by accessing a bounded
amount of data. We show that in parallel processing,
Zidian guarantees (a) no scans for scan-free queries,
(b) bounded communication cost for bounded queries; and
(c) parallel scalability, i.e., speed up when adding
processors. Moreover, Zidian can be plugged into
existing SQL-over-NoSQL systems and retains horizontal
scalability. Using benchmark and real-life data, we
empirically verify that Zidian improves existing
SQL-over-NoSQL systems by 2 orders of magnitude on
average.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tangwongsan:2019:OGO,
author = "Kanat Tangwongsan and Martin Hirzel and Scott
Schneider",
title = "Optimal and general out-of-order sliding-window
aggregation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "10",
pages = "1167--1180",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3339490.3339499",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Sliding-window aggregation derives a user-defined
summary of the most-recent portion of a data stream.
For in-order streams, each window change can be handled
in O (1) time even when the aggregation operator is not
invertible. But streaming data often arrive inherently
out-of-order, e.g., due to clock drifts and
communication delays. For such streams, prior work
resorted to latency-prone buffering or spent O (log n )
time for every window change, where n is the
instantaneous window size. This paper presents FiBA, a
novel real-time sliding window aggregation algorithm
that optimally handles streams of varying degrees of
out-of-orderness. FiBA is as general as the
state-of-the-art and supports variable-sized windows.
An insert or evict takes amortized O (log d ) time,
where d is the distance of the change to the window's
boundary. This means O (1) time for in-order arrivals
and nearly O (1) time for slightly out-of-order
arrivals, tending to O (log n ) time for the most
severely out-of-order arrivals. We also prove a
matching lower bound, showing optimality. At its heart,
the algorithm combines and extends finger searching,
lazy rebalancing, and position-aware partial
aggregates. Further, FiBA can answer range queries that
aggregate subwindows for window sharing. Finally, our
experiments show that FiBA performs well in practice
and conforms to the theoretical findings, with
significantly higher throughput than O (log n )
algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tang:2019:CTR,
author = "Bo Tang and Kyriakos Mouratidis and Man Lung Yiu and
Zhenyu Chen",
title = "Creating top ranking options in the continuous option
and preference space",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "10",
pages = "1181--1194",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3339490.3339500",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Top- k queries are extensively used to retrieve the k
most relevant options (e.g., products, services,
accommodation alternatives, etc) based on a weighted
scoring function that captures user preferences. In
this paper, we take the viewpoint of a business owner
who plans to introduce a new option to the market, with
a certain type of clientele in mind. Given a target
region in the consumer spectrum, we determine what
attribute values the new option should have, so that it
ranks among the top- k for any user in that region. Our
methodology can also be used to improve an existing
option, at the minimum modification cost, so that it
ranks consistently high for an intended type of
customers. This is the first work on competitive option
placement where no distinct user(s) are targeted, but a
general clientele type, i.e., a continuum of possible
preferences. Here also lies our main challenge (and
contribution), i.e., dealing with the interplay between
two continuous spaces: the targeted region in the
preference spectrum, and the option domain (where the
new option will be placed). At the core of our
methodology lies a novel and powerful interlinking
between the two spaces. Our algorithms offer exact
answers in practical response times, even for the
largest of the standard benchmark datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ma:2019:OBE,
author = "Hanchao Ma and Morteza Alipourlangouri and Yinghui Wu
and Fei Chiang and Jiaxing Pi",
title = "Ontology-based entity matching in attributed graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "10",
pages = "1195--1207",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3339490.3339501",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Keys for graphs incorporate the topology and value
constraints needed to uniquely identify entities in a
graph. They have been studied to support object
identification, knowledge fusion, and social network
reconciliation. Existing key constraints identify
entities as the matches of a graph pattern by subgraph
isomorphism, which enforce label equality on node
types. These constraints can be too restrictive to
characterize structures and node labels that are
syntactically different but semantically equivalent. We
propose a new class of key constraints, Ontological
Graph Keys (OGKs) that extend conventional graph keys
by ontological subgraph matching between entity labels
and an external ontology. We show that the implication
and validation problems for OGKs are each NP-complete.
To reduce the entity matching cost, we also provide an
algorithm to compute a minimal cover for OGKs. We then
study the entity matching problem with OGKs, and a
practical variant with a budget on the matching cost.
We develop efficient algorithms to perform entity
matching based on a (budgeted) Chase procedure. Using
real-world graphs, we experimentally verify the
efficiency and accuracy of OGK-based entity matching.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chen:2019:RTD,
author = "Lu Chen and Yunjun Gao and Ziquan Fang and Xiaoye Miao
and Christian S. Jensen and Chenjuan Guo",
title = "Real-time distributed co-movement pattern detection on
streaming trajectories",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "10",
pages = "1208--1220",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3339490.3339502",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the widespread deployment of mobile devices with
positioning capabilities, increasingly massive volumes
of trajectory data are being collected that capture the
movements of people and vehicles. This data enables
co-movement pattern detection, which is important in
applications such as trajectory compression and
future-movement prediction. Existing co-movement
pattern detection studies generally consider historical
data and thus propose offline algorithms. However,
applications such as future movement prediction need
real-time processing over streaming trajectories. Thus,
we investigate real-time distributed co-movement
pattern detection over streaming trajectories. Existing
off-line methods assume that all data is available when
the processing starts. Nevertheless, in a streaming
setting, unbounded data arrives in real time, making
pattern detection challenging. To this end, we propose
a framework based on Apache Flink, which is designed
for efficient distributed streaming data processing.
The framework encompasses two phases: clustering and
pattern enumeration. To accelerate the clustering, we
use a range join based on two-layer indexing, and
provide techniques that eliminate unnecessary
verifications. To perform pattern enumeration
efficiently, we present two methods FBA and VBA that
utilize id-based partitioning. When coupled with bit
compression and candidate-based enumeration techniques,
we reduce the enumeration cost from exponential to
linear. Extensive experiments offer insight into the
efficiency of the proposed framework and its
constituent techniques compared with existing
methods.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tan:2019:IIB,
author = "Jian Tan and Tieying Zhang and Feifei Li and Jie Chen
and Qixing Zheng and Ping Zhang and Honglin Qiao and
Yue Shi and Wei Cao and Rui Zhang",
title = "{iBTune}: individualized buffer tuning for large-scale
cloud databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "10",
pages = "1221--1234",
month = jun,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3339490.3339503",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Tuning the buffer size appropriately is critical to
the performance of a cloud database, since memory is
usually the resource bottleneck. For large-scale
databases supporting heterogeneous applications,
configuring the individual buffer sizes for a
significant number of database instances presents a
scalability challenge. Manual optimization is neither
efficient nor effective, and even not feasible for
large cloud clusters, especially when the workload may
dynamically change on each instance. The difficulty
lies in the fact that each database instance requires a
different buffer size that is highly individualized,
subject to the constraint of the total buffer memory
space. It is imperative to resort to algorithms that
automatically orchestrate the buffer pool tuning for
the entire database instances. To this end, we design
iBTune that has been deployed for more than 10, 000
OLTP cloud database instances in our production system.
Specifically, it leverages the information from similar
workloads to find out the tolerable miss ratio of each
instance. Then, it utilizes the relationship between
miss ratios and allocated memory sizes to individually
optimize the target buffer pool sizes. To provide a
guaranteed level of service level agreement (SLA), we
design a pairwise deep neural network that uses
features from measurements on pairs of instances to
predict the upper bounds of the request response times.
A target buffer pool size can be adjusted only when the
predicted response time upper bound is in a safe limit.
The successful deployment on a production environment,
which safely reduces the memory footprint by more than
17\% compared to the original system that relies on
manual configurations, demonstrates the effectiveness
of our solution.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Whittaker:2019:OTI,
author = "Michael Whittaker and Nick Edmonds and Sandeep Tata
and James B. Wendt and Marc Najork",
title = "Online template induction for machine-generated
emails",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1235--1248",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342264",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In emails, information abounds. Whether it be a bill
reminder, a hotel confirmation, or a shipping
notification, our emails contain useful bits of
information that enable a number of applications. Most
of this email traffic is machine-generated, sent from a
business to a human. These business-to-consumer emails
are typically instantiated from a set of email
templates, and discovering these templates is a key
step in enabling a variety of intelligent experiences.
Existing email information extraction systems typically
separate information extraction into two steps: an
offline template discovery process (called template
induction) that is periodically run on a sample of
emails, and an online email annotation process that
applies discovered templates to emails as they arrive.
Since information extraction requires an email's
template to be known, any delay in discovering a newly
created template causes missed extractions, lowering
the overall extraction coverage. In this paper, we
present a novel system called Crusher that discovers
templates completely online, reducing template
discovery delay from a week (for the existing
MapReduce-based batch system) to minutes. Furthermore,
Crusher has a resource consumption footprint that is
significantly smaller than the existing batch system.
We also report on the surprising lesson we learned that
conventional stream processing systems do not present a
good framework on which to build Crusher. Crusher
delivers an order of magnitude more throughput than a
prototype built using a stream processing engine. We
hope that these lessons help designers of stream
processing systems accommodate a broader range of
applications like online template induction in the
future.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2019:QSP,
author = "Yong Wang and Guoliang Li and Nan Tang",
title = "Querying shortest paths on time dependent road
networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1249--1261",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342265",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "For real-world time dependent road networks (TDRNs),
answering shortest path-based route queries and plans
in real-time is highly desirable by many industrial
applications. Unfortunately, traditional ( Dijkstra ---
or A *-like) algorithms are computationally expensive
for such tasks on TDRNs. Naturally, indexes are needed
to meet the real-time constraint required by real
applications. In this paper, we propose a novel
height-balanced tree-structured index, called
TD-G-tree, which supports fast route queries over
TDRNs. The key idea is to use hierarchical graph
partitioning to split a road network into hierarchical
partitions. This will produce a balanced tree, where
each tree node corresponds to a partition and each
parent-child relationship corresponds to a partition
and its sub-partition. We then compute and index time
dependent shortest paths (TDSPs) only for borders (
i.e., vertices whose edges are cut by a partition).
Based on TD-G-tree, we devise efficient algorithms to
support TDSP queries, as well as time-interval based
route planning, for computing optimal solutions through
dynamic programming and chronological
divide-and-conquer. Extensive experiments on real-world
datasets show that our method significantly outperforms
existing approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fariha:2019:EDQ,
author = "Anna Fariha and Alexandra Meliou",
title = "Example-driven query intent discovery: abductive
reasoning using semantic similarity",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1262--1275",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342266",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Traditional relational data interfaces require precise
structured queries over potentially complex schemas.
These rigid data retrieval mechanisms pose hurdles for
non-expert users, who typically lack language expertise
and are unfamiliar with the details of the schema.
Query by Example (QBE) methods offer an alternative
mechanism: users provide examples of their intended
query output and the QBE system needs to infer the
intended query. However, these approaches focus on the
structural similarity of the examples and ignore the
richer context present in the data. As a result, they
typically produce queries that are too general, and
fail to capture the user's intent effectively. In this
paper, we present SQuID, a system that performs
semantic similarity-aware query intent discovery. Our
work makes the following contributions: (1) We design
an end-to-end system that automatically formulates
select-project-join queries in an open-world setting,
with optional group-by aggregation and intersection
operators; a much larger class than prior QBE
techniques. (2) We express the problem of query intent
discovery using a probabilistic abduction model, that
infers a query as the most likely explanation of the
provided examples. (3) We introduce the notion of an
abduction-ready database, which precomputes semantic
properties and related statistics, allowing SQuID to
achieve real-time performance. (4) We present an
extensive empirical evaluation on three real-world
datasets, including user-intent case studies,
demonstrating that SQuID is efficient and effective,
and outperforms machine learning methods, as well as
the state-of-the-art in the related query reverse
engineering problem.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhou:2019:AVQ,
author = "Qi Zhou and Joy Arulraj and Shamkant Navathe and
William Harris and Dong Xu",
title = "Automated verification of query equivalence using
satisfiability modulo theories",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1276--1288",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342267",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Database-as-a-service offerings enable users to
quickly create and deploy complex data processing
pipelines. In practice, these pipelines often exhibit
significant overlap of computation due to redundant
execution of certain sub-queries. It is challenging for
developers and database administrators to manually
detect overlap across queries since they may be
distributed across teams, organization roles, and
geographic locations. Thus, we require automated
cloud-scale tools for identifying equivalent queries to
minimize computation overlap. State-of-the-art
algebraic approaches to automated verification of query
equivalence suffer from two limitations. First, they
are unable to model the semantics of widely-used SQL
features, such as complex query predicates and
three-valued logic. Second, they have a computationally
intensive verification procedure. These limitations
restrict their efficacy and efficiency in cloud-scale
database-as-a-service offerings. This paper makes the
case for an alternate approach to determining query
equivalence based on symbolic representation. The key
idea is to effectively transform a wide range of SQL
queries into first order logic formulae and then use
satisfiability modulo theories to efficiently verify
their equivalence. We have implemented this symbolic
representation-based approach in EQUITAS. Our
evaluation shows that EQUITAS proves the semantic
equivalence of a larger set of query pairs compared to
algebraic approaches and reduces the verification time
by 27X. We also demonstrate that on a set of 17,461
real-world SQL queries, it automatically identifies
redundant execution across 11\% of the queries. Our
symbolic-representation based technique is currently
deployed on Alibaba's MaxCompute database-as-a-service
platform.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Xu:2019:TUF,
author = "Pengfei Xu and Jiaheng Lu",
title = "Towards a unified framework for string similarity
joins",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1289--1302",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342268",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A similarity join aims to find all similar pairs
between two collections of records. Established
algorithms utilise different similarity measures,
either syntactic or semantic, to quantify the
similarity between two records. However, when records
are similar in forms of a mixture of syntactic and
semantic relations, utilising a single measure becomes
inadequate to disclose the real similarity between
records, and hence unable to obtain high-quality join
results. In this paper, we study a unified framework to
find similar records by combining multiple similarity
measures. To achieve this goal, we first develop a new
similarity framework that unifies the existing three
kinds of similarity measures simultaneously, including
syntactic (typographic) similarity, synonym-based
similarity, and taxonomy-based similarity. We then
theoretically prove that finding the maximum unified
similarity between two strings is generally NP -hard,
and furthermore develop an approximate algorithm which
runs in polynomial time with a non-trivial
approximation guarantee. To support efficient string
joins based on our unified similarity measure, we adopt
the filter-and-verification framework and propose a new
signature structure, called pebble, which can be
simultaneously adapted to handle multiple similarity
measures. The salient feature of our approach is that,
it can judiciously select the best pebble signatures
and the overlap thresholds to maximise the filtering
power. Extensive experiments show that our methods are
capable of finding similar records having mixed types
of similarity relations, while exhibiting high
efficiency and scalability for similarity joins. The
implementation can be downloaded at
https://github.com/HY-UDBMS/AU-Join.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yoon:2019:NEF,
author = "Susik Yoon and Jae-Gil Lee and Byung Suk Lee",
title = "{NETS}: extremely fast outlier detection from a data
stream via set-based processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1303--1315",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342269",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "This paper addresses the problem of efficiently
detecting outliers from a data stream as old data
points expire from and new data points enter the window
incrementally. The proposed method is based on a newly
discovered characteristic of a data stream that the
change in the locations of data points in the data
space is typically very insignificant. This observation
has led to the finding that the existing distance-based
outlier detection algorithms perform excessive
unnecessary computations that are repetitive and/or
canceling out the effects. Thus, in this paper, we
propose a novel set-based approach to detecting
outliers, whereby data points at similar locations are
grouped and the detection of outliers or inliers is
handled at the group level. Specifically, a new
algorithm NETS is proposed to achieve a remarkable
performance improvement by realizing set-based early
identification of outliers or inliers and taking
advantage of the ``net effect'' between expired and new
data points. Additionally, NETS is capable of achieving
the same efficiency even for a high-dimensional data
stream through two-level dimensional filtering.
Comprehensive experiments using six real-world data
streams show 5 to 25 times faster processing time than
state-of-the-art algorithms with comparable memory
consumption. We assert that NETS opens a new
possibility to real-time data stream outlier
detection.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lu:2019:SST,
author = "Yi Lu and Xiangyao Yu and Samuel Madden",
title = "{STAR}: scaling transactions through asymmetric
replication",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1316--1329",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342270",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we present STAR, a new distributed
in-memory database with asymmetric replication. By
employing a single-node non-partitioned architecture
for some replicas and a partitioned architecture for
other replicas, STAR is able to efficiently run both
highly partitionable workloads and workloads that
involve cross-partition transactions. The key idea is a
new phase-switching algorithm where the execution of
single-partition and cross-partition transactions is
separated. In the partitioned phase, single-partition
transactions are run on multiple machines in parallel
to exploit more concurrency. In the single-master
phase, mastership for the entire database is switched
to a single designated master node, which can execute
these transactions without the use of expensive
coordination protocols like two-phase commit. Because
the master node has a full copy of the database, this
phase-switching can be done at negligible cost. Our
experiments on two popular benchmarks (YCSB and TPC-C)
show that high availability via replication can coexist
with fast serializable transaction execution in
distributed in-memory databases, with STAR
outperforming systems that employ conventional
concurrency control and replication algorithms by up to
one order of magnitude.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2019:SD,
author = "Yuliang Li and Aaron Feng and Jinfeng Li and Saran
Mumick and Alon Halevy and Vivian Li and Wang-Chiew
Tan",
title = "Subjective databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1330--1343",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342271",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Online users are constantly seeking experiences, such
as a hotel with clean rooms and a lively bar, or a
restaurant for a romantic rendezvous. However,
e-commerce search engines only support queries
involving objective attributes such as location, price,
and cuisine, and any experiential data is relegated to
text reviews. In order to support experiential queries,
a database system needs to model subjective data. Users
should be able to pose queries that specify subjective
experiences using their own words, in addition to
conditions on the usual objective attributes. This
paper introduces OpineDB, a subjective database system
that addresses these challenges. We introduce a data
model for subjective databases. We describe how OpineDB
translates subjective queries against the subjective
database schema, which is done by matching the user
query phrases to the underlying schema. We also show
how the experiential conditions specified by the user
can be combined and the results aggregated and ranked.
We demonstrate that subjective databases satisfy user
needs more effectively and accurately than alternative
techniques through experiments with real data of hotel
and restaurant reviews.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ren:2019:FRD,
author = "Xuguang Ren and Junhu Wang and Wook-Shin Han and
Jeffrey Xu Yu",
title = "Fast and robust distributed subgraph enumeration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1344--1356",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342272",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We study the subgraph enumeration problem under
distributed settings. Existing solutions either suffer
from severe memory crisis or rely on large indexes,
which makes them impractical for very large graphs.
Most of them follow a synchronous model where the
performance is often bottlenecked by the machine with
the worst performance. Motivated by this, in this
paper, we propose RADS, a Robust Asynchronous
Distributed Subgraph enumeration system. RADS first
identifies results that can be found using
single-machine algorithms. This strategy not only
improves the overall performance but also reduces
network communication and memory cost. Moreover, RADS
employs a novel region-grouped multi-round expand
verify \& filter framework which does not need to
shuffle and exchange the intermediate results, nor does
it need to replicate a large part of the data graph in
each machine. This feature not only reduces network
communication cost and memory usage, but also allows us
to adopt simple strategies for memory control and load
balancing, making it more robust. Several optimization
strategies are also used in RADS to further improve the
performance. Our experiments verified the superiority
of RADS to state-of-the-art subgraph enumeration
approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fu:2019:EEL,
author = "Fangeheng Fu and Jiawei Jiang and Yingxia Shao and Bin
Cui",
title = "An experimental evaluation of large scale {GBDT}
systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1357--1370",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342273",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Gradient boosting decision tree (GBDT) is a
widely-used machine learning algorithm in both data
analytic competitions and real-world industrial
applications. Further, driven by the rapid increase in
data volume, efforts have been made to train GBDT in a
distributed setting to support large-scale workloads.
However, we find it surprising that the existing
systems manage the training dataset in different ways,
but none of them have studied the impact of data
management. To that end, this paper aims to study the
pros and cons of different data management methods
regarding the performance of distributed GBDT. We first
introduce a quadrant categorization of data management
policies based on data partitioning and data storage.
Then we conduct an in-depth systematic analysis and
summarize the advantageous scenarios of the quadrants.
Based on the analysis, we further propose a novel
distributed GBDT system named Vero, which adopts the
unexplored composition of vertical partitioning and
row-store and suits for many large-scale cases. To
validate our analysis empirically, we implement
different quadrants in the same code base and compare
them under extensive workloads, and finally compare
Vero with other state-of-the-art systems over a wide
range of datasets. Our theoretical and experimental
results provide a guideline on choosing a proper data
management policy for a given workload.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kotsogiannis:2019:PDP,
author = "Ios Kotsogiannis and Yuchao Tao and Xi He and Maryam
Fanaeepour and Ashwin Machanavajjhala and Michael Hay
and Gerome Miklau",
title = "{PrivateSQL}: a differentially private {SQL} query
engine",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1371--1384",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342274",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Differential privacy is considered a de facto standard
for private data analysis. However, the definition and
much of the supporting literature applies to flat
tables. While there exist variants of the definition
and specialized algorithms for specific types of
relational data (e.g. graphs), there isn't a general
privacy definition for multi-relational schemas with
constraints, and no system that permits accurate
differentially private answering of SQL queries while
imposing a fixed privacy budget across all queries
posed by the analyst. This work presents PrivateSQL, a
first-of-its-kind end-to-end differentially private
relational database system. PrivateSQL allows an
analyst to query data stored in a standard database
management system using a rich class of SQL counting
queries. PrivateSQL adopts a novel generalization of
differential privacy to multi-relational data that
takes into account constraints in the schema like
foreign keys, and allows the data owner to flexibly
specify entities in the schema that need privacy.
PrivateSQL ensures a fixed privacy loss across all the
queries posed by the analyst by answering queries on
private synopses generated from several views over the
base relation that are tuned to have low error on a
representative query workload. We experimentally
evaluate PrivateSQL on a real-world dataset and a
workload of more than 3, 600 queries. We show that for
50\% of the queries PrivateSQL offers at least 1, 000x
better error rates than solutions adapted from prior
work.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Amiri:2019:CCA,
author = "Mohammad Javad Amiri and Divyakant Agrawal and Amr {El
Abbadi}",
title = "{CAPER}: a cross-application permissioned blockchain",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1385--1398",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342275",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Despite recent intensive research, existing blockchain
systems do not adequately address all the
characteristics of distributed applications. In
particular, distributed applications collaborate with
each other following service level agreements (SLAs) to
provide different services. While collaboration between
applications, e.g., cross-application transactions,
should be visible to all applications, the internal
data of each application, e.g, internal transactions,
might be confidential. In this paper, we introduce
CAPER, a permissioned blockchain system to support both
internal and cross-application transactions of
collaborating distributed applications. In CAPER, the
blockchain ledger is formed as a directed acyclic graph
where each application accesses and maintains only its
own view of the ledger including its internal and all
cross-application transactions. CAPER also introduces
three consensus protocols to globally order
cross-application transactions between applications
with different internal consensus protocols. The
experimental results reveal the efficiency of CAPER in
terms of performance and scalability.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Koliousis:2019:CSD,
author = "Alexandros Koliousis and Pijika Watcharapichat and
Matthias Weidlich and Luo Mai and Paolo Costa and Peter
Pietzuch",
title = "{Crossbow}: scaling deep learning with small batch
sizes on multi-{GPU} servers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1399--1412",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342276",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Deep learning models are trained on servers with many
GPUs, and training must scale with the number of GPUs.
Systems such as TensorFlow and Caffe2 train models with
parallel synchronous stochastic gradient descent: they
process a batch of training data at a time, partitioned
across GPUs, and average the resulting partial
gradients to obtain an updated global model. To fully
utilise all GPUs, systems must increase the batch size,
which hinders statistical efficiency. Users tune
hyper-parameters such as the learning rate to
compensate for this, which is complex and
model-specific. We describe Crossbow, a new
single-server multi-GPU system for training deep
learning models that enables users to freely choose
their preferred batch size---however small---while
scaling to multiple GPUs. Crossbow uses many parallel
model replicas and avoids reduced statistical
efficiency through a new synchronous training method.
We introduce SMA, a synchronous variant of model
averaging in which replicas independently explore the
solution space with gradient descent, but adjust their
search synchronously based on the trajectory of a
globally-consistent average model. Crossbow achieves
high hardware efficiency with small batch sizes by
potentially training multiple model replicas per GPU,
automatically tuning the number of replicas to maximise
throughput. our experiments show that Crossbow improves
the training time of deep learning models on an 8-GPU
server by 1.3--4X compared to TensorFlow.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Feng:2019:FAA,
author = "Kaiyu Feng and Gao Cong and Christian S. Jensen and
Tao Guo",
title = "Finding attribute-aware similar regions for data
analysis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1414--1426",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342277",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the proliferation of mobile devices and
location-based services, increasingly massive volumes
of geo-tagged data are becoming available. This data
typically also contains non-location information. We
study how to use such information to characterize a
region and then how to find a region of the same size
and with the most similar characteristics. This
functionality enables a user to identify regions that
share characteristics with a user-supplied region that
the user is familiar with and likes. More specifically,
we formalize and study a new problem called the
attribute-aware similar region search ( ASRS ) problem.
We first define so-called composite aggregators that
are able to express aspects of interest in terms of the
information associated with a user-supplied region.
When applied to a region, an aggregator captures the
region's relevant characteristics. Next, given a query
region and a composite aggregator, we propose a novel
algorithm called DS-Search to find the most similar
region of the same size. Unlike any previous work on
region search, DS-Search repeatedly discretizes and
splits regions until an split region either satisfies a
drop condition or it is guaranteed to not contribute to
the result. In addition, we extend DS-Search to solve
the ASRS problem approximately. Finally, we report on
extensive empirical studies that offer insight into the
efficiency and effectiveness of the paper's
proposals.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tang:2019:IQP,
author = "Dixin Tang and Zechao Shang and Aaron J. Elmore and
Sanjay Krishnan and Michael J. Franklin",
title = "Intermittent query processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1427--1441",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342278",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Many applications ingest data in an intermittent, yet
largely predictable, pattern. Existing systems tend to
ignore how data arrives when making decisions about how
to update (or refresh) an ongoing query. To address
this shortcoming we propose a new query processing
paradigm, Intermittent Query Processing (IQP), that
bridges query execution and policies, to determine when
to update results and how much resources to allocate
for ensuring fast query updates. Here, for a query the
system provides an initial result that is to be
refreshed when policy dictates, such as after a defined
number of new records arrive or a time interval
elapses. In between intermittent data arrivals, IQP
inactivates query execution by selectively releasing
some resources occupied in normal execution that will
be least helpful (for future refreshes) according to
the arrival patterns for new records. We present an IQP
prototype based on PostgreSQL that selectively persists
the state associated with query operators to allow for
fast query updates while constraining resource
consumption. Our experiments show that for several
application scenarios IQP greatly lowers query
processing latency compared to batch systems, and
largely reduces memory consumption with comparable
latency compared to a state-of-the-art incremental view
maintenance technique.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Budiu:2019:HTC,
author = "Mihai Budiu and Parikshit Gopalan and Lalith Suresh
and Udi Wieder and Han Kruiger and Marcos K. Aguilera",
title = "{Hillview}: a trillion-cell spreadsheet for big data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1442--1457",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342279",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Hillview is a distributed spreadsheet for browsing
very large datasets that cannot be handled by a single
machine. As a spread-sheet, Hillview provides a high
degree of interactivity that permits data analysts to
explore information quickly along many dimensions while
switching visualizations on a whim. To provide the
required responsiveness, Hillview introduces
visualization sketches, or vizketches, as a simple idea
to produce compact data visualizations. Vizketches
combine algorithmic techniques for data summarization
with computer graphics principles for efficient
rendering. While simple, vizketches are effective at
scaling the spreadsheet by parallelizing computation,
reducing communication, providing progressive
visualizations, and offering precise accuracy
guarantees. Using Hillview running on eight servers, we
can navigate and visualize datasets of tens of billions
of rows and trillions of cells, much beyond the
published capabilities of competing systems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wei:2019:EFD,
author = "Ziheng Wei and Sebastian Link",
title = "Embedded functional dependencies and data-completeness
tailored database design",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1458--1470",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342626",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We establish a robust schema design framework for data
with missing values. The framework is based on the new
notion of an embedded functional dependency, which is
independent of the interpretation of missing values,
able to express completeness and integrity requirements
on application data, and capable of capturing many
redundant data value occurrences. We establish
axiomatic and algorithmic foundations for reasoning
about embedded functional dependencies. These
foundations allow us to establish generalizations of
Boyce-Codd and Third normal forms that do not permit
any redundancy in any future application data, or
minimize their redundancy across dependency-preserving
decompositions, respectively. We show how to transform
any given schema into application schemata that meet
given completeness and integrity requirements and the
conditions of the generalized normal forms. Data over
those application schemata are therefore fit for
purpose by design. Extensive experiments with benchmark
schemata and data illustrate our framework, and the
effectiveness and efficiency of our algorithms, but
also provide quantified insight into database schema
design trade-offs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fan:2019:OVG,
author = "Hua Fan and Wojciech Golab",
title = "{Ocean Vista}: gossip-based visibility control for
speedy geo-distributed transactions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1471--1484",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342627",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Providing ACID transactions under conflicts across
globally distributed data is the Everest of transaction
processing protocols. Transaction processing in this
scenario is particularly costly due to the high latency
of cross-continent network links, which inflates
concurrency control and data replication overheads. To
mitigate the problem, we introduce Ocean Vista --- a
novel distributed protocol that guarantees strict
serializability. We observe that concurrency control
and replication address different aspects of resolving
the visibility of transactions, and we address both
concerns using a multi-version protocol that tracks
visibility using version watermarks and arrives at
correct visibility decisions using efficient gossip.
Gossiping the watermarks enables asynchronous
transaction processing and acknowledging transaction
visibility in batches in the concurrency control and
replication protocols, which improves efficiency under
high cross-datacenter network delays. In particular,
Ocean Vista can process conflicting transactions in
parallel, and supports efficient write-quorum /
read-one access using one round trip in the common
case. We demonstrate experimentally in a
multi-data-center cloud environment that our design
outperforms a leading distributed transaction
processing engine (TAPIR) more than 10-fold in terms of
peak throughput, albeit at the cost of additional
latency for gossip. The latency penalty is generally
bounded by one wide area network (WAN) round trip time
(RTT), and in the best case (i.e., under light load)
our system nearly breaks even with TAPIR by committing
transactions in around one WAN RTT.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2019:INF,
author = "Xikui Wang and Michael J. Carey",
title = "An {IDEA}: an ingestion framework for data enrichment
in {asterixDB}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1485--1498",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342628",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Big Data today is being generated at an unprecedented
rate from various sources such as sensors,
applications, and devices, and it often needs to be
enriched based on other reference information to
support complex analytical queries. Depending on the
use case, the enrichment operations can be compiled
code, declarative queries, or machine learning models
with different complexities. For enrichments that will
be frequently used in the future, it can be
advantageous to push their computation into the
ingestion pipeline so that they can be stored (and
queried) together with the data. In some cases, the
referenced information may change over time, so the
ingestion pipeline should be able to adapt to such
changes to guarantee the currency and/or correctness of
the enrichment results. In this paper, we present a new
data ingestion framework that supports data ingestion
at scale, enrichments requiring complex operations, and
adaptiveness to reference data changes. We explain how
this framework has been built on top of Apache
AsterixDB and investigate its performance at scale
under various workloads.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Karyakin:2019:DMP,
author = "Alexey Karyakin and Kenneth Salem",
title = "{DimmStore}: memory power optimization for database
systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1499--1512",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.33422629",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Memory can consume a substantial amount of power in
database servers, yet memory power has received
considerably less attention than CPU power. Memory
power consumption is also highly non-proportional.
Thus, memory power becomes even more significant in the
common case in which a database server is either not
completely busy or not completely full. In this paper,
we study the application of two memory power
optimization techniques --- rank-aware allocation and
rate-based layout --- to database systems. By
concentrating memory load, rather than spreading it out
evenly, these techniques create and exploit memory
idleness to achieve power savings. We have implemented
these techniques in a prototype database system called
DimmStore. DimmStore is part of a memory power testbed
which includes customized hardware with direct power
measurement capabilities, allowing us to measure the
techniques' effectiveness. We use the testbed to
empirically characterize the power saving opportunities
provided by these techniques, as well as their
performance impact, under YCSB and TPC-C workloads.
Under simple YCSB workloads, power savings ranged up to
50\%, depending on load and space utilization, with
little performance impact. Savings were smaller, but
still significant, for TPC-C, which has more complex
data locality characteristics.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yan:2019:GAS,
author = "Cong Yan and Alvin Cheung",
title = "Generating application-specific data layouts for
in-memory databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1513--1525",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342630",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Database applications are often developed with
object-oriented languages while using relational
databases as the backend. To accelerate these
applications, developers would manually design
customized data structures to store data in memory, and
ways to utilize such data structures to answer queries.
Doing so is brittle and requires a lot of effort.
Alternatively, developers might automate the process by
using relational physical design tools to create
materialized views and indexes instead. However, the
characteristics of object-oriented database
applications are often distinct enough from traditional
database applications such that classical relational
query optimization techniques often cannot speed up
queries that arise from such applications, as our
experiments show. To address this, we build Chestnut, a
data layout generator for in-memory object-oriented
database applications. Given a memory budget, Chestnut
generates customized in-memory data layouts and query
plans to answer queries written using a subset of the
Rails API, a common framework for building
object-oriented database applications. Chestnut differs
from traditional query optimizers and physical
designers in two ways. First, Chestnut automatically
generates data layouts that are customized for the
application after analyzing their queries, hence
Chestnut-generated data layouts are designed to be
efficient to answer queries from such applications.
second, Chestnut uses a novel enumeration and
verification-based algorithm to generate query plans
that use such data layouts, rather than rule-based
approaches as in traditional query optimizers. We
evaluated Chestnut on four open-source Rails database
applications. The result shows that it can reduce
average query processing time by over 3.6X (and up to
42X), as compared to other in-memory relational
database engines.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Hai:2019:RPT,
author = "Rihan Hai and Christoph Quix",
title = "Rewriting of plain {SO} tgds into nested tgds",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1526--1538",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342631",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Schema mappings express the relationships between
sources in data interoperability scenarios and can be
expressed in various formalisms. Source-to-target
tuple-generating dependencies (s-t tgds) can be easily
used for data transformation or query rewriting tasks.
Second-order tgds (SO tgds) are more expressive as they
can also represent the composition and inversion of s-t
tgds. Yet, the expressive power of SO tgds comes with
the problem of undecidability for some reasoning tasks.
Nested tgds and plain SO tgds are mapping languages
that are between s-t tgds and SO tgds in terms of
expressivity, and their properties have been studied in
the recent years. Nested tgds are less expressive than
plain SO tgds, but the logical equivalence problem for
nested tgds is decidable. However, a detailed
characterization of plain SO tgds that have an
equivalent nested tgd is missing. In this paper, we
present an algorithmic solution for translating plain
SO tgds into nested tgds. The algorithm computes one or
more nested tgds, if a given plain SO tgd is
rewritable. Furthermore, we are able to give a detailed
characterization of those plain SO tgds for which an
equivalent nested tgd exists, based on the structural
properties of the source predicates and Skolem
functions in the plain SO tgd. In the evaluation, we
show that our algorithm covers a larger subset of plain
SO tgds than previous approaches and that a rewriting
can be computed efficiently although the algorithm has
the exponential complexity.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Nathan:2019:BMD,
author = "Senthil Nathan and Chander Govindarajan and Adarsh
Saraf and Manish Sethi and Praveen Jayachandran",
title = "Blockchain meets database: design and implementation
of a blockchain relational database",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1539--1552",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342632",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we design and implement the first-ever
decentralized replicated relational database with
blockchain properties that we term blockchain
relational database. We highlight several similarities
between features provided by blockchain platforms and a
replicated relational database, although they are
conceptually different, primarily in their trust model.
Motivated by this, we leverage the rich features,
decades of research and optimization, and available
tooling in relational databases to build a blockchain
relational database. We consider a permissioned
blockchain model of known, but mutually distrustful
organizations each operating their own database
instance that are replicas of one another. The replicas
execute transactions independently and engage in
decentralized consensus to determine the commit order
for transactions. We design two approaches, the first
where the commit order for transactions is agreed upon
prior to executing them, and the second where
transactions are executed without prior knowledge of
the commit order while the ordering happens in
parallel. We leverage serializable snapshot isolation
(SSI) to guarantee that the replicas across nodes
remain consistent and respect the ordering determined
by consensus, and devise a new variant of SSI based on
block height for the latter approach. We implement our
system on PostgreSQL and present detailed performance
experiments analyzing both approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kunft:2019:IRO,
author = "Andreas Kunft and Asterios Katsifodimos and Sebastian
Schelter and Sebastian Bre{\ss} and Tilmann Rabl and
Volker Markl",
title = "An intermediate representation for optimizing machine
learning pipelines",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1553--1567",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342633",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Machine learning (ML) pipelines for model training and
validation typically include preprocessing, such as
data cleaning and feature engineering, prior to
training an ML model. Preprocessing combines relational
algebra and user-defined functions (UDFs), while model
training uses iterations and linear algebra. Current
systems are tailored to either of the two. As a
consequence, preprocessing and ML steps are optimized
in isolation. To enable holistic optimization of ML
training pipelines, we present Lara, a declarative
domain-specific language for collections and matrices.
Lara's inter-mediate representation (IR) reflects on
the complete program, i.e., UDFs, control flow, and
both data types. Two views on the IR enable diverse
optimizations. Monads enable operator pushdown and
fusion across type and loop boundaries. Combinators
provide the semantics of domain-specific operators and
optimize data access and cross-validation of ML
algorithms. Our experiments on preprocessing pipelines
and selected ML algorithms show the effects of our
proposed optimizations on dense and sparse data, which
achieve speedups of up to an order of magnitude.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fang:2019:ARD,
author = "Yuanwei Fang and Chen Zou and Andrew A. Chien",
title = "Accelerating raw data analysis with the {ACCORDA}
software and hardware architecture",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1568--1582",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342634",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The data science revolution and growing popularity of
data lakes make efficient processing of raw data
increasingly important. To address this, we propose the
ACCelerated Operators for Raw Data Analysis (ACCORDA)
architecture. By extending the operator interface
(subtype with encoding) and employing a uniform runtime
worker model, ACCORDA integrates data transformation
acceleration seamlessly, enabling a new class of
encoding optimizations and robust high-performance raw
data processing. Together, these key features preserve
the software system architecture, empowering
state-of-art heuristic optimizations to drive flexible
data encoding for performance. ACCORDA derives
performance from its software architecture, but depends
critically on the acceleration of the Unstructured Data
Processor (UDP) that is integrated into the
memory-hierarchy, and accelerates data transformation
tasks by 16x-21x (parsing, decompression) to as much as
160x (deserialization) compared to an x86 core. We
evaluate ACCORDA using TPC-H queries on tabular data
formats, exercising raw data properties such as parsing
and data conversion. The ACCORDA system achieves
2.9x-13.2x speedups when compared to SparkSQL, reducing
raw data processing overhead to a geomean of 1.2x
(20\%). In doing so, ACCORDA robustly matches or
outperforms prior systems that depend on caching loaded
data, while computing on raw, unloaded data. This
performance benefit is robust across format complexity,
query predicates, and selectivity (data statistics).
ACCORDA's encoding-extended operator interface unlocks
aggressive encoding-oriented optimizations that deliver
80\% average performance increase over the 7 affected
TPC-H queries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Siddique:2019:CST,
author = "A. B. Siddique and Ahmed Eldawy and Vagelis
Hristidis",
title = "Comparing synopsis techniques for approximate spatial
data analysis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1583--1596",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342635",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The increasing amount of spatial data calls for new
scalable query processing techniques. One of the
techniques that are getting attention is data synopsis,
which summarizes the data using samples or histograms
and computes an approximate answer based on the
synopsis. This general technique is used in selectivity
estimation, clustering, partitioning, load balancing,
and visualization, among others. This paper
experimentally studies four spatial data synopsis
techniques for three common data analysis problems,
namely, selectivity estimation, k-means clustering, and
spatial partitioning. We run an extensive experimental
evaluation on both real and synthetic datasets of up to
2.7 billion records to study the trade-offs between the
synopsis methods and their applicability in big spatial
data analysis. For each of the three problems, we
compare with baseline techniques that operate on the
whole dataset and evaluate the synopsis generation
time, the time for computing an approximate answer on
the synopsis, and the accuracy of the result. We
present our observations about when each synopsis
technique performs best.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{El-Hindi:2019:BSD,
author = "Muhammad El-Hindi and Carsten Binnig and Arvind Arasu
and Donald Kossmann and Ravi Ramamurthy",
title = "{BlockchainDB}: a shared database on blockchains",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1597--1609",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342636",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper we present BlockchainDB, which leverages
blockchains as a storage layer and introduces a
database layer on top that extends blockchains by
classical data management techniques (e.g., sharding)
as well as a standardized query interface to facilitate
the adoption of blockchains for data sharing use cases.
We show that by introducing the additional database
layer, we are able to improve the performance and
scalability when using blockchains for data sharing and
also massively decrease the complexity for
organizations intending to use blockchains for data
sharing.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jia:2019:ETS,
author = "Ruoxi Jia and David Dao and Boxin Wang and Frances Ann
Hubis and Nezihe Merve Gurel and Bo Li and Ce Zhang and
Costas Spanos and Dawn Song",
title = "Efficient task-specific data valuation for nearest
neighbor algorithms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1610--1623",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342637",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Given a data set D containing millions of data points
and a data consumer who is willing to pay for \$X to
train a machine learning (ML) model over D, how should
we distribute this \$X to each data point to reflect
its ``value''? In this paper, we define the ``relative
value of data'' via the Shapley value, as it uniquely
possesses properties with appealing real-world
interpretations, such as fairness, rationality and
decentralizability. For general, bounded utility
functions, the Shapley value is known to be challenging
to compute: to get Shapley values for all N data
points, it requires O (2$^N$ ) model evaluations for
exact computation and O ( N log N ) for ( \epsilon,
\delta )-approximation. In this paper, we focus on one
popular family of ML models relying on K -nearest
neighbors ( K NN). The most surprising result is that
for unweighted K NN classifiers and regressors, the
Shapley value of all N data points can be computed,
exactly, in O ( N log N ) time --- an exponential
improvement on computational complexity! Moreover, for
( \epsilon, \delta )-approximation, we are able to
develop an algorithm based on Locality Sensitive
Hashing (LSH) with only sublinear complexity O ( N$^{h
(\epsilon, K)}$ log N ) when \epsilon is not too small
and K is not too large. We empirically evaluate our
algorithms on up to 10 million data points and even our
exact algorithm is up to three orders of magnitude
faster than the baseline approximation algorithm. The
LSH-based approximation algorithm can accelerate the
value calculation process even further. We then extend
our algorithm to other scenarios such as (1) weighed K
NN classifiers, (2) different data points are clustered
by different data curators, and (3) there are data
analysts providing computation who also requires proper
valuation. Some of these extensions, although also
being improved exponentially, are less practical for
exact computation (e.g., O ( N$^K$ ) complexity for
weighted K NN). We thus propose an Monte Carlo
approximation algorithm, which is O ( N (log N )$^2$
/(log K )$^2$ ) times more efficient than the baseline
approximation algorithm.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Saxena:2019:DID,
author = "Hemant Saxena and Lukasz Golab and Ihab F. Ilyas",
title = "Distributed implementations of dependency discovery
algorithms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1624--1636",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342638",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We analyze the problem of discovering dependencies
from distributed big data. Existing (non-distributed)
algorithms focus on minimizing computation by pruning
the search space of possible dependencies. However,
distributed algorithms must also optimize communication
costs, especially in shared-nothing settings, leading
to a more complex optimization space. To understand
this space, we introduce six primitives shared by
existing dependency discovery algorithms, corresponding
to data processing steps separated by communication
barriers. Through case studies, we show how the
primitives allow us to analyze the design space and
develop communication-optimized implementations.
Finally, we support our analysis with an experimental
evaluation on real datasets.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zamanian:2019:RDH,
author = "Erfan Zamanian and Xiangyao Yu and Michael Stonebraker
and Tim Kraska",
title = "Rethinking database high availability with {RDMA}
networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1637--1650",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342639",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Highly available database systems rely on data
replication to tolerate machine failures. Both classes
of existing replication algorithms, active-passive and
active-active, were designed in a time when network was
the dominant performance bottleneck. In essence, these
techniques aim to minimize network communication
between replicas at the cost of incurring more
processing redundancy; a trade-off that suitably fitted
the conventional wisdom of distributed database design.
However, the emergence of next-generation networks with
high throughput and low latency calls for revisiting
these assumptions. In this paper, we first make the
case that in modern RDMA-enabled networks, the
bottleneck has shifted to CPUs, and therefore the
existing network-optimized replication techniques are
no longer optimal. We present Active-Memory
Replication, a new high availability scheme that
efficiently leverages RDMA to completely eliminate the
processing redundancy in replication. Using
Active-Memory, all replicas dedicate their processing
power to executing new transactions, as opposed to
performing redundant computation. Active-Memory
maintains high availability and correctness in the
presence of failures through an efficient RDMA-based
undo-logging scheme. Our evaluation against
active-passive and active-active schemes shows that
Active-Memory is up to a factor of 2 faster than the
second-best protocol on RDMA-based networks.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bressan:2019:MFM,
author = "Marco Bressan and Stefano Leucci and Alessandro
Panconesi",
title = "{Motivo}: fast motif counting via succinct color
coding and adaptive sampling",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1651--1663",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342640",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The randomized technique of color coding is behind
state-of-the-art algorithms for estimating graph motif
counts. Those algorithms, however, are not yet capable
of scaling well to very large graphs with billions of
edges. In this paper we develop novel tools for the
``motif counting via color coding'' framework. As a
result, our new algorithm, MOTIYO, scales to much
larger graphs while at the same time providing more
accurate motif counts than ever before. This is
achieved thanks to two types of improvements. First, we
design new succinct data structures for fast color
coding operations, and a biased coloring trick that
trades accuracy versus resource usage. These
optimizations drastically reduce the resource
requirements of color coding. Second, we develop an
adaptive motif sampling strategy, based on a fractional
set cover problem, that breaks the additive
approximation barrier of standard sampling. This gives
multiplicative approximations for all motifs at once,
allowing us to count not only the most frequent motifs
but also extremely rare ones. To give an idea of the
improvements, in 40 minutes MOTIVO counts 7-nodes
motifs on a graph with 65M nodes and 1.8B edges; this
is 30 and 500 times larger than the state of the art,
respectively in terms of nodes and edges. On the
accuracy side, in one hour MOTIVO produces accurate
counts of \approx 10.000 distinct 8-node motifs on
graphs where state-of-the-art algorithms fail even to
find the second most frequent motif. Our method
requires just a high-end desktop machine. These results
show how color coding can bring motif mining to the
realm of truly massive graphs using only ordinary
hardware.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Poddar:2019:AED,
author = "Rishabh Poddar and Tobias Boelter and Raluca Ada
Popa",
title = "{Arx}: an encrypted database using semantically secure
encryption",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1664--1678",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342641",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In recent years, encrypted databases have emerged as a
promising direction that provides data confidentiality
without sacrificing functionality: queries are executed
on encrypted data. However, many practical proposals
rely on a set of weak encryption schemes that have been
shown to leak sensitive data. In this paper, we propose
Arx, a practical and functionally rich database system
that encrypts the data only with semantically secure
encryption schemes. We show that Arx supports real
applications such as ShareLaTeX with a modest
performance overhead.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gao:2019:EKG,
author = "Junyang Gao and Xian Li and Yifan Ethan Xu and
Bunyamin Sisman and Xin Luna Dong and Jun Yang",
title = "Efficient knowledge graph accuracy evaluation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1679--1691",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342642",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Estimation of the accuracy of a large-scale knowledge
graph (KG) often requires humans to annotate samples
from the graph. How to obtain statistically meaningful
estimates for accuracy evaluation while keeping human
annotation costs low is a problem critical to the
development cycle of a KG and its practical
applications. Surprisingly, this challenging problem
has largely been ignored in prior research. To address
the problem, this paper proposes an efficient sampling
and evaluation framework, which aims to provide quality
accuracy evaluation with strong statistical guarantee
while minimizing human efforts. Motivated by the
properties of the annotation cost function observed in
practice, we propose the use of cluster sampling to
reduce the overall cost. We further apply weighted and
two-stage sampling as well as stratification for better
sampling designs. We also extend our framework to
enable efficient incremental evaluation on evolving KG,
introducing two solutions based on stratified sampling
and a weighted variant of reservoir sampling. Extensive
experiments on real-world datasets demonstrate the
effectiveness and efficiency of our proposed solution.
Compared to baseline approaches, our best solutions can
provide up to 60\% cost reduction on static KG
evaluation and up to 80\% cost reduction on evolving KG
evaluation, without loss of evaluation quality.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Mhedhbi:2019:OSQ,
author = "Amine Mhedhbi and Semih Salihoglu",
title = "Optimizing subgraph queries by combining binary and
worst-case optimal joins",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1692--1704",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342643",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We study the problem of optimizing subgraph queries
using the new worst-case optimal join plans. Worst-case
optimal plans evaluate queries by matching one query
vertex at a time using multi-way intersections. The
core problem in optimizing worst-case optimal plans is
to pick an ordering of the query vertices to match. We
design a cost-based optimizer that (i) picks efficient
query vertex orderings for worst-case optimal plans;
and (ii) generates hybrid plans that mix traditional
binary joins with worst-case optimal style multiway
intersections. Our cost metric combines the cost of
binary joins with a new cost metric called
intersection-cost. The plan space of our optimizer
contains plans that are not in the plan spaces based on
tree decompositions from prior work. In addition to our
optimizer, we describe an adaptive technique that
changes the orderings of the worst-case optimal
subplans during query execution. We demonstrate the
effectiveness of the plans our optimizer picks and the
effectiveness of the adaptive technique through
extensive experiments. Our optimizer is integrated into
the Graphflow DBMS.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Marcus:2019:NLQ,
author = "Ryan Marcus and Parimarjan Negi and Hongzi Mao and Chi
Zhang and Mohammad Alizadeh and Tim Kraska and Olga
Papaemmanouil and Nesime Tatbul",
title = "{Neo}: a learned query optimizer",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1705--1718",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342644",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Query optimization is one of the most challenging
problems in database systems. Despite the progress made
over the past decades, query optimizers remain
extremely complex components that require a great deal
of hand-tuning for specific workloads and datasets.
Motivated by this shortcoming and inspired by recent
advances in applying machine learning to data
management challenges, we introduce Neo ( Neural
Optimizer ), a novel learning-based query optimizer
that relies on deep neural networks to generate query
executions plans. Neo bootstraps its query optimization
model from existing optimizers and continues to learn
from incoming queries, building upon its successes and
learning from its failures. Furthermore, Neo naturally
adapts to underlying data patterns and is robust to
estimation errors. Experimental results demonstrate
that Neo, even when bootstrapped from a simple
optimizer like PostgreSQL, can learn a model that
offers similar performance to state-of-the-art
commercial optimizers, and in some cases even surpass
them.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fang:2019:EAD,
author = "Yixiang Fang and Kaiqiang Yu and Reynold Cheng and
Laks V. S. Lakshmanan and Xuemin Lin",
title = "Efficient algorithms for densest subgraph discovery",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1719--1732",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342645",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Densest subgraph discovery (DSD) is a fundamental
problem in graph mining. It has been studied for
decades, and is widely used in various areas, including
network science, biological analysis, and graph
databases. Given a graph G, DSD aims to find a subgraph
D of G with the highest density (e.g., the number of
edges over the number of vertices in D ). Because DSD
is difficult to solve, we propose a new solution
paradigm in this paper. Our main observation is that
the densest subgraph can be accurately found through a
k -core (a kind of dense subgraph of G ), with
theoretical guarantees. Based on this intuition, we
develop efficient exact and approximation solutions for
DSD. Moreover, our solutions are able to find the
densest subgraphs for a wide range of graph density
definitions, including clique-based- and general
pattern-based density. We have performed extensive
experimental evaluation on both real and synthetic
datasets. Our results show that our algorithms are up
to four orders of magnitude faster than existing
approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Marcus:2019:PSD,
author = "Ryan Marcus and Olga Papaemmanouil",
title = "Plan-structured deep neural network models for query
performance prediction",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1733--1746",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342646",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Query performance prediction, the task of predicting a
query's latency prior to execution, is a challenging
problem in database management systems. Existing
approaches rely on features and performance models
engineered by human experts, but often fail to capture
the complex interactions between query operators and
input relations, and generally do not adapt naturally
to workload characteristics and patterns in query
execution plans. In this paper, we argue that deep
learning can be applied to the query performance
prediction problem, and we introduce a novel neural
network architecture for the task: a plan-structured
neural network. Our neural network architecture matches
the structure of any optimizer-selected query execution
plan and predict its latency with high accuracy, while
eliminating the need for human-crafted input features.
A number of optimizations are also proposed to reduce
training overhead without sacrificing effectiveness. We
evaluated our techniques on various workloads and we
demonstrate that our approach can out-perform the
state-of-the-art in query performance prediction.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ren:2019:SSL,
author = "Kun Ren and Dennis Li and Daniel J. Abadi",
title = "{SLOG}: serializable, low-latency, geo-replicated
transactions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1747--1761",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342647",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "For decades, applications deployed on a world-wide
scale have been forced to give up at least one of (1)
strict serializability (2) low latency writes (3) high
transactional throughput. In this paper we discuss
SLOG: a system that avoids this tradeoff for workloads
which contain physical region locality in data access.
SLOG achieves high-throughput, strictly serializable
ACID transactions at geo-replicated distance and scale
for all transactions submitted across the world, all
the while achieving low latency for transactions that
initiate from a location close to the home region for
data they access. Experiments find that SLOG can reduce
latency by more than an order of magnitude relative to
state-of-the-art strictly serializable geo-replicated
database systems such as Spanner and Calvin, while
maintaining high throughput under contention.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Paparrizos:2019:GET,
author = "John Paparrizos and Michael J. Franklin",
title = "{GRAIL}: efficient time-series representation
learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "11",
pages = "1762--1777",
month = jul,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3342263.3342648",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The analysis of time series is becoming increasingly
prevalent across scientific disciplines and industrial
applications. The effectiveness and the scalability of
time-series mining techniques critically depend on
design choices for three components responsible for (i)
representing; (ii) comparing; and (iii) indexing time
series. Unfortunately, these components have to date
been investigated and developed independently, often
resulting in mutually incompatible methods. The lack of
a unified approach has hindered progress towards fast
and accurate analytics over massive time-series
collections. To address this major drawback, we present
GRAIL, a generic framework to learn compact time-series
representations that preserve the properties of a
user-specified comparison function. Given the
comparison function, GRAIL (i) extracts landmark time
series using clustering; (ii) optimizes necessary
parameters; and (iii) exploits approximations for
kernel methods to construct representations in linear
time and space by expressing each time series as a
combination of the landmark time series. We extensively
evaluate GRAIL for querying, classification,
clustering, sampling, and visualization of time series.
For these tasks, methods leveraging GRAIL's
representations are significantly faster and at least
as accurate as state-of-the-art methods operating over
the raw time series. GRAIL shows promise as a new
primitive for highly accurate, yet scalable,
time-series analysis.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Damasio:2019:GGA,
author = "Guilherme Damasio and Spencer Bryson and Vincent
Corvinelli and Parke Godfrey and Piotr Mierzejewski and
Jaroslaw Szlichta and Calisto Zuzarte",
title = "{GALO}: guided automated learning for
re-optimization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1778--1781",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352064",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Query performance problem determination is usually
performed manually in consultation with experts through
the analysis of query plans. However, this is an
excessively time consuming, human error-prone, and
costly process. GALO is a novel system that automates
this process. The tool automatically learns recurring
problem patterns in query plans over workloads in an
offline learning phase to build a knowledge base of
plan rewrite remedies. GALO's knowledge base is built
on RDF and SPARQL, which is well-suited for
manipulating and querying over SQL query plans, which
are graphs themselves. It then uses the knowledge base
online to re-optimize queries queued for execution to
improve performance, often quite dramatically.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tian:2019:SGS,
author = "Yuanyuan Tian and Wen Sun and Sui Jun Tong and En
Liang Xu and Mir Hamid Pirahesh and Wei Zhao",
title = "Synergistic graph and {SQL} analytics inside {IBM
Db2}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1782--1785",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352065",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "To meet the challenge of analyzing rapidly growing
graph and network data created by modern applications,
a large number of specialized graph databases have
emerged, such as Neo4j, JanusGraph, and Sqlg. At the
same time, RDBMSs and SQL continue to support
mission-critical business analytics. However, real-life
analytical applications seldom contain only one type of
analytics. They are often made of heterogeneous
workloads, including SQL, machine learning, graph, and
other analytics. In particular, SQL and graph analytics
are usually accompanied together in one analytical
workload. This means that graph and SQL analytics need
to be synergistic with each other. Unfortunately, most
existing graph databases are standalone and cannot
easily integrate with relational databases. In
addition, as a matter of fact, many graph data (data
about relationships between objects or people) are
already prevalent in relational databases, although
they are not explicitly stored as graphs. Performing
graph analytics on these relational graph data today
requires exporting large amount of data to the
specialized graph databases. A natural question arises:
can SQL and graph analytics be performed
synergistically in a same system? In this demo, we
present such a working system called IBM Db2 Graph. Db2
Graph is an in-DBMS graph query approach. It is
implemented as a layer inside an experimental IBM
Db2TM, and thus can support synergistic graph and SQL
analytics efficiently. Db2 Graph employs a graph
overlay approach to expose a graph view of the
relational data. This approach flexibly retrofits graph
queries to existing graph data stored in relational
tables. We use an example scenario on health insurance
claim analysis to demonstrate how Db2 Graph is used to
support synergistic graph and SQL analytics inside
Db2.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ding:2019:CDC,
author = "Xiaoou Ding and Hongzhi Wang and Jiaxuan Su and Zijue
Li and Jianzhong Li and Hong Gao",
title = "{Cleanits}: a data cleaning system for industrial time
series",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1786--1789",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352066",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The great amount of time series generated by machines
has enormous value in intelligent industry. Knowledge
can be discovered from high-quality time series, and
used for production optimization and anomaly detection
in industry. However, the original sensors data always
contain many errors. This requires a sophisticated
cleaning strategy and a well-designed system for
industrial data cleaning. Motivated by this, we
introduce Cleanits, a system for industrial time series
cleaning. It implements an integrated cleaning strategy
for detecting and repairing three kinds of errors in
industrial time series. We develop reliable data
cleaning algorithms, considering features of both
industrial time series and domain knowledge. We
demonstrate Cleanits with two real datasets from power
plants. The system detects and repairs multiple dirty
data precisely, and improves the quality of industrial
time series effectively. Cleanits has a friendly
interface for users, and result visualization along
with logs are available during each cleaning process.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2019:IIT,
author = "Yipeng Zhang and Zhifeng Bao and Songsong Mo and
Yuchen Li and Yanghao Zhou",
title = "{ITAA}: an intelligent trajectory-driven outdoor
advertising deployment assistant",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1790--1793",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352067",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we demonstrate an Intelligent
Trajectory-driven outdoor Advertising deployment
Assistant (ITAA), which assists users to find an
optimal strategy for outdoor advertising (ad)
deployment. The challenge is how to measure the
influence to the moving trajectories of ads, and how to
optimize the placement of ads among billboards that
maximize the influence has been proven NP-hard.
Therefore, we develop a framework based on two
trajectory-driven influence models. ITAA is built upon
this framework with a user-friendly UI. It serves both
ad companies and their customers. We enhance the
interpretability to improve the user's understanding of
the influence of ads. The interactive function of ITAA
is made interpretable and easy to engage.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Qian:2019:SHL,
author = "Kun Qian and Lucian Popa and Prithviraj Sen",
title = "{SystemER}: a human-in-the-loop system for explainable
entity resolution",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1794--1797",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352068",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Entity Resolution (ER) is the task of identifying
different representations of the same real-world
object. To achieve scalability and the desired level of
quality, the typical ER pipeline includes multiple
steps that may involve low-level coding and extensive
human labor. We present SystemER, a tool for learning
explainable ER models that reduces the human labor all
throughout the stages of the ER pipeline. SystemER
achieves explainability by learning rules that not only
perform a given ER task but are human-comprehensible;
this provides transparency into the learning process,
and further enables verification and customization of
the learned model by the domain experts. By leveraging
a human in the loop and active learning, SystemER also
ensures that a small number of labeled examples is
sufficient to learn high-quality ER models. SystemER is
a full-fledged tool that includes an easy to use
interface, support for both flat files and
semi-structured data, and scale-out capabilities by
distributing computation via Apache Spark.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Huynh:2019:BEF,
author = "Viet-Phi Huynh and Paolo Papotti",
title = "{Buckle}: evaluating fact checking algorithms built on
knowledge bases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1798--1801",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352069",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Fact checking is the task of determining if a given
claim holds. Several algorithms have been developed to
check facts with reference information in the form of
knowledge bases. We demonstrate BUCKLE, an open-source
benchmark for comparing and evaluating fact checking
algorithms in a level playing field across a range of
scenarios. The demo is centered around three main
lessons. To start, we show how, by changing the
properties of the training and test facts, it is
possible to influence significantly the performance of
the algorithms. We then show the role of the reference
data. Finally, we discuss the performance for
algorithms designed on different principles and
assumptions, as well as approaches that address the
link prediction task in knowledge bases.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Gao:2019:QSE,
author = "Peng Gao and Xusheng Xiao and Zhichun Li and Kangkook
Jee and Fengyuan Xu and Sanjeev R. Kulkarni and Prateek
Mittal",
title = "A query system for efficiently investigating complex
attack behaviors for enterprise security",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1802--1805",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352070",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The need for countering Advanced Persistent Threat
(APT) attacks has led to the solutions that
ubiquitously monitor system activities in each
enterprise host, and perform timely attack
investigation over the monitoring data for uncovering
the attack sequence. However, existing general-purpose
query systems lack explicit language constructs for
expressing key properties of major attack behaviors,
and their semantics-agnostic design often produces
inefficient execution plans for queries. To address
these limitations, we build Aiql, a novel query system
that is designed with novel types of domain-specific
optimizations to enable efficient attack investigation.
Aiql provides (1) a domain-specific data model and
storage for storing the massive system monitoring data,
(2) a domain-specific query language, Attack
Investigation Query Language (Aiql) that integrates
critical primitives for expressing major attack
behaviors, and (3) an optimized query engine based on
the characteristics of the data and the semantics of
the query to efficiently schedule the execution. We
have deployed Aiql in NEC Labs America comprising 150
hosts. In our demo, we aim to show the complete usage
scenario of Aiql by (1) performing an APT attack in a
controlled environment, and (2) using Aiql to
investigate such attack by querying the collected
system monitoring data that contains the attack traces.
The audience will have the option to perform the APT
attack themselves under our guidance, and interact with
the system and investigate the attack via issuing
queries and checking the query results through our web
UI.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Miao:2019:CEO,
author = "Zhengjie Miao and Qitian Zeng and Chenjie Li and Boris
Glavic and Oliver Kennedy and Sudeepa Roy",
title = "{CAPE}: explaining outliers by counterbalancing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1806--1809",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352071",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this demonstration we showcase Cape, a system that
explains surprising aggregation outcomes. In contrast
to previous work, which relies exclusively on
provenance, Cape explains outliers in aggregation
queries through related outliers in the opposite
direction that provide counterbalance. The foundation
of our approach are aggregate regression patterns
(ARPs) that describe coarse-grained trends in the data.
We define outliers as deviations from such patterns and
present an efficient algorithm to find counterbalances
explaining outliers. In the demonstration, the audience
can run aggregation queries over real world datasets,
identify outliers of interest in the result of such
queries, and browse the patterns and explanations
returned by Cape.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ramachandra:2019:BAI,
author = "Karthik Ramachandra and Kwanghyun Park",
title = "{BlackMagic}: automatic inlining of scalar {UDFs} into
{SQL} queries with {Froid}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1810--1813",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352072",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Relational DBMSs allow users to extend the standard
declarative SQL language surface using User Defined
Functions (UDFs) that implement custom behavior. While
UDFs offer many advantages, it is well-known amongst
practitioners that they can cause severe degradation in
query performance. This degradation is due to the fact
that state-of-the-art query optimizers treat UDFs as
black boxes and do not reason about them during
optimization. We demonstrate Froid, a framework for
optimizing UDFs by opening up this black box and
exposing its underlying operations to the query
optimizer. It achieves this by systematically
translating the entire body of an imperative
multi-statement UDF into a single relational algebraic
expression. Thereby, any query invoking this UDF is
transformed into a query with a nested sub-query that
is semantically equivalent to the UDF. We then leverage
existing sub-query optimization techniques and thereby
get efficient, set-oriented, parallel query plans as
opposed to inefficient, iterative, serial execution of
UDFs. We demonstrate the benefits of Froid including
performance gains of up to multiple orders of magnitude
on real workloads. Froid is available as a feature of
Microsoft SQL Server 2019 called 'Scalar UDF
Inlining'.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Berg:2019:PPD,
author = "Lukas Berg and Tobias Ziegler and Carsten Binnig and
Uwe R{\"o}hm",
title = "{ProgressiveDB}: progressive data analytics as a
middleware",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1814--1817",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352073",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "ProgressiveDB transforms any standard SQL database
into a progressive database capable of continuous,
approximate query processing. It introduces a few small
extensions to the SQL query language that allow clients
to express progressive analytical queries. These
extensions are processed in the ProgressiveDB
middleware that sits between a database application and
the underlying database providing interactive query
processing as well as query steering capabilities to
the user. In our demo, we show how this system allows a
database application with a graphical user interface to
interact with different backends, while providing the
user with immediate feedback during exploratory data
exploration of an on-time flight database.
ProgressiveDB also supports efficient query steering by
providing a new technique, called progressive views,
which allows the intermediate results of one
progressive query to be shared and reused by multiple
concurrent progressive queries with refined scope.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kara:2019:DHT,
author = "Kaan Kara and Zeke Wang and Ce Zhang and Gustavo
Alonso",
title = "{doppioDB 2.0}: hardware techniques for improved
integration of machine learning into databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1818--1821",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352074",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Database engines are starting to incorporate machine
learning (ML) functionality as part of their
repertoire. Machine learning algorithms, however, have
very different characteristics than those of relational
operators. In this demonstration, we explore the
challenges that arise when integrating generalized
linear models into a database engine and how to
incorporate hardware accelerators into the execution, a
tool now widely used for ML workloads. The demo
explores two complementary alternatives: (1) how to
train models directly on compressed/encrypted
column-stores using a specialized coordinate descent
engine, and (2) how to use a bitwise weaving index for
stochastic gradient descent on low precision input
data. We present these techniques as implemented in our
prototype database doppioDB 2.0 and show how the new
functionality can be used from SQL.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Pahins:2019:CSV,
author = "Cicero A. L. Pahins and Behrooz Omidvar-Tehrani and
Sihem Amer-Yahia and Val{\'e}rie Siroux and Jean-Louis
Pepin and Jean-Christian Borel and Jo{\~a}o L. D.
Comba",
title = "{COVIZ}: a system for visual formation and exploration
of patient cohorts",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1822--1825",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352075",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We demonstrate COVIZ, an interactive system to
visually form and explore patient cohorts. COVIZ
seamlessly integrates visual cohort formation and
exploration, making it a single destination for
hypothesis generation. COVIZ is easy to use by medical
experts and offers many features: (1) It provides the
ability to isolate patient demographics (e.g., their
age group and location), health markers (e.g., their
body mass index), and treatments (e.g., Ventilation for
respiratory problems), and hence facilitates cohort
formation; (2) It summarizes the evolution of
treatments of a cohort into health trajectories, and
lets medical experts explore those trajectories; (3) It
guides them in examining different facets of a cohort
and generating hypotheses for future analysis; (4)
Finally, it provides the ability to compare the
statistics and health trajectories of multiple cohorts
at once. COVIZ relies on QDS, a novel data structure
that encodes and indexes various data distributions to
enable their efficient retrieval. Additionally, COVIZ
visualizes air quality data in the regions where
patients live to help with data interpretations. We
demonstrate two key scenarios, ecological scenario and
case cross-over scenario. A video demonstration of
COVIZ is accessible via http://bit.ly/video-coviz.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Franke:2019:PTF,
author = "Martin Franke and Ziad Sehili and Erhard Rahm",
title = "{PRIMAT}: a toolbox for fast privacy-preserving
matching",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1826--1829",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352076",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Privacy-preserving record linkage (PPRL) is
increasingly demanded in real-world applications, e.g.,
in the health-care domain, to combine person-related
data for data analysis while preserving the privacy of
individuals. However, the adoption of PPRL is hampered
by the absence of easy-to-use and powerful PPRL tools
covering the entire PPRL process. We therefore
demonstrate Primat, a flexible and scalable tool that
enables the definition and application of tailored PPRL
workflows as well as the comparative evaluation of
different PPRL methods. We introduce the main
requirements for PPRL tools and discuss previous tool
efforts that do not fully meet the requirements and
have not been applied in practice. By contrast, Primat
covers the whole PPRL life-cycle and improves
applicability by providing various components for data
owners and the central linkage to be executed by a
trusted linkage unit.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Marcus:2019:NFR,
author = "Ryan Marcus and Chi Zhang and Shuai Yu and Geoffrey
Kao and Olga Papaemmanouil",
title = "{NashDB}: fragmentation, replication, and provisioning
using economic methods",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1830--1833",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352077",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Modern elastic computing systems allow applications to
scale up and down automatically, increasing capacity
for workload spikes and ensuring cost savings during
lulls in activity. Adapting database management systems
to work on top of such elastic infrastructure is not a
trivial task, and requires a deep understanding of the
sophisticated interplay between data fragmentation,
replica allocation, and cluster provisioning. This
demonstration showcases NashDB, an end-to-end method
for addressing these concerns in an automatic way.
NashDB relies on economic models to maximize query
performance while staying within a user's budget. This
demonstration will (1) allow audience members to see
how NashDB handles shifting workloads in an adaptive
way, and (2) allow audience members to test NashDB
themselves by constructing synthetic workloads and
seeing how NashDB adapts a cluster to them in real
time.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sabek:2019:FAS,
author = "Ibrahim Sabek and Mashaal Musleh and Mohamed F.
Mokbel",
title = "Flash in action: scalable spatial data analysis using
{Markov} logic networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1834--1837",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352078",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The current explosion in spatial data raises the need
for efficient spatial analysis tools to extract useful
information from such data. However, existing tools are
neither generic nor scalable when dealing with big
spatial data. This demo presents Flash; a framework for
generic and scalable spatial data analysis, with a
special focus on spatial probabilistic graphical
modelling (SPGM). Flash exploits Markov Logic Networks
(MLN) to express SPGM as a set of declarative logical
rules. In addition, it provides spatial variations of
the scalable RDBMS-based learning and inference
techniques of MLN to efficiently perform SPGM
predictions. To show Flash effectiveness, we
demonstrate three applications that use Flash in their
SPGM: (1) Bird monitoring, (2) Safety analysis, and (3)
Land use change tracking.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kuhring:2019:CBO,
author = "Lucas Kuhring and Zsolt Istv{\'a}n",
title = "{I} can't believe it's not (only) software!: bionic
distributed storage for {Parquet} files",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1838--1841",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352079",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/python.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "There is a steady increase in the size of data stored
and processed as part of data science applications,
leading to bottlenecks and inefficiencies at various
layers of the stack. One way of reducing such
bottlenecks and increasing energy efficiency is by
tailoring the underlying distributed storage solution
to the application domain, using resources more
efficiently. We explore this idea in the context of a
popular column-oriented storage format used in big data
workloads, namely Apache Parquet. Our prototype uses an
FPGA-based storage node that offers high bandwidth data
deduplication and a companion software library that
exposes an API for Parquet file access. This way the
storage node remains general purpose and could be
shared by applications from different domains, while,
at the same time, benefiting from deduplication well
suited to Apache Parquet files and from selective reads
of columns in the file. In this demonstration we show,
on the one hand, that by relying on the FPGA's dataflow
processing model, it is possible to implement in-line
deduplication without increasing latencies
significantly or reducing throughput. On the other
hand, we highlight the benefits of implementing the
application-specific aspects in a software library
instead of FPGA circuits and how this enables, for
instance, regular data science frameworks running in
Python to access the data on the storage node and to
offload filtering operations.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Choi:2019:VVI,
author = "Hyewon Choi and Erkang Zhu and Arsala Bangash and
Ren{\'e}e J. Miller",
title = "{VISE}: vehicle image search engine with traffic
camera",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1842--1845",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352080",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present VISE, or Vehicle Image Search Engine, to
support the fast search of similar vehicles from
low-resolution traffic camera images. VISE can be used
to trace and locate vehicles for applications such as
police investigations when high-resolution footage is
not available. Our system consists of three components:
an interactive user-interface for querying and browsing
identified vehicles; a scalable search engine for fast
similarity search on millions of visual objects; and an
image processing pipeline that extracts feature vectors
of objects from video frames. We use transfer learning
technique to integrate state-of-the-art Convolutional
Neural Networks with two different refinement methods
to achieve high retrieval accuracy. We also use an
efficient high-dimensional nearest neighbor search
index to enable fast retrieval speed. In the demo, our
system will offer users an interactive experience
exploring a large database of traffic camera images
that is growing in real time at 200K frames per day.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Goldberg:2019:WSF,
author = "Stephan Goldberg and Tova Milo and Slava Novgorodov
and Kathy Razmadze",
title = "{WiClean}: a system for fixing {Wikipedia} interlinks
using revision history patterns",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1846--1849",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352081",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present WiClean, a Wikipedia plug-in that supports
the identification and cleaning of certain types of
errors in Wikipedia interlinks. The system mines update
patterns in Wikipedia revision logs, identifies the
common time frames in which they occur, and employs
them to signal incomplete/inconsistent updates and
suggests corrections. We demonstrate the effectiveness
of WiClean in identifying actual errors in a variety of
Wikipedia entity types, interactively employing the
VLDB'19 audience as editors to correct the identified
errors.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Roy:2019:SHC,
author = "Abhishek Roy and Alekh Jindal and Hiren Patel and
Ashit Gosalia and Subru Krishnan and Carlo Curino",
title = "{SparkCruise}: handsfree computation reuse in
{Spark}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1850--1853",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352082",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Interactive data analytics is often inundated with
common computations across multiple queries. These
redundancies result in poor query performance and
higher overall cost for the interactive query sessions.
Obviously, reusing these common computations could lead
to cost savings. However, it is difficult for the users
to manually detect and reuse the common computations in
their fast moving interactive sessions. In the paper,
we propose to demonstrate SparkCruise, a computation
reuse system that automatically selects the most useful
common computations to materialize based on the past
query workload. SparkCruise materializes these
computations as part of query processing, so the users
can continue with their query processing just as before
and computation reuse is automatically applied in the
background --- all without any modifications to the
Spark code. We will invite the audience to play with
several scenarios, such as workload redundancy insights
and pay-as-you-go materialization, highlighting the
utility of SparkCruise.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sandha:2019:DDM,
author = "Sandeep Singh Sandha and Wellington Cabrera and
Mohammed Al-Kateb and Sanjay Nair and Mani Srivastava",
title = "In-database distributed machine learning:
demonstration using {Teradata SQL} engine",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1854--1857",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352083",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/python.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Machine learning has enabled many interesting
applications and is extensively being used in big data
systems. The popular approach --- training machine
learning models in frameworks like Tensorflow, Pytorch
and Keras --- requires movement of data from database
engines to analytical engines, which adds an excessive
overhead on data scientists and becomes a performance
bottleneck for model training. In this demonstration,
we give a practical exhibition of a solution for the
enablement of distributed machine learning natively
inside database engines. During the demo, the audience
will interactively use Python APIs in Jupyter Notebooks
to train multiple linear regression models on synthetic
regression datasets and neural network models on vision
and sensory datasets directly inside Teradata SQL
Engine.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2019:SLS,
author = "Zhao Li and Xia Chen and Xuming Pan and Pengcheng Zou
and Yuchen Li and Guoxian Yu",
title = "{SHOAL}: large-scale hierarchical taxonomy via
graph-based query coalition in e-commerce",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1858--1861",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352084",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "E-commerce taxonomy plays an essential role in online
retail business. Existing taxonomy of e-commerce
platforms organizes items into an ontology structure.
However, the ontology-driven approach is subject to
costly manual maintenance and often does not capture
user's search intention, particularly when user
searches by her personalized needs rather than a
universal definition of the items. Observing that
search queries can effectively express user's
intention, we present a novel large-Scale Hierarchical
taxOnomy via grAph based query coaLition ( SHOAL ) to
bridge the gap between item taxonomy and user search
intention. SHOAL organizes hundreds of millions of
items into a hierarchical topic structure. Each topic
that consists of a cluster of items denotes a
conceptual shopping scenario, and is tagged with
easy-to-interpret descriptions extracted from search
queries. Furthermore, SHOAL establishes correlation
between categories of ontology-driven taxonomy, and
offers opportunities for explainable recommendation.
The feedback from domain experts shows that SHOAL
achieves a precision of 98\% in terms of placing items
into the right topics, and the result of an online A/B
test demonstrates that SHOAL boosts the Click Through
Rate (CTR) by 5\%. SHOAL has been deployed in Alibaba
and supports millions of searches for online shopping
per day.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Xu:2019:DMD,
author = "Min Xu and Tianhao Wang and Bolin Ding and Jingren
Zhou and Cheng Hong and Zhicong Huang",
title = "{DPSAaS}: multi-dimensional data sharing and analytics
as services under local differential privacy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1862--1865",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352085",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Differential privacy has emerged as the de facto
standard for privacy definitions, and been used by,
e.g., Apple, Google, Uber, and Microsoft, to collect
sensitive information about users and to build
privacy-preserving analytics engines. However, most of
such advanced privacy-protection techniques are not
accessible to mid-size companies and app developers in
the cloud. We demonstrate a lightweight middleware
DPSAaS, which provides differentially private
data-sharing-and-analytics functionality as cloud
services. We focus on multi-dimensional analytical
(MDA) queries under local differential privacy (LDP) in
this demo. MDA queries against a fact table have
predicates on (categorical or ordinal) dimensions and
aggregate one or more measures. In the absence of a
trusted agent, sensitive dimensions and measures are
encoded in a privacy-preserving way locally using our
LDP data sharing service, before being sent to the data
collector. The data collector estimates the answers to
MDA queries from the encoded data, using our data
analytics service. We will highlight the design
decisions of DPSAaS and twists made to LDA algorithms
to fit the design, in order to smoothly connect DPSAaS
to the data processing platform and analytics engines,
and to facilitate efficient large-scale processing.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cao:2019:PPS,
author = "Yang Cao and Yonghui Xiao and Li Xiong and Liquan Bai
and Masatoshi Yoshikawa",
title = "{PriSTE}: protecting spatiotemporal event privacy in
continuous location-based services",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1866--1869",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352086",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Location privacy-preserving mechanisms (LPPMs) have
been extensively studied for protecting a user's
location in location-based services. However, when
user's perturbed locations are released continuously,
existing LPPMs may not protect users' sensitive
spatiotemporal event, such as ``visited hospital in the
last week'' or ``regularly commuting between location 1
and location 2 every morning and afternoon'' (it is
easy to infer that locations 1 and 2 may be home and
office). In this demonstration, we demonstrate PriSTE
for protecting spatiotemporal event privacy in
continuous location release. First, to raise users'
awareness of such a new privacy goal, we design an
interactive tool to demonstrate how accurate an
adversary could infer a secret spatiotemporal event
from a sequence of locations or even LPPM-protected
locations. The attendees can find that some
spatiotemporal events are quite risky and even these
state-of-the-art LPPMs do not always protect
spatiotemporal event privacy. Second, we demonstrate
how a user can use PriSTE to automatically or manually
convert an LPPM for location privacy into one
protecting spatiotemporal event privacy in continuous
location-based services. Finally, we visualize the
trade-off between privacy and utility so that users can
choose appropriate privacy parameters in different
application scenarios.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Deutch:2019:DOS,
author = "Daniel Deutch and Evgeny Marants and Yuval
Moskovitch",
title = "{Datalignment}: ontology schema alignment through
datalog containment",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1870--1873",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352087",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We focus on the problem of aligning ontology
relations, namely finding relation names that
correspond to the same or related concepts. Such
alignment is a prerequisite to the integration of the
multiple available Knowledge Bases many of which
include similar concepts, differently termed. We
propose a novel approach for this problem, by
leveraging association rules --- originally mined in
order to enrich the ontological content. Here, we treat
the rules as Datalog programs and look for
bounded-depth sub-programs that are contained in (or
equivalent to) each other. Heads of such programs
intuitively correspond to related concepts, and we
propose them as candidates for alignment. The candidate
alignments require further verification by experts; to
this end we accompany each aligned pair with
explanations based on the provenance of each relation
according to its sub-program. We have implemented our
novel solution in a system called Datalignment. We
propose to demonstrate Datalignment, presenting the
aligned pairs that it finds, and the computed
explanations, in context of real-life Knowledge
Bases.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ge:2019:IIH,
author = "Congcong Ge and Yunjun Gao and Xiaoye Miao and Lu Chen
and Christian S. Jensen and Ziyuan Zhu",
title = "{IHCS}: an integrated hybrid cleaning system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1874--1877",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352088",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data cleaning is a prerequisite to subsequent data
analysis, and is know to often be time-consuming and
labor-intensive. We present IHCS, a hybrid data
cleaning system that integrates error detection and
repair to contend effectively with multiple error
types. In a preprocessing step that precedes the data
cleaning, IHCS formats an input dataset to be cleaned,
and transforms applicable data quality rules into a
unified format. Then, an MLN index structure is formed
according to the unified rules, enabling IHCS to handle
multiple error types simultaneously. During the
cleaning, IHCS first tackles abnormalities through an
abnormal group process, and then, it generates multiple
data versions based on the MLN index. Finally, IHCS
eliminates conflicting values across the multiple
versions, and derives the final unified clean data. A
visual interface enables cleaning process monitoring
and cleaning result analysis.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Costa:2019:CGB,
author = "Constantinos Costa and Xiaoyu Ge and Panos K.
Chrysanthis",
title = "{CAPRIO}: graph-based integration of indoor and
outdoor data for path discovery",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1878--1881",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352089",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Recently, navigation and localization systems have
emerged to support queries like the shortest distance
in either indoor or outdoor with additional
constraints. These systems, however, neither combine
the indoor and outdoor information nor consider the
external natural conditions like the weather that one
may face across an outdoor path. In this demonstration
paper we present CAPRIO, which proposes and implements
a novel graph representation that integrates indoor and
outdoor information to discover paths that personalize
outdoor exposure while minimizes the overall path
length. We also demonstrate how unifying the graph
algorithms for indoor and outdoor navigation enables
significant optimizations that would not be possible
otherwise.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wu:2019:HAS,
author = "Yingjun Wu and Jia Yu and Yuanyuan Tian and Richard
Sidle and Ronald Barber",
title = "{HERMIT} in action: succinct secondary indexing
mechanism via correlation exploration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1882--1885",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352090",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Database administrators construct secondary indexes on
data tables to accelerate query processing in
relational database management systems (RDBMSs). These
indexes are built on top of the most frequently queried
columns according to the data statistics.
Unfortunately, maintaining multiple secondary indexes
in the same database can be extremely space consuming,
causing significant performance degradation due to the
potential exhaustion of memory space. However, we find
that there indeed exist many opportunities to save
storage space by exploiting column correlations. We
recently introduced Hermit, a succinct secondary
indexing mechanism for modern RDBMSs. Hermit
judiciously leverages the rich soft functional
dependencies hidden among columns to prune out
redundant structures for indexed key access. instead of
building a complete index that stores every single
entry in the key columns, Hermit navigates any incoming
key access queries to an existing index built on the
correlated columns. This is achieved through the Tiered
Regression Search Tree (TRS-Tree), a succinct,
ML-enhanced data structure that performs fast curve
fitting to adaptively and dynamically capture both
column correlations and outliers. In this
demonstration, we showcase Hermit's appealing
characteristics. we not only demonstrate that Hermit
can significantly reduce space consumption with limited
performance overhead in terms of query response time
and index maintenance time, but also explain in detail
the rationale behind Hermit's high efficiency using
interactive online query processing examples.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Loudet:2019:DSH,
author = "Julien Loudet and Iulian Sandu-Popa and Luc Bouganim",
title = "{DISPERS}: securing highly distributed queries on
personal data management systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1886--1889",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352091",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Personal Data Management Systems (PDMS) advance at a
rapid pace allowing us to integrate all our personal
data in a single place and use it for our benefit and
for the benefit of the community. This leads to a
significant paradigm shift since personal data become
massively distributed and opens an important question:
how to query this massively distributed data in an
efficient, pertinent and privacy preserving way? This
demonstration proposes a fully-distributed PDMS called
DISPERS, built on top of SEP2P, allowing users to
securely and efficiently share and query their personal
data. The demonstration platform graphically
illustrates the query execution in details, showing
that DISPERS leads to maximal system security with low
and scalable overhead. Attendees are welcome to
challenge the security provided by DISPERS using the
proposed hacking tools.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Akhter:2019:SFS,
author = "Adil Akhter and Marios Fragkoulis and Asterios
Katsifodimos",
title = "Stateful functions as a service in action",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1890--1893",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352092",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In the serverless model, users upload application code
to a cloud platform and the cloud provider undertakes
the deployment, execution and scaling of the
application, relieving users from all operational
aspects. Although very popular, current serverless
offerings offer poor support for the management of
local application state, the main reason being that
managing state and keeping it consistent at large scale
is very challenging. As a result, the serverless model
is inadequate for executing stateful, latency-sensitive
applications. In this paper we present a high-level
programming model for developing stateful functions and
deploying them in the cloud. Our programming model
allows functions to retain state as well as call other
functions. In order to deploy stateful functions in a
cloud infrastructure, we translate functions and their
data exchanges into a stateful dataflow graph. With
this paper we aim at demonstrating that using a
modified version of an open-source dataflow engine as a
runtime for stateful functions, we can deploy scalable
and stateful services in the cloud with surprisingly
low latency and high throughput.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ordookhanians:2019:DKO,
author = "Allen Ordookhanians and Xin Li and Supun Nakandala and
Arun Kumar",
title = "Demonstration of {Krypton}: optimized {CNN} inference
for occlusion-based deep {CNN} explanations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1894--1897",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352093",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this demonstration, we present Krypton, a system
for accelerating occlusion-based deep convolution
neural network (CNN) explanation workloads. Driven by
the success of CNNs in image understanding tasks, there
is growing adoption of CNNs in various domains,
including high stakes applications such as radiology.
However, users of such applications often seek an
``explanation'' for why a CNN predicted a certain
label. One of the most widely used approaches for
explaining CNN predictions is the occlusion-based
explanation (OBE) method. This approach is
computationally expensive due to the large number of
re-inference requests produced. Krypton reduces the
runtime of OBE by up to 35x by enabling incremental and
approximate inference optimizations that are inspired
by classical database query optimization techniques. We
allow the audience to interactively diagnose CNN
predictions from several use cases, including radiology
and natural images. A short video of our demonstration
can be found here: https://youtu.be/1OWddbd4n6Y",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Miao:2019:LVE,
author = "Zhengjie Miao and Andrew Lee and Sudeepa Roy",
title = "{LensXPlain}: visualizing and explaining contributing
subsets for aggregate query answers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1898--1901",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352094",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this demonstration, we will present LensXPlain, an
interactive system to help users understand answers of
aggregate queries by providing meaningful explanations.
Given a SQL group-by query and a question from a user
`` why output o is high/low '', or `` why output o$_1$
is higher/lower than o$_2$ '', LensXPlain helps users
explore the results and find subsets of tuples captured
by predicates that contributed the most toward such
observations. The contributions are measured either by
intervention (if the contributing tuples are removed,
the values or the ratios in the user question change in
the opposite direction), or by aggravation (if the
query is restricted to the contributing tuples, the
observations change more in the same direction).
LensXPlain uses ensemble learning for recommending
useful attributes in explanations, and employs a suite
of optimizations to enable explanation generation and
refinement at an interactive speed. In the
demonstration, the audience can run aggregation queries
over real world datasets, browse the answers using a
graphical user interface, ask questions on
unexpected/interesting query results with simple
visualizations, and explore and refine explanations
returned by LensXPlain.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2019:JDL,
author = "Yi Zhang and Zachary G. Ives",
title = "Juneau: data lake management for {Jupyter}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1902--1905",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352095",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In collaborative settings such as multi-investigator
laboratories, data scientists need improved tools to
manage not their data records but rather their data
sets and data products, to facilitate both provenance
tracking and data (and code) reuse within their data
lakes and file systems. We demonstrate the Juneau
System, which extends computational notebook software
(Jupyter Notebook) as an instrumentation and data
management point for overseeing and facilitating
improved dataset usage, through capabilities for
indexing, searching, and recommending ``complementary''
data sources, previously extracted machine learning
features, and additional training data. This
demonstration focuses on how we help the user find
related datasets via search.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Hasani:2019:AEA,
author = "Sona Hasani and Faezeh Ghaderi and Shohedul Hasan and
Saravanan Thirumuruganathan and Abolfazl Asudeh and
Nick Koudas and Gautam Das",
title = "{ApproxML}: efficient approximate ad-hoc {ML} models
through materialization and reuse",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1906--1909",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352096",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Machine learning (ML) has gained a pivotal role in
answering complex predictive analytic queries. Model
building for large scale datasets is one of the time
consuming parts of the data science pipeline. Often
data scientists are willing to sacrifice some accuracy
in order to speed up this process during the
exploratory phase. In this paper, we propose to
demonstrate ApproxML, a system that efficiently
constructs approximate ML models for new queries from
previously constructed ML models using the concepts of
model materialization and reuse. ApproxML supports a
variety of ML models such as generalized linear models
for supervised learning, and K-means and Gaussian
Mixture model for unsupervised learning.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Essertel:2019:FAL,
author = "Gr{\'e}gory Essertel and Ruby Y. Tahboub and Fei Wang
and James Decker and Tiark Rompf",
title = "{Flare \& Lantern}: efficiently swapping horses
midstream",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1910--1913",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352097",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Running machine learning (ML) workloads at scale is as
much a data management problem as a model engineering
problem. Big performance challenges exist when data
management systems invoke ML classifiers as
user-defined functions (UDFs) or when stand-alone ML
frameworks interact with data stores for data loading
and pre-processing (ETL). In particular, UDFs can be
precompiled or simply a black box for the data
management system and the data layout may be completely
different from the native layout, thus adding overheads
at the boundaries. In this demo, we will show how
bottlenecks between existing systems can be eliminated
when their engines are designed around runtime
compilation and native code generation, which is the
case for many state-of-the-art relational engines as
well as ML frameworks. We demonstrate an integration of
Flare (an accelerator for Spark SQL), and Lantern (an
accelerator for TensorFlow and PyTorch) that results in
a highly optimized end-to-end compiled data path,
switching between SQL and ML processing with negligible
overhead.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Martins:2019:TES,
author = "Ruben Martins and Jia Chen and Yanju Chen and Yu Feng
and Isil Dillig",
title = "{Trinity}: an extensible synthesis framework for data
science",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1914--1917",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352098",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this demo paper, we introduce Trinity, a
general-purpose framework that can be used to quickly
build domain-specific program synthesizers for
automating many tedious tasks that arise in data
science. We illustrate how Trinity can be used by three
different users: First, we show how end-users can use
Trinity's built-in synthesizers to automate data
wrangling tasks. Second, we show how advanced users can
easily extend existing synthesizers to support
additional functionalities. Third, we show how
synthesis experts can change the underlying search
engine in Trinity. Overall, this paper is intended to
demonstrate how users can quickly use, modify, and
extend the Trinity framework with the goal of
automating many tasks that are considered to be the
``janitor'' work of data science.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Huang:2019:PAA,
author = "Zhiqi Huang and Ryan McKenna and George Bissias and
Gerome Miklau and Michael Hay and Ashwin
Machanavajjhala",
title = "{PSynDB}: accurate and accessible private data
generation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1918--1921",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352099",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Across many application domains, trusted parties who
collect sensitive information need mechanisms to safely
disseminate data. A favored approach is to generate
synthetic data: a dataset similar to the original,
hopefully retaining its statistical features, but one
that does not reveal the private information of
contributors to the data. We present PSynDB, a
web-based synthetic table generator that is built on
recent privacy technologies [10,11,15]. PSynDB
satisfies the formal guarantee of differential privacy
and generates synthetic tables with high accuracy for
tasks that the user specifies as important. PSynDB
allows users to browse expected error rates before
running the mechanism, a useful feature for making
important policy decisions, such as setting the privacy
loss budget. When the user has finished configuration,
the tool outputs a data synthesis program that can be
ported to a trusted environment. There it can be safely
executed on the private data to produce the private
synthetic dataset for broad dissemination.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chandramouli:2019:FFI,
author = "Badrish Chandramouli and Dong Xie and Yinan Li and
Donald Kossmann",
title = "{FishStore}: fast ingestion and indexing of raw data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1922--1925",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352100",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The last decade has witnessed a huge increase in data
being ingested into the cloud from a variety of data
sources. The ingested data takes various forms such as
JSON, CSV, and binary formats. Traditionally, data is
either ingested into storage in raw form, indexed
ad-hoc using range indices, or cooked into
analytics-friendly columnar formats. None of these
solutions is able to handle modern requirements on
storage: making the data available immediately for
ad-hoc and streaming queries while ingesting at
extremely high throughputs. We demonstrate FishStore,
our open-source concurrent latch-free storage layer for
data with flexible schema. FishStore builds on recent
advances in parsing and indexing techniques, and is
based on multi-chain hash indexing of dynamically
registered predicated subsets of data. We find
predicated subset hashing to be a powerful primitive
that supports a broad range of queries on ingested data
and admits a higher performance (by up to an order of
magnitude) implementation than current alternatives.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Diao:2019:SMF,
author = "Yanlei Diao and Pawe{\l} Guzewicz and Ioana Manolescu
and Mirjana Mazuran",
title = "{Spade}: a modular framework for analytical
exploration of {RDF} graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1926--1929",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352101",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "RDF data is complex; exploring it is hard, and can be
done through many different metaphors. We have
developed and propose to demonstrate Spade, a tool
helping users discover meaningful content of an RDF
graph by showing them the results of aggregation
(OLAP-style) queries automatically identified from the
data. Spade chooses aggregates that are visually
interesting, a property formally based on statistic
properties of the aggregation query results. While well
understood for relational data, such exploration raises
multiple challenges for RDF: facts, dimensions and
measures have to be identified (as opposed to known
beforehand); as there are more candidate aggregates,
assessing their interestingness can be very costly;
finally, ontologies bring novel specific challenges but
also novel opportunities, enabling ontology-driven
exploration from an aggregate initially proposed by the
system. Spade is a generic, extensible framework, which
we instantiated with: ( i ) novel methods for
enumerating candidate measures and dimensions in the
vast space of possibilities provided by an RDF graph; (
ii ) a set of aggregate interestingness functions; (
iii ) ontology-based interactive exploration, and ( iv
) efficient early-stop techniques for estimating the
interestingness of an aggregate query. The
demonstration will comprise interactive scenarios on a
variety of large, interesting RDF graphs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dsilva:2019:MRD,
author = "Joseph Vinish D'silva and Florestan {De Moor} and
Bettina Kemme",
title = "Making an {RDBMS} data scientist friendly: advanced
in-database interactive analytics with visualization
support",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1930--1933",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352102",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/python.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We are currently witnessing the rapid evolution and
adoption of various data science frameworks that
function external to the database. Any support from
conventional RDBMS implementations for data science
applications has been limited to procedural paradigms
such as user-defined functions (UDFs) that lack
exploratory programming support. Therefore, the current
status quo is that during the exploratory phase, data
scientists usually use the database system as the
``data storage'' layer of the data science framework,
whereby the majority of computation and analysis is
performed outside the database, e.g., at the client
node. We demonstrate AIDA, an in-database framework for
data scientists. AIDA allows users to write interactive
Python code using a development environment such as a
Jupyter notebook. The actual execution itself takes
place inside the database (near-data), where a server
component of AIDA, that resides inside the embedded
Python interpreter of the RDBMS, manages the data sets
and computations. The demonstration will also show the
visualization capabilities of AIDA where the progress
of computation can be observed through live updates.
Our evaluations show that AIDA performs several times
faster compared to contemporary external data science
frameworks, but is much easier to use for exploratory
development compared to database UDFs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zaouk:2019:UNG,
author = "Khaled Zaouk and Fei Song and Chenghao Lyu and Arnab
Sinha and Yanlei Diao and Prashant Shenoy",
title = "{UDAO}: a next-generation unified data analytics
optimizer",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1934--1937",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352103",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Big data analytics systems today still lack the
ability to take user performance goals and budgetary
constraints, collectively referred to as
``objectives'', and automatically configure an analytic
job to achieve the objectives. This paper presents
UDAO, a unified data analytics optimizer that can
automatically determine the parameters of the runtime
system, collectively called a job configuration, for
general dataflow programs based on user objectives.
UDAO embodies key techniques including in-situ
modeling, which learns a model for each user objective
in the same computing environment as the job is run,
and multi-objective optimization, which computes a
Pareto optimal set of job configurations to reveal
tradeoffs between different objectives. Using
benchmarks developed based on industry needs, our
demonstration will allow the user to explore (1)
learned models to gain insights into how various
parameters affect user objectives; (2) Pareto frontiers
to understand interesting tradeoffs between different
objectives and how a configuration recommended by the
optimizer explores these tradeoffs; (3) end-to-end
benefits that UDAO can provide over default
configurations or those manually tuned by engineers.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jo:2019:AFC,
author = "Saehan Jo and Immanuel Trummer and Weicheng Yu and
Xuezhi Wang and Cong Yu and Daniel Liu and Niyati
Mehta",
title = "{AggChecker}: a fact-checking system for text
summaries of relational data sets",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1938--1941",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352104",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We demonstrate AggChecker, a novel tool for verifying
textual summaries of relational data sets. The system
automatically verifies natural language claims about
numerical aggregates against the underlying raw data.
The system incorporates a combination of natural
language processing, information retrieval, machine
learning, and efficient query processing strategies.
Each claim is translated into a semantically equivalent
SQL query and evaluated against the database. Our
primary goal is analogous to that of a spell-checker:
to identify erroneous claims and provide guidance in
correcting them. In this demonstration, we show that
our system enables users to verify text summaries much
more efficiently than a standard SQL interface.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2019:GIG,
author = "Hanzhang Wang and Phuong Nguyen and Jun Li and Selcuk
Kopru and Gene Zhang and Sanjeev Katariya and Sami
Ben-Romdhane",
title = "{GRANO}: interactive graph-based root cause analysis
for cloud-native distributed data platform",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1942--1945",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352105",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We demonstrate Grano$^1$, an end-to-end anomaly
detection and root cause analysis (or RCA for short)
system for cloud-native distributed data platform by
providing a holistic view of the system component
topology, alarms and application events. Grano
provides: a Detection Layer to process large amount of
time-series monitoring data to detect anomalies at
logical and physical system components; an Anomaly
Graph Layer with novel graph modeling and algorithms
for leveraging system topology data and detection
results to identify the root cause relevance at the
system component level; and an Application Layer that
automatically notifies on-call personnel and presents
real-time and on-demand RCA support through an
interactive graph interface. The system is deployed and
evaluated using eBay's production data to help on-call
personnel to shorten the identification of root cause
from hours to minutes.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Frey:2019:DHB,
author = "Davide Frey and Marc X. Makkes and Pierre-Louis Roman
and Fran{\c{c}}ois Ta{\"\i}ani and Spyros Voulgaris",
title = "{Dietcoin}: hardening {Bitcoin} transaction
verification process for mobile devices",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1946--1949",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352106",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Distributed ledgers are among the most replicated data
repositories in the world. They offer data consistency,
immutability, and auditability, based on the assumption
that each participating node locally verifies their
entire content. Although their content, currently
extending up to a few hundred gigabytes, can be
accommodated by dedicated commodity hard disks,
downloading it, processing it, and storing it in
general-purpose desktop and laptop computers can prove
largely impractical. Even worse, this becomes a
prohibitive restriction for smartphones, mobile
devices, and resource-constrained IoT devices. In this
demo, we present an implementation of Dietcoin, a
Bitcoin protocol extension that allows nodes to perform
secure local verification of Bitcoin transactions with
small bandwidth and storage requirements. This demo
presents and benchmarks the main features of Dietcoin
that are important for today's cryptocurrencies and
smart contract systems, but are missing in the current
state-of-the-art: (i) allowing resource-constrained
devices to verify the correctness of selected blocks
locally without having to download the complete ledger;
(ii) enabling devices to join a blockchain quickly yet
securely, dropping bootstrap time from days down to a
matter of seconds; (iii) providing a generic solution
that can be applied to other distributed ledgers
secured with Proof-of-Work.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Singla:2019:RLS,
author = "Samriddhi Singla and Ahmed Eldawy and Rami Alghamdi
and Mohamed F. Mokbel",
title = "{Raptor}: large scale analysis of big raster and
vector data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1950--1953",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352107",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the increase in amount of remote sensing data,
there have been efforts to efficiently process it to
help ecologists and geographers answer queries.
However, they often need to process this data in
combination with vector data, for example, city
boundaries. Existing efforts require one dataset to be
converted to the other representation, which is
extremely inefficient for large datasets. In this
demonstration, we focus on the zonal statistics
problem, which computes the statistics over a raster
layer for each polygon in a vector layer. We
demonstrate three approaches, vector-based,
raster-based, and raptor-based approaches. The latter
is a recent effort of combining raster and vector data
without a need of any conversion. This demo will allow
users to run their own queries in any of the three
methods and observe the differences in their
performance depending on different raster and vector
dataset sizes.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Rezig:2019:DCH,
author = "El Kindi Rezig and Lei Cao and Michael Stonebraker and
Giovanni Simonini and Wenbo Tao and Samuel Madden and
Mourad Ouzzani and Nan Tang and Ahmed K. Elmagarmid",
title = "{Data Civilizer 2.0}: a holistic framework for data
preparation and analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1954--1957",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352108",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data scientists spend over 80\% of their time (1)
parameter-tuning machine learning models and (2)
iterating between data cleaning and machine learning
model execution. While there are existing efforts to
support the first requirement, there is currently no
integrated workflow system that couples data cleaning
and machine learning development. The previous version
of Data Civilizer was geared towards data cleaning and
discovery using a set of pre-defined tools. In this
paper, we introduce Data Civilizer 2.0, an end-to-end
workflow system satisfying both requirements. In
addition, this system also supports a sophisticated
data debugger and a workflow visualization system. In
this demo, we will show how we used Data Civilizer 2.0
to help scientists at the Massachusetts General
Hospital build their cleaning and machine learning
pipeline on their 30TB brain activity dataset.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Spiegelberg:2019:TRE,
author = "Leonhard F. Spiegelberg and Tim Kraska",
title = "{Tuplex}: robust, efficient analytics when {Python}
rules",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1958--1961",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352109",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/python.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Spark became the defacto industry standard as an
execution engine for data preparation, cleaning,
distributed machine learning, streaming and,
warehousing over raw data. However, with the success of
Python the landscape is shifting again; there is a
strong demand for tools which better integrate with the
Python landscape and do not have the impedance mismatch
like Spark. In this paper, we demonstrate Tuplex (short
for tuples and exceptions ), a Python-native data
preparation framework that allows users to develop and
deploy pipelines faster and more robustly while
providing bare-metal execution times through code
compilation whenever possible.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Renggli:2019:EMC,
author = "Cedric Renggli and Frances Ann Hubis and Bojan Karlas
and Kevin Schawinski and Wentao Wu and Ce Zhang",
title = "{Ease.ml\slash ci} and {Ease.ml\slash meter} in
action: towards data management for statistical
generalization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1962--1965",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352110",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Developing machine learning (ML) applications is
similar to developing traditional software --- it is
often an iterative process in which developers navigate
within a rich space of requirements, design decisions,
implementations, empirical quality, and performance. In
traditional software development, software engineering
is the field of study which provides principled
guidelines for this iterative process. However, as of
today, the counterpart of ``software engineering for
ML'' is largely missing --- developers of ML
applications are left with powerful tools (e.g.,
TensorFlow and PyTorch) but little guidance regarding
the development lifecycle itself. In this paper, we
view the management of ML development life-cycles from
a data management perspective. We demonstrate two
closely related systems, ease.ml/ci and ease.ml/meter,
that provide some ``principled guidelines'' for ML
application development: ci is a continuous integration
engine for ML models and meter is a ``profiler'' for
controlling overfitting of ML models. Both systems
focus on managing the ``statistical generalization
power'' of datasets used for assessing the quality of
ML applications, namely, the validation set and the
test set. By demonstrating these two systems we hope to
spawn further discussions within our community on
building this new type of data management systems for
statistical generalization.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Han:2019:PRV,
author = "Xueran Han and Jun Chen and Jiaheng Lu and Yueguo Chen
and Xiaoyong Du",
title = "{PivotE}: revealing and visualizing the underlying
entity structures for exploration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1966--1969",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352111",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "A Web-scale knowledge graph (KG) typically contains
millions of entities and thousands of entity types. Due
to the lack of a pre-defined data schema such as the ER
model, entities in KGs are loosely coupled based on
their relationships, which brings challenges for
effective accesses of the KGs in a structured manner
like SPARQL. This demonstration presents an
entity-oriented exploratory search prototype system
that is able to support search and explore KGs in a
exploratory search manner, where local structures of
KGs can be dynamically discovered and utilized for
guiding users. The system applies a path-based ranking
method for recommending similar entities and their
relevant information as exploration pointers. The
interface is designed to assist users to investigate a
domain (particular type) of entities, as well as to
explore the knowledge graphs in various relevant
domains. The queries are dynamically formulated by
tracing the users' dynamic clicking (exploration)
behaviors. In this demonstration, we will show how our
system visualize the underlying entity structures, as
well as explain the semantic correlations among them in
a unified interface, which not only assist users to
learn about the properties of entities in many aspects
but also guide them to further explore the information
space.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lu:2019:SYA,
author = "Jiaheng Lu and Yuxing Chen and Herodotos Herodotou and
Shivnath Babu",
title = "Speedup your analytics: automatic parameter tuning for
databases and big data systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1970--1973",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352112",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Database and big data analytics systems such as Hadoop
and Spark have a large number of configuration
parameters that control memory distribution, I/O
optimization, parallelism, and compression. Improper
parameter settings can cause significant performance
degradation and stability issues. However, regular
users and even expert administrators struggle to
understand and tune them to achieve good performance.
In this tutorial, we review existing approaches on
automatic parameter tuning for databases, Hadoop, and
Spark, which we classify into six categories:
rule-based, cost modeling, simulation-based,
experiment-driven, machine learning, and adaptive
tuning. We describe the foundations of different
automatic parameter tuning algorithms and present pros
and cons of each approach. We also highlight real-world
applications and systems, and identify research
challenges for handling cloud services, resource
heterogeneity, and real-time analytics.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Meng:2019:TAC,
author = "Yu Meng and Jiaxin Huang and Jingbo Shang and Jiawei
Han",
title = "{TextCube}: automated construction and
multidimensional exploration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1974--1977",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352113",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Today's society is immersed in a wealth of text data,
ranging from news articles, to social media, research
literature, medical records, and corporate reports. A
grand challenge of data science and engineering is to
develop effective and scalable methods to extract
structures and knowledge from massive text data to
satisfy diverse applications, without extensive,
corpus-specific human annotations. In this tutorial, we
show that TextCube provides a critical information
organization structure that will satisfy such an
information need. We overview a set of recently
developed data-driven methods that facilitate automated
construction of TextCubes from massive, domain-specific
text corpora, and show that TextCubes so constructed
will enhance text exploration and analysis for various
applications. We focus on new TextCube construction
methods that are scalable, weakly-supervised,
domain-independent, language-agnostic, and effective
(i.e., generating quality TextCubes from large corpora
of various domains). We will demonstrate with real
datasets (including news articles, scientific
publications, and product reviews) on how TextCubes can
be constructed to assist multidimensional analysis of
massive text corpora.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Amer-Yahia:2019:EEO,
author = "Sihem Amer-Yahia and Senjuti Basu Roy",
title = "The ever evolving online labor market: overview,
challenges and opportunities",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1978--1981",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352114",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The goal of this tutorial is to make the audience
aware of various discipline-specific research
activities that could be characterized to be part of
online labor markets and advocate for a unified
framework that is interdisciplinary in nature and
requires convergence of different research disciplines.
We will discuss how such a framework could bring
transformative effect on the nexus of humans,
technology, and the future of work.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sabek:2019:MLM,
author = "Ibrahim Sabek and Mohamed F. Mokbel",
title = "Machine learning meets big spatial data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1982--1985",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352115",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The proliferation in amounts of generated data has
propelled the rise of scalable machine learning
solutions to efficiently analyze and extract useful
insights from such data. Meanwhile, spatial data has
become ubiquitous, e.g., GPS data, with increasingly
sheer sizes in recent years. The applications of big
spatial data span a wide spectrum of interests
including tracking infectious disease, climate change
simulation, drug addiction, among others. Consequently,
major research efforts are exerted to support efficient
analysis and intelligence inside these applications by
either providing spatial extensions to existing machine
learning solutions or building new solutions from
scratch. In this 90-minutes tutorial, we
comprehensively review the state-of-the-art work in the
intersection of machine learning and big spatial data.
We cover existing research efforts and challenges in
three major areas of machine learning, namely, data
analysis, deep learning and statistical inference, as
well as two advanced spatial machine learning tasks,
namely, spatial features extraction and spatial
sampling. We also highlight open problems and
challenges for future research in this area.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Nargesian:2019:DLM,
author = "Fatemeh Nargesian and Erkang Zhu and Ren{\'e}e J.
Miller and Ken Q. Pu and Patricia C. Arocena",
title = "Data lake management: challenges and opportunities",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1986--1989",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352116",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The ubiquity of data lakes has created fascinating new
challenges for data management research. In this
tutorial, we review the state-of-the-art in data
management for data lakes. We consider how data lakes
are introducing new problems including dataset
discovery and how they are changing the requirements
for classic problems including data extraction, data
cleaning, data integration, data versioning, and
metadata management.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lakshmanan:2019:CFN,
author = "Laks V. S. Lakshmanan and Michael Simpson and
Saravanan Thirumuruganathan",
title = "Combating fake news: a data management and mining
perspective",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1990--1993",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352117",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Fake news is a major threat to global democracy
resulting in diminished trust in government, journalism
and civil society. The public popularity of social
media and social networks has caused a contagion of
fake news where conspiracy theories, disinformation and
extreme views flourish. Detection and mitigation of
fake news is one of the fundamental problems of our
times and has attracted widespread attention. While
fact checking websites such as snopes, politifact and
major companies such as Google, Facebook, and Twitter
have taken preliminary steps towards addressing fake
news, much more remains to be done. As an
interdisciplinary topic, various facets of fake news
have been studied by communities as diverse as machine
learning, databases, journalism, political science and
many more. The objective of this tutorial is two-fold.
First, we wish to familiarize the database community
with the efforts by other communities on combating fake
news. We provide a panoramic view of the
state-of-the-art of research on various aspects
including detection, propagation, mitigation, and
intervention of fake news. Next, we provide a concise
and intuitive summary of prior research by the database
community and discuss how it could be used to
counteract fake news. The tutorial covers research from
areas such as data integration, truth discovery and
fusion, probabilistic databases, knowledge graphs and
crowdsourcing from the lens of fake news. Effective
tools for addressing fake news could only be built by
leveraging the synergistic relationship between
database and other research communities. We hope that
our tutorial provides an impetus towards such synthesis
of ideas and the creation of new ones.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Anciaux:2019:PDS,
author = "Nicolas Anciaux and Luc Bouganim and Philippe Pucheral
and Iulian Sandu Popa and Guillaume Scerri",
title = "Personal database security and trusted execution
environments: a tutorial at the crossroads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1994--1997",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352118",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Smart disclosure initiatives and new regulations such
as GDPR in the EU increase the interest for Personal
Data Management Systems (PDMS) being provided to
individuals to preserve their entire digital life.
Consequently, the thorny issue of data security becomes
more and more prominent, but highly differs from
traditional privacy issues in outsourced corporate
databases. Concurrently, the emergence of Trusted
Execution Environments (TEE) changes the game in
privacy-preserving data management with novel security
models. This tutorial offers a global perspective of
the current state of work at the confluence of these
two rapidly growing areas. The goal is threefold: (1)
review and categorize PDMS solutions and identify
existing privacy threats and countermeasures; (2)
review new security models capitalizing on TEEs and
related privacy-preserving data management solutions
relevant to the personal context; (3) discuss new
challenges at the intersection of PDMS security and
TEE-based data management.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kessler:2019:SHG,
author = "Stephan Kessler and Jens Hoff and Johann-Christoph
Freytag",
title = "{SAP HANA} goes private: from privacy research to
privacy aware enterprise analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "1998--2009",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352119",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Over the last 20 years, the progress of information
technology has allowed many companies to generate,
integrate, store, and analyze data of unprecedented
size and complexity. In many cases, this data is
personal data and how it can be used is therefore
subject to laws that depend on the specific countries
and application domains. For example, the General Data
Protection Regulation (GDPR) introduced in the European
Union imposes strict rules on how personal data can be
processed. Analyzing personal data can create
tremendous value, but at the same time companies must
ensure that they remain legally compliant.
Unfortunately, existing systems offer only limited or
no support at all for processing personal data in a
privacy-aware manner. Approaches that have emerged from
the academic and industrial research environments need
to be integrated into large systems (like enterprise
systems) in a manageable and scalable way. In many IT
environments, it is also desirable and necessary to
combine and to integrate personal data with other
(non-personal) data in a seamless fashion. In this
paper, we present the first steps that SAP has taken to
provide its database management system SAP HANA with
privacy-enhanced processing capabilities, referred to
in the following as SAP HANA Data Anonymization.
Various goals on both the conceptual and technical
levels were followed with the aim of providing SAP
customers today with an integrated processing
environment for personal and non-personal data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Damasio:2019:GAL,
author = "Guilherme Damasio and Vincent Corvinelli and Parke
Godfrey and Piotr Mierzejewski and Alex Mihaylov and
Jaroslaw Szlichta and Calisto Zuzarte",
title = "Guided automated learning for query workload
re-optimization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "2010--2021",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352120",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Query optimization is a hallmark of database systems.
When an SQL query runs more expensively than is viable
or warranted, determination of the performance issues
is usually performed manually in consultation with
experts through the analysis of query's execution plan
(QEP). However, this is an excessively time consuming,
human error-prone, and costly process. GALO is a novel
system that automates this process. The tool
automatically learns recurring problem patterns in
query plans over workloads in an offline learning
phase, to build a knowledge base of plan-rewrite
remedies. It then uses the knowledge base online to
re-optimize queries often quite drastically. GALO's
knowledge base is built on RDF and SPARQL, W3C graph
database standards, which is well suited for
manipulating and querying over SQL query plans, which
are graphs themselves. GALO acts as a third-tier of
re-optimization, after query rewrite and cost-based
optimization, as a query plan rewrite. For generality,
the context of knowledge base problem patterns,
including table and column names, is abstracted with
canonical symbol labels. Since the knowledge base is
not tied to the context of supplied QEPs, table and
column names are matched automatically during the
re-optimization phase. Thus, problem patterns learned
over a particular query workload can be applied in
other query workloads. GALO's knowledge base is also an
invaluable tool for database experts to debug query
performance issues by tracking to known issues and
solutions as well as refining the optimizer with new
tuned techniques by the development team. We
demonstrate an experimental study of the effectiveness
of our techniques over synthetic TPC-DS and real IBM
client query workloads.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chattopadhyay:2019:PUS,
author = "Biswapesh Chattopadhyay and Priyam Dutta and Weiran
Liu and Ott Tinn and Andrew Mccormick and Aniket
Mokashi and Paul Harvey and Hector Gonzalez and David
Lomax and Sagar Mittal and Roee Ebenstein and Nikita
Mikhaylin and Hung-ching Lee and Xiaoyan Zhao and Tony
Xu and Luis Perez and Farhad Shahmohammadi and Tran Bui
and Neil McKay and Selcuk Aya and Vera Lychagina and
Brett Elliott",
title = "{Procella}: unifying serving and analytical data at
{YouTube}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "2022--2034",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352121",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Large organizations like YouTube are dealing with
exploding data volume and increasing demand for data
driven applications. Broadly, these can be categorized
as: reporting and dashboarding, embedded statistics in
pages, time-series monitoring, and ad-hoc analysis.
Typically, organizations build specialized
infrastructure for each of these use cases. This,
however, creates silos of data and processing, and
results in a complex, expensive, and harder to maintain
infrastructure. At YouTube, we solved this problem by
building a new SQL query engine --- Procella. Procella
implements a superset of capabilities required to
address all of the four use cases above, with high
scale and performance, in a single product. Today,
Procella serves hundreds of billions of queries per day
across all four workloads at YouTube and several other
Google product areas.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lu:2019:LET,
author = "Wei Lu and Zhanhao Zhao and Xiaoyu Wang and Haixiang
Li and Zhenmiao Zhang and Zhiyu Shui and Sheng Ye and
Anqun Pan and Xiaoyong Du",
title = "A lightweight and efficient temporal database
management system in {TDSQL}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "2035--2046",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352122",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Driven by the recent adoption of temporal expressions
into SQL:2011, extensions of temporal support in
conventional database management systems (a.b.a. DBMSs)
have re-emerged as a research hotspot. In this paper,
we present a lightweight yet efficient built-in
temporal implementation in Tencent's distributed
database management system, namely TDSQL. The novelty
of TDSQL's temporal implementation includes: (1) a new
temporal data model with the extension of SQL:2011, (2)
a built-in temporal implementation with various
optimizations, which are also applicable to other
DBMSs, and (3) a low-storage-consumption in which only
data changes are maintained. For the repeatability
purpose, we elaborate the integration of our proposed
techniques into MySQL. We conduct extensive experiments
on both real-life dataset and synthetic TPC benchmarks
by comparing TD-SQL with other temporal databases. The
results show that TDSQL is lightweight and efficient.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sherkat:2019:NSE,
author = "Reza Sherkat and Colin Florendo and Mihnea Andrei and
Rolando Blanco and Adrian Dragusanu and Amit Pathak and
Pushkar Khadilkar and Neeraj Kulkarni and Christian
Lemke and Sebastian Seifert and Sarika Iyer and
Sasikanth Gottapu and Robert Schulze and Chaitanya
Gottipati and Nirvik Basak and Yanhong Wang and Vivek
Kandiyanallur and Santosh Pendap and Dheren Gala and
Rajesh Almeida and Prasanta Ghosh",
title = "Native store extension for {SAP HANA}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "2047--2058",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352123",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present an overview of SAP HANA's Native Store
Extension (NSE). This extension substantially increases
database capacity, allowing to scale far beyond
available system memory. NSE is based on a hybrid
in-memory and paged column store architecture composed
from data access primitives. These primitives enable
the processing of hybrid columns using the same
algorithms optimized for traditional HANA's in-memory
columns. Using only three key primitives, we fabricated
byte-compatible counterparts for complex memory
resident data structures (e.g. dictionary and
hash-index), compressed schemes (e.g. sparse and
run-length encoding), and exotic data types (e.g.
geo-spatial). We developed a new buffer cache which
optimizes the management of paged resources by smart
strategies sensitive to page type and access patterns.
The buffer cache integrates with HANA's new execution
engine that issues pipelined prefetch requests to
improve disk access patterns. A novel load unit
configuration, along with a unified persistence format,
allows the hybrid column store to dynamically switch
between in-memory and paged data access to balance
performance and storage economy according to
application demands while reducing Total Cost of
Ownership (TCO). A new partitioning scheme supports
load unit specification at table, partition, and column
level. Finally, a new advisor recommends optimal load
unit configurations. Our experiments illustrate the
performance and memory footprint improvements on
typical customer scenarios.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhan:2019:ART,
author = "Chaoqun Zhan and Maomeng Su and Chuangxian Wei and
Xiaoqiang Peng and Liang Lin and Sheng Wang and Zhe
Chen and Feifei Li and Yue Pan and Fang Zheng and
Chengliang Chai",
title = "{AnalyticDB}: real-time {OLAP} database system at
{Alibaba} cloud",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "2059--2070",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352124",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With data explosion in scale and variety, OLAP
databases play an increasingly important role in
serving real-time analysis with low latency (e.g.,
hundreds of milliseconds), especially when incoming
queries are complex and ad hoc in nature. Moreover,
these systems are expected to provide high query
concurrency and write throughput, and support queries
over structured and complex data types (e.g., JSON,
vector and texts). In this paper, we introduce
AnalyticDB, a real-time OLAP database system developed
at Alibaba. AnalyticDB maintains all-column indexes in
an asynchronous manner with acceptable overhead, which
provides low latency for complex ad-hoc queries. Its
storage engine extends hybrid row-column layout for
fast retrieval of both structured data and data of
complex types. To handle large-scale data with high
query concurrency and write throughput, AnalyticDB
decouples read and write access paths. To further
reduce query latency, novel storage-aware SQL optimizer
and execution engine are developed to fully utilize the
advantages of the underlying storage and indexes.
AnalyticDB has been successfully deployed on Alibaba
Cloud to serve numerous customers (both large and
small). It is capable of holding 100 trillion rows of
records, i.e., 10PB+ in size. At the same time, it is
able to serve 10m+ writes and 100k+ queries per second,
while completing complex queries within hundreds of
milliseconds.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Schultz:2019:TCM,
author = "William Schultz and Tess Avitabile and Alyson Cabral",
title = "Tunable consistency in {MongoDB}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "2071--2081",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352125",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Distributed databases offer high availability but can
impose high costs on client applications in order to
maintain strong consistency at all times. MongoDB is a
document oriented database whose replication system
provides a variety of consistency levels allowing
client applications to select the trade-offs they want
to make when it comes to consistency and latency, at a
per operation level. In this paper we discuss the
tunable consistency models in MongoDB replication and
their utility for application developers. We discuss
how the MongoDB replication system's speculative
execution model and data rollback protocol help make
this spectrum of consistency levels possible. We also
present case studies of how these consistency levels
are used in real world applications, along with a
characterization of their performance benefits and
trade-offs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cao:2019:TOR,
author = "Shaosheng Cao and XinXing Yang and Cen Chen and Jun
Zhou and Xiaolong Li and Yuan Qi",
title = "{TitAnt}: online real-time transaction fraud detection
in {Ant Financial}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "2082--2093",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352126",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the explosive growth of e-commerce and the
booming of e-payment, detecting online transaction
fraud in real time has become increasingly important to
Fintech business. To tackle this problem, we introduce
the TitAnt, a transaction fraud detection system
deployed in Ant Financial, one of the largest Fintech
companies in the world. The system is able to predict
online real-time transaction fraud in mere
milliseconds. We present the problem definition,
feature extraction, detection methods, implementation
and deployment of the system, as well as empirical
effectiveness. Extensive experiments have been
conducted on large real-world transaction data to show
the effectiveness and the efficiency of the proposed
system.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhu:2019:ACG,
author = "Rong Zhu and Kun Zhao and Hongxia Yang and Wei Lin and
Chang Zhou and Baole Ai and Yong Li and Jingren Zhou",
title = "{AliGraph}: a comprehensive graph neural network
platform",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "2094--2105",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352127",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "An increasing number of machine learning tasks require
dealing with large graph datasets, which capture rich
and complex relationship among potentially billions of
elements. Graph Neural Network (GNN) becomes an
effective way to address the graph learning problem by
converting the graph data into a low dimensional space
while keeping both the structural and property
information to the maximum extent and constructing a
neural network for training and referencing. However,
it is challenging to provide an efficient graph storage
and computation capabilities to facilitate GNN training
and enable development of new GNN algorithms. In this
paper, we present a comprehensive graph neural network
system, namely AliGraph, which consists of distributed
graph storage, optimized sampling operators and runtime
to efficiently support not only existing popular GNNs
but also a series of in-house developed ones for
different scenarios. The system is currently deployed
at Alibaba to support a variety of business scenarios,
including product recommendation and personalized
search at Alibaba's E-Commerce platform. By conducting
extensive experiments on a real-world dataset with
492.90 million vertices, 6.82 billion edges and rich
attributes, AliGraph performs an order of magnitude
faster in terms of graph building (5 minutes vs hours
reported from the state-of-the-art PowerGraph
platform). At training, AliGraph runs 40\%-50\% faster
with the novel caching strategy and demonstrates around
12 times speed up with the improved runtime. In
addition, our in-house developed GNN models all
showcase their statistically significant superiorities
in terms of both effectiveness and efficiency (e.g.,
4.12\%--17.19\% lift by F1 scores).",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chen:2019:CSF,
author = "Zhimin Chen and Yue Wang and Vivek Narasayya and
Surajit Chaudhuri",
title = "Customizable and scalable fuzzy join for big data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "2106--2117",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352128",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Fuzzy join is an important primitive for data
cleaning. The ability to customize fuzzy join is
crucial to allow applications to address
domain-specific data quality issues such as synonyms
and abbreviations. While efficient indexing techniques
exist for single-node implementations of customizable
fuzzy join, the state-of-the-art scale-out techniques
do not support customization, and exhibit poor
performance and scalability characteristics. We
describe the design of a scale-out fuzzy join operator
that supports customization. We use a
locality-sensitive-hashing (LSH) based signature
scheme, and introduce optimizations that result in
significant speed up with negligible impact on recall.
We evaluate our implementation on the Azure Databricks
version of Spark using several real-world and synthetic
data sets. We observe speedups exceeding 50X compared
to the best-known prior scale-out technique, and close
to linear scalability with data size and number of
nodes.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2019:QQA,
author = "Guoliang Li and Xuanhe Zhou and Shifu Li and Bo Gao",
title = "{QTune}: a query-aware database tuning system with
deep reinforcement learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "2118--2130",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352129",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Database knob tuning is important to achieve high
performance (e.g., high throughput and low latency).
However, knob tuning is an NP-hard problem and existing
methods have several limitations. First, DBAs cannot
tune a lot of database instances on different
environments (e.g., different database vendors).
Second, traditional machine-learning methods either
cannot find good configurations or rely on a lot of
high-quality training examples which are rather hard to
obtain. Third, they only support coarse-grained tuning
(e.g., workload-level tuning) but cannot provide
fine-grained tuning (e.g., query-level tuning). To
address these problems, we propose a query-aware
database tuning system QTune with a deep reinforcement
learning (DRL) model, which can efficiently and
effectively tune the database configurations. QTune
first featurizes the SQL queries by considering rich
features of the SQL queries. Then QTune feeds the query
features into the DRL model to choose suitable
configurations. We propose a Double-State Deep
Deterministic Policy Gradient (DS-DDPG) model to enable
query-aware database configuration tuning, which
utilizes the actor-critic networks to tune the database
configurations based on both the query vector and
database states. QTune provides three database tuning
granularities: query-level, workload-level, and
cluster-level tuning. We deployed our techniques onto
three real database systems, and experimental results
show that QTune achieves high performance and
outperforms the state-of-the-art tuning methods.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kandula:2019:EAQ,
author = "Srikanth Kandula and Kukjin Lee and Surajit Chaudhuri
and Marc Friedman",
title = "Experiences with approximating queries in
{Microsoft}'s production big-data clusters",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "2131--2142",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352130",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With the rapidly growing volume of data, it is more
attractive than ever to leverage approximations to
answer analytic queries. Sampling is a powerful
technique which has been studied extensively from the
point of view of facilitating approximation. Yet, there
has been no large-scale study of effectiveness of
sampling techniques in big data systems. In this paper,
we describe an in-depth study of the sampling-based
approximation techniques that we have deployed in
Microsoft's big data clusters. We explain the choices
we made to implement approximation, identify the usage
cases, and study detailed data that sheds insight on
the usefulness of doing sampling based approximation.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Antonopoulos:2019:CTR,
author = "Panagiotis Antonopoulos and Peter Byrne and Wayne Chen
and Cristian Diaconu and Raghavendra Thallam
Kodandaramaih and Hanuma Kodavalla and Prashanth
Purnananda and Adrian-Leonard Radu and Chaitanya
Sreenivas Ravella and Girish Mittur Venkataramanappa",
title = "Constant time recovery in {Azure SQL} database",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "2143--2154",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352131",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Azure SQL Database and the upcoming release of SQL
Server introduce a novel database recovery mechanism
that combines traditional ARIES recovery with
multi-version concurrency control to achieve database
recovery in constant time, regardless of the size of
user transactions. Additionally, our algorithm enables
continuous transaction log truncation, even in the
presence of long running transactions, thereby allowing
large data modifications using only a small, constant
amount of log space. These capabilities are
particularly important for any Cloud database service
given (a) the constantly increasing database sizes, (b)
the frequent failures of commodity hardware, (c) the
strict availability requirements of modern, global
applications and (d) the fact that software upgrades
and other maintenance tasks are managed by the Cloud
platform, introducing unexpected failures for the
users. This paper describes the design of our recovery
algorithm and demonstrates how it allowed us to improve
the availability of Azure SQL Database by guaranteeing
consistent recovery times of under 3 minutes for
99.999\% of recovery cases in production.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Huang:2019:YGD,
author = "Yuzhen Huang and Yingjie Shi and Zheng Zhong and Yihui
Feng and James Cheng and Jiwei Li and Haochuan Fan and
Chao Li and Tao Guan and Jingren Zhou",
title = "{Yugong}: geo-distributed data and job placement at
scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "2155--2169",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352132",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Companies like Alibaba operate tens of data centers
(DCs) across geographically distributed locations.
These DCs collectively provide the storage space and
computing power for the company, storing EBs of data
and serving millions of batch analytics jobs every day.
In Alibaba, as our businesses grow, there are more and
more cross-DC dependencies caused by jobs reading data
from remote DCs. Consequently, the precious wide area
network bandwidth becomes a major bottleneck for
operating geo-distributed DCs at scale. In this paper,
we present Yugong --- a system that manages data
placement and job placement in Alibaba's
geo-distributed DCs, with the objective to minimize
cross-DC bandwidth usage. Yugong uses three methods,
namely project placement, table replication, and job
outsourcing, to address the issues of high bandwidth
consumption across the DCs. We give the details of
Yugong's design and implementation for the three
methods, and describe how it cooperates with other
systems (e.g., Alibaba's big data analytics platform
and cluster scheduler) to improve the productivity of
the DCs. We also report comprehensive performance
evaluation results, which validate the design of Yugong
and show that significant reduction in cross-DC
bandwidth usage has been achieved.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tan:2019:CCD,
author = "Junjay Tan and Thanaa Ghanem and Matthew Perron and
Xiangyao Yu and Michael Stonebraker and David DeWitt
and Marco Serafini and Ashraf Aboulnaga and Tim
Kraska",
title = "Choosing a cloud {DBMS}: architectures and tradeoffs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "2170--2182",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352133",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "As analytic (OLAP) applications move to the cloud,
DBMSs have shifted from employing a pure shared-nothing
design with locally attached storage to a hybrid design
that combines the use of shared-storage (e.g., AWS S3)
with the use of shared-nothing query execution
mechanisms. This paper sheds light on the resulting
tradeoffs, which have not been properly identified in
previous work. To this end, it evaluates the TPC-H
benchmark across a variety of DBMS offerings running in
a cloud environment (AWS) on fast 10Gb+ networks,
specifically database-as-a-service offerings (Redshift,
Athena), query engines (Presto, Hive), and a
traditional cloud agnostic OLAP database (Vertica).
While these comparisons cannot be apples-to-apples in
all cases due to cloud configuration restrictions, we
nonetheless identify patterns and design choices that
are advantageous. These include prioritizing low-cost
object stores like S3 for data storage, using system
agnostic yet still performant columnar formats like ORC
that allow easy switching to other systems for
different workloads, and making features that benefit
subsequent runs like query precompilation and caching
remote data to faster storage optional rather than
required because they disadvantage ad hoc queries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2019:SSM,
author = "Jingtian Zhang and Sai Wu and Zeyuan Tan and Gang Chen
and Zhushi Cheng and Wei Cao and Yusong Gao and Xiaojie
Feng",
title = "{S3}: a scalable in-memory skip-list index for
key--value store",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "2183--2194",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352134",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Many new memory indexing structures have been proposed
and outperform current in-memory skip-list index
adopted by LevelDB, RocksDB and other key--value
systems. However, those new indexes cannot be easily
integrated with key--value systems, because most of
them do not consider how the data can be efficiently
flushed to disk. Some assumptions, such as fixed size
key and value, are unrealistic for real applications.
In this paper, we present S3, a scalable in-memory
skip-list index for the customized version of RocksDB
in Alibaba Cloud. S3 adopts a two-layer structure. In
the top layer, a cache-sensitive structure is used to
maintain a few guard entries to facilitate the search
over the skip-list. In the bottom layer, a semi-ordered
skip-list index is built to support highly concurrent
insertions and fast lookup and range query. To further
improve the performance, we train a neural model to
select guard entries intelligently according to the
data distribution and query distribution. Experiments
on multiple datasets show that S3 achieves a comparable
performance to other new memory indexing schemes, and
can replace current in-memory skip-list of LevelDB and
RocksDB to support huge volume of data.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Masson:2019:DFF,
author = "Charles Masson and Jee E. Rim and Homin K. Lee",
title = "{DDSketch}: a fast and fully-mergeable quantile sketch
with relative-error guarantees",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "2195--2205",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352135",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Summary statistics such as the mean and variance are
easily maintained for large, distributed data streams,
but order statistics (i.e., sample quantiles) can only
be approximately summarized. There is extensive
literature on maintaining quantile sketches where the
emphasis has been on bounding the rank error of the
sketch while using little memory. Unfortunately, rank
error guarantees do not preclude arbitrarily large
relative errors, and this often occurs in practice when
the data is heavily skewed. Given the distributed
nature of contemporary large-scale systems, another
crucial property for quantile sketches is
mergeablility, i.e., several combined sketches must be
as accurate as a single sketch of the same data. We
present the first fully-mergeable, relative-error
quantile sketching algorithm with formal guarantees.
The sketch is extremely fast and accurate, and is
currently being used by Datadog at a wide-scale.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Long:2019:DSL,
author = "Qiang Long and Wei Wang and Jinfu Deng and Song Liu
and Wenhao Huang and Fangying Chen and Sifan Liu",
title = "A distributed system for large-scale $n$-gram language
models at {Tencent}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "2206--2217",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352136",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "n-gram language models are widely used in language
processing applications, e.g., automatic speech
recognition, for ranking the candidate word sequences
generated from the generator model, e.g., the acoustic
model. Large n-gram models typically give good ranking
results; however, they require a huge amount of memory
storage. While distributing the model across multiple
nodes resolves the memory issue, it nonetheless incurs
a great network communication overhead and introduces a
different bottleneck. In this paper, we present our
distributed system developed at Tencent with novel
optimization techniques for reducing the network
overhead, including distributed indexing, batching and
caching. They reduce the network requests and
accelerate the operation on each single node. We also
propose a cascade fault-tolerance mechanism which
adaptively switches to small n-gram models depending on
the severity of the failure. Experimental study on 9
automatic speech recognition (ASR) datasets confirms
that our distributed system scales to large models
efficiently, effectively and robustly. We have
successfully deployed it for Tencent's WeChat ASR with
the peak network traffic at the scale of 100 millions
of messages per minute.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Dursun:2019:MDQ,
author = "Kayhan Dursun and Carsten Binnig and Ugur Cetintemel
and Garret Swart and Weiwei Gong",
title = "A morsel-driven query execution engine for
heterogeneous multi-cores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "2218--2229",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352137",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Currently, we face the next major shift in processor
designs that arose from the physical limitations known
as the ``dark silicon effect''. Due to thermal
limitations and shrinking transistor sizes, multi-core
scaling is coming to an end. A major new direction that
hardware vendors are currently investigating involves
specialized and energy-efficient hardware accelerators
(e.g., ASICs) placed on the same die as the normal CPU
cores. In this paper, we present a novel query
processing engine called SiliconDB that targets such
heterogeneous processor environments. We leverage the
Sparc M7 platform to develop and test our ideas. Based
on the SSB benchmarks, as well as other micro
benchmarks, we compare the efficiency of SiliconDB with
existing execution strategies that make use of
co-processors (e.g., FPGAs, GPUs) and demonstrate
speed-up improvements of up to 2x.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cao:2019:SSS,
author = "Lei Cao and Wenbo Tao and Sungtae An and Jing Jin and
Yizhou Yan and Xiaoyu Liu and Wendong Ge and Adam Sah
and Leilani Battle and Jimeng Sun and Remco Chang and
Brandon Westover and Samuel Madden and Michael
Stonebraker",
title = "{Smile}: a system to support machine learning on {EEG}
data at scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "2230--2241",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352138",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In order to reduce the possibility of neural injury
from seizures and sidestep the need for a neurologist
to spend hours on manually reviewing the EEG recording,
it is critical to automatically detect and classify
``interictal-ictal continuum'' (IIC) patterns from EEG
data. However, the existing IIC classification
techniques are shown to be not accurate and robust
enough for clinical use because of the lack of high
quality labels of EEG segments as training data.
Obtaining high-quality labeled data is traditionally a
manual process by trained clinicians that can be
tedious, time-consuming, and error-prone. In this work,
we propose Smile, an industrial scale system that
provides an end-to-end solution to the IIC pattern
classification problem. The core components of Smile
include a visualization-based time series labeling
module and a deep-learning based active learning
module. The labeling module enables the users to
explore and label 350 million EEG segments (30TB) at
interactive speed. The multiple coordinated views allow
the users to examine the EEG signals from both time
domain and frequency domain simultaneously. The active
learning module first trains a deep neural network that
automatically extracts both the local features with
respect to each segment itself and the long term
dynamics of the EEG signals to classify IIC patterns.
Then leveraging the output of the deep learning model,
the EEG segments that can best improve the model are
selected and prompted to clinicians to label. This
process is iterated until the clinicians and the models
show high degree of agreement. Our initial experimental
results show that our Smile system allows the
clinicians to label the EEG segments at will with a
response time below 500 ms. The accuracy of the model
is progressively improved as more and more high quality
labels are acquired over time.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Green:2019:UGD,
author = "Alastair Green and Paolo Guagliardo and Leonid Libkin
and Tobias Lindaaker and Victor Marsault and Stefan
Plantikow and Martin Schuster and Petra Selmer and
Hannes Voigt",
title = "Updating graph databases with {Cypher}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "2242--2254",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352139",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The paper describes the present and the future of
graph updates in Cypher, the language of the Neo4j
property graph database and several other products.
Update features include those with clear analogs in
relational databases, as well as those that do not
correspond to any relational operators. Moreover,
unlike SQL, Cypher updates can be arbitrarily
intertwined with querying clauses. After presenting the
current state of update features, we point out their
shortcomings, most notably violations of atomicity and
non-deterministic behavior of updates. These have not
been previously known in the Cypher community. We then
describe the industry-academia collaboration on
designing a revised set of Cypher update operations.
Based on discovered shortcomings of update features, a
number of possible solutions were devised. They were
presented to key Cypher users, who were given the
opportunity to comment on how update features are used
in real life, and on their preferences for proposed
fixes. As the result of the consultation, a new set of
update operations for Cypher were designed. Those led
to a streamlined syntax, and eliminated the unexpected
and problematic behavior that original Cypher updates
exhibited.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kamsky:2019:ATC,
author = "Asya Kamsky",
title = "Adapting {TPC-C} benchmark to measure performance of
multi-document transactions in {MongoDB}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "2254--2262",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352140",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "MongoDB is a popular distributed database that
supports replication, horizontal partitioning
(sharding), a flexible document schema and ACID
guarantees on the document level. While it is generally
grouped with ``NoSQL'' databases, MongoDB provides many
features similar to those of traditional RDBMS such as
secondary indexes, an ad hoc query language, support
for complex aggregations, and new as of version 4.0
multi-statement, multi-document ACID transactions. We
looked for a well understood OLTP workload benchmark to
use in our own system performance test suite to
establish a baseline of transaction performance to
enable flagging performance regressions, as well as
improvements as we continue to add new functionality.
While there exist many published and widely used
benchmarks for RDBMS OLTP workloads, there are none
specifically for document databases. This paper
describes the process of adapting an existing
traditional RDBMS benchmark to MongoDB query language
and transaction semantics to allow measuring
transaction performance. We chose to adapt the TPC-C
benchmark even though it assumes a relational database
schema and SQL, hence extensive changes had to be made
to stay consistent with MongoDB best practices. Our
goal did not include creating official TPC-C
certifiable results, however, every attempt was made to
stay consistent with the spirit of the original
benchmark specification as well as to be compliant to
all specification requirements where possible. We
discovered that following best practices for document
schema design achieves better performance than using
required normalized schema. All the source code used
and validation scripts are published in github to allow
the reader to recreate and verify our results.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2019:CND,
author = "Feifei Li",
title = "Cloud-native database systems at {Alibaba}:
opportunities and challenges",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "2263--2272",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352141",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Cloud-native databases become increasingly important
for the era of cloud computing, due to the needs for
elasticity and on-demand usage by various applications.
These challenges from cloud applications present new
opportunities for cloud-native databases that cannot be
fully addressed by traditional on-premise enterprise
database systems. A cloud-native database leverages
software-hardware co-design to explore accelerations
offered by new hardware such as RDMA, NVM, kernel
bypassing protocols such as DPDK. Meanwhile, new design
architectures, such as shared storage, enable a
cloud-native database to decouple computation from
storage and provide excellent elasticity. For highly
concurrent workloads that require horizontal
scalability, a cloud-native database can leverage a
shared-nothing layer to provide distributed query and
transaction processing. Applications also require
cloud-native databases to offer high availability
through distributed consensus protocols. At Alibaba, we
have explored a suite of technologies to design
cloud-native database systems. Our storage engine,
X-Engine and PolarFS, improves both write and read
throughputs by using a LSM-tree design and self-adapted
separation of hot and cold data records. Based on these
efforts, we have designed and implemented POLARDB and
its distributed version POLARDB-X, which has
successfully supported the extreme transaction
workloads during the 2018 Global Shopping Festival on
November 11, 2018, and achieved commercial success on
Alibaba Cloud. We have also designed an OLAP system
called AnalyticDB (ADB in short) for enabling real-time
interactive data analytics for big data. We have
explored a self-driving database platform to achieve
autoscaling and intelligent database management. We
will report key technologies and lessons learned to
highlight the technical challenges and opportunities
for cloud-native database systems at Alibaba.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Boehm:2019:MME,
author = "Alexander Boehm",
title = "In-memory for the masses: enabling cost-efficient
deployments of in-memory data management platforms for
business applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "2273--2275",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352142",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "With unrivaled performance, modern in-memory data
management platforms such as SAP HANA [5] enable the
creation of novel types of business applications. By
keeping all data in memory, applications may combine
both demanding transactional as well as complex
analytical workloads in the context of a single system.
While this excellent performance, data freshness, and
flexibility gain is highly desirable in a vast range of
modern business applications [6], the corresponding
large appetite for main memory has significant
implications on server sizing. Particularly, hardware
costs on premise as well as in the cloud are at risk to
increase significantly, driven by the high amount of
DRAM that needs to be provisioned potentially. In this
talk, we discuss a variety of challenges and
opportunities that arise when running business
applications in a cost-efficient manner on in-memory
database systems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Hubail:2019:CAN,
author = "Murtadha {Al Hubail} and Ali Alsuliman and Michael
Blow and Michael Carey and Dmitry Lychagin and Ian
Maxon and Till Westmann",
title = "Couchbase analytics: {NoETL} for scalable {NoSQL} data
analysis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "2275--2286",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352143",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Couchbase Server is a highly scalable
document-oriented database management system. With a
shared-nothing architecture, it exposes a fast
key--value store with a managed cache for
sub-millisecond data operations, indexing for fast
queries, and a powerful query engine for executing
declarative SQL-like queries. Its Query Service debuted
several years ago and supports high volumes of
low-latency queries and updates for JSON documents. Its
recently introduced Analytics Service complements the
Query Service. Couchbase Analytics, the focus of this
paper, supports complex analytical queries (e.g., ad
hoc joins and aggregations) over large collections of
JSON documents. This paper describes the Analytics
Service from the outside in, including its user model,
its SQL++ based query language, and its MPP-based
storage and query processing architecture. It also
briefly touches on the relationship of Couchbase
Analytics to Apache AsterixDB, the open source Big Data
management system at the core of Couchbase Analytics.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Colyer:2019:PS,
author = "Adrian Colyer",
title = "Performance in the spotlight",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "2287--2289",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352144",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Performance in its various guises features prominently
in research evaluations, and rightly so. Without
adequate performance a system is not fit for purpose.
That doesn't necessarily mean we should pursue
performance at all costs though. In this talk we'll
explore a variety of additional evaluation criteria,
with a focus on those that are most important to
practitioners, and ask whether or not considering them
can open up interesting avenues of research.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Abouzied:2019:ILS,
author = "Azza Abouzied and Daniel J. Abadi and Kamil
Bajda-Pawlikowski and Avi Silberschatz",
title = "Integration of large-scale data processing systems and
traditional parallel database technology",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "2290--2299",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352145",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In 2009 we explored the feasibility of building a
hybrid SQL data analysis system that takes the best
features from two competing technologies: large-scale
data processing systems (such as Google MapReduce and
Apache Hadoop) and parallel database management systems
(such as Greenplum and Vertica). We built a prototype,
HadoopDB, and demonstrated that it can deliver the high
SQL query performance and efficiency of parallel
database management systems while still providing the
scalability, fault tolerance, and flexibility of
large-scale data processing systems. Subsequently,
HadoopDB grew into a commercial product, Hadapt, whose
technology was eventually acquired by Teradata. In this
paper, we provide an overview of HadoopDB's original
design, and its evolution during the subsequent ten
years of research and development effort. We describe
how the project innovated both in the research lab, and
as a commercial product at Hadapt and Teradata. We then
discuss the current vibrant ecosystem of software
projects (most of which are open source) that continued
HadoopDB's legacy of implementing a systems level
integration of large-scale data processing systems and
parallel database technology.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cooper:2019:PSL,
author = "Brian F. Cooper and P. P. S. Narayan and Raghu
Ramakrishnan and Utkarsh Srivastava and Adam
Silberstein and Philip Bohannon and Hans-Arno Jacobsen
and Nick Puz and Daniel Weaver and Ramana Yerneni",
title = "{PNUTS} to {Sherpa}: lessons from {Yahoo!}'s cloud
database",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "2300--2307",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352146",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we look back at the evolution of
Yahoo!'s geo-replicated cloud data store from a
research project called PNUTS to a globally deployed
production system called Sherpa, share some of the
lessons learned along the way, and finally, compare
PNUTS with current operational cloud stores.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Tan:2019:WPD,
author = "Wang-Chiew Tan",
title = "What {I} probably did right and what {I} think {I}
could have done better",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "2308--2308",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352147",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "They say a lot of good things in life are not free.
Success is one of them. Successful research requires an
immense amount of hard work and dedication over a long
period of time. For better or worse, hard work alone
does not guarantee success. In my experience, success
is a marathon of hard work and some luck along the way.
What is often forgotten is that it is important to
enjoy the journey of hard work and appreciate many
experiences and relationships along the way. I am
deeply honored to receive the 2019 VLDB Women in
Database Research Award. In the talk, I will share with
you a retrospective of my journey so far, what I
probably did right along the way, and perhaps more
importantly, the many things I think I could have done
better as a computer scientist and especially a female
computer scientist.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Parameswaran:2019:EDS,
author = "Aditya Parameswaran",
title = "Enabling data science for the majority",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "2309--2322",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352148",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Despite great strides in the generation, collection,
and processing of data at scale, data science is still
extremely inconvenient for the vast majority of the
population. The driving goal of our research, over the
past half decade, has been to make it easy for
individuals and teams---regardless of programming or
analysis expertise---manage, analyze, make sense of,
and draw insights from large datasets. In this article,
we reflect on a comprehensive suite of tools that we've
been building to empower everyone to perform data
science more efficiently and effortlessly, including
DataSpread, a scalable spreadsheet tool that combines
the benefits of spreadsheets and databases, and
ZenVisage, a visual exploration tool that accelerates
the discovery of trends or patterns. Our tools have
been developed in collaboration with experts in various
disciplines, including neuroscience, battery science,
genomics, astrophysics, and ad analytics. We will
discuss some of the key technical challenges underlying
the development of these tools, and how we addressed
them, drawing from ideas in multiple disciplines. in
the process, we will outline a research agenda for tool
development to empower everyone to tap into the hidden
potential in their datasets at scale.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Rekatsinas:2019:ODM,
author = "Theodoras Rekatsinas and Sudeepa Roy and Manasi Vartak
and Ce Zhang and Neoklis Polyzotis",
title = "Opportunities for data management research in the era
of horizontal {AI\slash ML}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "12",
pages = "2323--2323",
month = aug,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3352063.3352149",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:02 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "AI/ML is becoming a horizontal technology: its
application is expanding to more domains, and its
integration touches more parts of the technology stack.
Given the strong dependence of ML on data, this
expansion creates a new space for applying data
management techniques. At the same time, the deeper
integration of ML in the technology stack provides more
touch points where ML can be used in data management
systems and vice versa. In this panel, we invite
researchers working in this domain to discuss this
emerging world and its implications on data-management
research. Among other topics, the discussion will touch
on the opportunities for interesting research, how we
can interact with other communities, what is the core
expertise we bring to the table, and how we can conduct
and evaluate this research effectively within our own
community. The goal of the panel is to nudge the
community to appreciate the opportunities in this new
world of horizontal AI/ML and to spur a discussion on
how we can shape an effective research agenda.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Barthels:2019:SCH,
author = "Claude Barthels and Ingo M{\"u}ller and Konstantin
Taranov and Gustavo Alonso and Torsten Hoefler",
title = "Strong consistency is not hard to get: two-phase
locking and two-phase commit on thousands of cores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "13",
pages = "2325--2338",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3358701.3358702",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 26 07:21:38 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Concurrency control is a cornerstone of distributed
database engines and storage systems. In pursuit of
scalability, a common assumption is that Two-Phase
Locking (2PL) and Two-Phase Commit (2PC) are not viable
solutions due to their communication overhead. Recent
results, however, have hinted that 2PL and 2PC might
not have such a bad performance. Nevertheless, there
has been no attempt to actually measure how a
state-of-the-art implementation of 2PL and 2PC would
perform on modern hardware. The goal of this paper is
to establish a baseline for concurrency control
mechanisms on thousands of cores connected through a
low-latency network. We develop a distributed lock
table supporting all the standard locking modes used in
database engines. We focus on strong consistency in the
form of strict serializability implemented through
strict 2PL, but also explore read-committed and
repeatable-read, two common isolation levels used in
many systems. We do not leverage any known
optimizations in the locking or commit parts of the
protocols. The surprising result is that, for TPC-C,
2PL and 2PC can be made to scale to thousands of cores
and hundreds of machines, reaching a throughput of over
21 million transactions per second with 9.5 million New
Order operations per second. Since most existing
relational database engines use some form of locking
for implementing concurrency control, our findings
provide a path for such systems to scale without having
to significantly redesign transaction management. To
achieve these results, our implementation relies on
Remote Direct Memory Access (RDMA). Today, this
technology is commonly available on both Infiniband as
well as Ethernet networks, making the results valid
across a wide range of systems and platforms, including
database appliances, data centers, and cloud
environments.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wei:2019:DRE,
author = "Ziheng Wei and Uwe Leck and Sebastian Link",
title = "Discovery and ranking of embedded uniqueness
constraints",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "13",
pages = "2339--2352",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3358701.3358703",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 26 07:21:38 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data profiling is an enabler for efficient data
management and effective analytics. The discovery of
data dependencies is at the core of data profiling. We
conduct the first study on the discovery of embedded
uniqueness constraints (eUCs). These constraints
represents unique column combinations embedded in
complete fragments of incomplete data. We showcase
their implementation as filtered indexes, and their
application in integrity management and query
optimization. We show that the decision variant of
discovering a minimal eUC is NP-complete and
W[2]-complete. We characterize the maximum possible
solution size, and show which families of eUCs attain
that size. Despite the challenges, experiments with
real-world and synthetic benchmark data show that our
column(row)-efficient algorithms perform well with a
large number of columns(rows), and our hybrid algorithm
combines ideas from both. We show how to rank eUCs to
help identify relevant eUCs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chu:2019:ODB,
author = "Lingyang Chu and Yanyan Zhang and Yu Yang and Lanjun
Wang and Jian Pei",
title = "Online density bursting subgraph detection from
temporal graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "13",
pages = "2353--2365",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3358701.3358704",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 26 07:21:38 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Given a temporal weighted graph that consists of a
potentially endless stream of updates, we are
interested in finding density bursting subgraphs (DBS
for short), where a DBS is a subgraph that accumulates
its density at the fastest speed. Online DBS detection
enjoys many novel applications. At the same time, it is
challenging since the time duration of a DBS can be
arbitrarily long but a limited size storage can buffer
only up to a certain number of updates. To tackle this
problem, we observe the critical decomposability of
DBSs and show that a DBS with a long time duration can
be decomposed into a set of indecomposable DBSs with
equal or larger burstiness. We further prove that the
time duration of an indecomposable DBS is upper bounded
and propose an efficient method TopkDBSOL to detect
indecomposable DBSs in an online manner. Extensive
experiments demonstrate the effectiveness, efficiency
and scalability of TopkDBSOL in detecting significant
DBSs from temporal graphs in real applications.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Holanda:2019:PII,
author = "Pedro Holanda and Mark Raasveldt and Stefan Manegold
and Hannes M{\"u}hleisen",
title = "Progressive indexes: indexing for interactive data
analysis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "13",
pages = "2366--2378",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3358701.3358705",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 26 07:21:38 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Interactive exploration of large volumes of data is
increasingly common, as data scientists attempt to
extract interesting information from large opaque data
sets. This scenario presents a difficult challenge for
traditional database systems, as (1) nothing is known
about the query workload in advance, (2) the query
workload is constantly changing, and (3) the system
must provide interactive responses to the issued
queries. This environment is challenging for index
creation, as traditional database indexes require
upfront creation, hence a priori workload knowledge, to
be efficient. In this paper, we introduce Progressive
Indexing, a novel performance-driven indexing technique
that focuses on automatic index creation while
providing interactive response times to incoming
queries. Its design allows queries to have a limited
budget to spend on index creation. The indexing budget
is automatically tuned to each query before query
processing. This allows for systems to provide
interactive answers to queries during index creation
while being robust against various workload patterns
and data distributions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Hanai:2019:DEP,
author = "Masatoshi Hanai and Toyotaro Suzumura and Wen Jun Tan
and Elvis Liu and Georgios Theodoropoulos and Wentong
Cai",
title = "Distributed edge partitioning for trillion-edge
graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "13",
pages = "2379--2392",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3358701.3358706",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 26 07:21:38 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We propose Distributed Neighbor Expansion (Distributed
NE), a parallel and distributed graph partitioning
method that can scale to trillion-edge graphs while
providing high partitioning quality. Distributed NE is
based on a new heuristic, called parallel expansion,
where each partition is constructed in parallel by
greedily expanding its edge set from a single vertex in
such a way that the increase of the vertex cuts becomes
local minimal. We theoretically prove that the proposed
method has the upper bound in the partitioning quality.
The empirical evaluation with various graphs shows that
the proposed method produces higher-quality partitions
than the state-of-the-art distributed graph
partitioning algorithms. The performance evaluation
shows that the space efficiency of the proposed method
is an order-of-magnitude better than the existing
algorithms, keeping its time efficiency comparable. As
a result, Distributed NE can partition a trillion-edge
graph using only 256 machines within 70 minutes.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Athanassoulis:2019:OCL,
author = "Manos Athanassoulis and Kenneth S. B{\o}gh and Stratos
Idreos",
title = "Optimal column layout for hybrid workloads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "13",
pages = "2393--2407",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3358701.3358707",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 26 07:21:38 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data-intensive analytical applications need to support
both efficient reads and writes. However, what is
usually a good data layout for an update-heavy
workload, is not well-suited for a read-mostly one and
vice versa. Modern analytical data systems rely on
columnar layouts and employ delta stores to inject new
data and updates. We show that for hybrid workloads we
can achieve close to one order of magnitude better
performance by tailoring the column layout design to
the data and query workload. Our approach navigates the
possible design space of the physical layout: it
organizes each column's data by determining the number
of partitions, their corresponding sizes and ranges,
and the amount of buffer space and how it is allocated.
We frame these design decisions as an optimization
problem that, given workload knowledge and performance
requirements, provides an optimal physical layout for
the workload at hand. To evaluate this work, we build
an in-memory storage engine, Casper, and we show that
it outperforms state-of-the-art data layouts of
analytical systems for hybrid workloads. Casper
delivers up to 2.32x higher throughput for
update-intensive workloads and up to 2.14x higher
throughput for hybrid workloads. We further show how to
make data layout decisions robust to workload variation
by carefully selecting the input of the optimization.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sintos:2019:SDC,
author = "Stavros Sintos and Pankaj K. Agarwal and Jun Yang",
title = "Selecting data to clean for fact checking: minimizing
uncertainty vs. maximizing surprise",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "12",
number = "13",
pages = "2408--2421",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3358701.3358708",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Nov 26 07:21:38 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We study the optimization problem of selecting
numerical quantities to clean in order to fact-check
claims based on such data. Oftentimes, such claims are
technically correct, but they can still mislead for two
reasons. First, data may contain uncertainty and
errors. Second, data can be ``fished'' to advance
particular positions. In practice, fact-checkers cannot
afford to clean all data and must choose to clean what
``matters the most'' to checking a claim. We explore
alternative definitions of what ``matters the most'':
one is to ascertain claim qualities (by minimizing
uncertainty in these measures), while an alternative is
just to counter the claim (by maximizing the
probability of finding a counterargument). We show
whether the two objectives align with each other, with
important implications on when fact-checkers should
exercise care in selective data cleaning, to avoid
potential bias introduced by their desire to counter
claims. We develop efficient algorithms for solving the
various variants of the optimization problem, showing
significant improvements over naive solutions. The
problem is particularly challenging because the
objectives in the fact-checking context are complex,
non-linear functions over data. We obtain results that
generalize to a large class of functions, with
potential applications beyond fact-checking.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Chawla:2019:RMQ,
author = "Shuchi Chawla and Shaleen Deep and Paraschos Koutrisw
and Yifeng Teng",
title = "Revenue maximization for query pricing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "1",
pages = "1--14",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3357377.3357378",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:03 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Buying and selling of data online has increased
substantially over the last few years. Several
frameworks have already been proposed that study query
pricing in theory and practice. The key guiding
principle in these works is the notion of
arbitrage-freeness where the broker can set different
prices for different queries made to the dataset, but
must ensure that the pricing function does not provide
the buyers with opportunities for arbitrage. However,
little is known about revenue maximization aspect of
query pricing. In this paper, we study the problem
faced by a broker selling access to data with the goal
of maximizing her revenue. We show that this problem
can be formulated as a revenue maximization problem
with single-minded buyers and unlimited supply, for
which several approximation algorithms are known. We
perform an extensive empirical evaluation of the
performance of several pricing algorithms for the query
pricing problem on real-world instances. In addition to
previously known approximation algorithms, we propose
several new heuristics and analyze them both
theoretically and experimentally. Our experiments show
that algorithms with the best theoretical bounds are
not necessarily the best empirically. We identify
algorithms and heuristics that are both fast and also
provide consistently good performance when valuations
are drawn from a wide variety of distributions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shi:2019:RTP,
author = "Jieming Shi and Renchi Yang and Tianyuan Jin and
Xiaokui Xiao and Yin Yang",
title = "Realtime top-$k$ {Personalized PageRank} over large
graphs on {GPUs}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "1",
pages = "15--28",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3357377.3357379",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:03 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pagerank.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Given a graph G, a source node s \in G and a positive
integer k, a top- k Personalized PageRank (PPR) query
returns the k nodes with the highest PPR values with
respect to s, where the PPR of a node v measures its
relevance from the perspective of source s. Top- k PPR
processing is a fundamental task in many important
applications such as web search, social networks, and
graph analytics. This paper aims to answer such a query
in realtime, i.e., within less than 100ms, on an
Internet-scale graph with billions of edges. This is
far beyond the current state of the art, due to the
immense computational cost of processing a PPR query.
We achieve this goal with a novel algorithm kPAR, which
utilizes the massive parallel processing power of GPUs.
The main challenge in designing a GPU-based PPR
algorithm lies in that a GPU is mainly a parallel
computation device, whereas PPR processing involves
graph traversals and value propagation operations,
which are inherently sequential and memory-bound.
Existing scalable PPR algorithms are mostly described
as single-thread CPU solutions that are resistant to
parallelization. Further, they usually involve complex
data structures which do not have efficient adaptations
on GPUs. kPAR overcomes these problems via both novel
algorithmic designs (namely, adaptive forward push and
inverted random walks ) and system engineering (e.g.,
load balancing) to realize the potential of GPUs.
Meanwhile, kPAR provides rigorous guarantees on both
result quality and worst-case efficiency. Extensive
experiments show that kPAR is usually 10x faster than
parallel adaptations of existing methods. Notably, on a
billion-edge Twitter graph, kPAR answers a top-1000 PPR
query in 42.4 milliseconds.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2019:FLS,
author = "Sheng Wang and Zhifeng Bao and J. Shane Culpepper and
Timos Sellis and Xiaolin Qin",
title = "Fast large-scale trajectory clustering",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "1",
pages = "29--42",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3357377.3357380",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:03 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In this paper, we study the problem of large-scale
trajectory data clustering, k -paths, which aims to
efficiently identify k ``representative'' paths in a
road network. Unlike traditional clustering approaches
that require multiple data-dependent hyperparameters, k
-paths can be used for visual exploration in
applications such as traffic monitoring, public transit
planning, and site selection. By combining map matching
with an efficient intermediate representation of
trajectories and a novel edge-based distance (EBD)
measure, we present a scalable clustering method to
solve k -paths. Experiments verify that we can cluster
millions of taxi trajectories in less than one minute,
achieving improvements of up to two orders of magnitude
over state-of-the-art solutions that solve similar
trajectory clustering problems.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Herodotou:2019:ADT,
author = "Herodotos Herodotou and Elena Kakoulli",
title = "Automating distributed tiered storage management in
cluster computing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "1",
pages = "43--56",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3357377.3357381",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:03 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data-intensive platforms such as Hadoop and Spark are
routinely used to process massive amounts of data
residing on distributed file systems like HDFS.
Increasing memory sizes and new hardware technologies
(e.g., NVRAM, SSDs) have recently led to the
introduction of storage tiering in such settings.
However, users are now burdened with the additional
complexity of managing the multiple storage tiers and
the data residing on them while trying to optimize
their workloads. In this paper, we develop a general
framework for automatically moving data across the
available storage tiers in distributed file systems.
Moreover, we employ machine learning for tracking and
predicting file access patterns, which we use to decide
when and which data to move up or down the storage
tiers for increasing system performance. Our approach
uses incremental learning to dynamically refine the
models with new file accesses, allowing them to
naturally adjust and adapt to workload changes over
time. Our extensive evaluation using realistic
workloads derived from Facebook and CMU traces compares
our approach with several other policies and showcases
significant benefits in terms of both workload
performance and cluster efficiency.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Jung:2019:AAD,
author = "Jinho Jung and Hong Hu and Joy Arulraj and Taesoo Kim
and Woonhak Kang",
title = "{APOLLO}: automatic detection and diagnosis of
performance regressions in database systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "1",
pages = "57--70",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3357377.3357382",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:03 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The practical art of constructing database management
systems (DBMSs) involves a morass of trade-offs among
query execution speed, query optimization speed,
standards compliance, feature parity, modularity,
portability, and other goals. It is no surprise that
DBMSs, like all complex software systems, contain bugs
that can adversely affect their performance. The
performance of DBMSs is an important metric as it
determines how quickly an application can take in new
information and use it to make new decisions. Both
developers and users face challenges while dealing with
performance regression bugs. First, developers usually
find it challenging to manually design test cases to
uncover performance regressions since DBMS components
tend to have complex interactions. Second, users
encountering performance regressions are often unable
to report them, as the regression-triggering queries
could be complex and database-dependent. Third,
developers have to expend a lot of effort on localizing
the root cause of the reported bugs, due to the system
complexity and software development complexity. Given
these challenges, this paper presents the design of
Apollo, a toolchain for automatically detecting,
reporting, and diagnosing performance regressions in
DBMSs. We demonstrate that Apollo automates the
generation of regression-triggering queries, simplifies
the bug reporting process for users, and enables
developers to quickly pinpoint the root cause of
performance regressions. By automating the detection
and diagnosis of performance regressions, Apollo
reduces the labor cost of developing efficient DBMSs.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Owaida:2019:LLD,
author = "Muhsen Owaida and Gustavo Alonso and Laura Fogliarini
and Anthony Hock-Koon and Pierre-Etienne Melet",
title = "Lowering the latency of data processing pipelines
through {FPGA} based hardware acceleration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "1",
pages = "71--85",
month = sep,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3357377.3357383",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 2 06:49:03 MDT 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Web search engines often involve a complex pipeline of
processing stages including computing, scoring, and
ranking potential answers plus returning the sorted
results. The latency of such pipelines can be improved
by minimizing data movement, making stages faster, and
merging stages. The throughput is determined by the
stage with the smallest capacity and it can be improved
by allocating enough parallel resources to each stage.
In this paper we explore the possibility of employing
hardware acceleration (an FPGA) as a way to improve the
overall performance when computing answers to search
queries. With a real use case as a baseline and
motivation, we focus on accelerating the scoring
function implemented as a decision tree ensemble, a
common approach to scoring and classification in search
systems. Our solution uses a novel decision tree
ensemble implementation on an FPGA to: (1) increase the
number of entries that can be scored per unit of time,
and (2) provide a compact implementation that can be
combined with previous stages. The resulting system,
tested in Amazon F1 instances, significantly improves
the quality of the search results and improves
performance by two orders of magnitude over the
existing CPU based solution.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Cai:2019:MSS,
author = "Shaofeng Cai and Gang Chen and Beng Chin Ooi and
Jinyang Gao",
title = "Model slicing for supporting complex analytics with
elastic inference cost and resource constraints",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "2",
pages = "86--99",
month = oct,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3364324.3364325",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 11 07:51:12 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Deep learning models have been used to support
analytics beyond simple aggregation, where deeper and
wider models have been shown to yield great results.
These models consume a huge amount of memory and
computational operations. However, most of the
large-scale industrial applications are often
computational budget constrained. In practice, the peak
workload of inference service could be 10x higher than
the average cases, with the presence of unpredictable
extreme cases. Lots of computational resources could be
wasted during off-peak hours and the system may crash
when the workload exceeds system capacity. How to
support deep learning services with dynamic workload
cost-efficiently remains a challenging problem. In this
paper, we address the challenge with a general and
novel training scheme called model slicing, which
enables deep learning models to provide predictions
within the prescribed computational resource budget
dynamically. Model slicing could be viewed as an
elastic computation solution without requiring more
computational resources. Succinctly, each layer in the
model is divided into groups of contiguous block of
basic components (i.e. neurons in dense layers and
channels in convolutional layers), and then partially
ordered relation is introduced to these groups by
enforcing that groups participated in each forward pass
always starts from the first group to the
dynamically-determined rightmost group. Trained by
dynamically indexing the rightmost group with a single
parameter slice rate, the network is engendered to
build up group-wise and residual representation. Then
during inference, a sub-model with fewer groups can be
readily deployed for efficiency whose computation is
roughly quadratic to the width controlled by the slice
rate. Extensive experiments show that models trained
with model slicing can effectively support on-demand
workload with elastic inference cost.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Herlihy:2019:CCD,
author = "Maurice Herlihy and Barbara Liskov and Liuba Shrira",
title = "Cross-chain deals and adversarial commerce",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "2",
pages = "100--113",
month = oct,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3364324.3364326",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 11 07:51:12 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Modern distributed data management systems face a new
challenge: how can autonomous, mutually-distrusting
parties cooperate safely and effectively? Addressing
this challenge brings up questions familiar from
classical distributed systems: how to combine multiple
steps into a single atomic action, how to recover from
failures, and how to synchronize concurrent access to
data. Nevertheless, each of these issues requires
rethinking when participants are autonomous and
potentially adversarial. We propose the notion of a
cross-chain deal, a new way to structure complex
distributed computations that manage assets in an
adversarial setting. Deals are inspired by classical
atomic transactions, but are necessarily different, in
important ways, to accommodate the decentralized and
untrusting nature of the exchange. We describe novel
safety and liveness properties, along with two
alternative protocols for implementing cross-chain
deals in a system of independent blockchain ledgers.
One protocol, based on synchronous communication, is
fully decentralized, while the other, based on
semi-synchronous communication, requires a globally
shared ledger.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zois:2019:EMM,
author = "Vasileios Zois and Vassilis J. Tsotras and Walid A.
Najjar",
title = "Efficient main-memory top-$k$ selection for multicore
architectures",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "2",
pages = "114--127",
month = oct,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3364324.3364327",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 11 07:51:12 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Efficient Top-$k$ query evaluation relies on practices
that utilize auxiliary data structures to enable early
termination. Such techniques were designed to trade-off
complex work in the buffer pool against costly access
to disk-resident data. Parallel in-memory Top-$k$
selection with support for early termination presents a
novel challenge because computation shifts higher up in
the memory hierarchy. In this environment, data scan
methods using SIMD instructions and multithreading
perform well despite requiring evaluation of the
complete dataset. Early termination schemes that favor
simplicity require random access to resolve score
ambiguity while those optimized for sequential access
incur too many object evaluations. In this work, we
introduce the concept of rank uncertainty, a measure of
work efficiency that enables classifying existing
solutions according to their potential for efficient
parallel in-memory Top-fc selection. We identify data
reordering and layering strategies as those having the
highest potential and provide practical guidelines on
how to adapt them for parallel in-memory execution
(creating the VTA and SLA approaches). In addition, we
show that the number of object evaluations can be
further decreased by combining data reordering with
angle space partitioning (introducing PTA). Our
extensive experimental evaluation on varying query
parameters using both synthetic and real data, showcase
that PTA exhibits between 2 and 4 orders of magnitude
better query latency, and throughput when compared to
prior work and our optimized algorithmic variants (i.e.
VTA, SLA).",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Bottcher:2019:SGC,
author = "Jan B{\"o}ttcher and Viktor Leis and Thomas Neumann
and Alfons Kemper",
title = "Scalable garbage collection for in-memory {MVCC}
systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "2",
pages = "128--141",
month = oct,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3364324.3364328",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 11 07:51:12 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "To support Hybrid Transaction and Analytical
Processing (HTAP), database systems generally rely on
Multi-Version Concurrency Control (MVCC). While MVCC
elegantly enables lightweight isolation of readers and
writers, it also generates outdated tuple versions,
which, eventually, have to be reclaimed. Surprisingly,
we have found that in HTAP workloads, this reclamation
of old versions, i.e., garbage collection, often
becomes the performance bottleneck. It turns out that
in the presence of long-running queries,
state-of-the-art garbage collectors are too
coarse-grained. As a consequence, the number of
versions grows quickly slowing down the entire system.
Moreover, the standard background cleaning approach
makes the system vulnerable to sudden spikes in
workloads. In this work, we propose a novel garbage
collection (GC) approach that prunes obsolete versions
eagerly. Its seamless integration into the transaction
processing keeps the GC overhead minimal and ensures
good scalability. We show that our approach handles
mixed workloads well and also speeds up pure OLTP
workloads like TPC-C compared to existing
state-of-the-art approaches.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yang:2019:FDD,
author = "Bohua Yang and Dong Wen and Lu Qin and Ying Zhang and
Xubo Wang and Xuemin Lin",
title = "Fully dynamic depth-first search in directed graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "2",
pages = "142--154",
month = oct,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3364324.3364329",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 11 07:51:12 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Depth-first search (DFS) is a fundamental and
important algorithm in graph analysis. It is the basis
of many graph algorithms such as computing strongly
connected components, testing planarity, and detecting
biconnected components. The result of a DFS is normally
shown as a DFS-Tree. Given the frequent updates in many
real-world graphs (e.g., social networks and
communication networks), we study the problem of
DFS-Tree maintenance in dynamic directed graphs. In the
literature, most works focus on the DFS-Tree
maintenance problem in undirected graphs and directed
acyclic graphs. However, their methods cannot easily be
applied in the case of general directed graphs.
Motivated by this, we propose a framework and
corresponding algorithms for both edge insertion and
deletion in general directed graphs. We further give
several optimizations to speed up the algorithms. We
conduct extensive experiments on 12 real-world datasets
to show the efficiency of our proposed algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ma:2019:LMC,
author = "Chenhao Ma and Reynold Cheng and Laks V. S. Lakshmanan
and Tobias Grubenmann and Yixiang Fang and Xiaodong
Li",
title = "{LINC}: a motif counting algorithm for uncertain
graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "2",
pages = "155--168",
month = oct,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3364324.3364330",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 11 07:51:12 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "In graph applications (e.g., biological and social
networks), various analytics tasks (e.g., clustering
and community search) are carried out to extract
insight from large and complex graphs. Central to these
tasks is the counting of the number of motifs, which
are graphs with a few nodes. Recently, researchers have
developed several fast motif counting algorithms. Most
of these solutions assume that graphs are
deterministic, i.e., the graph edges are certain to
exist. However, due to measurement and statistical
prediction errors, this assumption may not hold, and
hence the analysis quality can be affected. To address
this issue, we examine how to count motifs on uncertain
graphs, whose edges only exist probabilistically.
Particularly, we propose a solution framework that can
be used by existing deterministic motif counting
algorithms. We further propose an approximation
algorithm. Extensive experiments on real datasets show
that our algorithms are more effective and efficient
than existing solutions.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Eskandarian:2019:OOQ,
author = "Saba Eskandarian and Matei Zaharia",
title = "{ObliDB}: oblivious query processing for secure
databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "2",
pages = "169--183",
month = oct,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3364324.3364331",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 11 07:51:12 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Hardware enclaves such as Intel SGX are a promising
technology for improving the security of databases
outsourced to the cloud. These enclaves provide an
execution environment isolated from the hyper-visor/OS,
and encrypt data in RAM. However, for applications that
use large amounts of memory, including most databases,
enclaves do not protect against access pattern leaks,
which let attackers gain a large amount of information
about the data. Moreover, the na{\"\i}ve way to address
this issue, using Oblivious RAM (ORAM) primitives from
the security literature, adds substantial overhead. A
number of recent works explore trusted hardware
enclaves as a path toward secure, access-pattern
oblivious outsourcing of data storage and analysis.
While these works efficiently solve specific
subproblems (e.g. building secure indexes or running
analytics queries that always scan entire tables), no
prior work has supported oblivious query processing for
general query workloads on a DBMS engine with multiple
access methods. Moreover, applying these techniques
individually does not guarantee that an end-to-end
workload, such as a complex SQL query over multiple
tables, will be oblivious. In this paper, we introduce
ObliDB, an oblivious database engine design that is the
first system to provide obliviousness for general
database read workloads over multiple access methods.
ObliDB introduces a diverse array of new oblivious
physical operators to accelerate oblivious SQL queries,
giving speedups of up to an order of magnitude over
na{\"\i}ve ORAM. It supports a broad range of queries,
including aggregation, joins, insertions, deletions and
point queries. We implement ObliDB and show that, on
analytics workloads, ObliDB ranges from 1.1--19x faster
than Opaque, a previous oblivious, enclave-based system
designed only for analytics, and comes within 2.6 x of
Spark SQL, which provides no security guarantees. In
addition, ObliDB supports point queries with 3--10ms
latency, which is comparable to index-only trusted
hardware systems, and runs over 7x faster than HIRB, a
previous encryption-based oblivious index system that
supports point queries.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ge:2019:SMP,
author = "Chang Ge and Ihab F. Ilyas and Florian Kerschbaum",
title = "Secure multi-party functional dependency discovery",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "2",
pages = "184--196",
month = oct,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3364324.3364332",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 11 07:51:12 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data profiling is an important task to understand data
semantics and is an essential pre-processing step in
many tools. Due to privacy constraints, data is often
partitioned into silos, with different access control.
Discovering functional dependencies (FDs) usually
requires access to all data partitions to find
constraints that hold on the whole dataset. Simply
applying general secure multi-party computation
protocols incurs high computation and communication
cost. This paper formulates the FD discovery problem in
the secure multi-party scenario. We propose secure
constructions for validating candidate FDs, and present
efficient cryptographic protocols to discover FDs over
distributed partitions. Experimental results show that
solution is practically efficient over non-secure
distributed FD discovery, and can significantly
outperform general purpose multi-party computation
frameworks. To the best of our knowledge, our work is
the first one to tackle this problem.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Wang:2019:VFM,
author = "Minmei Wang and Mingxun Zhou and Shouqian Shi and Chen
Qian",
title = "Vacuum filters: more space-efficient and faster
replacement for {Bloom} and cuckoo filters",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "2",
pages = "197--210",
month = oct,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3364324.3364333",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 11 07:51:12 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We present vacuum filters, a type of data structures
to support approximate membership queries. Vacuum
filters cost the smallest space among all known AMQ
data structures and provide higher insertion and lookup
throughput in most situations. Hence they can be used
as the replacement of the widely used Bloom filters and
cuckoo filters. Similar to cuckoo filters, vacuum
filters also store item fingerprints in a table. The
memory-efficiency and throughput improvements are from
the innovation of a table insertion and fingerprint
eviction strategy that achieves both high load factor
and data locality without any restriction of the table
size. In addition, we propose a new update framework to
resolve two difficult problems for AMQ structures under
dynamics, namely duplicate insertions and set resizing.
The experiments show that vacuum filters can achieve
25\% less space in average and similar throughput
compared to cuckoo filters, and 15\% less space and $ >
10 \times $ throughput compared to Bloom filters, with
same false positive rates. AMQ data structures are
widely used in various layers of computer systems and
networks and are usually hosted in platforms where
memory is limited and precious. Hence the improvements
brought by vacuum filters can be considered
significant.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sun:2019:SES,
author = "Yihan Sun and Guy E. Blelloch and Wan Shen Lim and
Andrew Pavlo",
title = "On supporting efficient snapshot isolation for hybrid
workloads with multi-versioned indexes",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "2",
pages = "211--225",
month = oct,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3364324.3364334",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 11 07:51:12 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Modern data-driven applications require that databases
support fast analytical queries while undergoing rapid
updates---often referred to as Hybrid Transactional
Analytical Processing (HTAP). Achieving fast queries
and updates in a database management system (DBMS) is
challenging since optimizations to improve analytical
queries can cause overhead for updates. One solution is
to use snapshot isolation (SI) for multi-version
concurrency control (MVCC) to allow readers to make
progress regardless of concurrent writers. In this
paper, we propose the Parallel Binary Tree (P-Tree)
index structure to achieve SI and MVCC for multicore
in-memory HTAP DBMSs. At their core, P-Trees are based
on pure (immutable) data structures that use
path-copying for updates for fast multi-versioning.
They support tree nesting to improve OLAP performance
while still allowing for efficient updates. The data
structure also enables parallel algorithms for bulk
operations on indexes and their underlying tables. We
evaluate P-Trees on OLTP and OLAP benchmarks, and
compare them with state-of-the-art data structures and
DBMSs. Our experiments show that P-Trees outperform
many concurrent data structures for the YCSB workload,
and is 4--9 x faster than existing DBMSs for analytical
queries, while also achieving reasonable throughput for
simultaneous transactional updates.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Fang:2019:IMV,
author = "Zhuhe Fang and Beilei Zheng and Chuliang Weng",
title = "Interleaved multi-vectorizing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "3",
pages = "226--238",
month = nov,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3368289.3368290",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 11 07:51:13 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "SIMD is an instruction set in mainstream processors,
which provides the data level parallelism to accelerate
the performance of applications. However, its
advantages diminish when applications suffer from heavy
cache misses. To eliminate cache misses in SIMD
vectorization, we present interleaved multi-vectorizing
(IMV) in this paper. It interleaves multiple execution
instances of vectorized code to hide memory access
latency with more computation. We also propose residual
vectorized states to solve the control flow divergence
in vectorization. IMV can make full use of the data
parallelism in SIMD and the memory level parallelism
through prefetching. It reduces cache misses, branch
misses and computation overhead to significantly speed
up the performance of pointer-chasing applications, and
it can be applied to executing entire query pipelines.
As experimental results show, IMV achieves up to 4.23X
and 3.17X better performance compared with the pure
scalar implementation and the pure SIMD vectorization,
respectively.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Shetiya:2019:UOA,
author = "Suraj Shetiya and Abolfazl Asudeh and Sadia Ahmed and
Gautam Das",
title = "A unified optimization algorithm for solving
{``regret-minimizing representative''} problems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "3",
pages = "239--251",
month = nov,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3368289.3368291",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 11 07:51:13 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Given a database with numeric attributes, it is often
of interest to rank the tuples according to linear
scoring functions. For a scoring function and a subset
of tuples, the regret of the subset is defined as the
(relative) difference in scores between the top-1 tuple
of the subset and the top-1 tuple of the entire
database. Finding the regret-ratio minimizing set
(RRMS), i.e., the subset of a required size k that
minimizes the maximum regret-ratio across all possible
ranking functions, has been a well-studied problem in
recent years. This problem is known to be NP-complete
and there are several approximation algorithms for it.
Other NP-complete variants have also been investigated,
e.g., finding the set of size k that minimizes the
average regret ratio over all linear functions. Prior
work have designed customized algorithms for different
variants of the problem, and are unlikely to easily
generalize to other variants. In this paper we take a
different path towards tackling these problems. In
contrast to the prior, we propose a unified algorithm
for solving different problem variants. Unification is
done by localizing the customization to the design of
variant-specific subroutines or ``oracles'' that are
called by our algorithm. Our unified algorithm takes
inspiration from the seemingly unrelated problem of
clustering from data mining, and the corresponding
k-medoid algorithm. We make several innovative
contributions in designing our algorithm, including
various techniques such as linear programming, edge
sampling in graphs, volume estimation of
multi-dimensional convex polytopes, and several others.
We provide rigorous theoretical analysis, as well as
substantial experimental evaluations over real and
synthetic data sets to demonstrate the practical
feasibility of our approach.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kandula:2019:PDI,
author = "Srikanth Kandula and Laurel Orr and Surajit
Chaudhuri",
title = "Pushing data-induced predicates through joins in
big-data clusters",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "3",
pages = "252--265",
month = nov,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3368289.3368292",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 11 07:51:13 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Using data statistics, we convert predicates on a
table into data induced predicates (diPs) that apply on
the joining tables. Doing so substantially speeds up
multi-relation queries because the benefits of
predicate pushdown can now apply beyond just the tables
that have predicates. We use diPs to skip data
exclusively during query optimization; i.e., diPs lead
to better plans and have no overhead during query
execution. We study how to apply diPs for complex query
expressions and how the usefulness of diPs varies with
the data statistics used to construct diPs and the data
distributions. Our results show that building diPs
using zone-maps which are already maintained in today's
clusters leads to sizable data skipping gains. Using a
new (slightly larger) statistic, 50\% of the queries in
the TPC-H, TPC-DS and JoinOrder benchmarks can skip at
least 33\% of the query input. Consequently, the median
query in a production big-data cluster finishes roughly
2x faster.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Pena:2019:DAE,
author = "Eduardo H. M. Pena and Eduardo C. de Almeida and Felix
Naumann",
title = "Discovery of approximate (and exact) denial
constraints",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "3",
pages = "266--278",
month = nov,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3368289.3368293",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 11 07:51:13 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Maintaining data consistency is known to be hard.
Recent approaches have relied on integrity constraints
to deal with the problem --- correct and complete
constraints naturally work towards data consistency.
State-of-the-art data cleaning frameworks have used the
formalism known as denial constraint (DC) to handle a
wide range of real-world constraints. Each DC expresses
a relationship between predicates that indicate which
combinations of attribute values are inconsistent. The
design of DCs, however, must keep pace with the
complexity of data and applications. The alternative to
designing DCs by hand is automatically discovering DCs
from data, which is computationally expensive due to
the large search space of DCs. To tackle this
challenging task, we present a novel algorithm to
efficiently discover DCs: DCFinder. The algorithm
combines data structures called position list indexes
with techniques based on predicate selectivity to
efficiently validate DC candidates. Because the
available data often contain errors, DCFinder is
especially designed to discovering approximate DCs,
i.e., DCs that may partially hold. Our experimental
evaluation uses real and synthetic datasets and shows
that DCFinder outperforms all the existing approximate
DC discovery algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Yang:2019:DUC,
author = "Zongheng Yang and Eric Liang and Amog Kamsetty and
Chenggang Wu and Yan Duan and Xi Chen and Pieter Abbeel
and Joseph M. Hellerstein and Sanjay Krishnan and Ion
Stoica",
title = "Deep unsupervised cardinality estimation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "3",
pages = "279--292",
month = nov,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3368289.3368294",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 11 07:51:13 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Cardinality estimation has long been grounded in
statistical tools for density estimation. To capture
the rich multivariate distributions of relational
tables, we propose the use of a new type of
high-capacity statistical model: deep autoregressive
models. However, direct application of these models
leads to a limited estimator that is prohibitively
expensive to evaluate for range or wildcard predicates.
To produce a truly usable estimator, we develop a Monte
Carlo integration scheme on top of autoregressive
models that can efficiently handle range queries with
dozens of dimensions or more. Like classical synopses,
our estimator summarizes the data without supervision.
Unlike previous solutions, we approximate the joint
data distribution without any independence assumptions.
Evaluated on real-world datasets and compared against
real systems and dominant families of techniques, our
estimator achieves single-digit multiplicative error at
tail, an up to 90x accuracy improvement over the second
best method, and is space- and runtime-efficient.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Ding:2019:FGI,
author = "Zeyu Ding and Yuxin Wang and Danfeng Zhang and Daniel
Kifer",
title = "Free gap information from the differentially private
sparse vector and noisy max mechanisms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "3",
pages = "293--306",
month = nov,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3368289.3368295",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 11 07:51:13 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Noisy Max and Sparse Vector are selection algorithms
for differential privacy and serve as building blocks
for more complex algorithms. In this paper we show that
both algorithms can release additional information for
free (i.e., at no additional privacy cost). Noisy Max
is used to return the approximate maximizer among a set
of queries. We show that it can also release for free
the noisy gap between the approximate maximizer and
runner-up. This free information can improve the
accuracy of certain subsequent counting queries by up
to 50\%. Sparse Vector is used to return a set of
queries that are approximately larger than a fixed
threshold. We show that it can adaptively control its
privacy budget (use less budget for queries that are
likely to be much larger than the threshold) in order
to increase the amount of queries it can process. These
results follow from a careful privacy analysis.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Sun:2019:EEL,
author = "Ji Sun and Guoliang Li",
title = "An end-to-end learning-based cost estimator",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "3",
pages = "307--319",
month = nov,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3368289.3368296",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 11 07:51:13 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Cost and cardinality estimation is vital to query
optimizer, which can guide the query plan selection.
However traditional empirical cost and cardinality
estimation techniques cannot provide high-quality
estimation, because they may not effectively capture
the correlation between multiple tables. Recently the
database community shows that the learning-based
cardinality estimation is better than the empirical
methods. However, existing learning-based methods have
several limitations. Firstly, they focus on estimating
the cardinality, but cannot estimate the cost.
Secondly, they are either too heavy or hard to
represent complicated structures, e.g., complex
predicates. To address these challenges, we propose an
effective end-to-end learning-based cost estimation
framework based on a tree-structured model, which can
estimate both cost and cardinality simultaneously. We
propose effective feature extraction and encoding
techniques, which consider both queries and physical
operations in feature extraction. We embed these
features into our tree-structured model. We propose an
effective method to encode string values, which can
improve the generalization ability for predicate
matching. As it is prohibitively expensive to enumerate
all string values, we design a patten-based method,
which selects patterns to cover string values and
utilizes the patterns to embed string values. We
conducted experiments on real-world datasets and
experimental results showed that our method
outperformed baselines.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zeng:2019:LMD,
author = "Yuxiang Zeng and Yongxin Tong and Lei Chen",
title = "Last-mile delivery made practical: an efficient route
planning framework with theoretical guarantees",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "3",
pages = "320--333",
month = nov,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3368289.3368297",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 11 07:51:13 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Last-mile delivery (LMD) refers to the movement of
goods from transportation origins to the final
destinations. It has widespread applications such as
urban logistics, e-commerce, etc. One fundamental
problem in last-mile delivery is route planning, which
schedules multiple couriers' routes, i.e., sequences of
origins and destinations of the requests under certain
optimization objectives. Prior studies usually designed
heuristic solutions to two strongly NP-hard
optimization objectives: minimizing the makespan (
i.e., maximum travel time) of couriers and total
latency ( i.e., waiting time) of requesters. There is
no algorithm with theoretical guarantees for either
optimization objective in practical cases. In this
paper, we propose a theoretically guaranteed solution
framework for both objectives. It achieves both
approximation ratios of $ 6 \rho $, where $ \rho $ is
the approximation ratio of a core operation, called $k$
LMD, which plans for one courier a route consisting of
$k$ requests. Leveraging a spatial index called
hierarchically separated tree, we further design an
efficient approximation algorithm for $k$ LMD with $
\rho = O(\log n)$, where $n$ is the number of requests.
Experimental results show that our approach outperforms
state-of-the-art methods by averagely 48.4\%--96.0\%
and 49.7\%--96.1\% for both objectives. Especially in
large-scale real datasets, our algorithm has $ 29.3
\times $--$ 108.9 \times $ shorter makespan and $ 20.2
\times $--$ 175.1 \times $ lower total latency than the
state-of-the-art algorithms.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kepe:2019:DPM,
author = "Tiago R. Kepe and Eduardo C. de Almeida and Marco A.
Z. Alves",
title = "Database processing-in-memory: an experimental study",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "3",
pages = "334--347",
month = nov,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3368289.3368298",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 11 07:51:13 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The rapid growth of ``big-data'' intensified the
problem of data movement when processing data
analytics: Large amounts of data need to move through
the memory up to the CPU before any computation takes
place. To tackle this costly problem,
Processing-in-Memory (PIM) inverts the traditional data
processing by pushing computation to memory with an
impact on performance and energy efficiency. In this
paper, we present an experimental study on processing
database SIMD operators in PIM compared to current x86
processor (i.e., using AVX512 instructions). We discuss
the execution time gap between those architectures.
However, this is the first experimental study, in the
database community, to discuss the trade-offs of
execution time and energy consumption between PIM and
x86 in the main query execution systems: materialized,
vectorized, and pipelined. We also discuss the results
of a hybrid query scheduling when interleaving the
execution of the SIMD operators between PIM and x86
processing hardware. In our results, the hybrid query
plan reduced the execution time by 45\%. It also
drastically reduced energy consumption by more than 2x
compared to hardware-specific query plans.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Leeka:2019:ISO,
author = "Jyoti Leeka and Kaushik Rajan",
title = "Incorporating super-operators in big-data query
optimizers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "3",
pages = "348--361",
month = nov,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3368289.3368299",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 11 07:51:13 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The cost of big-data analytics is dominated by shuffle
operations that induce multiple disk reads, writes and
network transfers. This paper proposes a new class of
optimization rules that are specifically aimed at
eliminating shuffles where possible. The rules
substitute multiple shuffle inducing operators ( Join,
UnionAll, Spool, GroupBy ) with a single streaming
operator which implements an entire sub-query. We call
such operators super-operators. A key challenge with
adding new rules that substitute sub-queries with
super-operators is that there are many variants of the
same sub-query that can be implemented via minor
modifications to the same super-operator. Adding each
as a separate rule leads to a search space explosion.
We propose several extensions to the query optimizer to
address this challenge. We propose a new abstract
representation for operator trees that captures all
possible sub-queries that a super-operator implements.
We propose a new rule matching algorithm that can
efficiently search for abstract operator trees. Finally
we extend the physical operator interface to introduce
new parametric super-operators. We implement our
changes in SCOPE, a state-of-the-art production
big-data optimizer used extensively at Microsoft. We
demonstrate that the proposed optimizations provide
significant reduction in both resource cost (average
1.7x) and latency (average 1.5x) on several production
queries, and do so without increasing optimization
time.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Li:2019:EPM,
author = "Conggai Li and Fan Zhang and Ying Zhang and Lu Qin and
Wenjie Zhang and Xuemin Lin",
title = "Efficient progressive minimum $k$-core search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "3",
pages = "362--375",
month = nov,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3368289.3368300",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 11 07:51:13 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "As one of the most representative cohesive subgraph
models, $k$-core model has recently received
significant attention in the literature. In this paper,
we investigate the problem of the minimum $k$-core
search: given a graph $G$, an integer $k$ and a set of
query vertices $ Q = \{ q \} $, we aim to find the
smallest $k$-core subgraph containing every query
vertex $ q \epsilon Q$. It has been shown that this
problem is NP-hard with a huge search space, and it is
very challenging to find the optimal solution. There
are several heuristic algorithms for this problem, but
they rely on simple scoring functions and there is no
guarantee as to the size of the resulting subgraph,
compared with the optimal solution. Our empirical study
also indicates that the size of their resulting
subgraphs may be large in practice. In this paper, we
develop an effective and efficient progressive
algorithm, namely PSA, to provide a good trade-off
between the quality of the result and the search time.
Novel lower and upper bound techniques for the minimum
$k$-core search are designed. Our extensive experiments
on 12 real-life graphs demonstrate the effectiveness
and efficiency of the new techniques.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhu:2019:HNL,
author = "Hang Zhu and Zhihao Bai and Jialin Li and Ellis
Michael and Dan R. K. Ports and Ion Stoica and Xin
Jin",
title = "{Harmonia}: near-linear scalability for replicated
storage with in-network conflict detection",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "3",
pages = "376--389",
month = nov,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3368289.3368301",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 11 07:51:13 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Distributed storage employs replication to mask
failures and improve availability. However, these
systems typically exhibit a hard tradeoff between
consistency and performance. Ensuring consistency
introduces coordination overhead, and as a result the
system throughput does not scale with the number of
replicas. We present Harmonia, a replicated storage
architecture that exploits the capability of
new-generation programmable switches to obviate this
tradeoff by providing near-linear scalability without
sacrificing consistency. To achieve this goal, Harmonia
detects read-write conflicts in the network, which
enables any replica to serve reads for objects with no
pending writes. Harmonia implements this functionality
at line rate, thus imposing no performance overhead. We
have implemented a prototype of Harmonia on a cluster
of commodity servers connected by a Barefoot Tofino
switch, and have integrated it with Redis. We
demonstrate the generality of our approach by
supporting a variety of replication protocols,
including primary-backup, chain replication,
Viewstamped Replication, and NOPaxos. Experimental
results show that Harmonia improves the throughput of
these protocols by up to $ 10 \times $ for a
replication factor of $ 10 $, providing near-linear
scalability up to the limit of our testbed.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Walenz:2019:LSC,
author = "Brett Walenz and Stavros Sintos and Sudeepa Roy and
Jun Yang",
title = "Learning to sample: counting with complex queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "3",
pages = "390--402",
month = nov,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3368289.3368302",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 11 07:51:13 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We study the problem of efficiently estimating counts
for queries involving complex filters, such as
user-defined functions, or predicates involving
self-joins and correlated subqueries. For such queries,
traditional sampling techniques may not be applicable
due to the complexity of the filter preventing sampling
over joins, and sampling after the join may not be
feasible due to the cost of computing the full join.
The other natural approach of training and using an
inexpensive classifier to estimate the count instead of
the expensive predicate suffers from the difficulties
in training a good classifier and giving meaningful
confidence intervals. In this paper we propose a new
method of learning to sample where we combine the best
of both worlds by using sampling in two phases. First,
we use samples to learn a probabilistic classifier, and
then use the classifier to design a stratified sampling
method to obtain the final estimates. We theoretically
analyze algorithms for obtaining an optimal
stratification, and compare our approach with a suite
of natural alternatives like quantification learning,
weighted and stratified sampling, and other techniques
from the literature. We also provide extensive
experiments in diverse use cases using multiple real
and synthetic datasets to evaluate the quality,
efficiency, and robustness of our approach.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Echihabi:2019:RLH,
author = "Karima Echihabi and Kostas Zoumpatianos and Themis
Palpanas and Houda Benbrahim",
title = "Return of the {Lernaean Hydra}: experimental
evaluation of data series approximate similarity
search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "3",
pages = "403--420",
month = nov,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3368289.3368303",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Dec 11 07:51:13 MST 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Data series are a special type of multidimensional
data present in numerous domains, where similarity
search is a key operation that has been extensively
studied in the data series literature. In parallel, the
multidimensional community has studied approximate
similarity search techniques. We propose a taxonomy of
similarity search techniques that reconciles the
terminology used in these two domains, we describe
modifications to data series indexing techniques
enabling them to answer approximate similarity queries
with quality guarantees, and we conduct a thorough
experimental evaluation to compare approximate
similarity search techniques under a unified framework,
on synthetic and real datasets in memory and on disk.
Although data series differ from generic
multidimensional vectors (series usually exhibit
correlation between neighboring values), our results
show that data series techniques answer approximate
queries with strong guarantees and an excellent
empirical performance, on data series and vectors
alike. These techniques outperform the state-of-the-art
approximate techniques for vectors when operating on
disk, and remain competitive in memory.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhou:2019:DDI,
author = "Xinjing Zhou and Lidan Shou and Ke Chen and Wei Hu and
Gang Chen",
title = "{DPTree}: differential indexing for persistent
memory",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "4",
pages = "421--434",
month = dec,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3372716.3372717",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jan 8 18:50:37 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The emergence of persistent memory (PM) spurs on
redesigns of database system components to gain full
exploitation of the persistence and speed of the
hardware. One crucial component studied by researchers
is persistent indices. However, such studies to
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Karimov:2019:AAH,
author = "Jeyhun Karimov and Tilmann Rabl and Volker Markl",
title = "{AJoin}: ad-hoc stream joins at scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "4",
pages = "435--448",
month = dec,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3372716.3372718",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jan 8 18:50:37 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The processing model of state-of-the-art stream
processing engines is designed to execute long-running
queries one at a time. However, with the advance of
cloud technologies and multi-tenant systems, multiple
users share the same cloud for stream query \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Luo:2019:PSL,
author = "Chen Luo and Michael J. Carey",
title = "On performance stability in {LSM}-based storage
systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "4",
pages = "449--462",
month = dec,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3372716.3372719",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jan 8 18:50:37 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "The Log-Structured Merge-Tree (LSM-tree) has been
widely adopted for use in modern NoSQL systems for its
superior write performance. Despite the popularity of
LSM-trees, they have been criticized for suffering from
write stalls and large performance \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Peng:2019:TBT,
author = "You Peng and Ying Zhang and Xuemin Lin and Wenjie
Zhang and Lu Qin and Jingren Zhou",
title = "Towards bridging theory and practice: hop-constrained
$s$--$t$ simple path enumeration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "4",
pages = "463--476",
month = dec,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3372716.3372720",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jan 8 18:50:37 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Graph is a ubiquitous structure representing entities
and their relationships applied in many areas such as
social networks, web graphs, and biological networks.
One of the fundamental tasks in graph analytics is to
investigate the relations between two \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Zhang:2019:PDS,
author = "Yuhao Zhang and Arun Kumar",
title = "{Panorama}: a data system for unbounded vocabulary
querying over video",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "4",
pages = "477--491",
month = dec,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3372716.3372721",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jan 8 18:50:37 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Deep convolutional neural networks (CNNs) achieve
state-of-the-art accuracy for many computer vision
tasks. But using them for video monitoring applications
incurs high computational cost and inference latency.
Thus, recent works have studied how to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lakhotia:2019:PTS,
author = "Kartik Lakhotia and Rajgopal Kannan and Qing Dong and
Viktor Prasanna",
title = "Planting trees for scalable and efficient canonical
hub labeling",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "4",
pages = "492--505",
month = dec,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3372716.3372722",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jan 8 18:50:37 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Hub labeling is widely used to improve the latency and
throughput of Point-to-Point Shortest Distance (PPSD)
queries in graph databases. However, constructing hub
labeling, even via the state-of-the-art Pruned Landmark
Labeling (PLL) algorithm is \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lahoti:2019:OIF,
author = "Preethi Lahoti and Krishna P. Gummadi and Gerhard
Weikum",
title = "Operationalizing individual fairness with pairwise
fair representations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "4",
pages = "506--518",
month = dec,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3372716.3372723",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jan 8 18:50:37 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We revisit the notion of individual fairness proposed
by Dwork et al. A central challenge in operationalizing
their approach is the difficulty in eliciting a human
specification of a similarity metric. In this paper, we
propose an operationalization of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kakaraparthy:2019:ODL,
author = "Aarati Kakaraparthy and Jignesh M. Patel and Kwanghyun
Park and Brian P. Kroth",
title = "Optimizing databases by learning hidden parameters of
solid state drives",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "4",
pages = "519--532",
month = dec,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3372716.3372724",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jan 8 18:50:37 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Solid State Drives (SSDs) are complex devices with
varying internal implementations, resulting in subtle
differences in behavior between devices. In this paper,
we demonstrate how a database engine can be optimized
for a particular device by learning \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Kang:2019:BOD,
author = "Daniel Kang and Peter Bailis and Matei Zaharia",
title = "{BlazeIt}: optimizing declarative aggregation and
limit queries for neural network-based video
analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "4",
pages = "533--546",
month = dec,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3372716.3372725",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jan 8 18:50:37 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Recent advances in neural networks (NNs) have enabled
automatic querying of large volumes of video data with
high accuracy. While these deep NNs can produce
accurate annotations of an object's position and type
in video, they are computationally \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Huang:2019:JST,
author = "Dawei Huang and Dong Young Yoon and Seth Pettie and
Barzan Mozafari",
title = "Joins on samples: a theoretical guide for
practitioners",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "4",
pages = "547--560",
month = dec,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3372716.3372726",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jan 8 18:50:37 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Despite decades of research on AQP (approximate query
processing), our understanding of sample-based joins
has remained limited and, to some extent, even
superficial. The common belief in the community is that
joining random samples is futile. This \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Karagiannis:2019:MAK,
author = "Georgios Karagiannis and Immanuel Trummer and Saehan
Jo and Shubham Khandelwal and Xuezhi Wang and Cong Yu",
title = "Mining an ``anti-knowledge base'' from {Wikipedia}
updates with applications to fact checking and beyond",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "4",
pages = "561--573",
month = dec,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3372716.3372727",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jan 8 18:50:37 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "We introduce the problem of anti-knowledge mining. Our
goal is to create an {``anti-knowledge base''} that
contains factual mistakes. The resulting data can be
used for analysis, training, and benchmarking in the
research domain of automated fact checking. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Lersch:2019:EPM,
author = "Lucas Lersch and Xiangpeng Hao and Ismail Oukid and
Tianzheng Wang and Thomas Willhalm",
title = "Evaluating persistent memory range indexes",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "4",
pages = "574--587",
month = dec,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.14778/3372716.3372728",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jan 8 18:50:37 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
abstract = "Persistent memory (PM) is fundamentally changing the
way database index structures are built by enabling
persistence, high performance, and (near) instant
recovery all on the memory bus. Prior work has proposed
many techniques to tailor index structure \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "http://portal.acm.org/citation.cfm?id=J1174",
}
@Article{Goldstein:2020:MBR,
author = "Jonathan Goldstein and Ahmed Abdelhamid and Mike
Barnett and Sebastian Burckhardt and Badrish
Chandramouli and Darren Gehring and Niel Lebeck and
Christopher Meiklejohn and Umar Farooq Minhas and Ryan
Newton and Rahee Ghosh Peshawaria and Tal Zaccai and
Irene Zhang",
title = "{A.M.B.R.O.S.I.A}: providing performant virtual
resiliency for distributed applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "5",
pages = "588--601",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3377369.3377370",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Apr 2 10:51:27 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3377369.3377370",
abstract = "When writing today's distributed programs, which
frequently span both devices and cloud services,
programmers are faced with complex decisions and coding
tasks around coping with failure, especially when these
distributed components are stateful. If \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ouyang:2020:ESP,
author = "Dian Ouyang and Long Yuan and Lu Qin and Lijun Chang
and Ying Zhang and Xuemin Lin",
title = "Efficient shortest path index maintenance on dynamic
road networks with theoretical guarantees",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "5",
pages = "602--615",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3377369.3377371",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Apr 2 10:51:27 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3377369.3377371",
abstract = "Computing the shortest path between two vertices is a
fundamental problem in road networks that is applied in
a wide variety of applications. To support efficient
shortest path query processing, a plethora of
index-based methods have been proposed in \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Stehle:2020:PMP,
author = "Elias Stehle and Hans-Arno Jacobsen",
title = "{ParPaRaw}: massively parallel parsing of
delimiter-separated raw data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "5",
pages = "616--628",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3377369.3377372",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Apr 2 10:51:27 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3377369.3377372",
abstract = "Parsing is essential for a wide range of use cases,
such as stream processing, bulk loading, and in-situ
querying of raw data. Yet, the compute-intense step
often constitutes a major bottleneck in the data
ingestion pipeline, since parsing of inputs \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Huang:2020:OOC,
author = "Yihe Huang and William Qian and Eddie Kohler and
Barbara Liskov and Liuba Shrira",
title = "Opportunities for optimism in contended main-memory
multicore transactions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "5",
pages = "629--642",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3377369.3377373",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Apr 2 10:51:27 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3377369.3377373",
abstract = "Optimistic concurrency control, or OCC, can achieve
excellent performance on uncontended workloads for
main-memory transactional databases. Contention causes
OCC's performance to degrade, however, and recent
concurrency control designs, such as hybrid \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zheng:2020:PLF,
author = "Bolong Zheng and Xi Zhao and Lianggui Weng and Nguyen
Quoc Viet Hung and Hang Liu and Christian S. Jensen",
title = "{PM-LSH}: a fast and accurate {LSH} framework for
high-dimensional approximate {NN} search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "5",
pages = "643--655",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3377369.3377374",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Apr 2 10:51:27 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3377369.3377374",
abstract = "Nearest neighbor (NN) search in high-dimensional
spaces is inherently computationally expensive due to
the curse of dimensionality. As a well-known solution
to approximate NN search, locality-sensitive hashing
(LSH) is able to answer c-approximate NN (c-\ldots{})",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sun:2020:HMB,
author = "Yahui Sun and Jun Luo and Theodoros Lappas and Xiaokui
Xiao and Bin Cui",
title = "Hunting multiple bumps in graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "5",
pages = "656--669",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3377369.3377375",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Apr 2 10:51:27 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3377369.3377375",
abstract = "Bump hunting is an important approach to the
extraction of insights from Euclidean datasets.
Recently, it has been explored for graph datasets for
the first time, and a single bump is hunted in an
unweighted graph in this exploration. Here, we extend
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yang:2020:HNE,
author = "Renchi Yang and Jieming Shi and Xiaokui Xiao and Yin
Yang and Sourav S. Bhowmick",
title = "Homogeneous network embedding for massive graphs via
reweighted personalized {PageRank}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "5",
pages = "670--683",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3377369.3377376",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Apr 2 10:51:27 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pagerank.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3377369.3377376",
abstract = "Given an input graph G and a node $ v \in G $,
homogeneous network embedding (HNE) maps the graph
structure in the vicinity of $v$ to a compact,
fixed-dimensional feature vector. This paper focuses on
HNE for massive graphs, e.g., with billions of edges.
On \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Qahtan:2020:PFD,
author = "Abdulhakim Qahtan and Nan Tang and Mourad Ouzzani and
Yang Cao and Michael Stonebraker",
title = "Pattern functional dependencies for data cleaning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "5",
pages = "684--697",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3377369.3377377",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Apr 2 10:51:27 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3377369.3377377",
abstract = "Patterns (or regex-based expressions) are widely used
to constrain the format of a domain (or a column),
e.g., a Year column should contain only four digits,
and thus a value like ``1980-'' might be a typo.
Moreover, integrity constraints (ICs) defined
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Whang:2020:MMV,
author = "Joyce Jiyoung Whang and Rundong Du and Sangwon Jung
and Geon Lee and Barry Drake and Qingqing Liu and
Seonggoo Kang and Haesun Park",
title = "{MEGA}: multi-view semi-supervised clustering of
hypergraphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "5",
pages = "698--711",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3377369.3377378",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Apr 2 10:51:27 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3377369.3377378",
abstract = "Complex relationships among entities can be modeled
very effectively using hypergraphs. Hypergraphs model
real-world data by allowing a hyperedge to include two
or more entities. Clustering of hypergraphs enables us
to group the similar entities \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Koumarelas:2020:MDD,
author = "Ioannis Koumarelas and Thorsten Papenbrock and Felix
Naumann",
title = "{MDedup}: duplicate detection with matching
dependencies",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "5",
pages = "712--725",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3377369.3377379",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Apr 2 10:51:27 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3377369.3377379",
abstract = "Duplicate detection is an integral part of data
cleaning and serves to identify multiple
representations of same real-world entities in
(relational) datasets. Existing duplicate detection
approaches are effective, but they are also hard to
parameterize \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Tran:2020:PVU,
author = "Van-Dang Tran and Hiroyuki Kato and Zhenjiang Hu",
title = "Programmable view update strategies on relations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "5",
pages = "726--739",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3377369.3377380",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Apr 2 10:51:27 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3377369.3377380",
abstract = "View update is an important mechanism that allows
updates on a view by translating them into the
corresponding updates on the base relations. The
existing literature has shown the ambiguity of
translating view updates. To address this ambiguity, we
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kumar:2020:ADD,
author = "Avinash Kumar and Zuozhi Wang and Shengquan Ni and
Chen Li",
title = "{Amber}: a debuggable dataflow system based on the
actor model",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "5",
pages = "740--753",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3377369.3377381",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Apr 2 10:51:27 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3377369.3377381",
abstract = "A long-running analytic task on big data often leaves
a developer in the dark without providing valuable
feedback about the status of the execution. In
addition, a failed job that needs to restart from
scratch can waste earlier computing resources. An
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Schiavio:2020:DSO,
author = "Filippo Schiavio and Daniele Bonetta and Walter
Binder",
title = "Dynamic speculative optimizations for {SQL}
compilation in {Apache Spark}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "5",
pages = "754--767",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3377369.3377382",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Apr 2 10:51:27 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3377369.3377382",
abstract = "Big-data systems have gained significant momentum, and
Apache Spark is becoming a de-facto standard for modern
data analytics. Spark relies on SQL query compilation
to optimize the execution performance of analytical
workloads on a variety of data \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Khayati:2020:MGE,
author = "Mourad Khayati and Alberto Lerner and Zakhar Tymchenko
and Philippe Cudr{\'e}-Mauroux",
title = "Mind the gap: an experimental evaluation of imputation
of missing values techniques in time series",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "5",
pages = "768--782",
month = jan,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3377369.3377383",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Apr 2 10:51:27 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3377369.3377383",
abstract = "Recording sensor data is seldom a perfect process.
Failures in power, communication or storage can leave
occasional blocks of data missing, affecting not only
real-time monitoring but also compromising the quality
of near- and off-line data analysis. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Mofrad:2020:GNA,
author = "Mohammad Hasanzadeh Mofrad and Rami Melhem and Yousuf
Ahmad and Mohammad Hammoud",
title = "{Graphite}: a {NUMA}-aware {HPC} system for graph
analytics based on a new {MPI * X} parallelism model",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "6",
pages = "783--797",
month = feb,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3380750.3380751",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Apr 2 10:51:28 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3380750.3380751",
abstract = "In this paper, we propose a new parallelism model
denoted as MPI * X and suggest a linear algebra-based
graph analytics system, namely, Graphite, which
effectively employs it. MPI * X promotes thread-based
partitioning to distribute computation and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Boer:2020:PIA,
author = "Naama Boer and Daniel Deutch and Nave Frost and Tova
Milo",
title = "Personal insights for altering decisions of tree-based
ensembles over time",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "6",
pages = "798--811",
month = feb,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3380750.3380752",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Apr 2 10:51:28 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3380750.3380752",
abstract = "Machine Learning models are prevalent in critical
human-related decision making, such as resume filtering
and loan applications. Refused individuals naturally
ask what could change the decision, should they
reapply. This question is hard for the model \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Peng:2020:ABS,
author = "You Peng and Ying Zhang and Xuemin Lin and Lu Qin and
Wenjie Zhang",
title = "Answering billion-scale label-constrained reachability
queries within microsecond",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "6",
pages = "812--825",
month = feb,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3380750.3380753",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Apr 2 10:51:28 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3380750.3380753",
abstract = "In this paper, we study the problem of
label-constrained reachability (LCR) query which is
fundamental in many applications with directed
edge-label graphs. Although the classical reachability
query (i.e., reachability query without label
constraint) \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Huang:2020:EER,
author = "Ruihong Huang and Shaoxu Song and Yunsu Lee and Jungho
Park and Soo-Hyung Kim and Sungmin Yi",
title = "Effective and efficient retrieval of structured
entities",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "6",
pages = "826--839",
month = feb,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3380750.3380754",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Apr 2 10:51:28 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3380750.3380754",
abstract = "Structured entities are commonly abstracted, such as
from XML, RDF or hidden-web databases. Direct retrieval
of various structured entities is highly demanded in
data lakes, e.g., given a JSON object, to find the XML
entities that denote the same real-. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sirin:2020:MAA,
author = "Utku Sirin and Anastasia Ailamaki",
title = "Micro-architectural analysis of {OLAP}: limitations
and opportunities",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "6",
pages = "840--853",
month = feb,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3380750.3380755",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Apr 2 10:51:28 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3380750.3380755",
abstract = "Understanding micro-architectural behavior is
important for efficiently using hardware resources.
Recent work has shown that in-memory online transaction
processing (OLTP) systems severely underutilize their
core micro-architecture resources [29]. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fang:2020:EEC,
author = "Yixiang Fang and Yixing Yang and Wenjie Zhang and
Xuemin Lin and Xin Cao",
title = "Effective and efficient community search over large
heterogeneous information networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "6",
pages = "854--867",
month = feb,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3380750.3380756",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Apr 2 10:51:28 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3380750.3380756",
abstract = "Recently, the topic of community search (CS) has
gained plenty of attention. Given a query vertex, CS
looks for a dense subgraph that contains it. Existing
studies mainly focus on homogeneous graphs in which
vertices are of the same type, and cannot be \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gupta:2020:RGS,
author = "Suyash Gupta and Sajjad Rahnama and Jelle Hellings and
Mohammad Sadoghi",
title = "{ResilientDB}: global scale resilient blockchain
fabric",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "6",
pages = "868--883",
month = feb,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3380750.3380757",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Apr 2 10:51:28 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3380750.3380757",
abstract = "Recent developments in blockchain technology have
inspired innovative new designs in resilient
distributed and database systems. At their core, these
blockchain applications typically use Byzantine
fault-tolerant consensus protocols to maintain a common
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Funke:2020:DPQ,
author = "Henning Funke and Jens Teubner",
title = "Data-parallel query processing on non-uniform data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "6",
pages = "884--897",
month = feb,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3380750.3380758",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Apr 2 10:51:28 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3380750.3380758",
abstract = "Graphics processing units (GPUs) promise spectacular
performance advantages when used as database
coprocessors. Their massive compute capacity, however,
is often hampered by control flow divergence caused by
non-uniform data distributions. When data-. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Feng:2020:EMH,
author = "Zonghao Feng and Qiong Luo",
title = "Evaluating memory-hard proof-of-work algorithms on
three processors",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "6",
pages = "898--911",
month = feb,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3380750.3380759",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Apr 2 10:51:28 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3380750.3380759",
abstract = "Most public blockchain systems, exemplified by
cryptocurrencies such as Ethereum and Monero, use
memory-hard proof-of-work (PoW) algorithms in consensus
protocols to maintain fair participation without a
trusted third party. The memory hardness, or the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lee:2020:ASW,
author = "Seokki Lee and Bertram Lud{\"a}scher and Boris
Glavic",
title = "Approximate summaries for why and why-not provenance",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "6",
pages = "912--924",
month = feb,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3380750.3380760",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Apr 2 10:51:28 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3380750.3380760",
abstract = "Why and why-not provenance have been studied
extensively in recent years. However, why-not
provenance and --- to a lesser degree --- why
provenance can be very large, resulting in severe
scalability and usability challenges. We introduce a
novel \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Jiang:2020:PAD,
author = "Hao Jiang and Chunwei Liu and Qi Jin and John
Paparrizos and Aaron J. Elmore",
title = "{PIDS}: attribute decomposition for improved
compression and query performance in columnar storage",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "6",
pages = "925--938",
month = feb,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3380750.3380761",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Apr 2 10:51:28 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3380750.3380761",
abstract = "We propose PIDS, Pattern Inference Decomposed Storage,
an innovative storage method for decomposing string
attributes in columnar stores. Using an unsupervised
approach, PIDS identifies common patterns in string
attributes from relational databases, and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Asudeh:2020:DCP,
author = "Abolfazl Asudeh and H. V. Jagadish and You (Will) Wu
and Cong Yu",
title = "On detecting cherry-picked trendlines",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "6",
pages = "939--952",
month = feb,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3380750.3380762",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Apr 2 10:51:28 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3380750.3380762",
abstract = "Poorly supported stories can be told based on data by
cherry-picking the data points included. While such
stories may be technically accurate, they are
misleading. In this paper, we build a system for
detecting cherry-picking, with a focus on trendlines
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ota:2020:DDD,
author = "Masayo Ota and Heiko M{\"u}ller and Juliana Freire and
Divesh Srivastava",
title = "Data-driven domain discovery for structured datasets",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "7",
pages = "953--967",
month = mar,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3384345.3384346",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 5 14:01:13 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3384345.3384346",
abstract = "The growing number of open datasets has created new
opportunities to derive insights and address important
societal problems. These data, however, often come with
little or no metadata, in particular about the types of
their attributes, thus greatly \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Shi:2020:RIF,
author = "Jieming Shi and Tianyuan Jin and Renchi Yang and
Xiaokui Xiao and Yin Yang",
title = "Realtime index-free single source {SimRank} processing
on web-scale graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "7",
pages = "966--980",
month = mar,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3384345.3384347",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 5 14:01:13 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pagerank.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3384345.3384347",
abstract = "Given a graph $G$ and a node $ u \in G$, a single
source SimRank query evaluates the similarity between
$u$ and every node $ v \in G$. Existing approaches to
single source SimRank computation incur either long
query response time, or expensive pre-computation,
which \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2020:DAR,
author = "Jiachuan Wang and Peng Cheng and Libin Zheng and Chao
Feng and Lei Chen and Xuemin Lin and Zheng Wang",
title = "Demand-aware route planning for shared mobility
services",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "7",
pages = "979--991",
month = mar,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3384345.3384348",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 5 14:01:13 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3384345.3384348",
abstract = "The dramatic development of shared mobility in food
delivery, ridesharing, and crowdsourced parcel delivery
has drawn great concerns. Specifically, shared mobility
refers to transferring or delivering more than one
passenger/package together when their \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Hilprecht:2020:DLD,
author = "Benjamin Hilprecht and Andreas Schmidt and Moritz
Kulessa and Alejandro Molina and Kristian Kersting and
Carsten Binnig",
title = "{DeepDB}: learn from data, not from queries!",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "7",
pages = "992--1005",
month = mar,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3384345.3384349",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 5 14:01:13 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3384345.3384349",
abstract = "The typical approach for learned DBMS components is to
capture the behavior by running a representative set of
queries and use the observations to train a machine
learning model. This workload-driven approach, however,
has two major downsides. First, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2020:DMU,
author = "Yuepeng Wang and Rushi Shah and Abby Criswell and Rong
Pan and Isil Dillig",
title = "Data migration using datalog program synthesis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "7",
pages = "1006--1019",
month = mar,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3384345.3384350",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 5 14:01:13 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3384345.3384350",
abstract = "This paper presents a new technique for migrating data
between different schemas. Our method expresses the
schema mapping as a Datalog program and automatically
synthesizes a Datalog program from simple input-output
examples to perform data migration. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhu:2020:LTG,
author = "Xiaowei Zhu and Guanyu Feng and Marco Serafini and
Xiaosong Ma and Jiping Yu and Lei Xie and Ashraf
Aboulnaga and Wenguang Chen",
title = "{LiveGraph}: a transactional graph storage system with
purely sequential adjacency list scans",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "7",
pages = "1020--1034",
month = mar,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3384345.3384351",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 5 14:01:13 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3384345.3384351",
abstract = "The specific characteristics of graph workloads make
it hard to design a one-size-fits-all graph storage
system. Systems that support transactional updates use
data structures with poor data locality, which limits
the efficiency of analytical workloads \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lin:2020:KKB,
author = "Xueling Lin and Haoyang Li and Hao Xin and Zijian Li
and Lei Chen",
title = "{KBPearl}: a knowledge base population system
supported by joint entity and relation linking",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "7",
pages = "1035--1049",
month = mar,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3384345.3384352",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 5 14:01:13 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3384345.3384352",
abstract = "Nowadays, most openly available knowledge bases (KBs)
are incomplete, since they are not synchronized with
the emerging facts happening in the real world.
Therefore, knowledge base population (KBP) from
external data sources, which extracts knowledge
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2020:CUT,
author = "Tianyi Li and Ruikai Huang and Lu Chen and Christian
S. Jensen and Torben Bach Pedersen",
title = "Compression of uncertain trajectories in road
networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "7",
pages = "1050--1063",
month = mar,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3384345.3384353",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 5 14:01:13 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3384345.3384353",
abstract = "Massive volumes of uncertain trajectory data are being
generated by GPS devices. Due to the limitations of GPS
data, these trajectories are generally uncertain. This
state of affairs renders it is attractive to be able to
compress uncertain trajectories \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Shastri:2020:UBI,
author = "Supreeth Shastri and Vinay Banakar and Melissa
Wasserman and Arun Kumar and Vijay Chidambaram",
title = "Understanding and benchmarking the impact of {GDPR} on
database systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "7",
pages = "1064--1077",
month = mar,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3384345.3384354",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 5 14:01:13 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3384345.3384354",
abstract = "The General Data Protection Regulation (GDPR) provides
new rights and protections to European people
concerning their personal data. We analyze GDPR from a
systems perspective, translating its legal articles
into a set of capabilities and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liu:2020:LOP,
author = "Jihang Liu and Shimin Chen and Lujun Wang",
title = "{LB+Trees}: optimizing persistent index performance on
{$3$DXPoint} memory",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "7",
pages = "1078--1090",
month = mar,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3384345.3384355",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 5 14:01:13 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3384345.3384355",
abstract = "3DXPoint memory is the first commercially available
NVM solution targeting mainstream computer systems.
While 3DXPoint conforms to many assumptions about NVM
in previous studies, we observe a number of distinctive
features of 3DXPoint. For example, the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lersch:2020:ELT,
author = "Lucas Lersch and Ivan Schreter and Ismail Oukid and
Wolfgang Lehner",
title = "Enabling low tail latency on multicore key-value
stores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "7",
pages = "1091--1104",
month = mar,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3384345.3384356",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 5 14:01:13 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3384345.3384356",
abstract = "Modern applications employ key-value stores (KVS) in
at least some point of their software stack, often as a
caching system or a storage manager. Many of these
applications also require a high degree of
responsiveness and performance predictability.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lin:2020:PAA,
author = "Chunbin Lin and Etienne Boursier and Yannis
Papakonstantinou",
title = "{Plato}: approximate analytics over compressed time
series with tight deterministic error guarantees",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "7",
pages = "1105--1118",
month = mar,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3384345.3384357",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 5 14:01:13 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3384345.3384357",
abstract = "Plato provides fast approximate analytics on time
series, by precomputing and storing compressed time
series. Plato's key novelty is the delivery of tight
deterministic error guarantees for the linear algebra
operators over vectors\slash time series, the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gera:2020:TLG,
author = "Prasun Gera and Hyojong Kim and Piyush Sao and Hyesoon
Kim and David Bader",
title = "Traversing large graphs on {GPUs} with unified
memory",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "7",
pages = "1119--1133",
month = mar,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3384345.3384358",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 5 14:01:13 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3384345.3384358",
abstract = "Due to the limited capacity of GPU memory, the
majority of prior work on graph applications on GPUs
has been restricted to graphs of modest sizes that fit
in memory. Recent hardware and software advances make
it possible to address much larger host \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ping:2020:SHQ,
author = "Haoyue Ping and Julia Stoyanovich and Benny
Kimelfeld",
title = "Supporting hard queries over probabilistic
preferences",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "7",
pages = "1134--1146",
month = mar,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3384345.3384359",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 5 14:01:13 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3384345.3384359",
abstract = "Preference analysis is widely applied in various
domains such as social choice and e-commerce. A
recently proposed framework augments the relational
database with a preference relation that represents
uncertain preferences in the form of statistical
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lu:2020:DSH,
author = "Baotong Lu and Xiangpeng Hao and Tianzheng Wang and
Eric Lo",
title = "{Dash}: scalable hashing on persistent memory",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "8",
pages = "1147--1161",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3389133.3389134",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 5 14:01:14 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3389133.3389134",
abstract = "Byte-addressable persistent memory (PM) brings hash
tables the potential of low latency, cheap persistence
and instant recovery. The recent advent of Intel Optane
DC Persistent Memory Modules (DCPMM) further
accelerates this trend. Many new hash table designs
have been proposed, but most of them were based on
emulation and perform sub-optimally on real PM. They
were also piece-wise and partial solutions that
side-step many important properties, in particular good
scalability, high load factor and instant
recovery.\par
We present Dash, a holistic approach to building
dynamic and scalable hash tables on real PM hardware
with all the aforementioned properties. Based on Dash,
we adapted two popular dynamic hashing schemes
(extendible hashing and linear hashing). On a 24-core
machine with Intel Optane DCPMM, we show that compared
to state-of-the-art, Dash-enabled hash tables can
achieve up to $ \approx 3.9 \times $ higher performance
with up to over 90\ load factor and an instant recovery
time of 57ms regardless of data size.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ferragina:2020:PIF,
author = "Paolo Ferragina and Giorgio Vinciguerra",
title = "The {PGM-index}: a fully-dynamic compressed learned
index with provable worst-case bounds",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "8",
pages = "1162--1175",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3389133.3389135",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 5 14:01:14 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3389133.3389135",
abstract = "We present the first learned index that supports
predecessor, range queries and updates within provably
efficient time and space bounds in the worst case. In
the (static) context of just predecessor and range
queries these bounds turn out to be optimal. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ma:2020:DRC,
author = "Minghua Ma and Zheng Yin and Shenglin Zhang and Sheng
Wang and Christopher Zheng and Xinhao Jiang and Hanwen
Hu and Cheng Luo and Yilin Li and Nengjun Qiu and
Feifei Li and Changcheng Chen and Dan Pei",
title = "Diagnosing root causes of intermittent slow queries in
cloud databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "8",
pages = "1176--1189",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3389133.3389136",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 5 14:01:14 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3389133.3389136",
abstract = "With the growing market of cloud databases, careful
detection and elimination of slow queries are of great
importance to service stability. Previous studies focus
on optimizing the slow queries that result from
internal reasons (e.g., poorly-written \ldots{})",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2020:PEF,
author = "Xuhao Chen and Roshan Dathathri and Gurbinder Gill and
Keshav Pingali",
title = "{Pangolin}: an efficient and flexible graph mining
system on {CPU} and {GPU}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "8",
pages = "1190--1205",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3389133.3389137",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 5 14:01:14 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3389133.3389137",
abstract = "There is growing interest in graph pattern mining
(GPM) problems such as motif counting. GPM systems have
been developed to provide unified interfaces for
programming algorithms for these problems and for
running them on parallel systems. However, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Dreseler:2020:QTH,
author = "Markus Dreseler and Martin Boissier and Tilmann Rabl
and Matthias Uflacker",
title = "Quantifying {TPC-H} choke points and their
optimizations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "8",
pages = "1206--1220",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3389133.3389138",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 5 14:01:14 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3389133.3389138",
abstract = "TPC-H continues to be the most widely used benchmark
for relational OLAP systems. It poses a number of
challenges, also known as ``choke points'', which
database systems have to solve in order to achieve good
benchmark results. Examples include joins \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2020:EAC,
author = "Yuanbing Li and Xian Wu and Yifei Jin and Jian Li and
Guoliang Li",
title = "Efficient algorithms for crowd-aided categorization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "8",
pages = "1221--1233",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3389133.3389139",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 5 14:01:14 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3389133.3389139",
abstract = "We study the problem of utilizing human intelligence
to categorize a large number of objects. In this
problem, given a category hierarchy and a set of
objects, we can ask humans to check whether an object
belongs to a category, and our goal is to find
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2020:SVD,
author = "Shaowei Wang and Yuqiu Qian and Jiachun Du and Wei
Yang and Liusheng Huang and Hongli Xu",
title = "Set-valued data publication with local privacy: tight
error bounds and efficient mechanisms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "8",
pages = "1234--1247",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3389133.3389140",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 5 14:01:14 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3389133.3389140",
abstract = "Most user-generated data in online services are
presented as set-valued data, e.g., visited website
URLs, recently used Apps by a person, and etc. These
data are of great value to service providers, but also
bring privacy concerns if collected and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fegaras:2020:TAB,
author = "Leonidas Fegaras and Hasanuzzaman Noor",
title = "Translation of array-based loops to distributed
data-parallel programs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "8",
pages = "1248--1260",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3389133.3389141",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 5 14:01:14 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3389133.3389141",
abstract = "Large volumes of data generated by scientific
experiments and simulations come in the form of arrays,
while programs that analyze these data are frequently
expressed in terms of array operations in an
imperative, loop-based language. But, as datasets
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fan:2020:IGP,
author = "Wenfei Fan and Muyang Liu and Chao Tian and Ruiqi Xu
and Jingren Zhou",
title = "Incrementalization of graph partitioning algorithms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "8",
pages = "1261--1274",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3389133.3389142",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 5 14:01:14 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3389133.3389142",
abstract = "This paper studies incremental graph partitioning.
Given a (vertex-cut or edge-cut) partition $ C(G) $ of
a graph $G$ and updates $ \Delta G$ to $G$, it is to
compute changes $ \Delta O$ to $ C(G)$, yielding a
partition of the updated graph such that (a) the new
partition is load-\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ko:2020:OIS,
author = "Shao-Heng Ko and Hsu-Chao Lai and Hong-Han Shuai and
Wang-Chien Lee and Philip S. Yu and De-Nian Yang",
title = "Optimizing item and subgroup configurations for
social-aware {VR} shopping",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "8",
pages = "1275--1289",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3389133.3389143",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 5 14:01:14 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3389133.3389143",
abstract = "Shopping in VR malls has been regarded as a paradigm
shift for E-commerce, but most of the conventional VR
shopping platforms are designed for a single user. In
this paper, we envisage a scenario of VR group
shopping, which brings major advantages over \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Savvides:2020:ECP,
author = "Savvas Savvides and Darshika Khandelwal and Patrick
Eugster",
title = "Efficient confidentiality-preserving data analytics
over symmetrically encrypted datasets",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "8",
pages = "1290--1303",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3389133.3389144",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 5 14:01:14 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3389133.3389144",
abstract = "In the past decade, cloud computing has emerged as an
economical and practical alternative to in-house
datacenters. But due to security concerns, many
enterprises are still averse to adopting third party
clouds. To mitigate these concerns, several \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gill:2020:SMG,
author = "Gurbinder Gill and Roshan Dathathri and Loc Hoang and
Ramesh Peri and Keshav Pingali",
title = "Single machine graph analytics on massive datasets
using {Intel Optane DC Persistent Memory}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "8",
pages = "1304--1318",
month = apr,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3389133.3389145",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 5 14:01:14 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3389133.3389145",
abstract = "Intel Optane DC Persistent Memory (Optane PMM) is a
new kind of byte-addressable memory with higher density
and lower cost than DRAM. This enables the design of
affordable systems that support up to 6TB of randomly
accessible memory. In this paper, we \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zakhary:2020:ACA,
author = "Victor Zakhary and Divyakant Agrawal and Amr {El
Abbadi}",
title = "Atomic commitment across blockchains",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "9",
pages = "1319--1331",
month = may,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3397230.3397231",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jul 8 18:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397231",
abstract = "The recent adoption of blockchain technologies and
open permissionless networks suggest the importance of
peer-to-peer atomic cross-chain transaction protocols.
Users should be able to atomically exchange tokens and
assets without depending on \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Mathew:2020:HSM,
author = "Ajit Mathew and Changwoo Min",
title = "{HydraList}: a scalable in-memory index using
asynchronous updates and partial replication",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "9",
pages = "1332--1345",
month = may,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3397230.3397232",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jul 8 18:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397232",
abstract = "Increased capacity of main memory has led to the rise
of in-memory databases. With disk access eliminated,
efficiency of index structures has become critical for
performance in these systems. An ideal index structure
should exhibit high performance for \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Davis:2020:EMP,
author = "A. Jesse Jiryu Davis and Max Hirschhorn and Judah
Schvimer",
title = "Extreme modelling in practice",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "9",
pages = "1346--1358",
month = may,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3397230.3397233",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jul 8 18:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397233",
abstract = "Formal modelling is a powerful tool for developing
complex systems. At MongoDB, we use TLA$^+$ to model
and verify multiple aspects of several systems.
Ensuring conformance between a specification and its
implementation can add value to any specification;
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lyu:2020:MBS,
author = "Bingqing Lyu and Lu Qin and Xuemin Lin and Ying Zhang
and Zhengping Qian and Jingren Zhou",
title = "Maximum biclique search at billion scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "9",
pages = "1359--1372",
month = may,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3397230.3397234",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jul 8 18:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397234",
abstract = "Maximum biclique search, which finds the biclique with
the maximum number of edges in a bipartite graph, is a
fundamental problem with a wide spectrum of
applications in different domains, such as E-Commerce,
social analysis, web services, and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chepurko:2020:AAR,
author = "Nadiia Chepurko and Ryan Marcus and Emanuel Zgraggen
and Raul Castro Fernandez and Tim Kraska and David
Karger",
title = "{ARDA}: automatic relational data augmentation for
machine learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "9",
pages = "1373--1387",
month = may,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3397230.3397235",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jul 8 18:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397235",
abstract = "Automatic machine learning (AML) is a family of
techniques to automate the process of training
predictive models, aiming to both improve performance
and make machine learning more accessible. While many
recent works have focused on aspects of the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Alkowaileet:2020:LBT,
author = "Wail Y. Alkowaileet and Sattam Alsubaiee and Michael
J. Carey",
title = "An {LSM}-based tuple compaction framework for {Apache
AsterixDB}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "9",
pages = "1388--1400",
month = may,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3397230.3397236",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jul 8 18:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397236",
abstract = "Document database systems store self-describing
semi-structured records, such as JSON, ``as-is''
without requiring the users to pre-define a schema.
This provides users with the flexibility to change the
structure of incoming records without worrying
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Shraga:2020:ACD,
author = "Roee Shraga and Avigdor Gal and Haggai Roitman",
title = "{ADnEV}: cross-domain schema matching using deep
similarity matrix adjustment and evaluation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "9",
pages = "1401--1415",
month = may,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3397230.3397237",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jul 8 18:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397237",
abstract = "Schema matching is a process that serves in
integrating structured and semi-structured data. Being
a handy tool in multiple contemporary business and
commerce applications, it has been investigated in the
fields of databases, AI, Semantic Web, and data
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhou:2020:QPP,
author = "Xuanhe Zhou and Ji Sun and Guoliang Li and Jianhua
Feng",
title = "Query performance prediction for concurrent queries
using graph embedding",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "9",
pages = "1416--1428",
month = may,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3397230.3397238",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jul 8 18:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397238",
abstract = "Query performance prediction is vital to many database
tasks (e.g., database monitoring and query scheduling).
Existing methods focus on predicting the performance
for a single query but cannot effectively predict the
performance for concurrent queries, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Alquraan:2020:SNZ,
author = "Ahmed Alquraan and Alex Kogan and Virendra J. Marathe
and Samer Al-Kiswany",
title = "Scalable, near-zero loss disaster recovery for
distributed data stores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "9",
pages = "1429--1442",
month = may,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3397230.3397239",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jul 8 18:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397239",
abstract = "This paper presents a new Disaster Recovery (DR)
system, called Slogger, that differs from prior works
in two principle ways: (i) Slogger enables DR for a
linearizable distributed data store, and (ii) Slogger
adopts the continuous backup approach that \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lu:2020:VAN,
author = "Kejing Lu and Hongya Wang and Wei Wang and Mineichi
Kudo",
title = "{VHP}: approximate nearest neighbor search via virtual
hypersphere partitioning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "9",
pages = "1443--1455",
month = may,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3397230.3397240",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jul 8 18:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397240",
abstract = "Locality sensitive hashing (LSH) is a widely practiced
c -approximate nearest neighbor( c -ANN) search
algorithm in high dimensional spaces. The
state-of-the-art LSH based algorithm searches an
unbounded and irregular space to identify candidates,
which \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kim:2020:IFS,
author = "Hyunjoon Kim and Seunghwan Min and Kunsoo Park and
Xuemin Lin and Seok-Hee Hong and Wook-Shin Han",
title = "{IDAR}: fast supergraph search using {DAG}
integration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "9",
pages = "1456--1468",
month = may,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3397230.3397241",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jul 8 18:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397241",
abstract = "Supergraph search is one of fundamental graph query
processing problems in many application domains. Given
a query graph and a set of data graphs, supergraph
search is to find all the data graphs contained in the
query graph as subgraphs. In existing \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Seleznova:2020:GEU,
author = "Mariia Seleznova and Behrooz Omidvar-Tehrani and Sihem
Amer-Yahia and Eric Simon",
title = "Guided exploration of user groups",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "9",
pages = "1469--1482",
month = may,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3397230.3397242",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jul 8 18:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397242",
abstract = "Finding a set of users of interest serves several
applications in behavioral analytics. Often times,
identifying users requires to explore the data and
gradually choose potential targets. This is a special
case of Exploratory Data Analysis (EDA), an \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gong:2020:IID,
author = "Long Gong and Huayi Wang and Mitsunori Ogihara and Jun
Xu",
title = "{iDEC}: indexable distance estimating codes for
approximate nearest neighbor search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "9",
pages = "1483--1497",
month = may,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3397230.3397243",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jul 8 18:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397243",
abstract = "Approximate Nearest Neighbor (ANN) search is a
fundamental algorithmic problem, with numerous
applications in many areas of computer science. In this
work, we propose indexable distance estimating codes
(iDEC), a new solution framework to ANN that \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Bian:2020:EAB,
author = "Song Bian and Qintian Guo and Sibo Wang and Jeffrey Xu
Yu",
title = "Efficient algorithms for budgeted influence
maximization on massive social networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "9",
pages = "1498--1510",
month = may,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3397230.3397244",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jul 8 18:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397244",
abstract = "Given a social network G, a cost associated with each
node, and a budget B, the budgeted influence
maximization (BIM) problem aims to find a set S of
nodes, denoted as the seed set, that maximizes the
expected number of influenced users under the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Prateek:2020:MTK,
author = "Arneish Prateek and Arijit Khan and Akshit Goyal and
Sayan Ranu",
title = "Mining Top-$k$ pairs of correlated subgraphs in a
large network",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "9",
pages = "1511--1524",
month = may,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3397230.3397245",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jul 8 18:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397245",
abstract = "We investigate the problem of correlated subgraphs
mining (CSM) where the goal is to identify pairs of
subgraph patterns that frequently co-occur in proximity
within a single graph. Correlated subgraph patterns are
different from frequent subgraphs due \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Buchnik:2020:FHT,
author = "Yehonatan Buchnik and Roy Friedman",
title = "{FireLedger}: a high throughput blockchain consensus
protocol",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "9",
pages = "1525--1539",
month = may,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3397230.3397246",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jul 8 18:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397246",
abstract = "Blockchains are distributed secure ledgers to which
transactions are issued continuously and each block of
transactions is tightly coupled to its predecessors.
Permissioned blockchains place special emphasis on
transactions throughput. In this paper we \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2020:PEF,
author = "Kefei Wang and Jian Liu and Feng Chen",
title = "Put an elephant into a fridge: optimizing cache
efficiency for in-memory key--value stores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "9",
pages = "1540--1554",
month = may,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3397230.3397247",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jul 8 18:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397247",
abstract = "In today's data centers, memory-based key-value
systems, such as Memcached and Redis, play an
indispensable role in providing high-speed data
services. The rapidly growing capacity and quickly
falling price of DRAM memory in the past years have
enabled \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Pedersen:2020:ASR,
author = "Simon Aagaard Pedersen and Bin Yang and Christian S.
Jensen",
title = "Anytime stochastic routing with hybrid learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "9",
pages = "1555--1567",
month = may,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3397230.3397248",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jul 8 18:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397248",
abstract = "Increasingly massive volumes of vehicle trajectory
data hold the potential to enable higher-resolution
traffic services than hitherto possible. We use
trajectory data to create a high-resolution, uncertain
road-network graph, where edges are associated
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2020:UED,
author = "Qizhen Zhang and Yifan Cai and Xinyi Chen and
Sebastian Angel and Ang Chen and Vincent Liu and Boon
Thau Loo",
title = "Understanding the effect of data center resource
disaggregation on production {DBMSs}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "9",
pages = "1568--1581",
month = may,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3397230.3397249",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jul 8 18:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397249",
abstract = "Resource disaggregation is a new architecture for data
centers in which resources like memory and storage are
decoupled from the CPU, managed independently, and
connected through a high-speed network. Recent work has
shown that although disaggregated \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Tziavelis:2020:OAR,
author = "Nikolaos Tziavelis and Deepak Ajwani and Wolfgang
Gatterbauer and Mirek Riedewald and Xiaofeng Yang",
title = "Optimal algorithms for ranked enumeration of answers
to full conjunctive queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "9",
pages = "1582--1597",
month = may,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3397230.3397250",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jul 8 18:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397250",
abstract = "We study ranked enumeration of join-query results
according to very general orders defined by selective
dioids. Our main contribution is a framework for ranked
enumeration over a class of dynamic programming
problems that generalizes seemingly different
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Dhulipala:2020:SPS,
author = "Laxman Dhulipala and Charles McGuffey and Hongbo Kang
and Yan Gu and Guy E. Blelloch and Phillip B. Gibbons
and Julian Shun",
title = "{Sage}: parallel semi-asymmetric graph algorithms for
{NVRAMs}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "9",
pages = "1598--1613",
month = may,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3397230.3397251",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Jul 8 18:23:01 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/abs/10.14778/3397230.3397251",
abstract = "Non-volatile main memory (NVRAM) technologies provide
an attractive set of features for large-scale graph
analytics, including byte-addressability, low idle
power, and improved memory-density. NVRAM systems today
have an order of magnitude more NVRAM \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhu:2020:PIN,
author = "Yuqing Zhu and Jing Tang and Xueyan Tang",
title = "Pricing influential nodes in online social networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "10",
pages = "1614--1627",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3401960.3401961",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 24 11:36:56 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3401960.3401961",
abstract = "Influential nodes with rich connections in online
social networks (OSNs) are of great values to initiate
marketing campaigns. However, the potential influence
spread that can be generated by these influential nodes
is hidden behind the structures of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sun:2020:KSA,
author = "Bintao Sun and Maximilien Danisch and T-H. Hubert Chan
and Mauro Sozio",
title = "{KClist++}: a simple algorithm for finding $k$-clique
densest subgraphs in large graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "10",
pages = "1628--1640",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3401960.3401962",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 24 11:36:56 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3401960.3401962",
abstract = "The problem of finding densest subgraphs has received
increasing attention in recent years finding
applications in biology, finance, as well as social
network analysis. The k -clique densest subgraph
problem is a generalization of the densest subgraph
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wellenzohn:2020:DIC,
author = "Kevin Wellenzohn and Michael H. B{\"o}hlen and Sven
Helmer",
title = "Dynamic interleaving of content and structure for
robust indexing of semi-structured hierarchical data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "10",
pages = "1641--1653",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3401960.3401963",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 24 11:36:56 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3401960.3401963",
abstract = "We propose a robust index for semi-structured
hierarchical data that supports content-and-structure
(CAS) queries specified by path and value predicates.
At the heart of our approach is a novel dynamic
interleaving scheme that merges the path and value
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Agarwal:2020:CGS,
author = "Shubhangi Agarwal and Sourav Dutta and Arnab
Bhattacharya",
title = "{ChiSeL}: graph similarity search using chi-squared
statistics in large probabilistic graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "10",
pages = "1654--1668",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3401960.3401964",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 24 11:36:56 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3401960.3401964",
abstract = "Subgraph querying is one of the most important
primitives in many applications. Although the field is
well studied for deterministic graphs, in many
situations, the graphs are probabilistic in nature. In
this paper, we address the problem of subgraph
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Tan:2020:FID,
author = "Zijing Tan and Ai Ran and Shuai Ma and Sheng Qin",
title = "Fast incremental discovery of pointwise order
dependencies",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "10",
pages = "1669--1681",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3401960.3401965",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 24 11:36:56 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3401960.3401965",
abstract = "Pointwise order dependencies (PODs) are dependencies
that specify ordering semantics on attributes of
tuples. POD discovery refers to the process of
identifying the set $ \Sigma $ of valid and minimal
PODs on a given data set D. In practice D is typically
large \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Livshits:2020:ADC,
author = "Ester Livshits and Alireza Heidari and Ihab F. Ilyas
and Benny Kimelfeld",
title = "Approximate denial constraints",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "10",
pages = "1682--1695",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3401960.3401966",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 24 11:36:56 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3401960.3401966",
abstract = "The problem of mining integrity constraints from data
has been extensively studied over the past two decades
for commonly used types of constraints, including the
classic Functional Dependencies (FDs) and the more
general Denial Constraints (DCs). In \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Rehrmann:2020:SOO,
author = "Robin Rehrmann and Carsten Binnig and Alexander
B{\"o}hm and Kihong Kim and Wolfgang Lehner",
title = "Sharing opportunities for {OLTP} workloads in
different isolation levels",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "10",
pages = "1696--1708",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3401960.3401967",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 24 11:36:56 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3401960.3401967",
abstract = "OLTP applications are usually executed by a high
number of clients in parallel and are typically faced
with high throughput demand as well as a constraint
latency requirement for individual statements.
Interestingly, OLTP workloads are often read-heavy
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Neumann:2020:BBM,
author = "Stefan Neumann and Pauli Miettinen",
title = "Biclustering and {Boolean} matrix factorization in
data streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "10",
pages = "1709--1722",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3401960.3401968",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 24 11:36:56 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3401960.3401968",
abstract = "We study clustering of bipartite graphs and Boolean
matrix factorization in data streams. We consider a
streaming setting in which the vertices from the left
side of the graph arrive one by one together with all
of their incident edges. We provide an \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Jian:2020:EER,
author = "Xun Jian and Yue Wang and Lei Chen",
title = "Effective and efficient relational community detection
and search in large dynamic heterogeneous information
networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "10",
pages = "1723--1736",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3401960.3401969",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 24 11:36:56 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3401960.3401969",
abstract = "Community search in heterogeneous information networks
(HINs) has attracted much attention in graph analysis.
Given a vertex, the goal is to find a densely-connected
sub-graph that contains the vertex. In practice, the
user may need to restrict the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kim:2020:NLS,
author = "Hyeonji Kim and Byeong-Hoon So and Wook-Shin Han and
Hongrae Lee",
title = "Natural language to {SQL}: where are we today?",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "10",
pages = "1737--1750",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3401960.3401970",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 24 11:36:56 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3401960.3401970",
abstract = "Translating natural language to SQL (NL2SQL) has
received extensive attention lately, especially with
the recent success of deep learning technologies.
However, despite the large number of studies, we do not
have a thorough understanding of how good \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Che:2020:ATD,
author = "Yulin Che and Zhuohang Lai and Shixuan Sun and Yue
Wang and Qiong Luo",
title = "Accelerating truss decomposition on heterogeneous
processors",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "10",
pages = "1751--1764",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3401960.3401971",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 24 11:36:56 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3401960.3401971",
abstract = "Truss decomposition is to divide a graph into a
hierarchy of subgraphs, or trusses. A subgraph is a k
-truss ( k {$>$}= 2) if each edge is in at least k ---
2 triangles in the subgraph. Existing algorithms work
by first counting the number of triangles each
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Mukherjee:2020:SDS,
author = "Rohan Mukherjee and Swarat Chaudhuri and Chris
Jermaine",
title = "Searching a database of source codes using
contextualized code search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "10",
pages = "1765--1778",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3401960.3401972",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 24 11:36:56 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3401960.3401972",
abstract = "Consider the case where a programmer has written some
part of a program, but has left part of the program
(such as a method or a function body) incomplete. The
goal is to use the context surrounding the missing code
to automatically ``figure out'' which \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2020:DSE,
author = "Yan Li and Tingjian Ge and Cindy Chen",
title = "Data stream event prediction based on timing knowledge
and state transitions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "10",
pages = "1779--1792",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3401960.3401973",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 24 11:36:56 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3401960.3401973",
abstract = "We study a practical problem of predicting the
upcoming events in data streams using a novel approach.
Treating event time orders as relationship types
between event entities, we build a dynamic knowledge
graph and use it to predict future event timing.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{McSherry:2020:SAP,
author = "Frank McSherry and Andrea Lattuada and Malte
Schwarzkopf and Timothy Roscoe",
title = "Shared arrangements: practical inter-query sharing for
streaming dataflows",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "10",
pages = "1793--1806",
month = jun,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3401960.3401974",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 24 11:36:56 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3401960.3401974",
abstract = "Current systems for data-parallel, incremental
processing and view maintenance over high-rate streams
isolate the execution of independent queries. This
creates unwanted redundancy and overhead in the
presence of concurrent incrementally maintained
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gupta:2020:SBD,
author = "Peeyush Gupta and Michael J. Carey and Sharad Mehrotra
and oberto Yus",
title = "{SmartBench}: a benchmark for data management in smart
spaces",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "1807--1820",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407791",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407791",
abstract = "This paper proposes SmartBench, a benchmark focusing
on queries resulting from (near) real-time applications
and longer-term analysis of IoT data. SmartBench,
derived from a deployed smart building monitoring
system, is comprised of: (1) An extensible \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Boniol:2020:SGB,
author = "Paul Boniol and Themis Palpanas",
title = "Series2Graph: graph-based subsequence anomaly
detection for time series",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "1821--1834",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407792",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407792",
abstract = "Subsequence anomaly detection in long sequences is an
important problem with applications in a wide range of
domains. However, the approaches that have been
proposed so far in the literature have severe
limitations: they either require prior domain
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2020:SCS,
author = "Dan Zhang and Madelon Hulsebos and Yoshihiko Suhara
and {\c{C}}agatay Demiralp and Jinfeng Li and
Wang-Chiew Tan",
title = "{Sato}: contextual semantic type detection in tables",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "1835--1848",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407793",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407793",
abstract = "Detecting the semantic types of data columns in
relational tables is important for various data
preparation and information retrieval tasks such as
data cleaning, schema matching, data discovery, and
semantic search. However, existing detection \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{He:2020:TTP,
author = "Qijian He and Wei Yang and Bingren Chen and Yangyang
Geng and Liusheng Huang",
title = "{TransNet}: training privacy-preserving neural network
over transformed layer",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "1849--1862",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407794",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407794",
abstract = "The accuracy of neural network can be improved by
training over multi-participants' pooled dataset, but
privacy problem of sharing sensitive data obstructs
this collaborative learning. To solve this
contradiction, we propose TransNet, a novel solution
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fan:2020:CAG,
author = "Wenfei Fan and Ruochun Jin and Muyang Liu and Ping Lu
and Chao Tian and Jingren Zhou",
title = "Capturing associations in graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "1863--1876",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407795",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407795",
abstract = "This paper proposes a class of graph association
rules, denoted by GARs, to specify regularities between
entities in graphs. A GAR is a combination of a graph
pattern and a dependency; it may take as predicates ML
(machine learning) classifiers for link \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Renz-Wieland:2020:DPA,
author = "Alexander Renz-Wieland and Rainer Gemulla and Steffen
Zeuch and Volker Markl",
title = "Dynamic parameter allocation in parameter servers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "1877--1890",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407796",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407796",
abstract = "To keep up with increasing dataset sizes and model
complexity, distributed training has become a necessity
for large machine learning tasks. Parameter servers
ease the implementation of distributed parameter
management---a key concern in distributed \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Freitag:2020:AWC,
author = "Michael Freitag and Maximilian Bandle and Tobias
Schmidt and Alfons Kemper and Thomas Neumann",
title = "Adopting worst-case optimal joins in relational
database systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "1891--1904",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407797",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407797",
abstract = "Worst-case optimal join algorithms are attractive from
a theoretical point of view, as they offer
asymptotically better runtime than binary joins on
certain types of queries. In particular, they avoid
enumerating large intermediate results by processing
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{McKenna:2020:WAM,
author = "Ryan McKenna and Raj Kumar Maity and Arya Mazumdar and
Gerome Miklau",
title = "A workload-adaptive mechanism for linear queries under
local differential privacy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "1905--1918",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407798",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407798",
abstract = "We propose a new mechanism to accurately answer a
user-provided set of linear counting queries under
local differential privacy (LDP). Given a set of linear
counting queries (the workload) our mechanism
automatically adapts to provide accuracy on the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2020:SSP,
author = "Yisu Remy Wang and Shana Hutchison and Jonathan Leang
and Bill Howe and Dan Suciu",
title = "{SPORES}: sum-product optimization via relational
equality saturation for large scale linear algebra",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "1919--1932",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407799",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407799",
abstract = "Machine learning algorithms are commonly specified in
linear algebra (LA). LA expressions can be rewritten
into more efficient forms, by taking advantage of input
properties such as sparsity, as well as program
properties such as common subexpressions. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fernandez:2020:DMP,
author = "Raul Castro Fernandez and Pranav Subramaniam and
Michael J. Franklin",
title = "Data market platforms: trading data assets to solve
data problems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "1933--1947",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407800",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407800",
abstract = "Data only generates value for a few organizations with
expertise and resources to make data shareable,
discoverable, and easy to integrate. Sharing data that
is easy to discover and integrate is hard because data
owners lack information (who needs what \ldots{})",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Mahdavi:2020:BEE,
author = "Mohammad Mahdavi and Ziawasch Abedjan",
title = "{Baran}: effective error correction via a unified
context representation and transfer learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "1948--1961",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407801",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407801",
abstract = "Traditional error correction solutions leverage
handmaid rules or master data to find the correct
values. Both are often amiss in real-world scenarios.
Therefore, it is desirable to additionally learn
corrections from a limited number of example \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fan:2020:RDS,
author = "Ju Fan and Junyou Chen and Tongyu Liu and Yuwei Shen
and Guoliang Li and Xiaoyong Du",
title = "Relational data synthesis using generative adversarial
networks: a design space exploration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "1962--1975",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407802",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407802",
abstract = "The proliferation of big data has brought an urgent
demand for privacy-preserving data publishing.
Traditional solutions to this demand have limitations
on effectively balancing the tradeoff between privacy
and utility of the released data. Thus, the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yang:2020:LLP,
author = "Lei Yang and Hong Wu and Tieying Zhang and Xuntao
Cheng and Feifei Li and Lei Zou and Yujie Wang and
Rongyao Chen and Jianying Wang and Gui Huang",
title = "{Leaper}: a learned prefetcher for cache invalidation
in {LSM}-tree based storage engines",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "1976--1989",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407803",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407803",
abstract = "Frequency-based cache replacement policies that work
well on page-based database storage engines are no
longer sufficient for the emerging LSM-tree (
Log-Structure Merge-tree ) based storage engines. Due
to the append-only and copy-on-write techniques
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kang:2020:ASG,
author = "Daniel Kang and Edward Gan and Peter Bailis and
Tatsunori Hashimoto and Matei Zaharia",
title = "Approximate selection with guarantees using proxies",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "1990--2003",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407804",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407804",
abstract = "Due to the falling costs of data acquisition and
storage, researchers and industry analysts often want
to find all instances of rare events in large datasets.
For instance, scientists can cheaply capture thousands
of hours of video, but are limited by \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kang:2020:EIC,
author = "Minji Kang and Soyee Choi and Gihwan Oh and Sang-Won
Lee",
title = "{2R}: efficiently isolating cold pages in flash
storages",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2004--2017",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407805",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407805",
abstract = "Given skewed writes common in databases, the
conventional 1R-Greedy FTL incurs huge write
amplification, most of which is contributed by cold
pages amounting to 80\% of data. Since 1R-Greedy
manages all flash blocks in one region at no type
distinction, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Bashardoost:2020:KT,
author = "Bahar Ghadiri Bashardoost and Ren{\'e}e J. Miller and
Kelly Lyons and Fatemeh Nargesian",
title = "Knowledge translation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2018--2032",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407806",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407806",
abstract = "We introduce Kensho, a tool for generating mapping
rules between two Knowledge Bases (KBs). To create the
mapping rules, Kensho starts with a set of
correspondences and enriches them with additional
semantic information automatically identified from the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Petersohn:2020:TSD,
author = "Devin Petersohn and Stephen Macke and Doris Xin and
William Ma and Doris Lee and Xiangxi Mo and Joseph E.
Gonzalez and Joseph M. Hellerstein and Anthony D.
Joseph and Aditya Parameswaran",
title = "Towards scalable dataframe systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2033--2046",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407807",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407807",
abstract = "Dataframes are a popular abstraction to represent,
prepare, and analyze data. Despite the remarkable
success of dataframe libraries in R and Python,
dataframes face performance issues even on moderately
large datasets. Moreover, there is significant
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lu:2020:AFP,
author = "Yi Lu and Xiangyao Yu and Lei Cao and Samuel Madden",
title = "{Aria}: a fast and practical deterministic {OLTP}
database",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2047--2060",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407808",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407808",
abstract = "Deterministic databases are able to efficiently run
transactions across different replicas without
coordination. However, existing state-of-the-art
deterministic databases require that transaction
read/write sets are known before execution, making such
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Miao:2020:COS,
author = "Dongjing Miao and Zhipeng Cai and Jianzhong Li and
Xiangyu Gao and Xianmin Liu",
title = "The computation of optimal subset repairs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2061--2074",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407809",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407809",
abstract = "Computing an optimal subset repair of an inconsistent
database is becoming a standalone research problem and
has a wide range of applications. However, it has not
been well-studied yet. A tight inapproximability bound
of the problem computing optimal \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Christodoulakis:2020:PPB,
author = "Christina Christodoulakis and Eric B. Munson and Moshe
Gabel and Angela Demke Brown and Ren{\'e}e J. Miller",
title = "{Pytheas}: pattern-based table discovery in {CSV}
files",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2075--2089",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407810",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407810",
abstract = "CSV is a popular Open Data format widely used in a
variety of domains for its simplicity and effectiveness
in storing and disseminating data. Unfortunately, data
published in this format often does not conform to
strict specifications, making automated \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wu:2020:PPV,
author = "Yuncheng Wu and Shaofeng Cai and Xiaokui Xiao and Gang
Chen and Beng Chin Ooi",
title = "Privacy preserving vertical federated learning for
tree-based models",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2090--2103",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407811",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407811",
abstract = "Federated learning (FL) is an emerging paradigm that
enables multiple organizations to jointly train a model
without revealing their private data to each other.
This paper studies vertical federated learning, which
tackles the scenarios where (i) \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Al-Baghdadi:2020:TBC,
author = "Ahmed Al-Baghdadi and Xiang Lian",
title = "Topic-based community search over spatial-social
networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2104--2117",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407812",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407812",
abstract = "Recently, the community search problem has attracted
significant attention, due to its wide spectrum of
real-world applications such as event organization,
friend recommendation, advertisement in e-commence, and
so on. Given a query vertex, the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fritz:2020:LME,
author = "Manuel Fritz and Michael Behringer and Holger
Schwarz",
title = "{LOG-Means}: efficiently estimating the number of
clusters in large datasets",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2118--2131",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407813",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407813",
abstract = "Clustering is a fundamental primitive in manifold
applications. In order to achieve valuable results,
parameters of the clustering algorithm, e.g., the
number of clusters, have to be set appropriately, which
is a tremendous pitfall. To this end, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Krastnikov:2020:EOD,
author = "Simeon Krastnikov and Florian Kerschbaum and Douglas
Stebila",
title = "Efficient oblivious database joins",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2132--2145",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407814",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407814",
abstract = "A major algorithmic challenge in designing
applications intended for secure remote execution is
ensuring that they are oblivious to their inputs, in
the sense that their memory access patterns do not leak
sensitive information to the server. This \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Issa:2020:ETQ,
author = "Ousmane Issa and Angela Bonifati and Farouk Toumani",
title = "Evaluating top-$k$ queries with inconsistency
degrees",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2146--2158",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407815",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407815",
abstract = "We study the problem of augmenting relational tuples
with inconsistency awareness and tackling top-k queries
under a set of denial constraints (DCs). We define a
notion of inconsistent tuples with respect to a set of
DCs and define two measures of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Nakandala:2020:CDS,
author = "Supun Nakandala and Yuhao Zhang and Arun Kumar",
title = "{Cerebro}: a data system for optimized deep learning
model selection",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2159--2173",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407816",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
note = "See errata \cite{Nakandala:2021:ECD}.",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407816",
abstract = "Deep neural networks (deep nets) are revolutionizing
many machine learning (ML) applications. But there is a
major bottleneck to wider adoption: the pain and
resource intensiveness of model selection. This
empirical process involves exploring deep net
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gan:2020:COP,
author = "Edward Gan and Peter Bailis and Moses Charikar",
title = "{CoopStore}: optimizing precomputed summaries for
aggregation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2174--2187",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407817",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407817",
abstract = "An emerging class of data systems partition their data
and precompute approximate summaries (i.e., sketches
and samples) for each segment to reduce query costs.
They can then aggregate and combine the segment
summaries to estimate results without \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Koide:2020:FSS,
author = "Satoshi Koide and Chuan Xiao and Yoshiharu Ishikawa",
title = "Fast subtrajectory similarity search in road networks
under weighted edit distance constraints",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2188--2201",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407818",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407818",
abstract = "In this paper, we address a similarity search problem
for spatial trajectories in road networks. In
particular, we focus on the subtrajectory similarity
search problem, which involves finding in a database
the subtrajectories similar to a query \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liu:2020:SAG,
author = "Yu Liu and Lei Zou and Qian Ge and Zhewei Wei",
title = "{SimTab}: accuracy-guaranteed {SimRank} queries
through tighter confidence bounds and multi-armed
bandits",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2202--2214",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407819",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407819",
abstract = "SimRank is a classic measure of vertex-pair similarity
according to the structure of graphs. Top-$k$ and
thresholding SimRank queries are two important types of
similarity search with numerous applications in web
mining, social network analysis, spam \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Dutt:2020:EAS,
author = "Anshuman Dutt and Chi Wang and Vivek Narasayya and
Surajit Chaudhuri",
title = "Efficiently approximating selectivity functions using
low overhead regression models",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2215--2228",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407820",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407820",
abstract = "Today's query optimizers use fast selectivity
estimation techniques but are known to be susceptible
to large estimation errors. Recent work on supervised
learned models for selectivity estimation significantly
improves accuracy while ensuring relatively \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lin:2020:IID,
author = "Yin Lin and Yifan Guan and Abolfazl Asudeh and H. V.
Jagadish",
title = "Identifying insufficient data coverage in databases
with multiple relations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2229--2242",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407821",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407821",
abstract = "In today's data-driven world, it is critical that we
use appropriate datasets for analysis and
decision-making. Datasets could be biased because they
reflect existing inequalities in the world, due to the
data scientists' biased world view, or due to
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2020:CMA,
author = "Lingxiao Li and Muhammad Aamir Cheema and Mohammed
Eunus Ali and Hua Lu and David Taniar",
title = "Continuously monitoring alternative shortest paths on
road networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2243--2255",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407822",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407822",
abstract = "Modern navigation systems do not only provide shortest
paths but also some alternative paths to provide more
options to the users. This paper is the first to study
the problem of continuously reporting alternative paths
for a user traveling along a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lee:2020:HMC,
author = "Geon Lee and Jihoon Ko and Kijung Shin",
title = "Hypergraph motifs: concepts, algorithms, and
discoveries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2256--2269",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407823",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407823",
abstract = "Hypergraphs naturally represent group interactions,
which are omnipresent in many domains: collaborations
of researchers, co-purchases of items, joint
interactions of proteins, to name a few. In this work,
we propose tools for answering the following \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Birnick:2020:HSE,
author = "Johann Birnick and Thomas Bl{\"a}sius and Tobias
Friedrich and Felix Naumann and Thorsten Papenbrock and
Martin Schirneck",
title = "Hitting set enumeration with partial information for
unique column combination discovery",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2270--2283",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407824",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407824",
abstract = "Unique column combinations (UCCs) are a fundamental
concept in relational databases. They identify entities
in the data and support various data management
activities. Still, UCCs are usually not explicitly
defined and need to be discovered. State-of-.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2020:SDS,
author = "Yue Chen and Zhida Chen and Gao Cong and Ahmed R.
Mahmood and Walid G. Aref",
title = "{SSTD}: a distributed system on streaming
spatio-textual data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2284--2296",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407825",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407825",
abstract = "Streaming spatio-textual data that contains
geolocations and textual contents, e.g., geo-tagged
tweets, is becoming increasingly available. Users can
register continuous queries to receive up-to-date
results continuously, or pose snapshot queries to
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Mohammed:2020:CPI,
author = "Haneen Mohammed and Ziyun Wei and Eugene Wu and Ravi
Netravali",
title = "Continuous prefetch for interactive data
applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2297--2311",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407826",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407826",
abstract = "Interactive data visualization and exploration (DVE)
applications are often network-bottlenecked due to
bursty request patterns, large response sizes, and
heterogeneous deployments over a range of networks and
devices. This makes it difficult to ensure \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2020:EES,
author = "Zheng Wang and Cheng Long and Gao Cong and Yiding
Liu",
title = "Efficient and effective similar subtrajectory search
with deep reinforcement learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2312--2325",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407827",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407827",
abstract = "Similar trajectory search is a fundamental problem and
has been well studied over the past two decades.
However, the similar subtrajectory search (SimSub)
problem, aiming to return a portion of a trajectory
(i.e., a subtrajectory), which is the most \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sun:2020:BSE,
author = "Zequn Sun and Qingheng Zhang and Wei Hu and Chengming
Wang and Muhao Chen and Farahnaz Akrami and Chengkai
Li",
title = "A benchmarking study of embedding-based entity
alignment for knowledge graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2326--2340",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407828",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407828",
abstract = "Entity alignment seeks to find entities in different
knowledge graphs (KGs) that refer to the same
real-world object. Recent advancement in KG embedding
impels the advent of embedding-based entity alignment,
which encodes entities in a continuous \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Qi:2020:ELS,
author = "Jianzhong Qi and Guanli Liu and Christian S. Jensen
and Lars Kulik",
title = "Effectively learning spatial indices",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2341--2354",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407829",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407829",
abstract = "Machine learning, especially deep learning, is used
increasingly to enable better solutions for data
management tasks previously solved by other means,
including database indexing. A recent study shows that
a neural network can not only learn to predict
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liu:2020:SLB,
author = "Qiyu Liu and Libin Zheng and Yanyan Shen and Lei
Chen",
title = "Stable learned bloom filters for data streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2355--2367",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407830",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407830",
abstract = "Bloom filter and its variants are elegant
space-efficient probabilistic data structures for
approximate set membership queries. It has been
recently shown that the space cost of Bloom filters can
be significantly reduced via a combination with pre-.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Jin:2020:ATL,
author = "Zhongjun Jin and Yeye He and Surajit Chauduri",
title = "Auto-transform: learning-to-transform by patterns",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2368--2381",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407831",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407831",
abstract = "Data Transformation is a long-standing problem in data
management. Recent work adopts a
``transform-by-example'' (TBE) paradigm to infer
transformation programs based on user-provided
input/output examples, which greatly improves
usability, and brought \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kossmann:2020:MMM,
author = "Jan Kossmann and Stefan Halfpap and Marcel Jankrift
and Rainer Schlosser",
title = "Magic mirror in my hand, which is the best in the
land?: an experimental evaluation of index selection
algorithms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2382--2395",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407832",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407832",
abstract = "Indexes are essential for the efficient processing of
database workloads. Proposed solutions for the relevant
and challenging index selection problem range from
metadata-based simple heuristics, over sophisticated
multi-step algorithms, to approaches \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Damme:2020:MAQ,
author = "Patrick Damme and Annett Ungeth{\"u}m and Johannes
Pietrzyk and Alexander Krause and Dirk Habich and
Wolfgang Lehner",
title = "{MorphStore}: analytical query engine with a holistic
compression-enabled processing model",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2396--2410",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407833",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407833",
abstract = "In this paper, we present MorphStore, an open-source
in-memory columnar analytical query engine with a novel
holistic compression-enabled processing model.
Basically, compression using lightweight integer
compression algorithms already plays an \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Parchas:2020:FED,
author = "Panos Parchas and Yonatan Naamad and Peter {Van
Bouwel} and Christos Faloutsos and Michalis
Petropoulos",
title = "Fast and effective distribution-key recommendation for
{Amazon Redshift}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2411--2423",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407834",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407834",
abstract = "How should we split data among the nodes of a
distributed data warehouse in order to boost
performance for a forecasted workload? In this paper,
we study the effect of different data partitioning
schemes on the overall network cost of pairwise joins.
We \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Pappachan:2020:SMA,
author = "Primal Pappachan and Roberto Yus and Sharad Mehrotra
and Johann-Christoph Freytag",
title = "{Sieve}: a middleware approach to scalable access
control for database management systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2424--2437",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407835",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407835",
abstract = "Current approaches for enforcing Fine Grained Access
Control (FGAC) in DBMS do not scale to scenarios when
the number of access control policies are in the order
of thousands. This paper identifies such a use case in
the context of emerging smart spaces \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sreekanti:2020:CSF,
author = "Vikram Sreekanti and Chenggang Wu and Xiayue Charles
Lin and Johann Schleier-Smith and Joseph E. Gonzalez
and Joseph M. Hellerstein and Alexey Tumanov",
title = "{Cloudburst}: stateful functions-as-a-service",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2438--2452",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407836",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407836",
abstract = "Function-as-a-Service (FaaS) platforms and
``serverless'' cloud computing are becoming
increasingly popular due to ease-of-use and operational
simplicity. Current FaaS offerings are targeted at
stateless functions that do minimal I/O and
communication. We \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Suprem:2020:OAD,
author = "Abhijit Suprem and Joy Arulraj and Calton Pu and Joao
Ferreira",
title = "{ODIN}: automated drift detection and recovery in
video analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2453--2465",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407837",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407837",
abstract = "Recent advances in computer vision have led to a
resurgence of interest in visual data analytics.
Researchers are developing systems for effectively and
efficiently analyzing visual data at scale. A
significant challenge that these systems encounter
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Piao:2020:MRA,
author = "Chengzhi Piao and Weiguo Zheng and Yu Rong and Hong
Cheng",
title = "Maximizing the reduction ability for near-maximum
independent set computation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2466--2478",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407838",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407838",
abstract = "Finding the maximum independent set is a fundamental
NP-hard problem in graph theory. Recent studies have
paid much attention to designing efficient algorithms
that find a maximal independent set of good quality
(the more vertices the better). \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2020:FTA,
author = "Zhao Chen and Peng Cheng and Lei Chen and Xuemin Lin
and Cyrus Shahabi",
title = "Fair task assignment in spatial crowdsourcing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2479--2492",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407839",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407839",
abstract = "With the pervasiveness of mobile devices, wireless
broadband and sharing economy, spatial crowdsourcing is
becoming part of our daily life. Existing studies on
spatial crowdsourcing usually focus on enhancing the
platform interests and customer \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2020:DSC,
author = "Hao Zhang and Jeffrey Xu Yu and Yikai Zhang and
Kangfei Zhao and Hong Cheng",
title = "Distributed subgraph counting: a general approach",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2493--2507",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407840",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407840",
abstract = "In this paper, we study local subgraph counting, which
is to count the occurrences of a user-given pattern
graph p around every node v in a data graph G, when v
matches to a given orbit o in p, where the orbit serves
as a center to count p. In general, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Karagiannis:2020:SMI,
author = "Georgios Karagiannis and Mohammed Saeed and Paolo
Papotti and Immanuel Trummer",
title = "{Scrutinizer}: a mixed-initiative approach to
large-scale, data-driven claim verification",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2508--2521",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407841",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407841",
abstract = "Organizations spend significant amounts of time and
money to manually fact check text documents summarizing
data. The goal of the Scrutinizer system is to reduce
verification overheads by supporting human fact
checkers in translating text claims into \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Krivosheev:2020:DPC,
author = "Evgeny Krivosheev and Siarhei Bykau and Fabio Casati
and Sunil Prabhakar",
title = "Detecting and preventing confused labels in
crowdsourced data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2522--2535",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407842",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407842",
abstract = "Crowdsourcing is a challenging activity for many
reasons, from task design to workers' training,
identification of low-quality annotators, and many
more. A particularly subtle form of error is due to
confusion of observations, that is, crowd workers
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2020:OHC,
author = "Rong-Hua Li and Sen Gao and Lu Qin and Guoren Wang and
Weihua Yang and Jeffrey Xu Yu",
title = "Ordering heuristics for $k$-clique listing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2536--2548",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407843",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407843",
abstract = "Listing all k -cliques in a graph is a fundamental
graph mining problem that finds many important
applications in community detection and social network
analysis. Unfortunately, the problem of k -clique
listing is often deemed infeasible for a large k, as
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2020:DSM,
author = "Jinfeng Li and Yuliang Li and Xiaolan Wang and
Wang-Chiew Tan",
title = "Deep or simple models for semantic tagging?: it
depends on your data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2549--2562",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407844",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407844",
abstract = "Semantic tagging, which has extensive applications in
text mining, predicts whether a given piece of text
conveys the meaning of a given semantic tag. The
problem of semantic tagging is largely solved with
supervised learning and today, deep learning \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Bilal:2020:DBC,
author = "Muhammad Bilal and Marco Serafini and Marco Canini and
Rodrigo Rodrigues",
title = "Do the best cloud configurations grow on trees?: an
experimental evaluation of black box algorithms for
optimizing cloud workloads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2563--2575",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407845",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407845",
abstract = "Cloud configuration optimization is the procedure to
determine the number and the type of instances to use
when deploying an application in cloud environments,
given a cost or performance objective. In the absence
of a performance model for the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhou:2020:FLD,
author = "Alexander Zhou and Yue Wang and Lei Chen",
title = "Finding large diverse communities on networks: the
edge maximum $ k*$-partite clique",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2576--2589",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407846",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407846",
abstract = "In this work we examine the problem of finding large,
diverse communities on graphs where the users are
separated into distinct groups. More specifically, this
work considers diversity to be the inclusion of users
from multiple groups as opposed to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{vanderLinde:2020:PCS,
author = "Albert van der Linde and Jo{\~a}o Leit{\~a}o and Nuno
Pregui{\c{c}}a",
title = "Practical client-side replication: weak consistency
semantics for insecure settings",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2590--2605",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407847",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407847",
abstract = "Client-side replication and direct client-to-client
synchronization can be used to create highly available,
low-latency interactive applications. Causal
consistency, the strongest available consistency model
under network partitions, is an attractive \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Rong:2020:APS,
author = "Kexin Rong and Yao Lu and Peter Bailis and Srikanth
Kandula and Philip Levis",
title = "Approximate partition selection for big-data workloads
using summary statistics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2606--2619",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407848",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407848",
abstract = "Many big-data clusters store data in large partitions
that support access at a coarse, partition-level
granularity. As a result, approximate query processing
via row-level sampling is inefficient, often requiring
reads of many partitions. In this work, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Winter:2020:MMH,
author = "Christian Winter and Tobias Schmidt and Thomas Neumann
and Alfons Kemper",
title = "Meet me halfway: split maintenance of continuous
views",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2620--2633",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407849",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407849",
abstract = "From Industry 4.0-driven factories to real-time
trading algorithms, businesses depend on analytics on
high-velocity real-time data. Often these analytics are
performed not in dedicated stream processing engines
but on views within a general-purpose \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2020:UPB,
author = "Youmin Chen and Youyou Lu and Kedong Fang and Qing
Wang and Jiwu Shu",
title = "{uTree}: a persistent {B+-tree} with low tail
latency",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2634--2648",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407850",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407850",
abstract = "Tail latency is a critical design issue in recent
storage systems. B$^+$ -tree, as a fundamental building
block in storage systems, incurs high tail latency,
especially when placed in persistent memory (PM). Our
empirical study specifies two factors that \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Boncz:2020:FFR,
author = "Peter Boncz and Thomas Neumann and Viktor Leis",
title = "{FSST}: fast random access string compression",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2649--2661",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407851",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407851",
abstract = "Strings are prevalent in real-world data sets. They
often occupy a large fraction of the data and are slow
to process. In this work, we present Fast Static Symbol
Table (FSST), a lightweight compression scheme for
strings. On text data, FSST offers \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Vogel:2020:MBC,
author = "Lukas Vogel and Viktor Leis and Alexander van Renen
and Thomas Neumann and Satoshi Imamura and Alfons
Kemper",
title = "{Mosaic}: a budget-conscious storage engine for
relational database systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2662--2675",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407852",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407852",
abstract = "Relational database systems are purpose-built for a
specific storage device class (e.g., HDD, SSD, or
DRAM). They do not cope well with the multitude of
storage devices that are competitive at their price
`sweet spots'. To make use of different storage
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Blanusa:2020:MCE,
author = "Jovan Blanusa and Radu Stoica and Paolo Ienne and
Kubilay Atasu",
title = "Manycore clique enumeration with fast set
intersections",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2676--2690",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407853",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407853",
abstract = "Listing all maximal cliques of a given graph has
important applications in the analysis of social and
biological networks. Parallelisation of maximal clique
enumeration (MCE) algorithms on modern manycore
processors is challenging due to the task-level
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Bater:2020:SPP,
author = "Johes Bater and Yongjoo Park and Xi He and Xiao Wang
and Jennie Rogers",
title = "{SAQE}: practical privacy-preserving approximate query
processing for data federations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2691--2705",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407854",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407854",
abstract = "A private data federation enables clients to query the
union of data from multiple data providers without
revealing any extra private information to the client
or any other data providers. Unfortunately, this strong
end-to-end privacy guarantee requires \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kuhlman:2020:RAA,
author = "Caitlin Kuhlman and Elke Rundensteiner",
title = "Rank aggregation algorithms for fair consensus",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2706--2719",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407855",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407855",
abstract = "Aggregating multiple rankings in a database is an
important task well studied by the database community.
High-stakes application domains include hiring,
lending, and education where multiple decision makers
rank candidates and their input is then \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Glasbergen:2020:SUA,
author = "Brad Glasbergen and Michael Abebe and Khuzaima Daudjee
and Amit Levi",
title = "Sentinel: universal analysis and insight for data
systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2720--2733",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407856",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407856",
abstract = "Systems continue to grow in complexity in response to
the need to support vast quantities of data and a wide
variety of workloads. Small changes in workloads and
system configuration can result in significantly
different system behaviour and performance \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fang:2020:ODC,
author = "Jingzhi Fang and Yanyan Shen and Yue Wang and Lei
Chen",
title = "Optimizing {DNN} computation graph using graph
substitutions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2734--2746",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407857",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407857",
abstract = "Deep learning has achieved great success in various
real-world applications. As deep neural networks (DNNs)
are getting larger, the inference and training cost of
DNNs increases significantly. Since one round of
inference or one iteration in the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sen:2020:ANL,
author = "Jaydeep Sen and Chuan Lei and Abdul Quamar and Fatma
{\"O}zcan and Vasilis Efthymiou and Ayushi Dalmia and
Greg Stager and Ashish Mittal and Diptikalyan Saha and
Karthik Sankaranarayanan",
title = "{ATHENA++}: natural language querying for complex
nested {SQL} queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2747--2759",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407858",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407858",
abstract = "Natural Language Interfaces to Databases (NLIDB)
systems eliminate the requirement for an end user to
use complex query languages like SQL, by translating
the input natural language (NL) queries to SQL
automatically. Although a significant volume of
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Xu:2020:CAD,
author = "Min Xu and Bolin Ding and Tianhao Wang and Jingren
Zhou",
title = "Collecting and analyzing data jointly from multiple
services under local differential privacy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2760--2772",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407859",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407859",
abstract = "Users' sensitive data can be collected and analyzed
under local differential privacy (LDP) without the need
to trust the data collector. Most previous work on LDP
can be applied when each user's data is generated and
collected from a single service or \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gan:2020:IDA,
author = "Yifan Gan and Xueyuan Ren and Drew Ripberger and
Spyros Blanas and Yang Wang",
title = "{IsoDiff}: debugging anomalies caused by weak
isolation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2773--2786",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407860",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407860",
abstract = "Weak isolation levels, such as Read Committed and
Snapshot Isolation, are widely used by databases for
their higher concurrency, but may introduce subtle
correctness errors in applications that only experts
can identify. This paper proposes IsoDiff, a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Barsky:2020:SRN,
author = "Marina Barsky and Jonathan Gabor and Mariano P.
Consens and Alex Thomo",
title = "Suffix rank: a new scalable algorithm for indexing
large string collections",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "12",
pages = "2787--2800",
month = aug,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3407790.3407861",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:33:57 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3407790.3407861",
abstract = "We investigate the problem of building a suffix array
substring index for inputs significantly larger than
main memory. This problem is especially important in
the context of biological sequence analysis, where
biological polymers can be thought of as \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zeng:2020:SBI,
author = "Yuxiang Zeng and Yongxin Tong and Yuguang Song and Lei
Chen",
title = "The simpler the better: an indexing approach for
shared-route planning queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "13",
pages = "3517--3530",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3424573.3424574",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:02 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3424573.3424574",
abstract = "Ridesharing services have gained global popularity as
a convenient, economic, and sustainable transportation
mode in recent years. One fundamental challenge in
these services is planning the shared-routes ( i.e.,
sequences of origins and destinations) \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Tanabe:2020:ACC,
author = "Takayuki Tanabe and Takashi Hoshino and Hideyuki
Kawashima and Osamu Tatebe",
title = "An analysis of concurrency control protocols for
in-memory databases with {CCBench}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "13",
pages = "3531--3544",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3424573.3424575",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:02 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3424573.3424575",
abstract = "This paper presents yet another concurrency control
analysis platform, CCBench. CCBench supports seven
protocols (Silo, TicToc, MOCC, Cicada, SI, SI with
latch-free SSN, 2PL) and seven versatile optimization
methods and enables the configuration of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2020:IUS,
author = "Tianhao Wang and Bolin Ding and Min Xu and Zhicong
Huang and Cheng Hong and Jingren Zhou and Ninghui Li
and Somesh Jha",
title = "Improving utility and security of the shuffler-based
differential privacy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "13",
pages = "3545--3558",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3424573.3424576",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:02 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3424573.3424576",
abstract = "When collecting information, local differential
privacy (LDP) alleviates privacy concerns of users
because their private information is randomized before
being sent it to the central aggregator. LDP imposes
large amount of noise as each user executes \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kipf:2020:CIL,
author = "Andreas Kipf and Damian Chromejko and Alexander Hall
and Peter Boncz and David G. Andersen",
title = "Cuckoo index: a lightweight secondary index
structure",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "13",
pages = "3559--3572",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3424573.3424577",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:02 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3424573.3424577",
abstract = "In modern data warehousing, data skipping is essential
for high query performance. While index structures such
as B-trees or hash tables allow for precise pruning,
their large storage requirements make them impractical
for indexing secondary columns. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Abebe:2020:MAP,
author = "Michael Abebe and Brad Glasbergen and Khuzaima
Daudjee",
title = "{MorphoSys}: automatic physical design metamorphosis
for distributed database systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "13",
pages = "3573--3587",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3424573.3424578",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:02 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3424573.3424578",
abstract = "Distributed database systems are widely used to meet
the demands of storing and managing computation-heavy
workloads. To boost performance and minimize resource
and data contention, these systems require selecting a
distributed physical design that \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Behnezhad:2020:PGA,
author = "Soheil Behnezhad and Laxman Dhulipala and Hossein
Esfandiari and Jakub Lacki and Vahab Mirrokni and
Warren Schudy",
title = "Parallel graph algorithms in constant adaptive rounds:
theory meets practice",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "13",
pages = "3588--3602",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3424573.3424579",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:02 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3424573.3424579",
abstract = "We study fundamental graph problems such as graph
connectivity, minimum spanning forest (MSF), and
approximate maximum (weight) matching in a distributed
setting. In particular, we focus on the Adaptive
Massively Parallel Computation (AMPC) model, which
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2020:DLP,
author = "Runhui Wang and Dong Deng",
title = "{DeltaPQ}: lossless product quantization code
compression for high dimensional similarity search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "13",
number = "13",
pages = "3603--3616",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3424573.3424580",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:02 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3424573.3424580",
abstract = "High dimensional data is ubiquitous and plays an
important role in many applications. However, the size
of high dimensional data is usually excessively large.
To alleviate this problem, in this paper, we develop
novel techniques to compress and search \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Marcus:2020:BLI,
author = "Ryan Marcus and Andreas Kipf and Alexander van Renen
and Mihail Stoian and Sanchit Misra and Alfons Kemper
and Thomas Neumann and Tim Kraska",
title = "Benchmarking learned indexes",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "1",
pages = "1--13",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3421424.3421425",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:02 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3421424.3421425",
abstract = "Recent advancements in learned index structures
propose replacing existing index structures, like
B-Trees, with approximate learned models. In this work,
we present a unified benchmark that compares well-tuned
implementations of three learned index \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2020:TGC,
author = "Zuozhi Wang and Kai Zeng and Botong Huang and Wei Chen
and Xiaozong Cui and Bo Wang and Ji Liu and Liya Fan
and Dachuan Qu and Zhenyu Hou and Tao Guan and Chen Li
and Jingren Zhou",
title = "Tempura: a general cost-based optimizer framework for
incremental data processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "1",
pages = "14--27",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3421424.3421427",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:02 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3421424.3421427",
abstract = "Incremental processing is widely-adopted in many
applications, ranging from incremental view
maintenance, stream computing, to recently emerging
progressive data warehouse and intermittent query
processing. Despite many algorithms developed on this
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Heo:2020:IGD,
author = "Geon Heo and Yuji Roh and Seonghyeon Hwang and Dayun
Lee and Steven Euijong Whang",
title = "Inspector gadget: a data programming-based labeling
system for industrial images",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "1",
pages = "28--36",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3421424.3421429",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:02 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3421424.3421429",
abstract = "As machine learning for images becomes democratized in
the Software 2.0 era, one of the serious bottlenecks is
securing enough labeled data for training. This problem
is especially critical in a manufacturing setting where
smart factories rely on \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yang:2020:SAN,
author = "Renchi Yang and Jieming Shi and Xiaokui Xiao and Yin
Yang and Juncheng Liu and Sourav S. Bhowmick",
title = "Scaling attributed network embedding to massive
graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "1",
pages = "37--49",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3421424.3421430",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:02 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3421424.3421430",
abstract = "Given a graph G where each node is associated with a
set of attributes, attributed network embedding (ANE)
maps each node $ v \in G $ to a compact vector X$_v$,
which can be used in downstream machine learning tasks.
Ideally, $ X_v$ should capture node $v$'s affinity.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2020:DEM,
author = "Yuliang Li and Jinfeng Li and Yoshihiko Suhara and
AnHai Doan and Wang-Chiew Tan",
title = "Deep entity matching with pre-trained language
models",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "1",
pages = "50--60",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3421424.3421431",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:02 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3421424.3421431",
abstract = "We present Ditto, a novel entity matching system based
on pre-trained Transformer-based language models. We
fine-tune and cast EM as a sequence-pair classification
problem to leverage such models with a simple
architecture. Our experiments show that a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yang:2020:NOC,
author = "Zongheng Yang and Amog Kamsetty and Sifei Luan and
Eric Liang and Yan Duan and Xi Chen and Ion Stoica",
title = "{NeuroCard}: one cardinality estimator for all
tables",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "1",
pages = "61--73",
month = sep,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3421424.3421432",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:02 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3421424.3421432",
abstract = "Query optimizers rely on accurate cardinality
estimates to produce good execution plans. Despite
decades of research, existing cardinality estimators
are inaccurate for complex queries, due to making lossy
modeling assumptions and not capturing inter-.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ding:2020:TLM,
author = "Jialin Ding and Vikram Nathan and Mohammad Alizadeh
and Tim Kraska",
title = "{Tsunami}: a learned multi-dimensional index for
correlated data and skewed workloads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "2",
pages = "74--86",
month = oct,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3425879.3425880",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:03 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3425879.3425880",
abstract = "Filtering data based on predicates is one of the most
fundamental operations for any modern data warehouse.
Techniques to accelerate the execution of filter
expressions include clustered indexes, specialized sort
orders (e.g., Z-order), multi-. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kang:2020:JOP,
author = "Daniel Kang and Ankit Mathur and Teja Veeramacheneni
and Peter Bailis and Matei Zaharia",
title = "Jointly optimizing preprocessing and inference for
{DNN}-based visual analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "2",
pages = "87--100",
month = oct,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3425879.3425881",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:03 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3425879.3425881",
abstract = "While deep neural networks (DNNs) are an increasingly
popular way to query large corpora of data, their
significant runtime remains an active area of research.
As a result, researchers have proposed systems and
optimizations to reduce these costs by \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Menon:2020:PCQ,
author = "Prashanth Menon and Amadou Ngom and Lin Ma and Todd C.
Mowry and Andrew Pavlo",
title = "Permutable compiled queries: dynamically adapting
compiled queries without recompiling",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "2",
pages = "101--113",
month = oct,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3425879.3425882",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:03 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3425879.3425882",
abstract = "Just-in-time (JIT) query compilation is a technique to
improve analytical query performance in database
management systems (DBMSs). But the cost of compiling
each query can be significant relative to its execution
time. This overhead prohibits the DBMS \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Min:2020:EEM,
author = "Seung Won Min and Vikram Sharma Mailthody and Zaid
Qureshi and Jinjun Xiong and Eiman Ebrahimi and Wen-mei
Hwu",
title = "{EMOGI}: efficient memory-access for out-of-memory
graph-traversal in {GPUs}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "2",
pages = "114--127",
month = oct,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3425879.3425883",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:03 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3425879.3425883",
abstract = "Modern analytics and recommendation systems are
increasingly based on graph data that capture the
relations between entities being analyzed. Practical
graphs come in huge sizes, offer massive parallelism,
and are stored in sparse-matrix formats such as
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2020:SFA,
author = "Yinda Zhang and Jinyang Li and Yutian Lei and Tong
Yang and Zhetao Li and Gong Zhang and Bin Cui",
title = "On-off sketch: a fast and accurate sketch on
persistence",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "2",
pages = "128--140",
month = oct,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3425879.3425884",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:03 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3425879.3425884",
abstract = "Approximate stream processing has attracted much
attention recently. Prior art mostly focuses on
characteristics like frequency, cardinality, and
quantile. Persistence, as a new characteristic, is
getting increasing attention. Unlike frequency,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Tran:2020:RTD,
author = "Luan Tran and Min Y. Mun and Cyrus Shahabi",
title = "Real-time distance-based outlier detection in data
streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "2",
pages = "141--153",
month = oct,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3425879.3425885",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:03 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3425879.3425885",
abstract = "Real-time outlier detection in data streams has drawn
much attention recently as many applications need to be
able to detect abnormal behaviors as soon as they
occur. The arrival and departure of streaming data on
edge devices impose new challenges to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Poppe:2020:SIL,
author = "Olga Poppe and Tayo Amuneke and Dalitso Banda and
Aritra De and Ari Green and Manon Knoertzer and Ehi
Nosakhare and Karthik Rajendran and Deepak Shankargouda
and Meina Wang and Alan Au and Carlo Curino and Qun Guo
and Alekh Jindal and Ajay Kalhan and Morgan Oslake and
Sonia Parchani and Vijay Ramani and Raj Sellappan and
Saikat Sen and Sheetal Shrotri and Soundararajan
Srinivasan and Ping Xia and Shize Xu and Alicia Yang
and Yiwen Zhu",
title = "Seagull: an infrastructure for load prediction and
optimized resource allocation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "2",
pages = "154--162",
month = oct,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3425879.3425886",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:03 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3425879.3425886",
abstract = "Microsoft Azure is dedicated to guarantee high quality
of service to its customers, in particular, during
periods of high customer activity, while controlling
cost. We employ a Data Science (DS) driven solution to
predict user load and leverage these \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2020:EKM,
author = "Sheng Wang and Yuan Sun and Zhifeng Bao",
title = "On the efficiency of {K-means} clustering: evaluation,
optimization, and algorithm selection",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "2",
pages = "163--175",
month = oct,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3425879.3425887",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:03 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3425879.3425887",
abstract = "This paper presents a thorough evaluation of the
existing methods that accelerate Lloyd's algorithm for
fast k -means clustering. To do so, we analyze the
pruning mechanisms of existing methods, and summarize
their common pipeline into a unified \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sun:2020:RHA,
author = "Shixuan Sun and Xibo Sun and Yulin Che and Qiong Luo
and Bingsheng He",
title = "{RapidMatch}: a holistic approach to subgraph query
processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "2",
pages = "176--188",
month = oct,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3425879.3425888",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:03 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3425879.3425888",
abstract = "A subgraph query searches for all embeddings in a data
graph that are identical to a query graph. Two kinds of
algorithms, either graph exploration based or join
based, have been developed for processing subgraph
queries. Due to algorithmic and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Xia:2020:TLP,
author = "Yu Xia and Xiangyao Yu and Andrew Pavlo and Srinivas
Devadas",
title = "Taurus: lightweight parallel logging for in-memory
database management systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "2",
pages = "189--201",
month = oct,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3425879.3425889",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:03 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3425879.3425889",
abstract = "Existing single-stream logging schemes are unsuitable
for in-memory database management systems (DBMSs) as
the single log is often a performance bottleneck. To
overcome this problem, we present Taurus, an efficient
parallel logging scheme that uses \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Paul:2020:IEE,
author = "Johns Paul and Bingsheng He and Shengliang Lu and
Chiew Tong Lau",
title = "Improving execution efficiency of just-in-time
compilation based query processing on {GPUs}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "2",
pages = "202--214",
month = oct,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3425879.3425890",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:03 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3425879.3425890",
abstract = "In recent years, we have witnessed significant efforts
to improve the performance of Online Analytical
Processing (OLAP) on graphics processing units (GPUs).
Most existing studies have focused on improving memory
efficiency since memory stalls can play \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2020:PTS,
author = "Shuang Wang and Hakan Ferhatosmanoglu",
title = "{PPQ}-trajectory: spatio-temporal quantization for
querying in large trajectory repositories",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "2",
pages = "215--227",
month = oct,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3425879.3425891",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:03 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3425879.3425891",
abstract = "We present PPQ-trajectory, a spatio-temporal
quantization based solution for querying large dynamic
trajectory data. PPQ-trajectory includes a
partition-wise predictive quantizer (PPQ) that
generates an error-bounded codebook with
autocorrelation and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Hu:2020:ADP,
author = "Xiao Hu and Shouzhuo Sun and Shweta Patwa and Debmalya
Panigrahi and Sudeepa Roy",
title = "Aggregated deletion propagation for counting
conjunctive query answers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "2",
pages = "228--240",
month = oct,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3425879.3425892",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:03 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3425879.3425892",
abstract = "We investigate the computational complexity of
minimizing the source side-effect in order to remove a
given number of tuples from the output of a conjunctive
query. This is a variant of the well-studied deletion
propagation problem, the difference being \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Luo:2020:BMW,
author = "Chen Luo and Michael J. Carey",
title = "Breaking down memory walls: adaptive memory management
in {LSM}-based storage systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "3",
pages = "241--254",
month = nov,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.5555/3430915.3442425",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:04 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.5555/3430915.3442425",
abstract = "Log-Structured Merge-trees (LSM-trees) have been
widely used in modern NoSQL systems. Due to their
out-of-place update design, LSM-trees have introduced
memory walls among the memory components of multiple
LSM-trees and between the write memory and the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Karlas:2020:NNC,
author = "Bojan Karlas and Peng Li and Renzhi Wu and Nezihe
Merve G{\"u}rel and Xu Chu and Wentao Wu and Ce Zhang",
title = "Nearest neighbor classifiers over incomplete
information: from certain answers to certain
predictions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "3",
pages = "255--267",
month = nov,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.5555/3430915.3442426",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:04 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.5555/3430915.3442426",
abstract = "Machine learning (ML) applications have been thriving
recently, largely attributed to the increasing
availability of data. However, inconsistency and
incomplete information are ubiquitous in real-world
datasets, and their impact on ML applications
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kingsbury:2020:EII,
author = "Kyle Kingsbury and Peter Alvaro",
title = "{Elle}: inferring isolation anomalies from
experimental observations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "3",
pages = "268--280",
month = nov,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.5555/3430915.3442427",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:04 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.5555/3430915.3442427",
abstract = "Users who care about their data store it in databases,
which (at least in principle) guarantee some form of
transactional isolation. However, experience shows that
many databases do not provide the isolation guarantees
they claim. With the recent \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kiefer:2020:SGF,
author = "Martin Kiefer and Ilias Poulakis and Sebastian
Bre{\ss} and Volker Markl",
title = "{Scotch}: generating {FPGA}-accelerators for sketching
at line rate",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "3",
pages = "281--293",
month = nov,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.5555/3430915.3442428",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:04 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.5555/3430915.3442428",
abstract = "Sketching algorithms are a powerful tool for
single-pass data summarization. Their numerous
applications include approximate query processing,
machine learning, and large-scale network monitoring.
In the presence of high-bandwidth interconnects or in-.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Khayati:2020:OOR,
author = "Mourad Khayati and Ines Arous and Zakhar Tymchenko and
Philippe Cudr{\'e}-Mauroux",
title = "{ORBITS}: online recovery of missing values in
multiple time series streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "3",
pages = "294--306",
month = nov,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.5555/3430915.3442429",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:04 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.5555/3430915.3442429",
abstract = "With the emergence of the Internet of Things (IoT),
time series streams have become ubiquitous in our daily
life. Recording such data is rarely a perfect process,
as sensor failures frequently occur, yielding
occasional blocks of data that go missing in \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Deng:2020:TTU,
author = "Xiang Deng and Huan Sun and Alyssa Lees and You Wu and
Cong Yu",
title = "{TURL}: table understanding through representation
learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "3",
pages = "307--319",
month = nov,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.5555/3430915.3442430",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:04 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.5555/3430915.3442430",
abstract = "Relational tables on the Web store a vast amount of
knowledge. Owing to the wealth of such tables, there
has been tremendous progress on a variety of tasks in
the area of table understanding. However, existing work
generally relies on heavily-engineered \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Guo:2020:EUD,
author = "Long Guo and Lifeng Hua and Rongfei Jia and Fei Fang
and Binqiang Zhao and Bin Cui",
title = "{EdgeDIPN}: a unified deep intent prediction network
deployed at the edge",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "3",
pages = "320--328",
month = nov,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.5555/3430915.3442431",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:04 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.5555/3430915.3442431",
abstract = "With the rapid growth of e-commerce in recent years,
e-commerce platforms are becoming a primary place for
people to find, compare and ultimately purchase
products. To improve online shopping experience for
consumers and increase sales for sellers, it \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lin:2020:LCW,
author = "Yiming Lin and Daokun Jiang and Roberto Yus and
Georgios Bouloukakis and Andrew Chio and Sharad
Mehrotra and Nalini Venkatasubramanian",
title = "Locater: cleaning wifi connectivity datasets for
semantic localization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "3",
pages = "329--341",
month = nov,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.5555/3430915.3442432",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:04 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.5555/3430915.3442432",
abstract = "This paper explores the data cleaning challenges that
arise in using WiFi connectivity data to locate users
to semantic indoor locations such as buildings,
regions, rooms. WiFi connectivity data consists of
sporadic connections between devices and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liu:2020:MMT,
author = "Hao Liu and Jindong Han and Yanjie Fu and Jingbo Zhou
and Xinjiang Lu and Hui Xiong",
title = "Multi-modal transportation recommendation with unified
route representation learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "3",
pages = "342--350",
month = nov,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.5555/3430915.3442433",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:04 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.5555/3430915.3442433",
abstract = "Multi-modal transportation recommendation aims to
provide the most appropriate travel route with various
transportation modes according to certain criteria.
After analyzing large-scale navigation data, we find
that route representations exhibit two \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2020:DDF,
author = "Yue Wang and Ruiqi Xu and Zonghao Feng and Yulin Che
and Lei Chen and Qiong Luo and Rui Mao",
title = "{Disk}: a distributed framework for single-source
{SimRank} with accuracy guarantee",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "3",
pages = "351--363",
month = nov,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.5555/3430915.3442434",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:04 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.5555/3430915.3442434",
abstract = "Measuring similarities among different nodes is
important in graph analysis. SimRank is one of the most
popular similarity measures. Given a graph G ( V, E )
and a source node u, a single-source Sim-Rank query
returns the similarities between u and each \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Didona:2020:TBU,
author = "Diego Didona and Nikolas Ioannou and Radu Stoica and
Kornilios Kourtis",
title = "Toward a better understanding and evaluation of tree
structures on flash {SSDs}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "3",
pages = "364--377",
month = nov,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.5555/3430915.3442435",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:04 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.5555/3430915.3442435",
abstract = "Solid-state drives (SSDs) are extensively used to
deploy persistent data stores, as they provide low
latency random access, high write throughput, high data
density, and low cost. Tree-based data structures are
widely used to build persistent data \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yang:2020:AMD,
author = "Jianyu Yang and Tianhao Wang and Ninghui Li and Xiang
Cheng and Sen Su",
title = "Answering multi-dimensional range queries under local
differential privacy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "3",
pages = "378--390",
month = nov,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.5555/3430915.3442436",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:04 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.5555/3430915.3442436",
abstract = "In this paper, we tackle the problem of answering
multi-dimensional range queries under local
differential privacy. There are three key technical
challenges: capturing the correlations among
attributes, avoiding the curse of dimensionality, and
dealing \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Palyvos-Giannas:2020:ASF,
author = "Dimitris Palyvos-Giannas and Bastian Havers and Marina
Papatriantafilou and Vincenzo Gulisano",
title = "{Ananke}: a streaming framework for live forward
provenance",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "3",
pages = "391--403",
month = nov,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.5555/3430915.3442437",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:04 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.5555/3430915.3442437",
abstract = "Data streaming enables online monitoring of large and
continuous event streams in Cyber-Physical Systems
(CPSs). In such scenarios, fine-grained backward
provenance tools can connect streaming query results to
the source data producing them, allowing \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lakhotia:2020:RRC,
author = "Kartik Lakhotia and Rajgopal Kannan and Viktor
Prasanna and Cesar A. F. {De Rose}",
title = "{Receipt}: refine coarse-grained independent tasks for
parallel tip decomposition of bipartite graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "3",
pages = "404--417",
month = nov,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.5555/3430915.3442438",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:04 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.5555/3430915.3442438",
abstract = "Tip decomposition is a crucial kernel for mining dense
subgraphs in bipartite networks, with applications in
spam detection, analysis of affiliation networks etc.
It creates a hierarchy of vertex-induced subgraphs with
varying densities determined by \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Deep:2020:CEW,
author = "Shaleen Deep and Anja Gruenheid and Paraschos Koutris
and Jeffrey Naughton and Stratis Viglas",
title = "Comprehensive and efficient workload compression",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "3",
pages = "418--430",
month = nov,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.5555/3430915.3442439",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:04 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.5555/3430915.3442439",
abstract = "This work studies the problem of constructing a
representative workload from a given input analytical
query workload where the former serves as an
approximation with guarantees of the latter. We discuss
our work in the context of workload analysis and
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{He:2020:CCO,
author = "Yongjun He and Jiacheng Lu and Tianzheng Wang",
title = "{CoroBase}: coroutine-oriented main-memory database
engine",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "3",
pages = "431--444",
month = nov,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.5555/3430915.3442440",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:04 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.5555/3430915.3442440",
abstract = "Data stalls are a major overhead in main-memory
database engines due to the use of pointer-rich data
structures. Lightweight coroutines ease the
implementation of software prefetching to hide data
stalls by overlapping computation and asynchronous data
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Smith:2020:SQN,
author = "Jaclyn Smith and Michael Benedikt and Milos Nikolic
and Amir Shaikhha",
title = "Scalable querying of nested data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "3",
pages = "445--457",
month = nov,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.5555/3430915.3442441",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 15 05:34:04 MST 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.5555/3430915.3442441",
abstract = "While large-scale distributed data processing
platforms have become an attractive target for query
processing, these systems are problematic for
applications that deal with nested collections.
Programmers are forced either to perform non-trivial
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gong:2020:SCE,
author = "Long Gong and Ziheng Liu and Liang Liu and Jun Xu and
Mitsunori Ogihara and Tong Yang",
title = "Space- and computationally-efficient set
reconciliation via parity bitmap sketch {(PBS)}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "4",
pages = "458--470",
month = dec,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3436905.3436906",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Feb 23 08:32:42 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3436905.3436906",
abstract = "Set reconciliation is a fundamental algorithmic
problem that arises in many networking, system, and
database applications. In this problem, two large sets
A and B of objects (bitcoins, files, records, etc.) are
stored respectively at two different \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Shetiya:2020:AAS,
author = "Suraj Shetiya and Saravanan Thirumuruganathan and Nick
Koudas and Gautam Das",
title = "{Astrid}: accurate selectivity estimation for string
predicates using deep learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "4",
pages = "471--484",
month = dec,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3436905.3436907",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Feb 23 08:32:42 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/string-matching.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3436905.3436907",
abstract = "Accurate selectivity estimation for string predicates
is a long-standing research challenge in databases.
Supporting pattern matching on strings (such as prefix,
substring, and suffix) makes this problem much more
challenging, thereby necessitating a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zheng:2020:CTR,
author = "Nan Zheng and Zachary G. Ives",
title = "Compact, tamper-resistant archival of fine-grained
provenance",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "4",
pages = "485--497",
month = dec,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3436905.3436909",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Feb 23 08:32:42 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3436905.3436909",
abstract = "Data provenance tools aim to facilitate reproducible
data science and auditable data analyses, by tracking
the processes and inputs responsible for each result of
an analysis. Fine-grained provenance further enables
sophisticated reasoning about why \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Muller:2020:RDI,
author = "Ingo M{\"u}ller and Ghislain Fourny and Stefan
Irimescu and Can Berker Cikis and Gustavo Alonso",
title = "{Rumble}: data independence for large messy data
sets",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "4",
pages = "498--506",
month = dec,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3436905.3436910",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Feb 23 08:32:42 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3436905.3436910",
abstract = "This paper introduces Rumble, a query execution engine
for large, heterogeneous, and nested collections of
JSON objects built on top of Apache Spark. While data
sets of this type are more and more wide-spread, most
existing tools are built around a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chapman:2020:CQF,
author = "Adriane Chapman and Paolo Missier and Giulia Simonelli
and Riccardo Torlone",
title = "Capturing and querying fine-grained provenance of
preprocessing pipelines in data science",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "4",
pages = "507--520",
month = dec,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3436905.3436911",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Feb 23 08:32:42 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3436905.3436911",
abstract = "Data processing pipelines that are designed to clean,
transform and alter data in preparation for learning
predictive models, have an impact on those models'
accuracy and performance, as well on other properties,
such as model fairness. It is therefore \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Farias:2020:LDD,
author = "Victor A. E. Farias and Felipe T. Brito and Cheryl
Flynn and Javam C. Machado and Subhabrata Majumdar and
Divesh Srivastava",
title = "Local dampening: differential privacy for non-numeric
queries via local sensitivity",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "4",
pages = "521--533",
month = dec,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3436905.3436912",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Feb 23 08:32:42 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3436905.3436912",
abstract = "Differential privacy is the state-of-the-art formal
definition for data release under strong privacy
guarantees. A variety of mechanisms have been proposed
in the literature for releasing the noisy output of
numeric queries (e.g., using the Laplace \ldots{}).",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2020:MDS,
author = "Tianyu Li and Matthew Butrovich and Amadou Ngom and
Wan Shen Lim and Wes McKinney and Andrew Pavlo",
title = "Mainlining databases: supporting fast transactional
workloads on universal columnar data file formats",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "4",
pages = "534--546",
month = dec,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3436905.3436913",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Feb 23 08:32:42 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3436905.3436913",
abstract = "The proliferation of modern data processing tools has
given rise to open-source columnar data formats. These
formats help organizations avoid repeated conversion of
data to a new format for each application. However,
these formats are read-only, and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lu:2020:AEC,
author = "Shengliang Lu and Bingsheng He and Yuchen Li and Hao
Fu",
title = "Accelerating exact constrained shortest paths on
{GPUs}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "4",
pages = "547--559",
month = dec,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3436905.3436914",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Feb 23 08:32:42 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3436905.3436914",
abstract = "The recently emerging applications such as
software-defined networks and autonomous vehicles
require efficient and exact solutions for constrained
shortest paths (CSP), which finds the shortest path in
a graph while satisfying some user-defined \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Mo:2020:TEW,
author = "Songsong Mo and Zhifeng Bao and Ping Zhang and Zhiyong
Peng",
title = "Towards an efficient weighted random walk domination",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "4",
pages = "560--572",
month = dec,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3436905.3436915",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Feb 23 08:32:42 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3436905.3436915",
abstract = "In this paper, we propose and study a new problem
called the weighted random walk domination. Given a
weighted graph G ( V, E ) and a budget B of the
weighted random walk, it aims to find a k -size set S,
which can minimize the total costs of the remaining
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Guo:2020:SMM,
author = "Guimu Guo and Da Yan and M. Tamer {\"O}zsu and Zhe
Jiang and Jalal Khalil",
title = "Scalable mining of maximal quasi-cliques: an
algorithm-system codesign approach",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "4",
pages = "573--585",
month = dec,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3436905.3436916",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Feb 23 08:32:42 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3436905.3436916",
abstract = "Given a user-specified minimum degree threshold $
\gamma $, a $ \gamma $-quasiclique is a subgraph $ g =
(V_g, E_g)$ where each vertex $ \nu \in V_g$ connects
to at least $ \gamma $ fraction of the other vertices
(i.e., $ \lceil \gamma \cdot (|V_g| - 1) \rceil $
vertices) in $g$. Quasi-clique is one of the most
natural \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kokoris-Kogias:2020:CPD,
author = "Eleftherios Kokoris-Kogias and Enis Ceyhun Alp and
Linus Gasser and Philipp Jovanovic and Ewa Syta and
Bryan Ford",
title = "{CALYPSO}: private data management for decentralized
ledgers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "4",
pages = "586--599",
month = dec,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3436905.3436917",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Feb 23 08:32:42 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3436905.3436917",
abstract = "Distributed ledgers provide high availability and
integrity, making them a key enabler for practical and
secure computation of distributed workloads among
mutually distrustful parties. Many practical
applications also require strong confidentiality,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Deeds:2020:SFL,
author = "Kyle Deeds and Brian Hentschel and Stratos Idreos",
title = "Stacked filters: learning to filter by structure",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "4",
pages = "600--612",
month = dec,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3436905.3436919",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Feb 23 08:32:42 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3436905.3436919",
abstract = "We present Stacked Filters, a new probabilistic filter
which is fast and robust similar to query-agnostic
filters (such as Bloom and Cuckoo filters), and at the
same time brings low false positive rates and sizes
similar to classifier-based filters \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Banerjee:2020:MSW,
author = "Prithu Banerjee and Wei Chen and Laks V. S.
Lakshmanan",
title = "Maximizing social welfare in a competitive diffusion
model",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "4",
pages = "613--625",
month = dec,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3436905.3436920",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Feb 23 08:32:42 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3436905.3436920",
abstract = "Influence maximization (IM) has garnered a lot of
attention in the literature owing to applications such
as viral marketing and infection containment. It aims
to select a small number of seed users to adopt an item
such that adoption propagates to a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gugnani:2020:UIR,
author = "Shashank Gugnani and Arjun Kashyap and Xiaoyi Lu",
title = "Understanding the idiosyncrasies of real persistent
memory",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "4",
pages = "626--639",
month = dec,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3436905.3436921",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Feb 23 08:32:42 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3436905.3436921",
abstract = "High capacity persistent memory (PMEM) is finally
commercially available in the form of Intel's Optane DC
Persistent Memory Module (DCPMM). Researchers have
raced to evaluate and understand the performance of
DCPMM itself as well as systems and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gale:2020:EMR,
author = "Abraham Gale and Am{\'e}lie Marian",
title = "Explaining monotonic ranking functions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "4",
pages = "640--652",
month = dec,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3436905.3436922",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Feb 23 08:32:42 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3436905.3436922",
abstract = "Ranking functions are commonly used to assist in
decision-making in a wide variety of applications. As
the general public realizes the significant societal
impacts of the widespread use of algorithms in
decision-making, there has been a push towards
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Dhulipala:2020:CFS,
author = "Laxman Dhulipala and Changwan Hong and Julian Shun",
title = "{ConnectIt}: a framework for static and incremental
parallel graph connectivity algorithms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "4",
pages = "653--667",
month = dec,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3436905.3436923",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Feb 23 08:32:42 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3436905.3436923",
abstract = "Connected components is a fundamental kernel in graph
applications. The fastest existing multicore algorithms
for solving graph connectivity are based on some form
of edge sampling and/or linking and compressing trees.
However, many combinations of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kouadri:2020:QSA,
author = "Wissam Mammar Kouadri and Mourad Ouziri and Salima
Benbernou and Karima Echihabi and Themis Palpanas and
Iheb {Ben Amor}",
title = "Quality of sentiment analysis tools: the reasons of
inconsistency",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "4",
pages = "668--681",
month = dec,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3436905.3436924",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Feb 23 08:32:42 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3436905.3436924",
abstract = "In this paper, we present a comprehensive study that
evaluates six state-of-the-art sentiment analysis tools
on five public datasets, based on the quality of
predictive results in the presence of semantically
equivalent documents, i.e., how consistent \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Garcia:2020:HLM,
author = "Rolando Garcia and Eric Liu and Vikram Sreekanti and
Bobby Yan and Anusha Dandamudi and Joseph E. Gonzalez
and Joseph M. Hellerstein and Koushik Sen",
title = "Hindsight logging for model training",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "4",
pages = "682--693",
month = dec,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3436905.3436925",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Feb 23 08:32:42 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3436905.3436925",
abstract = "In modern Machine Learning, model training is an
iterative, experimental process that can consume
enormous computation resources and developer time. To
aid in that process, experienced model developers log
and visualize program variables during training
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Jiang:2020:SSI,
author = "Lin Jiang and Junqiao Qiu and Zhijia Zhao",
title = "Scalable structural index construction for {JSON}
analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "4",
pages = "694--707",
month = dec,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3436905.3436926",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Feb 23 08:32:42 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2020.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3436905.3436926",
abstract = "JavaScript Object Notation (JSON) and its variants
have gained great popularity in recent years.
Unfortunately, the performance of their analytics is
often dragged down by the expensive JSON parsing. To
address this, recent work has shown that building
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Rui:2020:EJA,
author = "Ran Rui and Hao Li and Yi-Cheng Tu",
title = "Efficient join algorithms for large database tables in
a multi-{GPU} environment",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "4",
pages = "708--720",
month = dec,
year = "2020",
CODEN = "????",
DOI = "https://doi.org/10.14778/3436905.3436927",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Feb 23 08:32:42 MST 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3436905.3436927",
abstract = "Relational join processing is one of the core
functionalities in database management systems. It has
been demonstrated that GPUs as a general-purpose
parallel computing platform is very promising in
processing relational joins. However, join algorithms
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yan:2021:FAP,
author = "Shuyuan Yan and Bolin Ding and Wei Guo and Jingren
Zhou and Zhewei Wei and Xiaowei Jiang and Sheng Xu",
title = "{FlashP}: an analytical pipeline for real-time
forecasting of time-series relational data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "5",
pages = "721--729",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3446095.3446096",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 24 11:29:44 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3446095.3446096",
abstract = "Interactive response time is important in analytical
pipelines for users to explore a sufficient number of
possibilities and make informed business decisions. We
consider a forecasting pipeline with large volumes of
high-dimensional time series data. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Duong:2021:ESS,
author = "Chi Thang Duong and Trung Dung Hoang and Hongzhi Yin
and Matthias Weidlich and Quoc Viet Hung Nguyen and
Karl Aberer",
title = "Efficient streaming subgraph isomorphism with graph
neural networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "5",
pages = "730--742",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3446095.3446097",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 24 11:29:44 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3446095.3446097",
abstract = "Queries to detect isomorphic subgraphs are important
in graph-based data management. While the problem of
subgraph isomorphism search has received considerable
attention for the static setting of a single query, or
a batch thereof, existing approaches \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lu:2021:EBC,
author = "Yi Lu and Xiangyao Yu and Lei Cao and Samuel Madden",
title = "Epoch-based commit and replication in distributed
{OLTP} databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "5",
pages = "743--756",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3446095.3446098",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 24 11:29:44 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3446095.3446098",
abstract = "Many modern data-oriented applications are built on
top of distributed OLTP databases for both scalability
and high availability. Such distributed databases
enforce atomicity, durability, and consistency through
two-phase commit (2PC) and synchronous \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lin:2021:HCM,
author = "Zhe Lin and Fan Zhang and Xuemin Lin and Wenjie Zhang
and Zhihong Tian",
title = "Hierarchical core maintenance on large dynamic
graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "5",
pages = "757--770",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3446095.3446099",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 24 11:29:44 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3446095.3446099",
abstract = "The model of k -core and its decomposition have been
applied in various areas, such as social networks, the
world wide web, and biology. A graph can be decomposed
into an elegant k -core hierarchy to facilitate
cohesive subgraph discovery and network \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Mohan:2021:AMD,
author = "Jayashree Mohan and Amar Phanishayee and Ashish
Raniwala and Vijay Chidambaram",
title = "Analyzing and mitigating data stalls in {DNN}
training",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "5",
pages = "771--784",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3446095.3446100",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 24 11:29:44 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3446095.3446100",
abstract = "Training Deep Neural Networks (DNNs) is
resource-intensive and time-consuming. While prior
research has explored many different ways of reducing
DNN training time, the impact of input data pipeline,
i.e., fetching raw data items from storage and
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Hu:2021:PMH,
author = "Daokun Hu and Zhiwen Chen and Jianbing Wu and Jianhua
Sun and Hao Chen",
title = "Persistent memory hash indexes: an experimental
evaluation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "5",
pages = "785--798",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3446095.3446101",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 24 11:29:44 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3446095.3446101",
abstract = "Persistent memory (PM) is increasingly being leveraged
to build hash-based indexing structures featuring cheap
persistence, high performance, and instant recovery,
especially with the recent release of Intel Optane DC
Persistent Memory Modules. However, most of them are
evaluated on DRAM-based emulators with unreal
assumptions, or focus on the evaluation of specific
metrics with important properties sidestepped. Thus, it
is essential to understand how well the proposed hash
indexes perform on real PM and how they differentiate
from each other if a wider range of performance metrics
are considered. To this end, this paper provides a
comprehensive evaluation of persistent hash tables. In
particular, we focus on the evaluation of six
state-of-the-art hash tables including Level hashing,
CCEH, Dash, PCLHT, Clevel, and SOFT, with real PM
hardware. Our evaluation was conducted using a unified
benchmarking framework and representative workloads.
Besides characterizing common performance properties,
we also explore how hardware configurations (such as PM
bandwidth, CPU instructions, and NUMA) affect the
performance of PM-based hash tables. With our in-depth
analysis, we identify design trade-offs and good
paradigms in prior arts, and suggest desirable
optimizations and directions for the future development
of PM-based hash tables.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2021:OMD,
author = "Cheng Chen and Jun Yang and Mian Lu and Taize Wang and
Zhao Zheng and Yuqiang Chen and Wenyuan Dai and
Bingsheng He and Weng-Fai Wong and Guoan Wu and Yuping
Zhao and Andy Rudoff",
title = "Optimizing in-memory database engine for {AI}-powered
on-line decision augmentation using persistent memory",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "5",
pages = "799--812",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3446095.3446102",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 24 11:29:44 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3446095.3446102",
abstract = "On-line decision augmentation (OLDA) has been
considered as a promising paradigm for real-time
decision making powered by Artificial Intelligence
(AI). OLDA has been widely used in many applications
such as real-time fraud detection, personalized
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Usta:2021:DMT,
author = "Arif Usta and Akifhan Karakayali and {\"O}zg{\"u}r
Ulusoy",
title = "{DBTagger}: multi-task learning for keyword mapping in
{NLIDBs} using bi-directional recurrent neural
networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "5",
pages = "813--821",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3446095.3446103",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 24 11:29:44 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3446095.3446103",
abstract = "Translating Natural Language Queries (NLQs) to
Structured Query Language (SQL) in interfaces deployed
in relational databases is a challenging task, which
has been widely studied in database community recently.
Conventional rule based systems utilize \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sarkhel:2021:IIE,
author = "Ritesh Sarkhel and Arnab Nandi",
title = "Improving information extraction from visually rich
documents using visual span representations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "5",
pages = "822--834",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3446095.3446104",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 24 11:29:44 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3446095.3446104",
abstract = "Along with textual content, visual features play an
essential role in the semantics of visually rich
documents. Information extraction (IE) tasks perform
poorly on these documents if these visual cues are not
taken into account. In this paper, we \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liu:2021:ZHT,
author = "Gang Liu and Leying Chen and Shimin Chen",
title = "{Zen}: a high-throughput log-free {OLTP} engine for
non-volatile main memory",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "5",
pages = "835--848",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3446095.3446105",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 24 11:29:44 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3446095.3446105",
abstract = "Emerging Nonvolatile memory (NVM) technologies like
3DX-point promise significant performance potential for
OLTP databases. However, transactional databases need
to be redesigned because the key assumptions that
non-volatile storage is orders of magnitude slower than
DRAM and only supports blocked-oriented access have
changed. NVMs are byte-addressable and almost as fast
as DRAM. The capacity of NVM is much (4-16x) larger
than DRAM. Such NVM characteristics make it possible to
build OLTP database entirely in NVM main
memory.\par
This paper studies the structure of OLTP engines with
hybrid NVM and DRAM memory. We observe three challenges
to design an OLTP engine for NVM: tuple metadata
modifications, NVM write redundancy, and NVM space
management. We propose Zen, a high-throughput log-free
OLTP engine for NVM. Zen addresses the three design
challenges with three novel techniques: metadata
enhanced tuple cache, log-free persistent transactions,
and light-weight NVM space management. Experimental
results on a real machine equipped with Intel Optane DC
Persistent Memory show that Zen achieves up to 10.1x
improvement compared with existing solutions to run an
OLTP database as large as the size of NVM while
achieving fast failure recovery.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ji:2021:DPB,
author = "Tianxi Ji and Pan Li and Emre Yilmaz and Erman Ayday
and Yanfang (Fanny) Ye and Jinyuan Sun",
title = "Differentially private binary- and matrix-valued data
query: an {XOR} mechanism",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "5",
pages = "849--862",
month = jan,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3446095.3446106",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 24 11:29:44 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3446095.3446106",
abstract = "Differential privacy has been widely adopted to
release continuous- and scalar-valued information on a
database without compromising the privacy of individual
data records in it. The problem of querying binary- and
matrix-valued information on a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Nakandala:2021:ECD,
author = "Supun Nakandala and Yuhao Zhang and Arun Kumar",
title = "Errata for {``Cerebro: a data system for optimized
deep learning model selection''}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "6",
pages = "863--863",
month = feb,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3447689.3447691",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:38 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
note = "See \cite{Nakandala:2020:CDS}.",
URL = "https://dl.acm.org/doi/10.14778/3447689.3447691",
abstract = "We discovered that there was an inconsistency in the
communication cost formulation for the decentralized
fine-grained training method in Table 2 of our paper
[1]. We used Horovod as the archetype for decentralized
fine-grained approaches, and its \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yin:2021:PBD,
author = "Lujia Yin and Yiming Zhang and Zhaoning Zhang and
Yuxing Peng and Peng Zhao",
title = "{ParaX}: boosting deep learning for big data analytics
on many-core {CPUs}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "6",
pages = "864--877",
month = feb,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3447689.3447692",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:38 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3447689.3447692",
abstract = "Despite the fact that GPUs and accelerators are more
efficient in deep learning (DL), commercial clouds like
Facebook and Amazon now heavily use CPUs in DL
computation because there are large numbers of CPUs
which would otherwise sit idle during off-. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Cai:2021:OTF,
author = "Walter Cai and Philip A. Bernstein and Wentao Wu and
Badrish Chandramouli",
title = "Optimization of threshold functions over streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "6",
pages = "878--889",
month = feb,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3447689.3447693",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:38 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3447689.3447693",
abstract = "A common stream processing application is alerting,
where the data stream management system (DSMS)
continuously evaluates a threshold function over
incoming streams. If the threshold is crossed, the DSMS
raises an alarm. The threshold function is often
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhu:2021:BCI,
author = "Xuliang Zhu and Xin Huang and Byron Choi and Jiaxin
Jiang and Zhaonian Zou and Jianliang Xu",
title = "Budget constrained interactive search for multiple
targets",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "6",
pages = "890--902",
month = feb,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3447689.3447694",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:38 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3447689.3447694",
abstract = "Interactive graph search leverages human intelligence
to categorize target labels in a hierarchy, which is
useful for image classification, product
categorization, and database search. However, many
existing interactive graph search studies aim at
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2021:SMK,
author = "Yangjun Chen and Hoang Hai Nguyen",
title = "On the string matching with $k$ differences in {DNA}
databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "6",
pages = "903--915",
month = feb,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3447689.3447695",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:38 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/string-matching.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3447689.3447695",
abstract = "In this paper, we discuss an efficient and effective
index mechanism for the string matching with k
differences, by which we will find all the substrings
of a target string y of length n that align with a
pattern string x of length m with not more than
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fujiwara:2021:FAA,
author = "Yasuhiro Fujiwara and Sekitoshi Kanai and Yasutoshi
Ida and Atsutoshi Kumagai and Naonori Ueda",
title = "Fast algorithm for anchor graph hashing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "6",
pages = "916--928",
month = feb,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3447689.3447696",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:38 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3447689.3447696",
abstract = "Anchor graph hashing is used in many applications such
as cancer detection, web page classification, and drug
discovery. It computes the hash codes from the
eigenvectors of the matrix representing the
similarities between data points and anchor points;
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2021:ACG,
author = "Wangda Zhang and Junyoung Kim and Kenneth A. Ross and
Eric Sedlar and Lukas Stadler",
title = "Adaptive code generation for data-intensive
analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "6",
pages = "929--942",
month = feb,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3447689.3447697",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:38 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3447689.3447697",
abstract = "Modern database management systems employ
sophisticated query optimization techniques that enable
the generation of efficient plans for queries over very
large data sets. A variety of other applications also
process large data sets, but cannot leverage \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Tsamoura:2021:MKB,
author = "Efthymia Tsamoura and David Carral and Enrico Malizia
and Jacopo Urbani",
title = "Materializing knowledge bases via trigger graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "6",
pages = "943--956",
month = feb,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3447689.3447699",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:38 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3447689.3447699",
abstract = "The chase is a well-established family of algorithms
used to materialize Knowledge Bases (KBs) for tasks
like query answering under dependencies or data
cleaning. A general problem of chase algorithms is that
they might perform redundant computations. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liu:2021:DEE,
author = "Jinfei Liu and Jian Lou and Junxu Liu and Li Xiong and
Jian Pei and Jimeng Sun",
title = "{Dealer}: an end-to-end model marketplace with
differential privacy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "6",
pages = "957--969",
month = feb,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3447689.3447700",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:38 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3447689.3447700",
abstract = "Data-driven machine learning has become ubiquitous. A
marketplace for machine learning models connects data
owners and model buyers, and can dramatically
facilitate data-driven machine learning applications.
In this paper, we take a formal data \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Rahman:2021:NIS,
author = "Sajjadur Rahman and Mangesh Bendre and Yuyang Liu and
Shichu Zhu and Zhaoyuan Su and Karrie Karahalios and
Aditya G. Parameswaran",
title = "{NOAH}: interactive spreadsheet exploration with
dynamic hierarchical overviews",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "6",
pages = "970--983",
month = feb,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3447689.3447701",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:38 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3447689.3447701",
abstract = "Spreadsheet systems are by far the most popular
platform for data exploration on the planet, supporting
millions of rows of data. However, exploring
spreadsheets that are this large via operations such as
scrolling or issuing formulae can be \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yang:2021:EBT,
author = "Yixing Yang and Yixiang Fang and Maria E. Orlowska and
Wenjie Zhang and Xuemin Lin",
title = "Efficient bi-triangle counting for large bipartite
networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "6",
pages = "984--996",
month = feb,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3447689.3447702",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:38 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3447689.3447702",
abstract = "A bipartite network is a network with two disjoint
vertex sets and its edges only exist between vertices
from different sets. It has received much interest
since it can be used to model the relationship between
two different sets of objects in many \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Tata:2021:GSE,
author = "Sandeep Tata and Navneet Potti and James B. Wendt and
Lauro Beltr{\~a}o Costa and Marc Najork and Beliz
Gunel",
title = "{Glean}: structured extractions from templatic
documents",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "6",
pages = "997--1005",
month = feb,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3447689.3447703",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:38 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3447689.3447703",
abstract = "Extracting structured information from templatic
documents is an important problem with the potential to
automate many real-world business workflows such as
payment, procurement, and payroll. The core challenge
is that such documents can be laid out in \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gao:2021:IGL,
author = "Jun Gao and Jiazun Chen and Zhao Li and Ji Zhang",
title = "{ICS-GNN}: lightweight interactive community search
via graph neural network",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "6",
pages = "1006--1018",
month = feb,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3447689.3447704",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:38 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3447689.3447704",
abstract = "Searching a community containing a given query vertex
in an online social network enjoys wide applications
like recommendation, team organization, etc. When
applied to real-life networks, the existing approaches
face two major limitations. First, they \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sun:2021:BEN,
author = "Yuanyuan Sun and Sheng Wang and Huorong Li and Feifei
Li",
title = "Building enclave-native storage engines for practical
encrypted databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "6",
pages = "1019--1032",
month = feb,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3447689.3447705",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:38 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3447689.3447705",
abstract = "Data confidentiality is one of the biggest concerns
that hinders enterprise customers from moving their
workloads to the cloud. Thanks to the trusted execution
environment (TEE), it is now feasible to build
encrypted databases in the enclave that can \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Thorne:2021:NLP,
author = "James Thorne and Majid Yazdani and Marzieh Saeidi and
Fabrizio Silvestri and Sebastian Riedel and Alon
Halevy",
title = "From natural language processing to neural databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "6",
pages = "1033--1039",
month = feb,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3447689.3447706",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:38 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3447689.3447706",
abstract = "In recent years, neural networks have shown impressive
performance gains on long-standing AI problems, such as
answering queries from text and machine translation.
These advances raise the question of whether neural
nets can be used at the core of query \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2021:RER,
author = "Haibo Wang and Chaoyi Ma and Olufemi O. Odegbile and
Shigang Chen and Jih-Kwon Peir",
title = "Randomized error removal for online spread estimation
in data streaming",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "6",
pages = "1040--1052",
month = feb,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3447689.3447707",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:38 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3447689.3447707",
abstract = "Measuring flow spread in real time from large,
high-rate data streams has numerous practical
applications, where a data stream is modeled as a
sequence of data items from different flows and the
spread of a flow is the number of distinct items in the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{DeLeo:2021:TAS,
author = "Dean {De Leo} and Peter Boncz",
title = "{Teseo} and the analysis of structural dynamic
graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "6",
pages = "1053--1066",
month = feb,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3447689.3447708",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:38 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
note = "See errata \cite{Leo:2021:ETA}.",
URL = "https://dl.acm.org/doi/10.14778/3447689.3447708",
abstract = "We present Teseo, a new system for the storage and
analysis of dynamic structural graphs in main-memory
and the addition of transactional support. Teseo
introduces a novel design based on sparse arrays, large
arrays interleaved with gaps, and a fat tree,.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gubner:2021:CDS,
author = "Tim Gubner and Peter Boncz",
title = "Charting the design space of query execution using
{VOILA}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "6",
pages = "1067--1079",
month = feb,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3447689.3447709",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:38 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3447689.3447709",
abstract = "Database architecture, while having been studied for
four decades now, has delivered only a few designs with
well-understood properties. These few are followed by
most actual systems. Acquiring more knowledge about the
design space is a very time-. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2021:HES,
author = "Zhiqi Wang and Jin Xue and Zili Shao",
title = "{Heracles}: an efficient storage model and data
flushing for performance monitoring timeseries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "6",
pages = "1080--1092",
month = feb,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3447689.3447710",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:38 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3447689.3447710",
abstract = "Performance-monitoring timeseries systems such as
Prometheus and InfluxDB play a critical role in
assuring reliability and operationally. These systems
commonly adopt a column-oriented storage model, by
which timeseries samples from different time-.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Macke:2021:FGL,
author = "Stephen Macke and Hongpu Gong and Doris Jung-Lin Lee
and Andrew Head and Doris Xin and Aditya Parameswaran",
title = "Fine-grained lineage for safer notebook interactions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "6",
pages = "1093--1101",
month = feb,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3447689.3447712",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:38 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3447689.3447712",
abstract = "Computational notebooks have emerged as the platform
of choice for data science and analytical workflows,
enabling rapid iteration and exploration. By keeping
intermediate program state in memory and segmenting
units of execution into so-called ``cells'', \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Tsitsulin:2021:FAG,
author = "Anton Tsitsulin and Marina Munkhoeva and Davide Mottin
and Panagiotis Karras and Ivan Oseledets and Emmanuel
M{\"u}ller",
title = "{FREDE}: anytime graph embeddings",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "6",
pages = "1102--1110",
month = feb,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3447689.3447713",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:38 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3447689.3447713",
abstract = "Low-dimensional representations, or embeddings, of a
graph's nodes facilitate several practical data science
and data engineering tasks. As such embeddings rely,
explicitly or implicitly, on a similarity measure among
nodes, they require the computation \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2021:AGM,
author = "Xiaodong Li and Reynold Cheng and Kevin Chen-Chuan
Chang and Caihua Shan and Chenhao Ma and Hongtai Cao",
title = "On analyzing graphs with motif-paths",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "6",
pages = "1111--1123",
month = feb,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3447689.3447714",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:38 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3447689.3447714",
abstract = "Path-based solutions have been shown to be useful for
various graph analysis tasks, such as link prediction
and graph clustering. However, they are no longer
adequate for handling complex and gigantic graphs.
Recently, motif-based analysis has attracted \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Tsaras:2021:CIM,
author = "Dimitris Tsaras and George Trimponias and Lefteris
Ntaflos and Dimitris Papadias",
title = "Collective influence maximization for multiple
competing products with an awareness-to-influence
model",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "7",
pages = "1124--1136",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3450980.3450981",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:39 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3450980.3450981",
abstract = "Influence maximization (IM) is a fundamental task in
social network analysis. Typically, IM aims at
selecting a set of seeds for the network that
influences the maximum number of individuals. Motivated
by practical applications, in this paper we focus
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sun:2021:FGS,
author = "Yahui Sun and Xiaokui Xiao and Bin Cui and Saman
Halgamuge and Theodoros Lappas and Jun Luo",
title = "Finding group {Steiner} trees in graphs with both
vertex and edge weights",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "7",
pages = "1137--1149",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3450980.3450982",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:39 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3450980.3450982",
abstract = "Given an undirected graph and a number of vertex
groups, the group Steiner trees problem is to find a
tree such that (i) this tree contains at least one
vertex in each vertex group; and (ii) the sum of vertex
and edge weights in this tree is minimized. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Abeywickrama:2021:OBM,
author = "Tenindra Abeywickrama and Victor Liang and Kian-Lee
Tan",
title = "Optimizing bipartite matching in real-world
applications by incremental cost computation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "7",
pages = "1150--1158",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3450980.3450983",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:39 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3450980.3450983",
abstract = "The Kuhn-Munkres (KM) algorithm is a classical
combinatorial optimization algorithm that is widely
used for minimum cost bipartite matching in many
real-world applications, such as transportation. For
example, a ride-hailing service may use it to find
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Trummer:2021:CNE,
author = "Immanuel Trummer",
title = "The case for {NLP}-enhanced database tuning: towards
tuning tools that ``read the manual''",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "7",
pages = "1159--1165",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3450980.3450984",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:39 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3450980.3450984",
abstract = "A large body of knowledge on database tuning is
available in the form of natural language text. We
propose to leverage natural language processing (NLP)
to make that knowledge accessible to automated tuning
tools. We describe multiple avenues to exploit
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Maiyya:2021:EUC,
author = "Sujaya Maiyya and Faisal Nawab and Divyakant Agrawal
and Amr {El Abbadi}",
title = "Errata for {``Unifying consensus and atomic commitment
for effective cloud data management''}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "7",
pages = "1166--1166",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3450980.3450985",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:39 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
note = "See \cite{Maiyya:2019:UCA}.",
URL = "https://dl.acm.org/doi/10.14778/3450980.3450985",
abstract = "This errata article discusses and corrects a minor
error in our work published in VLDB 2019. The
discrepancy specifically pertains to Algorithms 3 and
4. The algorithms presented in the paper are biased
towards a commit decision in a specific failure
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Istvan:2021:SDD,
author = "Zsolt Istv{\'a}n and Soujanya Ponnapalli and Vijay
Chidambaram",
title = "Software-defined data protection: low overhead policy
compliance at the storage layer is within reach!",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "7",
pages = "1167--1174",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3450980.3450986",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:39 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3450980.3450986",
abstract = "Most modern data processing pipelines run on top of a
distributed storage layer, and securing the whole
system, and the storage layer in particular, against
accidental or malicious misuse is crucial to ensuring
compliance to rules and regulations. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2021:TRT,
author = "Tianyi Li and Lu Chen and Christian S. Jensen and
Torben Bach Pedersen",
title = "{TRACE}: real-time compression of streaming
trajectories in road networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "7",
pages = "1175--1187",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3450980.3450987",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:39 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3450980.3450987",
abstract = "The deployment of vehicle location services generates
increasingly massive vehicle trajectory data, which
incurs high storage and transmission costs. A range of
studies target offline compression to reduce the
storage cost. However, to enable online \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Saha:2021:SPC,
author = "Arkaprava Saha and Ruben Brokkelkamp and Yllka Velaj
and Arijit Khan and Francesco Bonchi",
title = "Shortest paths and centrality in uncertain networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "7",
pages = "1188--1201",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3450980.3450988",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:39 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3450980.3450988",
abstract = "Computing the shortest path between a pair of nodes is
a fundamental graph primitive, which has critical
applications in vehicle routing, finding functional
pathways in biological networks, survivable network
design, among many others. In this work, we \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liu:2021:ADAa,
author = "Tongyu Liu and Ju Fan and Yinqing Luo and Nan Tang and
Guoliang Li and Xiaoyong Du",
title = "Adaptive data augmentation for supervised learning
over missing data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "7",
pages = "1202--1214",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3450980.3450989",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:39 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3450980.3450989",
abstract = "Real-world data is dirty, which causes serious
problems in (supervised) machine learning (ML). The
widely used practice in such scenario is to first
repair the labeled source (a.k.a. train) data using
rule-, statistical- or ML-based methods and then use
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhao:2021:KPA,
author = "Fuheng Zhao and Sujaya Maiyya and Ryan Wiener and
Divyakant Agrawal and Amr {El Abbadi}",
title = "{KLL$^\pm $} approximate quantile sketches over
dynamic datasets",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "7",
pages = "1215--1227",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3450980.3450990",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:39 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3450980.3450990",
abstract = "Recently the long standing problem of optimal
construction of quantile sketches was resolved by
Karnin, Lang, and Liberty using the KLL sketch (FOCS
2016). The algorithm for KLL is restricted to online
insert operations and no delete operations. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Jankov:2021:DNM,
author = "Dimitrije Jankov and Binhang Yuan and Shangyu Luo and
Chris Jermaine",
title = "Distributed numerical and machine learning
computations via two-phase execution of aggregated join
trees",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "7",
pages = "1228--1240",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3450980.3450991",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:39 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3450980.3450991",
abstract = "When numerical and machine learning (ML) computations
are expressed relationally, classical query execution
strategies (hash-based joins and aggregations) can do a
poor job distributing the computation. In this paper,
we propose a two-phase execution \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{VanAken:2021:IML,
author = "Dana {Van Aken} and Dongsheng Yang and Sebastien
Brillard and Ari Fiorino and Bohan Zhang and Christian
Bilien and Andrew Pavlo",
title = "An inquiry into machine learning-based automatic
configuration tuning services on real-world database
management systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "7",
pages = "1241--1253",
month = mar,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3450980.3450992",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Apr 13 13:43:39 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3450980.3450992",
abstract = "Modern database management systems (DBMS) expose
dozens of configurable knobs that control their runtime
behavior. Setting these knobs correctly for an
application's workload can improve the performance and
efficiency of the DBMS. But because of their \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Tang:2021:RRP,
author = "Nan Tang and Ju Fan and Fangyi Li and Jianhong Tu and
Xiaoyong Du and Guoliang Li and Sam Madden and Mourad
Ouzzani",
title = "{RPT}: relational pre-trained transformer is almost
all you need towards democratizing data preparation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "8",
pages = "1254--1261",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3457390.3457391",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:31 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3457390.3457391",
abstract = "Can AI help automate human-easy but computer-hard data
preparation tasks that burden data scientists,
practitioners, and crowd workers? We answer this
question by presenting RPT, a denoising autoencoder for
tuple-to-X models (`` X '' could be tuple, token,
\ldots{})",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zou:2021:LAP,
author = "Jia Zou and Amitabh Das and Pratik Barhate and Arun
Iyengar and Binhang Yuan and Dimitrije Jankov and Chris
Jermaine",
title = "{Lachesis}: automatic partitioning for {UDF}-centric
analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "8",
pages = "1262--1275",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3457390.3457392",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:31 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3457390.3457392",
abstract = "Partitioning is effective in avoiding expensive
shuffling operations. However, it remains a significant
challenge to automate this process for Big Data
analytics workloads that extensively use user defined
functions (UDFs), where sub-computations are \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wu:2021:ULI,
author = "Jiacheng Wu and Yong Zhang and Shimin Chen and Jin
Wang and Yu Chen and Chunxiao Xing",
title = "Updatable learned index with precise positions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "8",
pages = "1276--1288",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3457390.3457393",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:31 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3457390.3457393",
abstract = "Index plays an essential role in modern database
engines to accelerate the query processing. The new
paradigm of ``learned index'' has significantly changed
the way of designing index structures in DBMS. The key
insight is that indexes could be regarded \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fang:2021:MMS,
author = "Ziquan Fang and Lu Pan and Lu Chen and Yuntao Du and
Yunjun Gao",
title = "{MDTP}: a multi-source deep traffic prediction
framework over spatio-temporal trajectory data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "8",
pages = "1289--1297",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3457390.3457394",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:31 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3457390.3457394",
abstract = "Traffic prediction has drawn increasing attention for
its ubiquitous real-life applications in traffic
management, urban computing, public safety, and so on.
Recently, the availability of massive trajectory data
and the success of deep learning motivate \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Min:2021:SCS,
author = "Seunghwan Min and Sung Gwan Park and Kunsoo Park and
Dora Giammarresi and Giuseppe F. Italiano and Wook-Shin
Han",
title = "Symmetric continuous subgraph matching with
bidirectional dynamic programming",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "8",
pages = "1298--1310",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3457390.3457395",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:31 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3457390.3457395",
abstract = "In many real datasets such as social media streams and
cyber data sources, graphs change over time through a
graph update stream of edge insertions and deletions.
Detecting critical patterns in such dynamic graphs
plays an important role in various \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Suzuki:2021:ADP,
author = "Tomoya Suzuki and Kazuhiro Hiwada and Hirotsugu
Kajihara and Shintaro Sano and Shuou Nomura and Tatsuo
Shiozawa",
title = "Approaching {DRAM} performance by using
microsecond-latency flash memory for small-sized random
read accesses: a new access method and its graph
applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "8",
pages = "1311--1324",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3457390.3457397",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:31 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3457390.3457397",
abstract = "For applications in which small-sized random accesses
frequently occur for datasets that exceed DRAM
capacity, placing the datasets on SSD can result in
poor application performance. For the read-intensive
case we focus on in this paper, low latency \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Orogat:2021:CTB,
author = "Abdelghny Orogat and Isabelle Liu and Ahmed El-Roby",
title = "{CBench}: towards better evaluation of question
answering over knowledge graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "8",
pages = "1325--1337",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3457390.3457398",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:31 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3457390.3457398",
abstract = "Recently, there has been an increase in the number of
knowledge graphs that can be only queried by experts.
However, describing questions using structured queries
is not straightforward for non-expert users who need to
have sufficient knowledge about \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yuan:2021:TRA,
author = "Binhang Yuan and Dimitrije Jankov and Jia Zou and
Yuxin Tang and Daniel Bourgeois and Chris Jermaine",
title = "Tensor relational algebra for distributed machine
learning system design",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "8",
pages = "1338--1350",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3457390.3457399",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:31 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3457390.3457399",
abstract = "We consider the question: what is the abstraction that
should be implemented by the computational engine of a
machine learning system? Current machine learning
systems typically push whole tensors through a series
of compute kernels such as matrix \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fan:2021:PDD,
author = "Wenfei Fan and Chao Tian and Yanghao Wang and Qiang
Yin",
title = "Parallel discrepancy detection and incremental
detection",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "8",
pages = "1351--1364",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3457390.3457400",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:31 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3457390.3457400",
abstract = "This paper studies how to catch duplicates, mismatches
and conflicts in the same process. We adopt a class of
entity enhancing rules that embed machine learning
predicates, unify entity resolution and conflict
resolution, and are collectively defined \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liu:2021:TCA,
author = "Tiantian Liu and Huan Li and Hua Lu and Muhammad Aamir
Cheema and Lidan Shou",
title = "Towards crowd-aware indoor path planning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "8",
pages = "1365--1377",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3457390.3457401",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:31 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3457390.3457401",
abstract = "Indoor venues accommodate many people who collectively
form crowds. Such crowds in turn influence people's
routing choices, e.g., people may prefer to avoid
crowded rooms when walking from A to B. This paper
studies two types of crowd-aware indoor path \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gupta:2021:PES,
author = "Surabhi Gupta and Karthik Ramachandra",
title = "Procedural extensions of {SQL}: understanding their
usage in the wild",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "8",
pages = "1378--1391",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3457390.3457402",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:31 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3457390.3457402",
abstract = "Procedural extensions of SQL have been in existence
for many decades now. However, little is known about
their magnitude of usage and their complexity in
real-world workloads. Procedural code executing in a
RDBMS is known to have inefficiencies and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Bharadwaj:2021:DRD,
author = "Sagar Bharadwaj and Praveen Gupta and Ranjita Bhagwan
and Saikat Guha",
title = "Discovering related data at scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "8",
pages = "1392--1400",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3457390.3457403",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:31 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3457390.3457403",
abstract = "Analysts frequently require data from multiple sources
for their tasks, but finding these sources is
challenging in exabyte-scale data lakes. In this paper,
we address this problem for our enterprise's data lake
by using machine-learning to identify \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Cereda:2021:CCG,
author = "Stefano Cereda and Stefano Valladares and Paolo
Cremonesi and Stefano Doni",
title = "{CGPTuner}: a contextual {Gaussian} process bandit
approach for the automatic tuning of {IT}
configurations under varying workload conditions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "8",
pages = "1401--1413",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3457390.3457404",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:31 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3457390.3457404",
abstract = "Properly selecting the configuration of a database
management system (DBMS) is essential to increase
performance and reduce costs. However, the task is
astonishingly tricky due to a large number of tunable
configuration parameters and their inter-. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Schiavio:2021:LAI,
author = "Filippo Schiavio and Daniele Bonetta and Walter
Binder",
title = "Language-agnostic integrated queries in a managed
polyglot runtime",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "8",
pages = "1414--1426",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3457390.3457405",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:31 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3457390.3457405",
abstract = "Language-integrated query (LINQ) frameworks offer a
convenient programming abstraction for processing
in-memory collections of data, allowing developers to
concisely express declarative queries using
general-purpose programming languages. Existing LINQ",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kulkarni:2021:AHT,
author = "Chinmay Kulkarni and Badrish Chandramouli and Ryan
Stutsman",
title = "Achieving high throughput and elasticity in a
larger-than-memory store",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "8",
pages = "1427--1440",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3457390.3457406",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:31 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3457390.3457406",
abstract = "Millions of sensors, mobile applications and machines
now generate billions of events. Specialized many-core
key-value stores (KVSs) can ingest and index these
events at high rates (over 100 Mops/s on one machine)
if events are generated on the same \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yao:2021:ESB,
author = "Kai Yao and Lijun Chang",
title = "Efficient size-bounded community search over large
networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "8",
pages = "1441--1453",
month = apr,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3457390.3457407",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:31 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3457390.3457407",
abstract = "The problem of community search, which aims to find a
cohesive subgraph containing user-given query vertices,
has been extensively studied recently. Most of the
existing studies mainly focus on the cohesiveness of
the returned community, while ignoring \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhao:2021:MVA,
author = "Jianwen Zhao and Yufei Tao",
title = "Minimum vertex augmentation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "9",
pages = "1454--1466",
month = may,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3461535.3461536",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:32 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3461535.3461536",
abstract = "This paper introduces a class of graph problems named
minimum vertex augmentation (MVA). Given an input graph
G where each vertex carries a binary color 0 or 1, we
want to flip the colors of the fewest 0-vertices such
that the subgraph induced by all \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gaffney:2021:DIS,
author = "Kevin P. Gaffney and Robert Claus and Jignesh M.
Patel",
title = "Database isolation by scheduling",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "9",
pages = "1467--1480",
month = may,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3461535.3461537",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:32 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3461535.3461537",
abstract = "Transaction isolation is conventionally achieved by
restricting access to the physical items in a database.
To maximize performance, isolation functionality is
often packaged with recovery, I/O, and data access
methods in a monolithic transactional \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Park:2021:SSS,
author = "Jong-Hyeok Park and Soyee Choi and Gihwan Oh and
Sang-Won Lee",
title = "{SaS}: {SSD} as {SQL} database system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "9",
pages = "1481--1488",
month = may,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3461535.3461538",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:32 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3461535.3461538",
abstract = "Every database engine runs on top of an operating
system in the host, strictly separated with the
storage. This more-than-half-century-old IHDE
(In-Host-Database-Engine) architecture, however,
reveals its limitations when run on fast flash memory
SSDs. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhu:2021:FFL,
author = "Rong Zhu and Ziniu Wu and Yuxing Han and Kai Zeng and
Andreas Pfadler and Zhengping Qian and Jingren Zhou and
Bin Cui",
title = "{FLAT}: fast, lightweight and accurate method for
cardinality estimation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "9",
pages = "1489--1502",
month = may,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3461535.3461539",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:32 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3461535.3461539",
abstract = "Query optimizers rely on accurate cardinality
estimation (CardEst) to produce good execution plans.
The core problem of CardEst is how to model the rich
joint distribution of attributes in an accurate and
compact manner. Despite decades of research, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chan:2021:FAA,
author = "Tsz Nam Chan and Zhe Li and Leong Hou U. and Jianliang
Xu and Reynold Cheng",
title = "Fast augmentation algorithms for network kernel
density visualization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "9",
pages = "1503--1516",
month = may,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3461535.3461540",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:32 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3461535.3461540",
abstract = "Network kernel density visualization, or NKDV, has
been extensively used to visualize spatial data points
in various domains, including traffic accident hotspot
detection, crime hotspot detection, disease outbreak
detection, and business and urban \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2021:AAG,
author = "Jiawei Wang and Cheng Li and Kai Ma and Jingze Huo and
Feng Yan and Xinyu Feng and Yinlong Xu",
title = "{AUTOGR}: automated geo-replication with fast system
performance and preserved application semantics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "9",
pages = "1517--1530",
month = may,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3461535.3461541",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:32 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3461535.3461541",
abstract = "Geo-replication is essential for providing low latency
response and quality Internet services. However,
designing fast and correct geo-replicated services is
challenging due to the complex trade-off between
performance and consistency semantics in \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liu:2021:LAD,
author = "Qing Liu and Xuliang Zhu and Xin Huang and Jianliang
Xu",
title = "Local algorithms for distance-generalized core
decomposition over large dynamic graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "9",
pages = "1531--1543",
month = may,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3461535.3461542",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:32 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3461535.3461542",
abstract = "The distance-generalized core, also called ( k, h
)-core, is defined as the maximal subgraph in which
every vertex has at least k vertices at distance no
longer than h. Compared with k -core, ( k, h )-core can
identify more fine-grained subgraphs and, hence,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Benson:2021:VEH,
author = "Lawrence Benson and Hendrik Makait and Tilmann Rabl",
title = "{Viper}: an efficient hybrid {PMem-DRAM} key-value
store",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "9",
pages = "1544--1556",
month = may,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3461535.3461543",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:32 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3461535.3461543",
abstract = "Key-value stores (KVSs) have found wide application in
modern software systems. For persistence, their data
resides in slow secondary storage, which requires KVSs
to employ various techniques to increase their read and
write performance from and to the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zeighami:2021:ESC,
author = "Sepanta Zeighami and Cyrus Shahabi and John Krumm",
title = "Estimating spread of contact-based contagions in a
population through sub-sampling",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "9",
pages = "1557--1569",
month = may,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3461535.3461544",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:32 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3461535.3461544",
abstract = "Various phenomena such as viruses, gossips, and
physical objects (e.g., packages and marketing
pamphlets) can be spread through physical contacts. The
spread depends on how people move, i.e., their mobility
patterns. In practice, mobility patterns of an
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Herodotou:2021:TTS,
author = "Herodotos Herodotou and Elena Kakoulli",
title = "{Trident}: task scheduling over tiered storage systems
in big data platforms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "9",
pages = "1570--1582",
month = may,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3461535.3461545",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:32 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3461535.3461545",
abstract = "The recent advancements in storage technologies have
popularized the use of tiered storage systems in
data-intensive compute clusters. The Hadoop Distributed
File System (HDFS), for example, now supports storing
data in memory, SSDs, and HDDs, while \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Cong:2021:CCE,
author = "Zicun Cong and Lingyang Chu and Yu Yang and Jian Pei",
title = "Comprehensible counterfactual explanation on
{Kolmogorov--Smirnov} test",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "9",
pages = "1583--1596",
month = may,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3461535.3461546",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:32 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3461535.3461546",
abstract = "The Kolmogorov--Smirnov (KS) test is popularly used in
many applications, such as anomaly detection,
astronomy, database security and AI systems. One
challenge remained untouched is how we can obtain an
explanation on why a test set fails the KS test. In
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhou:2021:ALS,
author = "Hongkuan Zhou and Ajitesh Srivastava and Hanqing Zeng
and Rajgopal Kannan and Viktor Prasanna",
title = "Accelerating large scale real-time {GNN} inference
using channel pruning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "9",
pages = "1597--1605",
month = may,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3461535.3461547",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:32 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3461535.3461547",
abstract = "Graph Neural Networks (GNNs) are proven to be powerful
models to generate node embedding for downstream
applications. However, due to the high computation
complexity of GNN inference, it is hard to deploy GNNs
for large-scale or real-time applications. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Leis:2021:TCO,
author = "Viktor Leis and Maximilian Kuschewski",
title = "Towards cost-optimal query processing in the cloud",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "9",
pages = "1606--1612",
month = may,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3461535.3461549",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:32 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3461535.3461549",
abstract = "Public cloud providers offer hundreds of heterogeneous
hardware instances. For analytical query processing
systems, this presents a major challenge: depending on
the hardware configuration, performance and cost may
differ by orders of magnitude. We \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gong:2021:AIG,
author = "Shufeng Gong and Chao Tian and Qiang Yin and Wenyuan
Yu and Yanfeng Zhang and Liang Geng and Song Yu and Ge
Yu and Jingren Zhou",
title = "Automating incremental graph processing with flexible
memoization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "9",
pages = "1613--1625",
month = may,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3461535.3461550",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:32 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3461535.3461550",
abstract = "The ever-growing amount of dynamic graph data demands
efficient techniques of incremental graph processing.
However, incremental graph algorithms are challenging
to develop. Existing approaches usually require users
to manually design nontrivial \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Jepsen:2021:NST,
author = "Theo Jepsen and Alberto Lerner and Fernando Pedone and
Robert Soul{\'e} and Philippe Cudr{\'e}-Mauroux",
title = "In-network support for transaction triaging",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "9",
pages = "1626--1639",
month = may,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3461535.3461551",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:32 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3461535.3461551",
abstract = "We introduce Transaction Triaging, a set of techniques
that manipulate streams of transaction requests and
responses while they travel to and from a database
server. Compared to normal transaction streams, the
triaged ones execute faster once they reach \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2021:WRL,
author = "Xiaoying Wang and Changbo Qu and Weiyuan Wu and
Jiannan Wang and Qingqing Zhou",
title = "Are we ready for learned cardinality estimation?",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "9",
pages = "1640--1654",
month = may,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3461535.3461552",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:32 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3461535.3461552",
abstract = "Cardinality estimation is a fundamental but long
unresolved problem in query optimization. Recently,
multiple papers from different research groups
consistently report that learned models have the
potential to replace existing cardinality estimators.
In \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lemiesz:2021:ADS,
author = "Jakub Lemiesz",
title = "On the algebra of data sketches",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "9",
pages = "1655--1667",
month = may,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3461535.3461553",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:32 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3461535.3461553",
abstract = "We consider the problem of designing a distributed
data sketch for scenario in which data stream is
observed by many independent network nodes. We require
that a sketch apart from being computationally and
memory efficient should also be mergeable in a
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Hou:2021:MPA,
author = "Guanhao Hou and Xingguang Chen and Sibo Wang and
Zhewei Wei",
title = "Massively parallel algorithms for {Personalized
Pagerank}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "9",
pages = "1668--1680",
month = may,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3461535.3461554",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:32 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pagerank.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3461535.3461554",
abstract = "Personalized PageRank (PPR) has wide applications in
search engines, social recommendations, community
detection, and so on. Nowadays, graphs are becoming
massive and many IT companies need to deal with large
graphs that cannot be fitted into the memory of most
commodity servers. However, most existing
state-of-the-art solutions for PPR computation only
work for single-machines and are inefficient for the
distributed framework since such solutions either (i)
result in an excessively large number of communication
rounds, or (ii) incur high communication costs in each
round.
Motivated by this, we present Delta-Push, an efficient
framework for single-source and top-$k$ PPR queries in
distributed settings. Our goal is to reduce the number
of rounds while guaranteeing that the load, i.e., the
maximum number of messages an executor sends or
receives in a round, can be bounded by the capacity of
each executor. We first present a non-trivial
combination of a redesigned parallel push algorithm and
the Monte-Carlo method to answer single-source PPR
queries. The solution uses pre-sampled random walks to
reduce the number of rounds for the push algorithm.
Theoretical analysis under the Massively Parallel
Computing (MPC) model shows that our proposed solution
bounds the communication rounds to [EQUATION] under a
load of O(m/p), where m is the number of edges of the
input graph, p is the number of executors, and $
\epsilon $ is a user-defined error parameter. In the
meantime, as the number of executors increases to $ p'
= \gamma \cdot p$, the load constraint can be relaxed
since each executor can hold $ O(\gamma \cdot m / p')$
messages with invariant local memory. In such
scenarios, multiple queries can be processed in batches
simultaneously. We show that with a load of $ O(\gamma
\cdot m / p')$, our Delta-Push can process $ \gamma $
queries in a batch with [EQUATION] rounds, while other
baseline solutions still keep the same round cost for
each batch. We further present a new top-$k$ algorithm
that is friendly to the distributed framework and
reduces the number of rounds required in practice.
Extensive experiments show that our proposed solution
is more efficient than alternatives.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Schleich:2021:GQC,
author = "Maximilian Schleich and Zixuan Geng and Yihong Zhang
and Dan Suciu",
title = "{GeCo}: quality counterfactual explanations in real
time",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "9",
pages = "1681--1693",
month = may,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3461535.3461555",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:32 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3461535.3461555",
abstract = "Machine learning is increasingly applied in
high-stakes decision making that directly affect
people's lives, and this leads to an increased demand
for systems to explain their decisions. Explanations
often take the form of counterfactuals, which
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Salazar:2021:AFE,
author = "Ricardo Salazar and Felix Neutatz and Ziawasch
Abedjan",
title = "Automated feature engineering for algorithmic
fairness",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "9",
pages = "1694--1702",
month = may,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3461535.3463474",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 23 06:39:32 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3461535.3463474",
abstract = "One of the fundamental problems of machine ethics is
to avoid the perpetuation and amplification of
discrimination through machine learning applications.
In particular, it is desired to exclude the influence
of attributes with sensitive information, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Addanki:2021:HDR,
author = "Raghavendra Addanki and Sainyam Galhotra and Barna
Saha",
title = "How to design robust algorithms using noisy comparison
{Oracle}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "10",
pages = "1703--1716",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3467861.3467862",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 27 15:40:22 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3467861.3467862",
abstract = "Metric based comparison operations such as finding
maximum, nearest and farthest neighbor are fundamental
to studying various clustering techniques such as k
-center clustering and agglomerative hierarchical
clustering. These techniques crucially rely on
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Boniol:2021:SSS,
author = "Paul Boniol and John Paparrizos and Themis Palpanas
and Michael J. Franklin",
title = "{SAND}: streaming subsequence anomaly detection",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "10",
pages = "1717--1729",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3467861.3467863",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 27 15:40:22 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3467861.3467863",
abstract = "With the increasing demand for real-time analytics and
decision making, anomaly detection methods need to
operate over streams of values and handle drifts in
data distribution. Unfortunately, existing approaches
have severe limitations: they either \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Xiao:2021:OFU,
author = "Yingtai Xiao and Zeyu Ding and Yuxin Wang and Danfeng
Zhang and Daniel Kifer",
title = "Optimizing fitness-for-use of differentially private
linear queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "10",
pages = "1730--1742",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3467861.3467864",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 27 15:40:22 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3467861.3467864",
abstract = "In practice, differentially private data releases are
designed to support a variety of applications. A data
release is fit for use if it meets target accuracy
requirements for each application. In this paper, we
consider the problem of answering linear \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Cao:2021:CED,
author = "Xinle Cao and Jian Liu and Hao Lu and Kui Ren",
title = "Cryptanalysis of an encrypted database in {SIGMOD
'14}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "10",
pages = "1743--1755",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3467861.3467865",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 27 15:40:22 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3467861.3467865",
abstract = "Encrypted database is an innovative technology
proposed to solve the data confidentiality issue in
cloud-based DB systems. It allows a data owner to
encrypt its database before uploading it to the service
provider; and it allows the service provider to
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Jin:2021:USM,
author = "Tianyuan Jin and Yu Yang and Renchi Yang and Jieming
Shi and Keke Huang and Xiaokui Xiao",
title = "Unconstrained submodular maximization with modular
costs: tight approximation and application to profit
maximization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "10",
pages = "1756--1768",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3467861.3467866",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 27 15:40:22 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3467861.3467866",
abstract = "Given a set V, the problem of unconstrained submodular
maximization with modular costs (USM-MC) asks for a
subset $ S \subseteq $ V that maximizes $ f(S) $ --- $
c(S) $, where $f$ is a non-negative, monotone, and
submodular function that gauges the utility of S, and c
is a non-\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2021:DDL,
author = "Yuhao Zhang and Frank McQuillan and Nandish Jayaram
and Nikhil Kak and Ekta Khanna and Orhan Kislal and
Domino Valdano and Arun Kumar",
title = "Distributed deep learning on data systems: a
comparative analysis of approaches",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "10",
pages = "1769--1782",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3467861.3467867",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 27 15:40:22 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3467861.3467867",
abstract = "Deep learning (DL) is growing in popularity for many
data analytics applications, including among
enterprises. Large business-critical datasets in such
settings typically reside in RDBMSs or other data
systems. The DB community has long aimed to bring
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sheng:2021:PSM,
author = "Siyuan Sheng and Qun Huang and Sa Wang and Yungang
Bao",
title = "{PR}-sketch: monitoring per-key aggregation of
streaming data with nearly full accuracy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "10",
pages = "1783--1796",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3467861.3467868",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 27 15:40:22 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3467861.3467868",
abstract = "Computing per-key aggregation is indispensable in
streaming data analysis formulated as two phases, an
update phase and a recovery phase. As the size and
speed of data streams rise, accurate per-key
information is useful in many applications like
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Koutsoukos:2021:TAG,
author = "Dimitrios Koutsoukos and Supun Nakandala and
Konstantinos Karanasos and Karla Saur and Gustavo
Alonso and Matteo Interlandi",
title = "Tensors: an abstraction for general data processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "10",
pages = "1797--1804",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3467861.3467869",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 27 15:40:22 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3467861.3467869",
abstract = "Deep Learning (DL) has created a growing demand for
simpler ways to develop complex models and efficient
ways to execute them. Thus, a significant effort has
gone into frameworks like PyTorch or TensorFlow to
support a variety of DL models and run \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Pujol:2021:BSM,
author = "David Pujol and Yikai Wu and Brandon Fain and Ashwin
Machanavajjhala",
title = "Budget sharing for multi-analyst differential
privacy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "10",
pages = "1805--1817",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3467861.3467870",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 27 15:40:22 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3467861.3467870",
abstract = "Large organizations that collect data about
populations (like the US Census Bureau) release summary
statistics that are used by multiple stakeholders for
resource allocation and policy making problems. These
organizations are also legally required to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Poepsel-Lemaitre:2021:LDS,
author = "Rudi Poepsel-Lemaitre and Martin Kiefer and Joscha von
Hein and Jorge-Arnulfo Quian{\'e}-Ruiz and Volker
Markl",
title = "In the land of data streams where synopses are
missing, one framework to bring them all",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "10",
pages = "1818--1831",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3467861.3467871",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 27 15:40:22 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3467861.3467871",
abstract = "In pursuit of real-time data analysis, approximate
summarization structures, i.e., synopses, have gained
importance over the years. However, existing stream
processing systems, such as Flink, Spark, and Storm, do
not support synopses as first class \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2021:DAI,
author = "Yifan Li and Xiaohui Yu and Nick Koudas",
title = "Data acquisition for improving machine learning
models",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "10",
pages = "1832--1844",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3467861.3467872",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 27 15:40:22 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3467861.3467872",
abstract = "The vast advances in Machine Learning (ML) over the
last ten years have been powered by the availability of
suitably prepared data for training purposes. The
future of ML-enabled enterprise hinges on data. As
such, there is already a vibrant market \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2021:EAR,
author = "Xiaoshuang Chen and Kai Wang and Xuemin Lin and Wenjie
Zhang and Lu Qin and Ying Zhang",
title = "Efficiently answering reachability and path queries on
temporal bipartite graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "10",
pages = "1845--1858",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3467861.3467873",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 27 15:40:22 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3467861.3467873",
abstract = "Bipartite graphs are naturally used to model
relationships between two different types of entities,
such as people-location, author-paper, and
customer-product. When modeling real-world applications
like disease outbreaks, edges are often enriched with
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ciaccia:2021:PQT,
author = "Paolo Ciaccia and Davide Martinenghi and Riccardo
Torlone",
title = "Preference queries over taxonomic domains",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "10",
pages = "1859--1871",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3467861.3467874",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 27 15:40:22 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3467861.3467874",
abstract = "When composing multiple preferences characterizing the
most suitable results for a user, several issues may
arise. Indeed, preferences can be partially
contradictory, suffer from a mismatch with the level of
detail of the actual data, and even lack \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yan:2021:RDL,
author = "Baoyue Yan and Xuntao Cheng and Bo Jiang and Shibin
Chen and Canfang Shang and Jianying Wang and Gui Huang
and Xinjun Yang and Wei Cao and Feifei Li",
title = "Revisiting the design of {LSM}-tree Based {OLTP}
storage engine with persistent memory",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "10",
pages = "1872--1885",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3467861.3467875",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 27 15:40:22 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3467861.3467875",
abstract = "The recent byte-addressable and large-capacity
commercialized persistent memory (PM) is promising to
drive database as a service (DBaaS) into unchartered
territories. This paper investigates how to leverage
PMs to revisit the conventional LSM-tree based
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ge:2021:KCA,
author = "Chang Ge and Shubhankar Mohapatra and Xi He and Ihab
F. Ilyas",
title = "{Kamino}: constraint-aware differentially private data
synthesis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "10",
pages = "1886--1899",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3467861.3467876",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 27 15:40:22 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3467861.3467876",
abstract = "Organizations are increasingly relying on data to
support decisions. When data contains private and
sensitive information, the data owner often desires to
publish a synthetic database instance that is similarly
useful as the true data, while ensuring \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2021:TCE,
author = "Yingqiang Zhang and Chaoyi Ruan and Cheng Li and
Xinjun Yang and Wei Cao and Feifei Li and Bo Wang and
Jing Fang and Yuhui Wang and Jingze Huo and Chao Bi",
title = "Towards cost-effective and elastic cloud database
deployment via memory disaggregation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "10",
pages = "1900--1912",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3467861.3467877",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 27 15:40:22 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3467861.3467877",
abstract = "It is challenging for cloud-native relational
databases to meet the ever-increasing needs of scaling
compute and memory resources independently and
elastically. The recent emergence of memory
disaggregation architecture, relying on high-speed RDMA
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Peeters:2021:DOF,
author = "Ralph Peeters and Christian Bizer",
title = "Dual-objective fine-tuning of {BERT} for entity
matching",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "10",
pages = "1913--1921",
month = jun,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3467861.3467878",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Oct 27 15:40:22 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3467861.3467878",
abstract = "An increasing number of data providers have adopted
shared numbering schemes such as GTIN, ISBN, DUNS, or
ORCID numbers for identifying entities in the
respective domain. This means for data integration that
shared identifiers are often available for a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Besta:2021:GEH,
author = "Maciej Besta and Zur Vonarburg-Shmaria and Yannick
Schaffner and Leonardo Schwarz and Grzegorz Kwasniewski
and Lukas Gianinazzi and Jakub Beranek and Kacper Janda
and Tobias Holenstein and Sebastian Leisinger and Peter
Tatkowski and Esref Ozdemir and Adrian Balla and Marcin
Copik and Philipp Lindenberger and Marek Konieczny and
Onur Mutlu and Torsten Hoefler",
title = "{GraphMineSuite}: enabling high-performance and
programmable graph mining algorithms with set algebra",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "1922--1935",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476252",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 06:21:49 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476252",
abstract = "We propose GraphMineSuite (GMS): the first
benchmarking suite for graph mining that facilitates
evaluating and constructing high-performance graph
mining algorithms. First, GMS comes with a benchmark
specification based on extensive literature review,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Takenouchi:2021:PES,
author = "Keita Takenouchi and Takashi Ishio and Joji Okada and
Yuji Sakata",
title = "{PATSQL}: efficient synthesis of {SQL} queries from
example tables with quick inference of projected
columns",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "1937--1949",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476253",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 06:21:49 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476253",
abstract = "SQL is one of the most popular tools for data
analysis, and it is now used by an increasing number of
users without having expertise in databases. Several
studies have proposed programming-by-example approaches
to help such non-experts to write correct \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liu:2021:FFA,
author = "Jie Liu and Wenqian Dong and Qingqing Zhou and Dong
Li",
title = "{Fauce}: fast and accurate deep ensembles with
uncertainty for cardinality estimation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "1950--1963",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476254",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 06:21:49 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476254",
abstract = "Cardinality estimation is a fundamental and critical
problem in databases. Recently, many estimators based
on deep learning have been proposed to solve this
problem and they have achieved promising results.
However, these estimators struggle to provide
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2021:CSE,
author = "Mengzhao Wang and Xiaoliang Xu and Qiang Yue and
Yuxiang Wang",
title = "A comprehensive survey and experimental comparison of
graph-based approximate nearest neighbor search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "1964--1978",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476255",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 06:21:49 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476255",
abstract = "Approximate nearest neighbor search (ANNS) constitutes
an important operation in a multitude of applications,
including recommendation systems, information
retrieval, and pattern recognition. In the past decade,
graph-based ANNS algorithms have been the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yuan:2021:TPP,
author = "Zifeng Yuan and Huey Eng Chua and Sourav S. Bhowmick
and Zekun Ye and Wook-Shin Han and Byron Choi",
title = "Towards plug-and-play visual graph query interfaces:
data-driven selection of canned patterns for large
networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "1979--1991",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476256",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 06:21:49 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476256",
abstract = "Canned patterns ( i.e., small subgraph patterns) in
visual graph query interfaces (a.k.a GUI) facilitate
efficient query formulation by enabling
pattern-at-a-time construction mode. However, existing
GUIS for querying large networks either do not expose
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sun:2021:TMG,
author = "Shixuan Sun and Yuhang Chen and Shengliang Lu and
Bingsheng He and Yuchen Li",
title = "{ThunderRW}: an in-memory graph random walk engine",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "1992--2005",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476257",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 06:21:49 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476257",
abstract = "As random walk is a powerful tool in many graph
processing, mining and learning applications, this
paper proposes an efficient in-memory random walk
engine named ThunderRW. Compared with existing parallel
systems on improving the performance of a single
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Dong:2021:BCC,
author = "Zheng Dong and Xin Huang and Guorui Yuan and Hengshu
Zhu and Hui Xiong",
title = "Butterfly-core community search over labeled graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2006--2018",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476258",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 06:21:49 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476258",
abstract = "Community search aims at finding densely connected
subgraphs for query vertices in a graph. While this
task has been studied widely in the literature, most of
the existing works only focus on finding homogeneous
communities rather than heterogeneous \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Negi:2021:FLL,
author = "Parimarjan Negi and Ryan Marcus and Andreas Kipf and
Hongzi Mao and Nesime Tatbul and Tim Kraska and
Mohammad Alizadeh",
title = "{Flow-loss}: learning cardinality estimates that
matter",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2019--2032",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476259",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 06:21:49 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476259",
abstract = "Recently there has been significant interest in using
machine learning to improve the accuracy of cardinality
estimation. This work has focused on improving average
estimation error, but not all estimates matter equally
for downstream tasks like query \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yu:2021:QHK,
author = "Michael Yu and Dong Wen and Lu Qin and Ying Zhang and
Wenjie Zhang and Xuemin Lin",
title = "On querying historical $k$-cores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2033--2045",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476260",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 06:21:49 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476260",
abstract = "Many real-world relationships between entities can be
modeled as temporal graphs, where each edge is
associated with a timestamp or a time interval
representing its occurrence. K -core is a fundamental
model used to capture cohesive subgraphs in a simple
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Cormode:2021:FEU,
author = "Graham Cormode and Samuel Maddock and Carsten Maple",
title = "Frequency estimation under local differential
privacy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2046--2058",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476261",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 06:21:49 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476261",
abstract = "Private collection of statistics from a large
distributed population is an important problem, and has
led to large scale deployments from several leading
technology companies. The dominant approach requires
each user to randomly perturb their input, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zogaj:2021:DML,
author = "Fatjon Zogaj and Jos{\'e} Pablo Cambronero and Martin
C. Rinard and J{\"u}rgen Cito",
title = "Doing more with less: characterizing dataset
downsampling for {AutoML}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2059--2072",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476262",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 06:21:49 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476262",
abstract = "Automated machine learning (AutoML) promises to
democratize machine learning by automatically
generating machine learning pipelines with little to no
user intervention. Typically, a search procedure is
used to repeatedly generate and validate candidate
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2021:LBE,
author = "Yifan Li and Xiaohui Yu and Nick Koudas",
title = "{LES 3}: learning-based exact set similarity search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2073--2086",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476263",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 06:21:49 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476263",
abstract = "Set similarity search is a problem of central interest
to a wide variety of applications such as data cleaning
and web search. Past approaches on set similarity
search utilize either heavy indexing structures,
incurring large search costs or indexes \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Min:2021:LGC,
author = "Seung Won Min and Kun Wu and Sitao Huang and Mert
Hidayetoglu and Jinjun Xiong and Eiman Ebrahimi and
Deming Chen and Wen-mei Hwu",
title = "Large graph convolutional network training with
{GPU}-oriented data communication architecture",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2087--2100",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476264",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 06:21:49 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476264",
abstract = "Graph Convolutional Networks (GCNs) are increasingly
adopted in large-scale graph-based recommender systems.
Training GCN requires the minibatch generator
traversing graphs and sampling the sparsely located
neighboring nodes to obtain their features. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yang:2021:FHP,
author = "Yifei Yang and Matt Youill and Matthew Woicik and
Yizhou Liu and Xiangyao Yu and Marco Serafini and
Ashraf Aboulnaga and Michael Stonebraker",
title = "{FlexPushdownDB}: hybrid pushdown and caching in a
cloud {DBMS}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2101--2113",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476265",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 06:21:49 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476265",
abstract = "Modern cloud databases adopt a storage-disaggregation
architecture that separates the management of
computation and storage. A major bottleneck in such an
architecture is the network connecting the computation
and storage layers. Two solutions have been \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2021:AMA,
author = "Zhiwei Chen and Shaoxu Song and Ziheng Wei and Jingyun
Fang and Jiang Long",
title = "Approximating median absolute deviation with bounded
error",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2114--2126",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476266",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 06:21:49 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476266",
abstract = "The median absolute deviation (MAD) is a statistic
measuring the variability of a set of quantitative
elements. It is known to be more robust to outliers
than the standard deviation (SD), and thereby widely
used in outlier detection. Computing the exact
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2021:EEG,
author = "Mengxuan Zhang and Lei Li and Xiaofang Zhou",
title = "An experimental evaluation and guideline for path
finding in weighted dynamic network",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2127--2140",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476267",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 06:21:49 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476267",
abstract = "Shortest path computation is a building block of
various network applications. Since real-life networks
evolve as time passes, the Dynamic Shortest Path (DSP)
problem has drawn lots of attention in recent years.
However, as DSP has many factors related \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Vandevoort:2021:RAR,
author = "Brecht Vandevoort and Bas Ketsman and Christoph Koch
and Frank Neven",
title = "Robustness against read committed for transaction
templates",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2141--2153",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476268",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 06:21:49 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476268",
abstract = "The isolation level Multiversion Read Committed (RC),
offered by many database systems, is known to trade
consistency for increased transaction throughput.
Sometimes, transaction workloads can be safely executed
under RC obtaining the perfect isolation \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2021:LLC,
author = "Huayi Zhang and Lei Cao and Samuel Madden and Elke
Rundensteiner",
title = "{LANCET}: labeling complex data at scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2154--2166",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476269",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 06:21:49 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476269",
abstract = "Cutting-edge machine learning techniques often require
millions of labeled data objects to train a robust
model. Because relying on humans to supply such a huge
number of labels is rarely practical, automated methods
for label generation are needed. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2021:VSE,
author = "Yang Li and Yu Shen and Wentao Zhang and Jiawei Jiang
and Bolin Ding and Yaliang Li and Jingren Zhou and Zhi
Yang and Wentao Wu and Ce Zhang and Bin Cui",
title = "{VolcanoML}: speeding up end-to-end {AutoML} via
scalable search space decomposition",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2167--2176",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476270",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 06:21:49 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476270",
abstract = "End-to-end AutoML has attracted intensive interests
from both academia and industry, which automatically
searches for ML pipelines in a space induced by feature
engineering, algorithm/model selection, and
hyper-parameter tuning. Existing AutoML systems,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Cheng:2021:QTF,
author = "Peng Cheng and Jiabao Jin and Lei Chen and Xuemin Lin
and Libin Zheng",
title = "A queueing-theoretic framework for vehicle dispatching
in dynamic car-hailing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2177--2189",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476271",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 06:21:49 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476271",
abstract = "With the rapid development of smart mobile devices,
the car-hailing platforms (e.g., Uber or Lyft) have
attracted much attention from the academia and the
industry. In this paper, we consider a dynamic
car-hailing problem, namely maximum revenue vehicle
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Cai:2021:DSD,
author = "Kuntai Cai and Xiaoyu Lei and Jianxin Wei and Xiaokui
Xiao",
title = "Data synthesis via differentially private {Markov}
random fields",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2190--2202",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476272",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 06:21:49 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476272",
abstract = "This paper studies the synthesis of high-dimensional
datasets with differential privacy (DP). The
state-of-the-art solution addresses this problem by
first generating a set M of noisy low-dimensional
marginals of the input data D, and then use them to
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Whittaker:2021:SRS,
author = "Michael Whittaker and Ailidani Ailijiang and Aleksey
Charapko and Murat Demirbas and Neil Giridharan and
Joseph M. Hellerstein and Heidi Howard and Ion Stoica
and Adriana Szekeres",
title = "Scaling replicated state machines with
compartmentalization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2203--2215",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476273",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 06:21:49 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476273",
abstract = "State machine replication protocols, like MultiPaxos
and Raft, are a critical component of many distributed
systems and databases. However, these protocols offer
relatively low throughput due to several bottlenecked
components. Numerous existing \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sarkar:2021:CAL,
author = "Subhadeep Sarkar and Dimitris Staratzis and Ziehen Zhu
and Manos Athanassoulis",
title = "Constructing and analyzing the {LSM} compaction design
space",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2216--2229",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476274",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 06:21:49 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476274",
abstract = "Log-structured merge (LSM) trees offer efficient
ingestion by appending incoming data, and thus, are
widely used as the storage layer of production NoSQL
data stores. To enable competitive read performance,
LSM-trees periodically re-organize data to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Hellings:2021:BSB,
author = "Jelle Hellings and Mohammad Sadoghi",
title = "{ByShard}: sharding in a {Byzantine} environment",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2230--2243",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476275",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 06:21:49 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476275",
abstract = "The emergence of blockchains has fueled the
development of resilient systems that can deal with
Byzantine failures due to crashes, bugs, or even
malicious behavior. Recently, we have also seen the
exploration of sharding in these resilient systems,
this \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ertl:2021:SFG,
author = "Otmar Ertl",
title = "{SetSketch}: filling the gap between {MinHash} and
{HyperLogLog}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2244--2257",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476276",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 06:21:49 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476276",
abstract = "MinHash and HyperLogLog are sketching algorithms that
have become indispensable for set summaries in big data
applications. While HyperLogLog allows counting
different elements with very little space, MinHash is
suitable for the fast comparison of sets \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Bao:2021:CEM,
author = "Ergute Bao and Yin Yang and Xiaokui Xiao and Bolin
Ding",
title = "{CGM}: an enhanced mechanism for streaming data
collection with local differential privacy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2258--2270",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476277",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 06:21:49 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476277",
abstract = "Local differential privacy (LDP) is a well-established
privacy protection scheme for collecting sensitive
data, which has been integrated into major platforms
such as iOS, Chrome, and Windows. The main idea is that
each individual randomly perturbs her \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Leo:2021:ETA,
author = "Dean {De Leo} and Per Fuchs and Peter Boncz",
title = "Errata for {``Teseo and the analysis of structural
dynamic graphs'': (PVLDB {\bf 14}(6):1053--1066)}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2271--2272",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476278",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 06:21:49 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
note = "See \cite{DeLeo:2021:TAS}.",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476278",
abstract = "In our paper [4], we experimentally evaluated our
work, Teseo, together with five other systems under the
LDBC Graphalytics benchmark [6]. We developed and
publicly released [2] an ad-hoc driver for the purpose.
Since the time the paper was published, a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Musleh:2021:QMB,
author = "Mashaal Musleh and Sofiane Abbar and Rade Stanojevic
and Mohamed Mokbel",
title = "{QARTA}: an {ML}-based system for accurate map
services",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2273--2282",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476279",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 06:21:49 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476279",
abstract = "Maps services are ubiquitous in widely used
applications including navigation systems, ride
sharing, and items/food delivery. Though there are
plenty of efforts to support such services through
designing more efficient algorithms, we believe that
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Cunningham:2021:RWT,
author = "Teddy Cunningham and Graham Cormode and Hakan
Ferhatosmanoglu and Divesh Srivastava",
title = "Real-world trajectory sharing with local differential
privacy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2283--2295",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476280",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 06:21:49 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476280",
abstract = "Sharing trajectories is beneficial for many real-world
applications, such as managing disease spread through
contact tracing and tailoring public services to a
population's travel patterns. However, public concern
over privacy and data protection has \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sinthong:2021:PRQ,
author = "Phanwadee Sinthong and Michael J. Carey",
title = "{PolyFrame}: a retargetable query-based approach to
scaling dataframes",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2296--2304",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476281",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 06:21:49 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476281",
abstract = "In the last few years, the field of data science has
been growing rapidly as various businesses have adopted
statistical and machine learning techniques to empower
their decision-making and applications. Scaling data
analyses to large volumes of data \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Shi:2021:SCD,
author = "Jessica Shi and Laxman Dhulipala and David Eisenstat
and Jakub Lacki and Vahab Mirrokni",
title = "Scalable community detection via parallel correlation
clustering",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2305--2313",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476282",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 18:05:40 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476282",
abstract = "Graph clustering and community detection are central
problems in modern data mining. The increasing need for
analyzing billion-scale data calls for faster and more
scalable algorithms for these problems. There are
certain trade-offs between the quality \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Xu:2021:SSB,
author = "Cheng Xu and Ce Zhang and Jianliang Xu and Jian Pei",
title = "{SlimChain}: scaling blockchain transactions through
off-chain storage and parallel processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2314--2326",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476283",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 18:05:40 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476283",
abstract = "Blockchain technology has emerged as the cornerstone
of many decentralized applications operating among
otherwise untrusted peers. However, it is well known
that existing blockchain systems do not scale well.
Transactions are often executed and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2021:TOG,
author = "Side Li and Arun Kumar",
title = "Towards an optimized {GROUP} by abstraction for
large-scale machine learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2327--2340",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476284",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 18:05:40 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476284",
abstract = "Many applications that use large-scale machine
learning (ML) increasingly prefer different models for
subgroups (e.g., countries) to improve accuracy,
fairness, or other desiderata. We call this emerging
popular practice learning over groups, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kang:2021:AAA,
author = "Daniel Kang and John Guibas and Peter Bailis and
Tatsunori Hashimoto and Yi Sun and Matei Zaharia",
title = "Accelerating approximate aggregation queries with
expensive predicates",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2341--2354",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476285",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 18:05:40 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476285",
abstract = "Researchers and industry analysts are increasingly
interested in computing aggregation queries over large,
unstructured datasets with selective predicates that
are computed using expensive deep neural networks
(DNNs). As these DNNs are expensive and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Schmidt:2021:FDA,
author = "Tobias Schmidt and Maximilian Bandle and Jana Giceva",
title = "A four-dimensional analysis of partitioned approximate
filters",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2355--2368",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476286",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 18:05:40 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476286",
abstract = "With today's data deluge, approximate filters are
particularly attractive to avoid expensive operations
like remote data/disk accesses. Among the many filter
variants available, it is non-trivial to find the most
suitable one and its optimal \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chiosa:2021:SOP,
author = "Monica Chiosa and Thomas B. Preu{\ss}er and Gustavo
Alonso",
title = "{SKT}: a one-pass multi-sketch data analytics
accelerator",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2369--2382",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476287",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 18:05:40 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476287",
abstract = "Data analysts often need to characterize a data stream
as a first step to its further processing. Some of the
initial insights to be gained include, e.g., the
cardinality of the data set and its frequency
distribution. Such information is typically \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fent:2021:PAG,
author = "Philipp Fent and Thomas Neumann",
title = "A practical approach to groupjoin and nested
aggregates",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2383--2396",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476288",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 18:05:40 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476288",
abstract = "Groupjoins, the combined execution of a join and a
subsequent group by, are common in analytical queries,
and occur in about 1/8 of the queries in TPC-H and
TPC-DS. While they were originally invented to improve
performance, efficient parallel execution \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wei:2021:RVQ,
author = "Ziyun Wei and Immanuel Trummer and Connor Anderson",
title = "Robust voice querying with {MUVE}: optimally
visualizing results of phonetically similar queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2397--2409",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476289",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 18:05:40 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476289",
abstract = "Recently proposed voice query interfaces translate
voice input into SQL queries. Unreliable speech
recognition on top of the intrinsic challenges of
text-to-SQL translation makes it hard to reliably
interpret user input. We present MUVE (Multiplots for
\ldots{})",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wu:2021:CCF,
author = "Yinjun Wu and James Weimer and Susan B. Davidson",
title = "{CHEF}: a cheap and fast pipeline for iteratively
cleaning label uncertainties",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2410--2418",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476290",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 18:05:40 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476290",
abstract = "High-quality labels are expensive to obtain for many
machine learning tasks, such as medical image
classification tasks. Therefore, probabilistic (weak)
labels produced by weak supervision tools are used to
seed a process in which influential samples \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Siddiqui:2021:CAG,
author = "Tarique Siddiqui and Surajit Chaudhuri and Vivek
Narasayya",
title = "{COMPARE}: accelerating groupwise comparison in
relational databases for data analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2419--2431",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476291",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 18:05:40 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476291",
abstract = "Data analysis often involves comparing subsets of data
across many dimensions for finding unusual trends and
patterns. While the comparison between subsets of data
can be expressed using SQL, they tend to be complex to
write, and suffer from poor \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Durner:2021:CUC,
author = "Dominik Durner and Badrish Chandramouli and Yinan Li",
title = "{Crystal}: a unified cache storage system for
analytical databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2432--2444",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476292",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 18:05:40 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476292",
abstract = "Cloud analytical databases employ a disaggregated
storage model, where the elastic compute layer accesses
data persisted on remote cloud storage in
block-oriented columnar formats. Given the high latency
and low bandwidth to remote storage and the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Cetorelli:2021:SEP,
author = "Valerio Cetorelli and Paolo Atzeni and Valter
Crescenzi and Franco Milicchio",
title = "The smallest extraction problem",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2445--2458",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476293",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 18:05:40 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476293",
abstract = "We introduce landmark grammars, a new family of
context-free grammars aimed at describing the HTML
source code of pages published by large and templated
websites and therefore at effectively tackling Web data
extraction problems. Indeed, they address \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Thirumuruganathan:2021:DLB,
author = "Saravanan Thirumuruganathan and Han Li and Nan Tang
and Mourad Ouzzani and Yash Govind and Derek Paulsen
and Glenn Fung and AnHai Doan",
title = "Deep learning for blocking in entity matching: a
design space exploration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2459--2472",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476294",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 18:05:40 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476294",
abstract = "Entity matching (EM) finds data instances that refer
to the same real-world entity. Most EM solutions
perform blocking then matching. Many works have applied
deep learning (DL) to matching, but far fewer works
have applied DL to blocking. These blocking \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2021:GID,
author = "Wentao Zhang and Zhi Yang and Yexin Wang and Yu Shen
and Yang Li and Liang Wang and Bin Cui",
title = "{GRAIN}: improving data efficiency of {\em gra\/}ph
neural networks via diversified {\em in\/}fluence
maximization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2473--2482",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476295",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 18:05:40 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476295",
abstract = "Data selection methods, such as active learning and
core-set selection, are useful tools for improving the
data efficiency of deep learning models on large-scale
datasets. However, recent deep learning models have
moved forward from independent and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Bandle:2021:DTM,
author = "Maximilian Bandle and Jana Giceva",
title = "Database technology for the masses: sub-operators as
first-class entities",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2483--2490",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476296",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 18:05:40 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476296",
abstract = "A wealth of technology has evolved around relational
databases over decades that has been successfully tried
and tested in many settings and use cases. Yet, the
majority of it remains overlooked in the pursuit of
performance (e.g., NoSQL) or new \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gupta:2021:CSL,
author = "Pranjal Gupta and Amine Mhedhbi and Semih Salihoglu",
title = "Columnar storage and list-based processing for graph
database management systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2491--2504",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476297",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 18:05:40 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476297",
abstract = "We revisit column-oriented storage and query
processing techniques in the context of contemporary
graph database management systems (GDBMSs). Similar to
column-oriented RDBMSs, GDBMSs support read-heavy
analytical workloads that however have \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhu:2021:PLB,
author = "Yiwen Zhu and Matteo Interlandi and Abhishek Roy and
Krishnadhan Das and Hiren Patel and Malay Bag and
Hitesh Sharma and Alekh Jindal",
title = "{Phoebe}: a learning-based checkpoint optimizer",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2505--2518",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476298",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 18:05:40 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476298",
abstract = "Easy-to-use programming interfaces paired with
cloud-scale processing engines have enabled big data
system users to author arbitrarily complex analytical
jobs over massive volumes of data. However, as the
complexity and scale of analytical jobs increase,.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Nargesian:2021:TDS,
author = "Fatemeh Nargesian and Abolfazl Asudeh and H. V.
Jagadish",
title = "Tailoring data source distributions for fairness-aware
data integration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2519--2532",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476299",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 18:05:40 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476299",
abstract = "Data scientists often develop data sets for analysis
by drawing upon sources of data available to them. A
major challenge is to ensure that the data set used for
analysis has an appropriate representation of relevant
(demographic) groups: it meets \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Bansal:2021:MVI,
author = "Parikshit Bansal and Prathamesh Deshpande and Sunita
Sarawagi",
title = "Missing value imputation on multidimensional time
series",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2533--2545",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476300",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 18:05:40 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476300",
abstract = "We present DeepMVI, a deep learning method for missing
value imputation in multidimensional time-series
datasets. Missing values are commonplace in decision
support platforms that aggregate data over long time
stretches from disparate sources, whereas \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Rezig:2021:HSD,
author = "El Kindi Rezig and Mourad Ouzzani and Walid G. Aref
and Ahmed K. Elmagarmid and Ahmed R. Mahmood and
Michael Stonebraker",
title = "{Horizon}: scalable dependency-driven data cleaning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2546--2554",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476301",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 18:05:40 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476301",
abstract = "A large class of data repair algorithms rely on
integrity constraints to detect and repair errors. A
well-studied class of constraints is Functional
Dependencies (FDs, for short). Although there has been
an increased interest in developing general data
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Shaowang:2021:DDS,
author = "Ted Shaowang and Nilesh Jain and Dennis D. Matthews
and Sanjay Krishnan",
title = "Declarative data serving: the future of machine
learning inference on the edge",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2555--2562",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476302",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 18:05:40 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476302",
abstract = "Recent advances in computer architecture and
networking have ushered in a new age of edge computing,
where computation is placed close to the point of data
collection to facilitate low-latency decision making.
As the complexity of such deployments grow \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yang:2021:APS,
author = "Junwen Yang and Yeye He and Surajit Chaudhuri",
title = "Auto-pipeline: synthesizing complex data pipelines
by-target using reinforcement learning and search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2563--2575",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476303",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 18:05:40 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476303",
abstract = "Recent work has made significant progress in helping
users to automate single data preparation steps, such
as string-transformations and table-manipulation
operators (e.g., Join, GroupBy, Pivot, etc.). We in
this work propose to automate multiple such \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lockhart:2021:EIQ,
author = "Brandon Lockhart and Jinglin Peng and Weiyuan Wu and
Jiannan Wang and Eugene Wu",
title = "Explaining inference queries with {Bayesian}
optimization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2576--2585",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476304",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 18:05:40 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476304",
abstract = "Obtaining an explanation for an SQL query result can
enrich the analysis experience, reveal data errors, and
provide deeper insight into the data. Inference query
explanation seeks to explain unexpected aggregate query
results on inference data; such \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liu:2021:DBF,
author = "Chunwei Liu and Hao Jiang and John Paparrizos and
Aaron J. Elmore",
title = "Decomposed bounded floats for fast compression and
queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2586--2598",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476305",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 18:05:40 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476305",
abstract = "Modern data-intensive applications often generate
large amounts of low precision float data with a
limited range of values. Despite the prevalence of such
data, there is a lack of an effective solution to
ingest, store, and analyze bounded, low-precision,
numeric data. To address this gap, we propose Buff, a
new compression technique that uses a decomposed
columnar storage and encoding methods to provide
effective compression, fast ingestion, and high-speed
in-situ adaptive query operators with SIMD support.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Tziavelis:2021:BEJ,
author = "Nikolaos Tziavelis and Wolfgang Gatterbauer and Mirek
Riedewald",
title = "Beyond equi-joins: ranking, enumeration and
factorization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2599--2612",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476306",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 18:05:40 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476306",
abstract = "We study theta-joins in general and join predicates
with conjunctions and disjunctions of inequalities in
particular, focusing on ranked enumeration where the
answers are returned incrementally in an order dictated
by a given ranking function. Our \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Jacob:2021:EBE,
author = "Vincent Jacob and Fei Song and Arnaud Stiegler and
Bijan Rad and Yanlei Diao and Nesime Tatbul",
title = "{Exathlon}: a benchmark for explainable anomaly
detection over time series",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2613--2626",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476307",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 18:05:40 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476307",
abstract = "Access to high-quality data repositories and
benchmarks have been instrumental in advancing the
state of the art in many experimental research domains.
While advanced analytics tasks over time series data
have been gaining lots of attention, lack of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kuchnik:2021:PCR,
author = "Michael Kuchnik and George Amvrosiadis and Virginia
Smith",
title = "Progressive compressed records: taking a byte out of
deep learning data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2627--2641",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476308",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 18:05:40 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476308",
abstract = "Deep learning accelerators efficiently train over vast
and growing amounts of data, placing a newfound burden
on commodity networks and storage devices. A common
approach to conserve bandwidth involves resizing or
compressing data prior to training. We \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Alsaudi:2021:TFQ,
author = "Abdulrahman Alsaudi and Yasser Altowim and Sharad
Mehrotra and Yaming Yu",
title = "{TQEL}: framework for query-driven linking of top-$k$
entities in social media blogs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "11",
pages = "2642--2654",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476249.3476309",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 18:05:40 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476249.3476309",
abstract = "Social media analysis over blogs (such as tweets)
often requires determining top-k mentions of a certain
category (e.g., movies) in a collection (e.g., tweets
collected over a given day). Such queries require
entity linking (EL) function to be executed \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chan:2021:KEN,
author = "Tsz Nam Chan and Pak Lon Ip and Leong Hou U. and Weng
Hou Tong and Shivansh Mittal and Ye Li and Reynold
Cheng",
title = "{KDV-explorer}: a near real-time kernel density
visualization system for spatial analysis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2655--2658",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476312",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476312",
abstract = "Kernel density visualization (KDV) is a commonly used
visualization tool for many spatial analysis tasks,
including disease outbreak detection, crime hotspot
detection, and traffic accident hotspot detection.
Although the most popular geographical \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2021:RRI,
author = "Zhebin Zhang and Dajie Dong and Yuhang Ma and Yilong
Ying and Dawei Jiang and Ke Chen and Lidan Shou and
Gang Chen",
title = "{Refiner}: a reliable incentive-driven federated
learning system powered by blockchain",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2659--2662",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476313",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476313",
abstract = "Modern mobile applications often produce decentralized
data, i.e., a huge amount of privacy-sensitive data
distributed over a large number of mobile devices.
Techniques for learning models from decentralized data
must properly handle two natures of such \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Uotila:2021:MMM,
author = "Valter Uotila and Jiaheng Lu and Dieter Gawlick and
Zhen Hua Liu and Souripriya Das and Gregory
Pogossiants",
title = "{MultiCategory}: multi-model query processing meets
category theory and functional programming",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2663--2666",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476314",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476314",
abstract = "The variety of data is one of the important issues in
the era of Big Data. The data are naturally organized
in different formats and models, including structured
data, semi-structured data, and unstructured data.
Prior research has envisioned an \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2021:CCQ,
author = "Qichen Wang and Chaoqi Zhang and Danish Alsayed and Ke
Yi and Bin Wu and Feifei Li and Chaoqun Zhan",
title = "Cquirrel: continuous query processing over acyclic
relational schemas",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2667--2670",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476315",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476315",
abstract = "We will demonstrate Cquirrel, a continuous query
processing engine built on top of Flink. Cquirrel
assumes a relational schema where the foreign-key
constraints form a directed acyclic graph, and supports
any selection-projection-join-aggregation query
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Mao:2021:DDF,
author = "Yuetian Mao and Shuai Yuan and Nan Cui and Tianjiao Du
and Beijun Shen and Yuting Chen",
title = "{DeFiHap}: detecting and fixing {HiveQL}
anti-patterns",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2671--2674",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476316",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476316",
abstract = "The emergence of Hive greatly facilitates the
management of massive data stored in various places.
Meanwhile, data scientists face challenges during
HiveQL programming --- they may not use correct and/or
efficient HiveQL statements in their programs;
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Helal:2021:DKD,
author = "Ahmed Helal and Mossad Helali and Khaled Ammar and
Essam Mansour",
title = "A demonstration of {KGLac}: a data discovery and
enrichment platform for data science",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2675--2678",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476317",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476317",
abstract = "Data science growing success relies on knowing where a
relevant dataset exists, understanding its impact on a
specific task, finding ways to enrich a dataset, and
leveraging insights derived from it. With the growth of
open data initiatives, data \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Faure-Giovagnoli:2021:AVL,
author = "Pierre Faure-Giovagnoli and Marie {Le Guilly} and
Jean-Marc Petit and Vasile-Marian Scuturici",
title = "{ADESIT}: visualize the limits of your data in a
machine learning process",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2679--2682",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476318",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476318",
abstract = "Thanks to the numerous machine learning tools
available to us nowadays, it is easier than ever to
derive a model from a dataset in the frame of a
supervised learning problem. However, when this model
behaves poorly compared with an expected performance,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yan:2021:PAM,
author = "Yinzhao Yan and Raymond Chi-Wing Wong",
title = "Path advisor: a multi-functional campus map tool for
shortest path",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2683--2686",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476319",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476319",
abstract = "The shortest path in both the two dimensional (2D)
plane and the three dimensional (3D) terrain is
extensively used both in industry and academia.
Although there are some map visualization tools for
viewing the shortest path in 2D and 3D views, we find
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2021:IHL,
author = "Liangde Li and Supun Nakandala and Arun Kumar",
title = "Intermittent human-in-the-loop model selection using
{Cerebro}: a demonstration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2687--2690",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476320",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476320",
abstract = "Deep learning (DL) is revolutionizing many fields.
However, there is a major bottleneck for the wide
adoption of DL: the pain of model selection, which
requires exploring a large config space of model
architecture and training hyper-parameters before
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Funke:2021:LLC,
author = "Henning Funke and Jens Teubner",
title = "Low-latency compilation of {SQL} queries to machine
code",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2691--2694",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476321",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476321",
abstract = "Query compilation has proven to be one of the most
efficient query processing techniques. Despite its fast
processing speed, the additional compilation times of
the technique limit its applicability. This is because
the approach is most beneficial only \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Groppe:2021:SDS,
author = "Sven Groppe and Rico Klinckenberg and Benjamin
Warnke",
title = "Sound of databases: sonification of a semantic web
database engine",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2695--2698",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476322",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476322",
abstract = "Sonifications map data to auditory dimensions and
offer a new audible experience to their listeners. We
propose a sonification of query processing paired with
a corresponding visualization both integrated in a web
application. In this demonstration we \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2021:HHM,
author = "Zihao Chen and Zhizhen Xu and Chen Xu and Juan Soto
and Volker Markl and Weining Qian and Aoying Zhou",
title = "{HyMAC}: a hybrid matrix computation system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2699--2702",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476323",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476323",
abstract = "Distributed matrix computation is common in
large-scale data processing and machine learning
applications. Iterative-convergent algorithms involving
matrix computation share a common property: parameters
converge non-uniformly. This property can be \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Xu:2021:GOS,
author = "Jingbo Xu and Zhanning Bai and Wenfei Fan and Longbin
Lai and Xue Li and Zhao Li and Zhengping Qian and Lei
Wang and Lei Wang and Yanyan Wang and Wenyuan Yu and
Jingren Zhou",
title = "{GraphScope}: a one-stop large graph processing
system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2703--2706",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476324",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476324",
abstract = "Due to diverse graph data and algorithms, programming
and orchestration of complex computation pipelines have
become the major challenges to making use of graph
applications for Web-scale data analysis. GraphScope
aims to provide a one-stop and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Renz-Wieland:2021:JMI,
author = "Alexander Renz-Wieland and Tobias Drobisch and Zoi
Kaoudi and Rainer Gemulla and Volker Markl",
title = "Just move it!: dynamic parameter allocation in
action",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2707--2710",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476325",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476325",
abstract = "Parameter servers (PSs) ease the implementation of
distributed machine learning systems, but their
performance can fall behind that of single machine
baselines due to communication overhead. We demonstrate
Lapse, an open source PS with dynamic parameter
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Orogat:2021:CDC,
author = "Abdelghny Orogat and Ahmed El-Roby",
title = "{CBench}: demonstrating comprehensive evaluation of
question answering systems over knowledge graphs
through deep analysis of benchmarks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2711--2714",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476326",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476326",
abstract = "A plethora of question answering (QA) systems that
retrieve answers to natural language questions from
knowledge graphs have been developed in recent years.
However, choosing a benchmark to accurately assess the
quality of a question answering system is \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Woltmann:2021:PPM,
author = "Lucas Woltmann and Dominik Olwig and Claudio Hartmann
and Dirk Habich and Wolfgang Lehner",
title = "{PostCENN}: {postgreSQL} with machine learning models
for cardinality estimation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2715--2718",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476327",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476327",
abstract = "In this demo, we present PostCENN, an enhanced
PostgreSQL database system with an end-to-end
integration of machine learning (ML) models for
cardinality estimation. In general, cardinality
estimation is a topic with a long history in the
database \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2021:DDU,
author = "Jinyang Li and Yuval Moskovitch and H. V. Jagadish",
title = "{DENOUNCER}: detection of unfairness in classifiers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2719--2722",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476328",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476328",
abstract = "The use of automated data-driven tools for
decision-making has gained popularity in recent years.
At the same time, the reported cases of algorithmic
bias and discrimination increase as well, which in turn
lead to an extensive study of algorithmic \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Abbar:2021:DQM,
author = "Sofiane Abbar and Rade Stanojevic and Mashaal Musleh
and Mohamed ElShrif and Mohamed Mokbel",
title = "A demonstration of {QARTA}: an {ML}-based system for
accurate map services",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2723--2726",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476329",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476329",
abstract = "This demo presents QARTA; an open-source full-fledged
system for highly accurate and scalable map services.
QARTA employs machine learning techniques to: (a)
construct its own highly accurate map in terms of both
map topology and edge weights, and (b) \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Smith:2021:TTN,
author = "Jaclyn Smith and Michael Benedikt and Brandon Moore
and Milos Nikolic",
title = "{TraNCE}: transforming nested collections
efficiently",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2727--2730",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476330",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476330",
abstract = "Nested relational query languages have long been seen
as an attractive tool for scenarios involving large
hierarchical datasets. There has been a resurgence of
interest in nested relational languages. One driver has
been the affinity of these languages \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Diestelkamper:2021:DMA,
author = "Ralf Diestelk{\"a}mper and Seokki Lee and Boris Glavic
and Melanie Herschel",
title = "Debugging missing answers for spark queries over
nested data with {Breadcrumb}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2731--2734",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476331",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476331",
abstract = "We present Breadcrumb, a system that aids developers
in debugging queries through query-based explanations
for missing answers. Given as input a query and an
expected, but missing, query result, Breadcrumb
identifies operators in the input query that \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wu:2021:DPW,
author = "Renzhi Wu and Prem Sakala and Peng Li and Xu Chu and
Yeye He",
title = "Demonstration of {Panda}: a weakly supervised entity
matching system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2735--2738",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476332",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476332",
abstract = "Entity matching (EM) refers to the problem of
identifying tuple pairs in one or more relations that
refer to the same real world entities. Supervised
machine learning (ML) approaches, and deep learning
based approaches in particular, typically achieve
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liu:2021:ADAb,
author = "Jiabin Liu and Fu Zhu and Chengliang Chai and Yuyu Luo
and Nan Tang",
title = "Automatic data acquisition for deep learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2739--2742",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476333",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476333",
abstract = "Deep learning (DL) has widespread applications and has
revolutionized many industries. Although automated
machine learning (AutoML) can help us away from coding
for DL models, the acquisition of lots of high-quality
data for model training remains a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhou:2021:DSD,
author = "Xuanhe Zhou and Lianyuan Jin and Ji Sun and Xinyang
Zhao and Xiang Yu and Jianhua Feng and Shifu Li and
Tianqing Wang and Kun Li and Luyang Liu",
title = "{DBMind}: a self-driving platform in {openGauss}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2743--2746",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476334",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476334",
abstract = "We demonstrate a self-driving system DBMind, which
provides three autonomous capabilities in database,
including self-monitoring, self-diagnosis and
self-optimization. First, self-monitoring judiciously
collects database metrics and detects anomalies
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lin:2021:DDE,
author = "Qiongqiong Lin and Jiayao Zhang and Jinfei Liu and Kui
Ren and Jian Lou and Junxu Liu and Li Xiong and Jian
Pei and Jimeng Sun",
title = "Demonstration of dealer: an end-to-end model
marketplace with differential privacy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2747--2750",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476335",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476335",
abstract = "Data-driven machine learning (ML) has witnessed great
success across a variety of application domains. Since
ML model training relies on a large amount of data,
there is a growing demand for high-quality data to be
collected for ML model training. Data \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Mu:2021:AAC,
author = "Tianyu Mu and Hongzhi Wang and Shenghe Zheng and
Shaoqing Zhang and Cheng Liang and Haoyun Tang",
title = "{Assassin}: an automatic classification system based
on algorithm selection",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2751--2754",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476336",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476336",
abstract = "The increasing complexity of data analysis tasks makes
it dependent on human expertise and challenging for
non-experts. One of the major challenges faced in data
analysis is the selection of the proper algorithm for
given tasks and data sets. Motivated \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Cao:2021:AMD,
author = "Lei Cao and Dongqing Xiao and Yizhou Yan and Samuel
Madden and Guoliang Li",
title = "{ATLANTIC}: making database differentially private and
faster with accuracy guarantee",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2755--2758",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476337",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476337",
abstract = "Differential privacy promises to enable data sharing
and general data analytics while protecting individual
privacy. Because the private data is often stored in
the form of relational database that supports SQL
queries, making SQL-based analytics \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Xie:2021:DMS,
author = "Anze Xie and Anders Carlsson and Jason Mohoney and
Roger Waleffe and Shanan Peters and Theodoros
Rekatsinas and Shivaram Venkataraman",
title = "Demo of {Marius}: a system for large-scale graph
embeddings",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2759--2762",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476338",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476338",
abstract = "Graph embeddings have emerged as the de facto
representation for modern machine learning over graph
data structures. The goal of graph embedding models is
to convert high-dimensional sparse graphs into
low-dimensional, dense and continuous vector spaces
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Muller:2021:PPO,
author = "Heiko M{\"u}ller and Sonia Castelo and Munaf Qazi and
Juliana Freire",
title = "From papers to practice: the \pkg{openclean}
open-source data cleaning library",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2763--2766",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476339",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476339",
abstract = "Data preparation is still a major bottleneck for many
data science projects. Even though many sophisticated
algorithms and tools have been proposed in the research
literature, it is difficult for practitioners to
integrate them into their data wrangling \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ge:2021:DAD,
author = "Yongming Ge and Vanessa Lin and Maureen Daum and
Brandon Haynes and Alvin Cheung and Magdalena
Balazinska",
title = "Demonstration of apperception: a database management
system for geospatial video data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2767--2770",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476340",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476340",
abstract = "Many recent video applications---including traffic
monitoring, drone analytics, autonomous driving, and
virtual reality---require piecing together, combining,
and operating over many related video streams. Despite
the massive data volumes involved and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Karatzoglidi:2021:AEC,
author = "Mary Karatzoglidi and Paraskevas Kerasiotis and Verena
Kantere",
title = "Automated energy consumption forecasting with
{EnForce}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2771--2774",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476341",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476341",
abstract = "The need to reduce energy consumption on a global
scale has been of high importance during the last
years. Research has created methods to make highly
accurate forecasts on the energy consumption of
buildings and there have been efforts towards the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Jang:2021:RWG,
author = "Myung-Hwan Jang and Yong-Yeon Jo and Sang-Wook Kim",
title = "{RealGraph} web: a graph analysis platform on the
web",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2775--2778",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476342",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476342",
abstract = "In this demo, we present RealGraph$^{Web}$, a
web-based platform that provides various kinds of graph
analysis services. RealGraph$^{Web}$ is based on
RealGraph, a graph engine that addresses the problem of
performance degradation in processing real-world big
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ghosh:2021:IDS,
author = "Arthita Ghosh and Deven Bansod and Arpit Narechania
and Prashanth Dintyala and Su Timurturkan and Joy
Arulraj",
title = "Interactive demonstration of {SQLCheck}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2779--2782",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476343",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476343",
abstract = "We will demonstrate a prototype of sqlcheck, a
holistic toolchain for automatically finding and fixing
anti-patterns in database applications. The advent of
modern database-as-a-service platforms has made it easy
for developers to quickly create \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lin:2021:CET,
author = "Yiming Lin and Pramod Khargonekar and Sharad Mehrotra
and Nalini Venkatasubramanian",
title = "{T-cove}: an exposure tracing system based on cleaning
wi-fi events on organizational premises",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2783--2786",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476344",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476344",
abstract = "WiFi connectivity events, generated when a mobile
device connects to WiFi access points can serve as a
robust, passive, (almost) zero-cost indoor localization
technology. The challenge is the coarse level
localization it offers that limits its \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2021:DGE,
author = "Paul Y. Wang and Sainyam Galhotra and Romila Pradhan
and Babak Salimi",
title = "Demonstration of generating explanations for black-box
algorithms using {Lewis}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2787--2790",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476345",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476345",
abstract = "Explainable artificial intelligence (XAI) aims to
reduce the opacity of AI-based decision-making systems,
allowing humans to scrutinize and trust them. Unlike
prior work that attributes the responsibility for an
algorithm's decisions to its inputs as a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Castelo:2021:ADS,
author = "Sonia Castelo and R{\'e}mi Rampin and A{\'e}cio Santos
and Aline Bessa and Fernando Chirigati and Juliana
Freire",
title = "{Auctus}: a dataset search engine for data discovery
and augmentation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2791--2794",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476346",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476346",
abstract = "The large volumes of structured data currently
available, from Web tables to open-data portals and
enterprise data, open up new opportunities for progress
in answering many important scientific, societal, and
business questions. However, finding \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Rehman:2021:DRS,
author = "Mohammed Suhail Rehman and Silu Huang and Aaron J.
Elmore",
title = "A demonstration of {RELIC}: a system for retrospective
lineage inference of data workflows",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2795--2798",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476347",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476347",
abstract = "The ad-hoc, heterogeneous process of modern data
science typically involves loading, cleaning, and
mutating dataset(s) into multiple versions recorded as
artifacts by various tools within a single data science
workflow. Lineage information, including \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2021:SSC,
author = "Zhihao Chen and Haizhen Zhuo and Quanqing Xu and
Xiaodong Qi and Chengyu Zhu and Zhao Zhang and Cheqing
Jin and Aoying Zhou and Ying Yan and Hui Zhang",
title = "{SChain}: a scalable consortium blockchain exploiting
intra- and inter-block concurrency",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2799--2802",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476348",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476348",
abstract = "We demonstrate SChain, a consortium blockchain that
scales transaction processing to support large-scale
enterprise applications. The unique advantage of SChain
stems from the exploitation of both intra- and
inter-block concurrency. The intra-block \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Anastasiou:2021:EEP,
author = "Chrysovalantis Anastasiou and Constantinos Costa and
Panos K. Chrysanthis and Cyrus Shahabi",
title = "{EPICGen}: an experimental platform for indoor
congestion generation and forecasting",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2803--2806",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476349",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476349",
abstract = "Effectively and accurately forecasting the congestion
in indoor spaces has become particularly important
during the pandemic in order to reduce the risk of
exposure to airborne viruses. However, there is a lack
of readily available indoor congestion \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Arnaout:2021:WKB,
author = "Hiba Arnaout and Simon Razniewski and Gerhard Weikum
and Jeff Z. Pan",
title = "{Wikinegata}: a knowledge base with interesting
negative statements",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2807--2810",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476350",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476350",
abstract = "Databases about general-world knowledge, so-called
knowledge bases (KBs), are important in applications
such as search and question answering. Traditionally,
although KBs use open world assumption, popular KBs
only store positive information, but \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhu:2021:FEE,
author = "Jinwei Zhu and Kun Cheng and Jiayang Liu and Liang
Guo",
title = "Full encryption: an end to end encryption mechanism in
{GaussDB}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2811--2814",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476351",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476351",
abstract = "In this paper, we present a novel mechanism called
Full Encryption (FE) in GaussDB. FE-in-GaussDB provides
column-level encryption for sensitive data, and secures
the asset from any malicious cloud administrator or
information leakage attack. It ensures \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Mandamadiotis:2021:DIA,
author = "Antonis Mandamadiotis and Stavroula Eleftherakis and
Apostolos Glenis and Dimitrios Skoutas and Yannis
Stavrakas and Georgia Koutrika",
title = "{DatAgent}: the imminent age of intelligent data
assistants",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2815--2818",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476352",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476352",
abstract = "In this demonstration, we present DatAgent, an
intelligent data assistant system that allows users to
ask queries in natural language, and can respond in
natural language as well. Moreover, the system actively
guides the user using different types of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Rezig:2021:DDD,
author = "El Kindi Rezig and Anshul Bhandari and Anna Fariha and
Benjamin Price and Allan Vanterpool and Vijay Gadepally
and Michael Stonebraker",
title = "{DICE}: data discovery by example",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2819--2822",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476353",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476353",
abstract = "In order to conduct analytical tasks, data scientists
often need to find relevant data from an avalanche of
sources (e.g., data lakes, large organizational
databases). This effort is typically made in an ad hoc,
non-systematic manner, which makes it a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Schuhknecht:2021:AAP,
author = "Felix Schuhknecht and Aaron Priesterroth and Justus
Henneberg and Reza Salkhordeh",
title = "{AnyOLAP}: analytical processing of arbitrary
data-intensive applications without {ETL}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2823--2826",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476354",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476354",
abstract = "The volume of data that is processed and produced by
modern data-intensive applications is constantly
increasing. Of course, along with the volume, the
interest in analyzing and interpreting this data
increases as well. As a consequence, more and more
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Jacob:2021:DEB,
author = "Vincent Jacob and Fei Song and Arnaud Stiegler and
Bijan Rad and Yanlei Diao and Nesime Tatbul",
title = "A demonstration of the {Exathlon} benchmarking
platform for explainable anomaly detection",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2827--2830",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476355",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476355",
abstract = "In this demo, we introduce Exathlon --- a new
benchmarking platform for explainable anomaly detection
over high-dimensional time series. We designed Exathlon
to support data scientists and researchers in
developing and evaluating learned models and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Shaikhha:2021:IRH,
author = "Amir Shaikhha and Maximilian Schleich and Dan
Olteanu",
title = "An intermediate representation for hybrid database and
machine learning workloads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2831--2834",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476356",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476356",
abstract = "IFAQ is an intermediate representation and compilation
framework for hybrid database and machine learning
workloads expressible using iterative programs with
functional aggregate queries. We demonstrate IFAQ for
several OLAP queries, linear algebra \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Pastor:2021:HDY,
author = "Eliana Pastor and Andrew Gavgavian and Elena Baralis
and Luca de Alfaro",
title = "How divergent is your data?",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2835--2838",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476357",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476357",
abstract = "We present DivExplorer, a tool that enables users to
explore datasets and find subgroups of data for which a
classifier behaves in an anomalous manner. These
subgroups, denoted as divergent subgroups, may exhibit,
for example, higher-than-normal false \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Berro:2021:ERP,
author = "Auday Berro and Mohammad-Ali Yaghub Zade Fard and
Marcos Baez and Boualem Benatallah and Khalid
Benabdeslem",
title = "An extensible and reusable pipeline for automated
utterance paraphrases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2839--2842",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476358",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476358",
abstract = "In this demonstration paper we showcase an extensible
and reusable pipeline for automatic paraphrase
generation, i.e., reformulating sentences using
different words. Capturing the nuances of human
language is fundamental to the effectiveness of
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Beedkar:2021:CGD,
author = "Kaustubh Beedkar and David Brekardin and Jorge-Anulfo
Quian{\'e}-Ruiz and Volker Markl",
title = "Compliant geo-distributed data processing in action",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2843--2846",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476359",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476359",
abstract = "In this paper we present our work on compliant
geo-distributed data processing. Our work focuses on
the new dimension of dataflow constraints that regulate
the movement of data across geographical or
institutional borders. For example, European \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yadav:2021:QDV,
author = "Piyush Yadav and Dhaval Salwala and Felipe Arruda
Pontes and Praneet Dhingra and Edward Curry",
title = "Query-driven video event processing for the {Internet
of Multimedia Things}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2847--2850",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476360",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476360",
abstract = "Advances in Deep Neural Network (DNN) techniques have
revolutionized video analytics and unlocked the
potential for querying and mining video event patterns.
This paper details GNOSIS, an event processing platform
to perform near-real-time video event \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Koutroumanis:2021:DNU,
author = "Nikolaos Koutroumanis and Nikolaos Kousathanas and
Christos Doulkeridis and Akrivi Vlachou",
title = "A demonstration of {NoDA}: unified access to {NoSQL}
stores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2851--2854",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476361",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476361",
abstract = "In this demo paper, we present a system prototype,
called NoDA, that unifies access to NoSQL stores, by
exposing a single interface to big data developers.
This hides the heterogeneity of NoSQL stores, in terms
of different query languages, non-. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sen:2021:APP,
author = "Rathijit Sen and Abhishek Roy and Alekh Jindal and Rui
Fang and Jeff Zheng and Xiaolei Liu and Ruiping Li",
title = "{AutoExecutor}: predictive parallelism for {Spark SQL}
queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2855--2858",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476362",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476362",
abstract = "Right-sizing resources for query execution is
important for cost-efficient performance, but
estimating how performance is affected by resource
allocations, upfront, before query execution is
difficult. We demonstrate AutoExecutor, a predictive
system \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liu:2021:CBA,
author = "Jiaxiang Liu and Karl Knopf and Yiqing Tan and Bolin
Ding and Xi He",
title = "Catch a blowfish alive: a demonstration of
policy-aware differential privacy for interactive data
exploration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2859--2862",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476363",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476363",
abstract = "Policy-aware differential privacy (DP) frameworks such
as Blowfish privacy enable more accurate query answers
than standard DP. In this work, we build the first
policy-aware DP system for interactive data
exploration, BlowfishDB, that aims to (i) \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ouellette:2021:RDL,
author = "Paul Ouellette and Aidan Sciortino and Fatemeh
Nargesian and Bahar Ghadiri Bashardoost and Erkang Zhu
and Ken Q. Pu and Ren{\'e}e J. Miller",
title = "{RONIN}: data lake exploration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2863--2866",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476364",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476364",
abstract = "Dataset discovery can be performed using search (with
a query or keywords) to find relevant data. However,
the result of this discovery can be overwhelming to
explore. Existing navigation techniques mostly focus on
linkage graphs that enable navigation \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Boniol:2021:SAS,
author = "Paul Boniol and John Paparrizos and Themis Palpanas
and Michael J. Franklin",
title = "{SAND} in action: subsequence anomaly detection for
streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2867--2870",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476365",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476365",
abstract = "Subsequence anomaly detection in long data series is a
significant problem. While the demand for real-time
analytics and decision making increases, anomaly
detection methods have to operate over streams and
handle drifts in data distribution. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Koutras:2021:VAM,
author = "Christos Koutras and Kyriakos Psarakis and George
Siachamis and Andra Ionescu and Marios Fragkoulis and
Angela Bonifati and Asterios Katsifodimos",
title = "{Valentine} in action: matching tabular data at
scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2871--2874",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476366",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476366",
abstract = "Capturing relationships among heterogeneous datasets
in large data lakes --- traditionally termed schema
matching --- is one of the most challenging problems
that corporations and institutions face nowadays.
Discovering and integrating datasets heavily \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Guan:2021:GDE,
author = "Sheng Guan and Hanchao Ma and Sutanay Choudhury and
Yinghui Wu",
title = "{GEDet}: detecting erroneous nodes with a few
examples",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2875--2878",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476367",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476367",
abstract = "Detecting nodes with erroneous values in real-world
graphs remains challenging due to the lack of examples
and various error scenarios. We demonstrate GEDet, an
error detection engine that can detect erroneous nodes
in graphs with a few examples. The \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fan:2021:GUE,
author = "Wenfei Fan and Tao He and Longbin Lai and Xue Li and
Yong Li and Zhao Li and Zhengping Qian and Chao Tian
and Lei Wang and Jingbo Xu and Youyang Yao and Qiang
Yin and Wenyuan Yu and Jingren Zhou and Diwen Zhu and
Rong Zhu",
title = "{GraphScope}: a unified engine for big graph
processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2879--2892",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476369",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476369",
abstract = "GraphScope is a system and a set of language
extensions that enable a new programming interface for
large-scale distributed graph computing. It generalizes
previous graph processing frameworks (e.g., Pregel,
GraphX) and distributed graph databases (e.g.,.
\ldots{})",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Shang:2021:DSI,
author = "Zeyuan Shang and Emanuel Zgraggen and Benedetto
Buratti and Philipp Eichmann and Navid Karimeddiny and
Charlie Meyer and Wesley Runnels and Tim Kraska",
title = "{Davos}: a system for interactive data-driven decision
making",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2893--2905",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476370",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476370",
abstract = "Recently, a new horizon in data analytics,
prescriptive analytics, is becoming more and more
important to make data-driven decisions. As opposed to
the progress of democratizing data acquisition and
access, making data-driven decisions remains a
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Qin:2021:MEU,
author = "An Qin and Mengbai Xiao and Yongwei Wu and Xinjie
Huang and Xiaodong Zhang",
title = "{Mixer}: efficiently understanding and retrieving
visual content at web-scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2906--2917",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476371",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476371",
abstract = "Visual contents, including images and videos, are
dominant on the Internet today. The conventional search
engine is mainly designed for textual documents, which
must be extended to process and manage increasingly
high volumes of visual data objects. In \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Justo:2021:TPF,
author = "David Justo and Shaoqing Yi and Lukas Stadler and
Nadia Polikarpova and Arun Kumar",
title = "Towards a polyglot framework for factorized {ML}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2918--2931",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476372",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476372",
abstract = "Optimizing machine learning (ML) workloads on
structured data is a key concern for data platforms.
One class of optimizations called ``factorized ML''
helps reduce ML runtimes over multi-table datasets by
pushing ML computations down through joins, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Dayan:2021:EML,
author = "Niv Dayan and Moshe Twitto and Yuval Rochman and Uri
Beitler and Itai {Ben Zion} and Edward Bortnikov and
Shmuel Dashevsky and Ofer Frishman and Evgeni Ginzburg
and Igal Maly and Avraham (Poza) Meir and Mark Mokryn
and Iddo Naiss and Noam Rabinovich",
title = "The end of {Moore}'s law and the rise of the data
processor",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2932--2944",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476373",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476373",
abstract = "With the end of Moore's Law, database architects are
turning to hardware accelerators to offload
computationally intensive tasks from the CPU. In this
paper, we show that accelerators can facilitate far
more than just computation: they enable algorithms and
data structures that lavishly expand computation in
order to optimize for disparate cost metrics. We
introduce the Pliops Extreme Data Processor (XDP), a
novel storage engine implemented from the ground up
using customized hardware. At its core, XDP consists of
an accelerated hash table to index the data in storage
using less memory and fewer storage accesses for
queries than the best alternative. XDP also employs an
accelerated compressor, a capacitor, and a lock-free
RAID sub-system to minimize storage space and recovery
time while minimizing performance penalties. As a
result, XDP overcomes cost contentions that have so far
been inescapable.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Murray:2021:TDM,
author = "Derek G. Murray and Jir{\'\i} Simsa and Ana Klimovic
and Ihor Indyk",
title = "\pkg{tf.data}: a machine learning data processing
framework",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2945--2958",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476374",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476374",
abstract = "Training machine learning models requires feeding
input data for models to ingest. Input pipelines for
machine learning jobs are often challenging to
implement efficiently as they require reading large
volumes of data, applying complex transformations,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Eltabakh:2021:BBA,
author = "Mohamed Eltabakh and Anantha Subramanian and Awny
Al-Omari and Mohammed Al-Kateb and Sanjay Nair and
Mahbub Hasan and Wellington Cabrera and Charles Zhang
and Amit Kishore and Snigdha Prasad",
title = "Not black-box anymore!: enabling analytics-aware
optimizations in {Teradata Vantage}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2959--2971",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476375",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476375",
abstract = "Teradata Vantage is a platform for integrating a broad
range of analytical functions and capabilities with the
Teradata's SQL engine. One of the main challenges in
optimizing the execution of these analytical functions
is that many of them are not only \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2021:FAE,
author = "Yingda Chen and Jiamang Wang and Yifeng Lu and Ying
Han and Zhiqiang Lv and Xuebin Min and Hua Cai and Wei
Zhang and Haochuan Fan and Chao Li and Tao Guan and Wei
Lin and Yangqing Jia and Jingren Zhou",
title = "{Fangorn}: adaptive execution framework for
heterogeneous workloads on shared clusters",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2972--2985",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476376",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476376",
abstract = "Pervasive needs for data explorations at all scales
have populated modern distributed platforms with
workloads of different characteristics. The growing
complexities and diversities have thereafter imposed
distinct challenges to execute them on shared
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Agiwal:2021:NPS,
author = "Ankur Agiwal and Kevin Lai and Gokul Nath Babu
Manoharan and Indrajit Roy and Jagan Sankaranarayanan
and Hao Zhang and Tao Zou and Min Chen and Zongchang
(Jim) Chen and Ming Dai and Thanh Do and Haoyu Gao and
Haoyan Geng and Raman Grover and Bo Huang and Yanlai
Huang and Zhi (Adam) Li and Jianyi Liang and Tao Lin
and Li Liu and Yao Liu and Xi Mao and Yalan (Maya) Meng
and Prashant Mishra and Jay Patel and Rajesh S. R. and
Vijayshankar Raman and Sourashis Roy and Mayank Singh
Shishodia and Tianhang Sun and Ye (Justin) Tang and
Junichi Tatemura and Sagar Trehan and Ramkumar Vadali
and Prasanna Venkatasubramanian and Gensheng Zhang and
Kefei Zhang and Yupu Zhang and Zeleng Zhuang and Goetz
Graefe and Divyakant Agrawal and Jeff Naughton and
Sujata Kosalge and Hakan Hac{\i}g{\"u}m{\"u}{\c{s}}",
title = "Napa: powering scalable data warehousing with robust
query performance at {Google}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2986--2997",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476377",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476377",
abstract = "Google services continuously generate vast amounts of
application data. This data provides valuable insights
to business users. We need to store and serve these
planet-scale data sets under the extremely demanding
requirements of scalability, sub-second \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lee:2021:ABR,
author = "Rubao Lee and Minghong Zhou and Chi Li and Shenggang
Hu and Jianping Teng and Dongyang Li and Xiaodong
Zhang",
title = "The art of balance: a {RateupDBTM} experience of
building a {CPU\slash GPU} hybrid database product",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "2999--3013",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476378",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476378",
abstract = "GPU-accelerated database systems have been studied for
more than 10 years, ranging from prototyping
development to industry products serving in multiple
domains of data applications. Existing GPU database
research solutions are often focused on specific
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Cheng:2021:RTL,
author = "Audrey Cheng and Xiao Shi and Lu Pan and Anthony
Simpson and Neil Wheaton and Shilpa Lawande and Nathan
Bronson and Peter Bailis and Natacha Crooks and Ion
Stoica",
title = "{RAMP-TAO}: layering atomic transactions on
{Facebook}'s online {TAO} data store",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "3014--3027",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476379",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476379",
abstract = "Facebook's graph store TAO, like many other
distributed data stores, traditionally prioritizes
availability, efficiency, and scalability over strong
consistency or isolation guarantees to serve its large,
read-dominant workloads. As product developers
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2021:OAD,
author = "Guoliang Li and Xuanhe Zhou and Ji Sun and Xiang Yu
and Yue Han and Lianyuan Jin and Wenbo Li and Tianqing
Wang and Shifu Li",
title = "{openGauss}: an autonomous database system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "3028--3042",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476380",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476380",
abstract = "Although learning-based database optimization
techniques have been studied from academia in recent
years, they have not been widely deployed in commercial
database systems. In this work, we build an autonomous
database framework and integrate our \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Potharaju:2021:HIS,
author = "Rahul Potharaju and Terry Kim and Eunjin Song and
Wentao Wu and Lev Novik and Apoorve Dave and Andrew
Fogarty and Pouria Pirzadeh and Vidip Acharya and
Gurleen Dhody and Jiying Li and Sinduja Ramanujam and
Nicolas Bruno and C{\'e}sar A. Galindo-Legaria and
Vivek Narasayya and Surajit Chaudhuri and Anil K. Nori
and Tomas Talius and Raghu Ramakrishnan",
title = "Hyperspace: the indexing subsystem of {Azure
Synapse}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "3043--3055",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476382",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476382",
abstract = "Microsoft recently introduced Azure Synapse Analytics,
which offers an integrated experience across data
ingestion, storage, and querying in Apache Spark and
T-SQL over data in the lake, including files and
warehouse tables. In this paper, we present \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zheng:2021:SVB,
author = "Bolong Zheng and Lei Bi and Juan Cao and Hua Chai and
Jun Fang and Lu Chen and Yunjun Gao and Xiaofang Zhou
and Christian S. Jensen",
title = "{SpeakNav}: voice-based route description language
understanding for template-driven path search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "3056--3068",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476383",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476383",
abstract = "Many navigation applications take natural language
speech as input, which avoids users typing in words and
thus improves traffic safety. However, navigation
applications often fail to understand a user's
free-form description of a route. In addition,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gomes:2021:RML,
author = "Ana Sofia Gomes and Jo{\~a}o Oliveirinha and Pedro
Cardoso and Pedro Bizarro",
title = "{Railgun}: managing large streaming windows under
{MAD} requirements",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "3069--3082",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476384",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476384",
abstract = "Some mission critical systems, e.g., fraud detection,
require accurate, real-time metrics over long time
sliding windows on applications that demand high
throughput and low latencies. As these applications
need to run ``forever'' and cope with large, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Edara:2021:BMW,
author = "Pavan Edara and Mosha Pasumansky",
title = "Big metadata: when metadata is big data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "3083--3095",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476385",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476385",
abstract = "The rapid emergence of cloud data warehouses like
Google BigQuery has redefined the landscape of data
analytics. With the growth of data volumes, such
systems need to scale to hundreds of EiB of data in the
near future. This growth is accompanied by an
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Stoddard:2021:TRF,
author = "Josh Stoddard and Adam Mustafa and Naveen Goela",
title = "{Tanium Reveal}: a federated search engine for
querying unstructured file data on large enterprise
networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "3096--3109",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476386",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476386",
abstract = "Tanium Reveal is a federated search engine deployed on
large-scale enterprise networks that is capable of
executing data queries across billions of private data
files within 60 seconds. Data resides at the edge of
networks, potentially distributed on \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gencer:2021:HJL,
author = "Can Gencer and Marko Topolnik and Viliam {\v{D}}urina
and Emin Demirci and Ensar B. Kahveci and Ali
G{\"u}rb{\"u}z and Ond{\v{r}}ej Luk{\'a}{\v{s}} and
J{\'o}zsef Bart{\'o}k and Grzegorz Gierlach and
Franti{\v{s}}ek Hartman and Ufuk Y{\i}lmaz and Mehmet
Do{\u{g}}an and Mohamed Mandouh and Marios Fragkoulis
and Asterios Katsifodimos",
title = "{Hazelcast Jet}: low-latency stream processing at the
99.99-th percentile",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "3110--3121",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476387",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476387",
abstract = "Jet is an open source, high performance, distributed
stream processor built at Hazelcast during the last
five years. Jet was engineered with millisecond latency
on the 99.99th percentile as its primary design goal.
Originally Jet's purpose was to be an \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Roy:2021:SWO,
author = "Abhishek Roy and Alekh Jindal and Priyanka Gomatam and
Xiating Ouyang and Ashit Gosalia and Nishkam Ravi and
Swinky Mann and Prakhar Jain",
title = "{SparkCruise}: workload optimization in managed spark
clusters at {Microsoft}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "3122--3134",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476388",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476388",
abstract = "Today cloud companies offer fully managed Spark
services. This has made it easy to onboard new
customers but has also increased the volume of users
and their workload sizes. However, both cloud providers
and users lack the tools and time to optimize
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Akidau:2021:WSP,
author = "Tyler Akidau and Edmon Begoli and Slava Chernyak and
Fabian Hueske and Kathryn Knight and Kenneth Knowles
and Daniel Mills and Dan Sotolongo",
title = "Watermarks in stream processing systems: semantics and
comparative analysis of {Apache Flink} and {Google}
cloud dataflow",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "3135--3147",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476389",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476389",
abstract = "Streaming data processing is an exercise in taming
disorder: from oftentimes huge torrents of information,
we hope to extract powerful and timely analyses. But
when dealing with streaming data, the unbounded and
temporally disordered nature of real-. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Power:2021:CBD,
author = "Conor Power and Hiren Patel and Alekh Jindal and Jyoti
Leeka and Bob Jenkins and Michael Rys and Ed Triou and
Dexin Zhu and Lucky Katahanas and Chakrapani Bhat
Talapady and Joshua Rowe and Fan Zhang and Rich Draves
and Marc Friedman and Ivan Santa Maria Filho and Amrish
Kumar",
title = "The {Cosmos} big data platform at {Microsoft}: over a
decade of progress and a decade to look forward",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "3148--3161",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476390",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476390",
abstract = "The twenty-first century has been dominated by the
need for large scale data processing, marking the birth
of big data platforms such as Cosmos. This paper
describes the evolution of the exabyte-scale Cosmos big
data platform at Microsoft; our journey \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Pandis:2021:EAR,
author = "Ippokratis Pandis",
title = "The evolution of {Amazon Redshift}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "3162--3174",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476391",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476391",
abstract = "In 2013, Amazon Web Services revolutionized the data
warehousing industry by launching Amazon Redshift [7],
the first fully managed, petabyte-scale
enterprise-grade cloud data warehouse. Amazon Redshift
made it simple and cost-effective to efficiently
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Razniewski:2021:LMK,
author = "Simon Razniewski and Hiba Arnaout and Shrestha Ghosh
and Fabian Suchanek",
title = "On the limits of machine knowledge: completeness,
recall and negation in web-scale knowledge bases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "3175--3177",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476401",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476401",
abstract = "General-purpose knowledge bases (KBs) are an important
component of several data-driven applications.
Pragmatically constructed from available web sources,
these KBs are far from complete, which poses a set of
challenges in curation as well as \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Orr:2021:MMP,
author = "Laurel Orr and Atindriyo Sanyal and Xiao Ling and
Karan Goel and Megan Leszczynski",
title = "Managing {ML} pipelines: feature stores and the coming
wave of embedding ecosystems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "3178--3181",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476402",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476402",
abstract = "The industrial machine learning pipeline requires
iterating on model features, training and deploying
models, and monitoring deployed models at scale.
Feature stores were developed to manage and standardize
the engineer's workflow in this end-to-end \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2021:DAM,
author = "Yuliang Li and Xiaolan Wang and Zhengjie Miao and
Wang-Chiew Tan",
title = "Data augmentation for {ML}-driven data preparation and
integration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "3182--3185",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476403",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476403",
abstract = "In recent years, we have witnessed the development of
novel data augmentation (DA) techniques for creating
additional training data needed by machine learning
based solutions. In this tutorial, we will provide a
comprehensive overview of techniques \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zalipynis:2021:ADP,
author = "Ramon Antonio Rodriges Zalipynis",
title = "Array {DBMS}: past, present, and (near) future",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "3186--3189",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476404",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476404",
abstract = "Array DBMSs strive to be the best systems for
managing, processing, and even visualizing big N -d
arrays. The last decade blossomed with R\&D in array
DBMS, making it a young and fast-evolving area. We
present the first comprehensive tutorial on array
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2021:MLD,
author = "Guoliang Li and Xuanhe Zhou and Lei Cao",
title = "Machine learning for databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "3190--3193",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476405",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476405",
abstract = "Machine learning techniques have been proposed to
optimize the databases. For example, traditional
empirical database optimization techniques (e.g., cost
estimation, join order selection, knob tuning, index
and view advisor) cannot meet the high-. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kargar:2021:ELN,
author = "Saeed Kargar and Faisal Nawab",
title = "Extending the lifetime of {NVM}: challenges and
opportunities",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "3194--3197",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476406",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476406",
abstract = "Recently, Non-Volalile Memory (NVM) technology has
revolutionized the landscape or memory systems. With
many advantages, such as non volatility and near zero
standby power consumption, these byte-addressable
memory technologies are taking the place of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Echihabi:2021:NTH,
author = "Karima Echihabi and Kostas Zoumpatianos and Themis
Palpanas",
title = "New trends in high-{D} vector similarity search:
{AI}-driven, progressive, and distributed",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "3198--3201",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476407",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476407",
abstract = "Similarity search is a core operation of many critical
applications, involving massive collections of
high-dimensional (high-d) objects. Objects can be data
series, text, multimedia, graphs, database tables or
deep network embeddings. In this tutorial, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Jindal:2021:MLC,
author = "Alekh Jindal and Matteo Interlandi",
title = "Machine learning for cloud data systems: the progress
so far and the path forward",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "3202--3205",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476408",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476408",
abstract = "The goal of this tutorial is to educate the audience
about the state of the art in ML for cloud data
systems, both in research and in practice. The tutorial
is divided in two parts: the progress, and the path
forward. Part I covers the recent successes \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Davidson:2021:JCT,
author = "Susan B. Davidson",
title = "It's not just cookies and tea",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "3206--3206",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476409",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476409",
abstract = "Three of the major research themes over my career have
been concurrency, integration and provenance. In this
talk, I will explain why these themes are not only
important in database research, but how they have
played a role in my personal success. I \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Neumann:2021:ECQ,
author = "Thomas Neumann",
title = "Evolution of a compiling query engine",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "3207--3210",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476410",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476410",
abstract = "In 2011 we showed how to use dynamic code generation
to process queries in a data-centric manner. This
execution model can produce compact and efficient code
and was successfully used by both our own systems and
systems of other groups. As the systems \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Pavlo:2021:MYD,
author = "Andrew Pavlo and Matthew Butrovich and Lin Ma and
Prashanth Menon and Wan Shen Lim and Dana {Van Aken}
and William Zhang",
title = "Make your database system dream of electric sheep:
towards self-driving operation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "3211--3221",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476411",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476411",
abstract = "Database management systems (DBMSs) are notoriously
difficult to deploy and administer. Self-driving DBMSs
seek to remove these impediments by managing themselves
automatically. Despite decades of DBMS auto-tuning
research, a truly autonomous, self-. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kraska:2021:TIO,
author = "Tim Kraska",
title = "Towards instance-optimized data systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "3222--3232",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476392",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476392",
abstract = "In recent years, we have seen increased interest in
applying machine learning to system problems. For
example, there has been work on applying machine
learning to improve query optimization, indexing,
storage layouts, scheduling, log-structured merge
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Weikum:2021:KGD,
author = "Gerhard Weikum",
title = "Knowledge graphs 2021: a data odyssey",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "3233--3238",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476393",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476393",
abstract = "Providing machines with comprehensive knowledge of the
world's entities and their relationships has been a
long-standing vision and challenge for AI. Over the
last 15 years, huge knowledge bases, also known as
knowledge graphs, have been automatically \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ives:2021:FDB,
author = "Zachary G. Ives and Rachel Pottinger and Arun Kumar
and Johannes Gehrke and Jana Giceva",
title = "The future of data(base) education: is the ``cow
book'' dead?",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "3239--3239",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476394",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476394",
abstract = "This panel encourages a debate over the future of
database education and its relationship to Data
Science: Are Computer Science (CS) and Data Science
(DS) different disciplines about to split, and how does
that effect how we teach our field? Is there a ``data''
course that belongs in CS that all of our students
should take? Who is the traditional database course,
e.g. based on the ``cow book'', relevant to? What
traditional topics should we not be teaching in our
core data course(s) and which ones should be added?
What do we teach the student who has one elective for
data science? How does our community position itself
for leadership in CS given the popularity of DS?",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Remis:2021:UVI,
author = "Luis Remis and Chaunt{\'e} W. Lacewell",
title = "Using {VDMS} to index and search {100M} images",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "12",
pages = "3240--3252",
month = jul,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3476311.3476381",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:41:16 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3476311.3476381",
abstract = "Data scientists spend most of their time dealing with
data preparation, rather than doing what they know
best: build machine learning models and algorithms to
solve previously unsolvable problems. In this paper, we
describe the Visual Data Management \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liu:2021:TEF,
author = "Jian Liu and Kefei Wang and Feng Chen",
title = "{TSCache}: an efficient flash-based caching scheme for
time-series data workloads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "13",
pages = "3253--3266",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3484224.3484225",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:38:15 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3484224.3484225",
abstract = "Time-series databases are becoming an indispensable
component in today's data centers. In order to manage
the rapidly growing time-series data, we need an
effective and efficient system solution to handle the
huge traffic of time-series data queries. A \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2021:MRL,
author = "Huayi Wang and Jingfan Meng and Long Gong and Jun Xu
and Mitsunori Ogihara",
title = "{MP-RW-LSH}: an efficient multi-probe {LSH} solution
to {ANNS-L$_1$}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "13",
pages = "3267--3280",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3484224.3484226",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:38:15 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3484224.3484226",
abstract = "Approximate Nearest Neighbor Search (ANNS) is a
fundamental algorithmic problem, with numerous
applications in many areas of computer science.
Locality-Sensitive Hashing (LSH) is one of the most
popular solution approaches for ANNS. A common
shortcoming of many LSH schemes is that since they
probe only a single bucket in a hash table, they need
to use a large number of hash tables to achieve a high
query accuracy. For ANNS-L2, a multi-probe scheme was
proposed to overcome this drawback by strategically
probing multiple buckets in a hash table. In this work,
we propose MP-RW-LSH, the first and so far only
multi-probe LSH solution to ANNS in L1 distance, and
show that it achieves a better tradeoff between
scalability and query efficiency than all existing
LSH-based solutions. We also explain why a
state-of-the-art ANNS-L1 solution called Cauchy
projection LSH (CP-LSH) is fundamentally not suitable
for multi-probe extension. Finally, as a use case, we
construct, using MP-RW-LSH as the underlying
``ANNS-L$_1$ engine'', a new ANNS-E (E for edit
distance) solution that beats the state of the art.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Mailis:2021:VSK,
author = "Theofilos Mailis and Yannis Kotidis and Stamatis
Christoforidis and Evgeny Kharlamov and Yannis
Ioannidis",
title = "View selection over knowledge graphs in triple
stores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "13",
pages = "3281--3294",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3484224.3484227",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:38:15 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3484224.3484227",
abstract = "Knowledge Graphs (KGs) are collections of
interconnected and annotated entities that have become
powerful assets for data integration, search
enhancement, and other industrial applications.
Knowledge Graphs such as DBPEDIA may contain billion of
triple \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2021:FHO,
author = "Dongjie Li and Siyi Lv and Yanyu Huang and Yijing Liu
and Tong Li and Zheli Liu and Liang Guo",
title = "Frequency-hiding order-preserving encryption with
small client storage",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "13",
pages = "3295--3307",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3484224.3484228",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:38:15 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3484224.3484228",
abstract = "The range query on encrypted databases is usually
implemented using the order-preserving encryption (OPE)
technique which preserves the order of plaintexts.
Since the frequency leakage of plaintexts makes OPE
vulnerable to frequency-analyzing attacks, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Koutsoukos:2021:MMR,
author = "Dimitrios Koutsoukos and Ingo M{\"u}ller and Renato
Marroqu{\'\i}n and Ana Klimovic and Gustavo Alonso",
title = "Modularis: modular relational analytics over
heterogeneous distributed platforms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "13",
pages = "3308--3321",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3484224.3484229",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:38:15 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3484224.3484229",
abstract = "The enormous quantity of data produced every day
together with advances in data analytics has led to a
proliferation of data management and analysis systems.
Typically, these systems are built around highly
specialized monolithic operators optimized for
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lou:2021:TTA,
author = "Yunkai Lou and Chaokun Wang and Tiankai Gu and Hao
Feng and Jun Chen and Jeffrey Xu Yu",
title = "Time-topology analysis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "13",
pages = "3322--3334",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3484224.3484230",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:38:15 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3484224.3484230",
abstract = "Many real-world networks have been evolving, and are
finely modeled as temporal graphs from the viewpoint of
the graph theory. A temporal graph is informative, and
always contains two types of information, i.e., the
temporal information and topological \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Bernau:2021:QIC,
author = "Daniel Bernau and G{\"u}nther Eibl and Philip W.
Grassal and Hannah Keller and Florian Kerschbaum",
title = "Quantifying identifiability to choose and audit
$\epsilon$ in differentially private deep learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "13",
pages = "3335--3347",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3484224.3484231",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:38:15 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3484224.3484231",
abstract = "Differential privacy allows bounding the influence
that training data records have on a machine learning
model. To use differential privacy in machine learning,
data scientists must choose privacy parameters (
\epsilon , \delta ). Choosing meaningful privacy
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Laigner:2021:DMM,
author = "Rodrigo Laigner and Yongluan Zhou and Marcos Antonio
Vaz Salles and Yijian Liu and Marcos Kalinowski",
title = "Data management in microservices: state of the
practice, challenges, and research directions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "13",
pages = "3348--3361",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3484224.3484232",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:38:15 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3484224.3484232",
abstract = "Microservices have become a popular architectural
style for data-driven applications, given their ability
to functionally decompose an application into small and
autonomous services to achieve scalability, strong
isolation, and specialization of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ammerlaan:2021:PDM,
author = "Remmelt Ammerlaan and Gilbert Antonius and Marc
Friedman and H. M. Sajjad Hossain and Alekh Jindal and
Peter Orenberg and Hiren Patel and Shi Qiao and Vijay
Ramani and Lucas Rosenblatt and Abhishek Roy and Irene
Shaffer and Soundarajan Srinivasan and Markus Weimer",
title = "{PerfGuard}: deploying {ML}-for-systems without
performance regressions, almost!",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "13",
pages = "3362--3375",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3484224.3484233",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:38:15 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3484224.3484233",
abstract = "Modern data processing systems require optimization at
massive scale, and using machine learning to optimize
these systems (ML-for-systems) has shown promising
results. Unfortunately, ML-for-systems is subject to
over generalizations that do not capture \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ding:2021:DDS,
author = "Bailu Ding and Surajit Chaudhuri and Johannes Gehrke
and Vivek Narasayya",
title = "{DSB}: a decision support benchmark for
workload-driven and traditional database systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "13",
pages = "3376--3388",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3484224.3484234",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:38:15 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3484224.3484234",
abstract = "We describe a new benchmark, DSB, for evaluating both
workload-driven and traditional database systems on
modern decision support workloads. DSB is adapted from
the widely-used industrial-standard TPC-DS benchmark.
It enhances the TPC-DS benchmark with \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Hernandez:2021:CHP,
author = "Daniel Hern{\'a}ndez and Luis Gal{\'a}rraga and Katja
Hose",
title = "Computing how-provenance for {SPARQL} queries via
query rewriting",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "13",
pages = "3389--3401",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3484224.3484235",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:38:15 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3484224.3484235",
abstract = "Over the past few years, we have witnessed the
emergence of large knowledge graphs built by extracting
and combining information from multiple sources. This
has propelled many advances in query processing over
knowledge graphs, however the aspect of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2021:UUD,
author = "Junxiong Wang and Immanuel Trummer and Debabrota
Basu",
title = "{UDO}: universal database optimization using
reinforcement learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "13",
pages = "3402--3414",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3484224.3484236",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:38:15 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3484224.3484236",
abstract = "UDO is a versatile tool for offline tuning of database
systems for specific workloads. UDO can consider a
variety of tuning choices, reaching from picking
transaction code variants over index selections up to
database system parameter tuning. UDO uses \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Feldmann:2021:ITA,
author = "Anja Feldmann",
title = "{Internet} traffic analysis at scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "13",
pages = "3415--3415",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3484224.3484237",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:38:15 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3484224.3484237",
abstract = "In this talk, I will use multiple internet measurement
studies as examples to outline the challenges that we
face when performing internet-scale traffic analysis,
including implications of the COVID-19 pandemic on
internet traffic as well as detecting \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Koutra:2021:PSG,
author = "Danai Koutra",
title = "The power of summarization in graph mining and
learning: smaller data, faster methods, more
interpretability",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "13",
pages = "3416--3416",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3484224.3484238",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:38:15 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3484224.3484238",
abstract = "Our ability to generate, collect, and archive data
related to everyday activities, such as interacting on
social media, browsing the web, and monitoring
well-being, is rapidly increasing. Getting the most
benefit from this large-scale data requires \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Shah:2021:SPL,
author = "Nigam Shah",
title = "Summarizing patients like mine via an on-demand
consultation service",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "13",
pages = "3417--3417",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3484224.3484242",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:38:15 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3484224.3484242",
abstract = "Using evidence derived from previously collected
medical records to guide patient care has been a
long-standing vision of clinicians and informaticians,
and one with the potential to transform medical
practice. We offered an on-demand consultation
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Vanschoren:2021:TSO,
author = "Joaquin Vanschoren",
title = "Towards scalable online machine learning
collaborations with {OpenML}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "13",
pages = "3418--3418",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3484224.3484239",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:38:15 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3484224.3484239",
abstract = "Is massively collaborative machine learning possible?
Can we share and organize our collective knowledge of
machine learning to solve ever more challenging
problems? In a way, yes: as a community, we are already
very successful at developing high-. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Vartak:2021:MMI,
author = "Manasi Vartak",
title = "From {ML} models to intelligent applications: the rise
of {MLOps}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "13",
pages = "3419--3419",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3484224.3484240",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:38:15 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3484224.3484240",
abstract = "The last 5+ years in ML have focused on building the
best models, hyperparameter optimization, parallel
training, massive neural networks, etc. Now that the
building of models has become easy, models are being
integrated into every piece of software and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zaharia:2021:DPF,
author = "Matei Zaharia",
title = "Designing production-friendly machine learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "14",
number = "13",
pages = "3420--3420",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3484224.3484241",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Oct 29 16:38:15 MDT 2021",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3484224.3484241",
abstract = "Building production ML applications is difficult
because of their resource cost and complex failure
modes. I will discuss these challenges from two
perspectives: the Stanford DAWN Lab and experience with
large-scale commercial ML users at Databricks. I
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhao:2021:ASA,
author = "Kang Zhao and Liuyihan Song and Yingya Zhang and Pan
Pan and Yinghui Xu and Rong Jin",
title = "{ANN} softmax: acceleration of extreme classification
training",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "1",
pages = "1--10",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3485450.3485451",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jan 20 16:04:55 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3485450.3485451",
abstract = "Thanks to the popularity of GPU and the growth of its
computational power, more and more deep learning tasks,
such as face recognition, image retrieval and word
embedding, can take advantage of extreme classification
to improve accuracy. However, it \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yu:2021:WTD,
author = "Gyeong-In Yu and Saeed Amizadeh and Sehoon Kim and
Artidoro Pagnoni and Ce Zhang and Byung-Gon Chun and
Markus Weimer and Matteo Interlandi",
title = "{WindTunnel}: towards differentiable {ML} pipelines
beyond a single model",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "1",
pages = "11--20",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3485450.3485452",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jan 20 16:04:55 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3485450.3485452",
abstract = "While deep neural networks (DNNs) have shown to be
successful in several domains like computer vision,
non-DNN models such as linear models and gradient
boosting trees are still considered state-of-the-art
over tabular data. When using these models, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Skiadopoulos:2021:DDO,
author = "Athinagoras Skiadopoulos and Qian Li and Peter Kraft
and Kostis Kaffes and Daniel Hong and Shana Mathew and
David Bestor and Michael Cafarella and Vijay Gadepally
and Goetz Graefe and Jeremy Kepner and Christos
Kozyrakis and Tim Kraska and Michael Stonebraker and
Lalith Suresh and Matei Zaharia",
title = "{DBOS}: a {DBMS}-oriented operating system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "1",
pages = "21--30",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3485450.3485454",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jan 20 16:04:55 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3485450.3485454",
abstract = "This paper lays out the rationale for building a
completely new operating system (OS) stack. Rather than
build on a single node OS together with separate
cluster schedulers, distributed filesystems, and
network managers, we argue that a distributed
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Jain:2021:DIA,
author = "Arjit Jain and Sunita Sarawagi and Prithviraj Sen",
title = "Deep indexed active learning for matching
heterogeneous entity representations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "1",
pages = "31--45",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3485450.3485455",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jan 20 16:04:55 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3485450.3485455",
abstract = "Given two large lists of records, the task in entity
resolution (ER) is to find the pairs from the Cartesian
product of the lists that correspond to the same real
world entity. Typically, passive learning methods on
such tasks require large amounts of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhou:2021:LQR,
author = "Xuanhe Zhou and Guoliang Li and Chengliang Chai and
Jianhua Feng",
title = "A learned query rewrite system using {Monte Carlo}
tree search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "1",
pages = "46--58",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3485450.3485456",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jan 20 16:04:55 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3485450.3485456",
abstract = "Query rewrite transforms a SQL query into an
equivalent one but with higher performance. However,
SQL rewrite is an NP-hard problem, and existing
approaches adopt heuristics to rewrite the queries.
These heuristics have two main limitations. First, the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lin:2021:DCP,
author = "Yin Lin and Brit Youngmann and Yuval Moskovitch and H.
V. Jagadish and Tova Milo",
title = "On detecting cherry-picked generalizations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "1",
pages = "59--71",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3485450.3485457",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jan 20 16:04:55 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3485450.3485457",
abstract = "Generalizing from detailed data to statements in a
broader context is often critical for users to make
sense of large data sets. Correspondingly, poorly
constructed generalizations might convey misleading
information even if the statements are \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2021:FNF,
author = "Jiayi Wang and Chengliang Chai and Jiabin Liu and
Guoliang Li",
title = "{FACE}: a normalizing flow based cardinality
estimator",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "1",
pages = "72--84",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3485450.3485458",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jan 20 16:04:55 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3485450.3485458",
abstract = "Cardinality estimation is one of the most important
problems in query optimization. Recently, machine
learning based techniques have been proposed to
effectively estimate cardinality, which can be broadly
classified into query-driven and data-driven \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sun:2021:LCE,
author = "Ji Sun and Jintao Zhang and Zhaoyan Sun and Guoliang
Li and Nan Tang",
title = "Learned cardinality estimation: a design space
exploration and a comparative evaluation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "1",
pages = "85--97",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3485450.3485459",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jan 20 16:04:55 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3485450.3485459",
abstract = "Cardinality estimation is core to the query optimizers
of DBMSs. Non-learned methods, especially based on
histograms and samplings, have been widely used in
commercial and open-source DBMSs. Nevertheless,
histograms and samplings can only be used to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{He:2021:DAD,
author = "Dong He and Maureen Daum and Walter Cai and Magdalena
Balazinska",
title = "{DeepEverest}: accelerating declarative top-{$K$}
queries for deep neural network interpretation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "1",
pages = "98--111",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3485450.3485460",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jan 20 16:04:55 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3485450.3485460",
abstract = "We design, implement, and evaluate DeepEverest, a
system for the efficient execution of interpretation by
example queries over the activation values of a deep
neural network. DeepEverest consists of an efficient
indexing technique and a query execution \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chatterjee:2021:CCC,
author = "Subarna Chatterjee and Meena Jagadeesan and Wilson Qin
and Stratos Idreos",
title = "{Cosine}: a cloud-cost optimized self-designing
key--value storage engine",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "1",
pages = "112--126",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3485450.3485461",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jan 20 16:04:55 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3485450.3485461",
abstract = "We present a self-designing key-value storage engine,
Cosine, which can always take the shape of the close to
``perfect'' engine architecture given an input
workload, a cloud budget, a target performance, and
required cloud SLAs. By identifying and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Adnan:2021:ARS,
author = "Muhammad Adnan and Yassaman Ebrahimzadeh Maboud and
Divya Mahajan and Prashant J. Nair",
title = "Accelerating recommendation system training by
leveraging popular choices",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "1",
pages = "127--140",
month = sep,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3485450.3485462",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jan 20 16:04:55 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3485450.3485462",
abstract = "Recommender models are commonly used to suggest
relevant items to a user for e-commerce and online
advertisement-based applications. These models use
massive embedding tables to store numerical
representation of items' and users' categorical
variables \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yang:2021:BCE,
author = "Jianye Yang and Yun Peng and Wenjie Zhang",
title = "$ (p, q) $-biclique counting and enumeration for large
sparse bipartite graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "2",
pages = "141--153",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3489496.3489497",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:26:54 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3489496.3489497",
abstract = "In this paper, we study the problem of ( p,
q)-biclique counting and enumeration for large sparse
bipartite graphs. Given a bipartite G = ( U, V, E), and
two integer parameters p and q, we aim to efficiently
count and enumerate all (p, q)-bicliques in G,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Graur:2021:EQL,
author = "Dan Graur and Ingo M{\"u}ller and Mason Proffitt and
Ghislain Fourny and Gordon T. Watts and Gustavo
Alonso",
title = "Evaluating query languages and systems for high-energy
physics data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "2",
pages = "154--168",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3489496.3489498",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:26:54 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3489496.3489498",
abstract = "In the domain of high-energy physics (HEP), query
languages in general and SQL in particular have found
limited acceptance. This is surprising since HEP data
analysis matches the SQL model well: the data is fully
structured and queried using mostly \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Hao:2021:DHC,
author = "Kongzhang Hao and Long Yuan and Wenjie Zhang",
title = "Distributed hop-constrained $s$--$t$ simple path
enumeration at billion scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "2",
pages = "169--182",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3489496.3489499",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:26:54 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3489496.3489499",
abstract = "Hop-constrained s-t simple path (HC-s-t path)
enumeration is a fundamental problem in graph analysis
and has received considerable attention recently.
Straightforward distributed solutions are inefficient
and suffer from poor scalabiltiy when addressing
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fang:2021:EAO,
author = "Jingzhi Fang and Yanyan Shen and Yue Wang and Lei
Chen",
title = "{ETO}: accelerating optimization of {DNN} operators by
high-performance tensor program reuse",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "2",
pages = "183--195",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3489496.3489500",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:26:54 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3489496.3489500",
abstract = "Recently, deep neural networks (DNNs) have achieved
great success in various applications, where low
inference latency is important. Existing solutions
either manually tune the kernel library or utilize
search-based compilation to reduce the operator
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Grulich:2021:BEE,
author = "Philipp Marian Grulich and Steffen Zeuch and Volker
Markl",
title = "{Babelfish}: efficient execution of polyglot queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "2",
pages = "196--210",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3489496.3489501",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:26:54 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3489496.3489501",
abstract = "Today's users of data processing systems come from
different domains, have different levels of expertise,
and prefer different programming languages. As a
result, analytical workload requirements shifted from
relational to polyglot queries involving \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhou:2021:BCU,
author = "Alexander Zhou and Yue Wang and Lei Chen",
title = "Butterfly counting on uncertain bipartite graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "2",
pages = "211--223",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3489496.3489502",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:26:54 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3489496.3489502",
abstract = "When considering uncertain bipartite networks, the
number of instances of the popular graphlet structure
the butterfly may be used as an important metric to
quickly gauge information about the network. This
Uncertain Butterfly Count has practical usages
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Cui:2021:MGG,
author = "Yue Cui and Kai Zheng and Dingshan Cui and Jiandong
Xie and Liwei Deng and Feiteng Huang and Xiaofang
Zhou",
title = "{METRO}: a generic graph neural network framework for
multivariate time series forecasting",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "2",
pages = "224--236",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3489496.3489503",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:26:54 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3489496.3489503",
abstract = "Multivariate time series forecasting has been drawing
increasing attention due to its prevalent applications.
It has been commonly assumed that leveraging latent
dependencies between pairs of variables can enhance
prediction accuracy. However, most \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ge:2021:LAE,
author = "Congcong Ge and Xiaoze Liu and Lu Chen and Yunjun Gao
and Baihua Zheng",
title = "{LargeEA}: aligning entities for large-scale knowledge
graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "2",
pages = "237--245",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3489496.3489504",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:26:54 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3489496.3489504",
abstract = "Entity alignment (EA) aims to find equivalent entities
in different knowledge graphs (KGs). Current EA
approaches suffer from scalability issues, limiting
their usage in real-world EA scenarios. To tackle this
challenge, we propose LargeEA to align \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lu:2021:HHG,
author = "Kejing Lu and Mineichi Kudo and Chuan Xiao and
Yoshiharu Ishikawa",
title = "{HVS}: hierarchical graph structure based on {Voronoi}
diagrams for solving approximate nearest neighbor
search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "2",
pages = "246--258",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3489496.3489506",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:26:54 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3489496.3489506",
abstract = "Approximate nearest neighbor search (ANNS) is a
fundamental problem that has a wide range of
applications in information retrieval and data mining.
Among state-of-the-art in-memory ANNS methods,
graph-based methods have attracted particular interest
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Arman:2021:OHP,
author = "Arif Arman and Dmitri Loguinov",
title = "{Origami}: a high-performance mergesort framework",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "2",
pages = "259--271",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3489496.3489507",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:26:54 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3489496.3489507",
abstract = "Mergesort is a popular algorithm for sorting
real-world workloads as it is immune to data skewness,
suitable for parallelization using vectorized
intrinsics, and relatively simple to multi-thread. In
this paper, we introduce Origami, an in-memory merge-.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wu:2021:LSL,
author = "Renzhi Wu and Bolin Ding and Xu Chu and Zhewei Wei and
Xiening Dai and Tao Guan and Jingren Zhou",
title = "Learning to be a statistician: learned estimator for
number of distinct values",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "2",
pages = "272--284",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3489496.3489508",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:26:54 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3489496.3489508",
abstract = "Estimating the number of distinct values (NDV) in a
column is useful for many tasks in database systems,
such as columnstore compression and data profiling. In
this work, we focus on how to derive accurate NDV
estimations from random (online/offline) \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yu:2021:PFP,
author = "Shangdi Yu and Yiqiu Wang and Yan Gu and Laxman
Dhulipala and Julian Shun",
title = "{ParChain}: a framework for parallel hierarchical
agglomerative clustering using nearest-neighbor chain",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "2",
pages = "285--298",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3489496.3489509",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:26:54 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3489496.3489509",
abstract = "This paper studies the hierarchical clustering
problem, where the goal is to produce a dendrogram that
represents clusters at varying scales of a data set. We
propose the ParChain framework for designing parallel
hierarchical agglomerative clustering \ldots{}.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chauhan:2021:ARP,
author = "Komal Chauhan and Kartik Jain and Sayan Ranu and
Srikanta Bedathur and Amitabha Bagchi",
title = "Answering regular path queries through exemplars",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "2",
pages = "299--311",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3489496.3489510",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:26:54 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3489496.3489510",
abstract = "Regular simple path query (RPQ) is one of the
fundamental operators in graph analytics. In an RPQ,
the input is a graph, a source node and a regular
expression. The goal is to identify all nodes that are
connected to the source through a simple path
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Miao:2021:SHE,
author = "Xupeng Miao and Hailin Zhang and Yining Shi and
Xiaonan Nie and Zhi Yang and Yangyu Tao and Bin Cui",
title = "{HET}: scaling out huge embedding model training via
cache-enabled distributed framework",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "2",
pages = "312--320",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3489496.3489511",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:26:54 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3489496.3489511",
abstract = "Embedding models have been an effective learning
paradigm for high-dimensional data. However, one open
issue of embedding models is that their representations
(latent factors) often result in large parameter space.
We observe that existing distributed \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2021:FFG,
author = "Pengfei Li and Yu Hua and Jingnan Jia and Pengfei
Zuo",
title = "{FINEdex}: a fine-grained learned index scheme for
scalable and concurrent memory systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "2",
pages = "321--334",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3489496.3489512",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:26:54 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3489496.3489512",
abstract = "Index structures in memory systems become important to
improve the entire system performance. The promising
learned indexes leverage deep-learning models to
complement existing index structures and obtain
significant performance improvements. Existing
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Bai:2021:TTA,
author = "Jiyang Bai and Peixiang Zhao",
title = "{TaGSim}: type-aware graph similarity learning and
computation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "2",
pages = "335--347",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3489496.3489513",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:26:54 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3489496.3489513",
abstract = "Computing similarity between graphs is a fundamental
and critical problem in graph-based applications, and
one of the most commonly used graph similarity measures
is graph edit distance (GED), defined as the minimum
number of graph edit operations that \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhu:2021:AIC,
author = "Yuqing Zhu and Jing Tang and Xueyan Tang and Lei
Chen",
title = "Analysis of influence contribution in social
advertising",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "2",
pages = "348--360",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3489496.3489514",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:26:54 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3489496.3489514",
abstract = "Online Social Network (OSN) providers usually conduct
advertising campaigns by inserting social ads into
promoted posts. Whenever a user engages in a promoted
ad, she may further propagate the promoted ad to her
followers recursively and the propagation \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Theodorakis:2021:SSN,
author = "Georgios Theodorakis and Fotios Kounelis and Peter
Pietzuch and Holger Pirk",
title = "{Scabbard}: single-node fault-tolerant stream
processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "2",
pages = "361--374",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3489496.3489515",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:26:54 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3489496.3489515",
abstract = "Single-node multi-core stream processing engines
(SPEs) can process hundreds of millions of tuples per
second. Yet making them fault-tolerant with
exactly-once semantics while retaining this performance
is an open challenge: due to the limited I/O \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Konstantinidis:2021:EPC,
author = "George Konstantinidis and Jet Holt and Adriane
Chapman",
title = "Enabling personal consent in databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "2",
pages = "375--387",
month = oct,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3489496.3489516",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:26:54 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3489496.3489516",
abstract = "Users have the right to consent to the use of their
data, but current methods are limited to very
coarse-grained expressions of consent, as
``opt-in/opt-out'' choices for certain uses. In this
paper we identify the need for fine-grained consent
management \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liu:2021:ESB,
author = "Yejia Liu and Weiyuan Wu and Lampros Flokas and
Jiannan Wang and Eugene Wu",
title = "Enabling {SQL}-based training data debugging for
federated learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "3",
pages = "388--400",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3494124.3494125",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:35:56 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3494124.3494125",
abstract = "How can we debug a logistic regression model in a
federated learning setting when seeing the model behave
unexpectedly (e.g., the model rejects all high-income
customers' loan applications)? The SQL-based training
data debugging framework has proved \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Vaidya:2021:LQL,
author = "Kapil Vaidya and Anshuman Dutt and Vivek Narasayya and
Surajit Chaudhuri",
title = "Leveraging query logs and machine learning for
parametric query optimization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "3",
pages = "401--413",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3494124.3494126",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:35:56 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3494124.3494126",
abstract = "Parametric query optimization (PQO) must address two
problems: identify a relatively small number of plans
to cache for a parameterized query (populateCache), and
efficiently select the best cached plan to use for
executing any instance of the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lu:2021:PTS,
author = "Yao Lu and Srikanth Kandula and Arnd Christian
K{\"o}nig and Surajit Chaudhuri",
title = "Pre-training summarization models of structured
datasets for cardinality estimation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "3",
pages = "414--426",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3494124.3494127",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:35:56 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3494124.3494127",
abstract = "We consider the problem of pre-training models which
convert structured datasets into succinct summaries
that can be used to answer cardinality estimation
queries. Doing so avoids per-dataset training and, in
our experiments, reduces the time to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Rao:2021:XEF,
author = "Susie Xi Rao and Shuai Zhang and Zhichao Han and Zitao
Zhang and Wei Min and Zhiyao Chen and Yinan Shan and
Yang Zhao and Ce Zhang",
title = "{xFraud}: explainable fraud transaction detection",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "3",
pages = "427--436",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3494124.3494128",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:35:56 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3494124.3494128",
abstract = "At online retail platforms, it is crucial to actively
detect the risks of transactions to improve customer
experience and minimize financial loss. In this work,
we propose xFraud, an explainable fraud transaction
prediction framework which is mainly \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yuan:2021:SMG,
author = "Ye Yuan and Delong Ma and Zhenyu Wen and Zhiwei Zhang
and Guoren Wang",
title = "Subgraph matching over graph federation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "3",
pages = "437--450",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3494124.3494129",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:35:56 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3494124.3494129",
abstract = "Many real-life applications require processing graph
data across heterogeneous sources. In this paper, we
define the graph federation that indicates that the
graph data sources are temporarily federated and offer
their data for users. Next, we propose a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Niu:2021:PBD,
author = "Xing Niu and Boris Glavic and Ziyu Liu and Pengyuan Li
and Dieter Gawlick and Vasudha Krishnaswamy and Zhen
Hua Liu and Danica Porobic",
title = "Provenance-based data skipping",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "3",
pages = "451--464",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3494124.3494130",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:35:56 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3494124.3494130",
abstract = "Database systems use static analysis to determine
upfront which data is needed for answering a query and
use indexes and other physical design techniques to
speed-up access to that data. However, for important
classes of queries, e.g., HAVING and top-k \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Jin:2021:DTL,
author = "Di Jin and Bunyamin Sisman and Hao Wei and Xin Luna
Dong and Danai Koutra",
title = "Deep transfer learning for multi-source entity linkage
via domain adaptation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "3",
pages = "465--477",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3494124.3494131",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:35:56 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3494124.3494131",
abstract = "Multi-source entity linkage focuses on integrating
knowledge from multiple sources by linking the records
that represent the same real world entity. This is
critical in high-impact applications such as data
cleaning and user stitching. The state-of-the-.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Xing:2021:EEI,
author = "Lu Xing and Eric Lee and Tong An and Bo-Cheng Chu and
Ahmed Mahmood and Ahmed M. Aly and Jianguo Wang and
Walid G. Aref",
title = "An experimental evaluation and investigation of waves
of misery in $r$-trees",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "3",
pages = "478--490",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3494124.3494132",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:35:56 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3494124.3494132",
abstract = "Waves of misery is a phenomenon where spikes of many
node splits occur over short periods of time in tree
indexes. Waves of misery negatively affect the
performance of tree indexes in insertion-heavy
workloads. Waves of misery have been first observed
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liu:2021:PPR,
author = "Yongyi Liu and Ahmed R. Mahmood and Amr Magdy and
Sergio Rey",
title = "{PRUC}: {P-regions} with user-defined constraint",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "3",
pages = "491--503",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3494124.3494133",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:35:56 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3494124.3494133",
abstract = "This paper introduces a generalized spatial
regionalization problem, namely, PRUC ( P -Regions with
User-defined Constraint) that partitions spatial areas
into homogeneous regions. PRUC accounts for
user-defined constraints imposed over aggregate region
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2021:PIR,
author = "Yile Chen and Xiucheng Li and Gao Cong and Cheng Long
and Zhifeng Bao and Shang Liu and Wanli Gu and Fuzheng
Zhang",
title = "Points-of-interest relationship inference with
spatial-enriched graph neural networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "3",
pages = "504--512",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3494124.3494134",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:35:56 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3494124.3494134",
abstract = "As a fundamental component in location-based services,
inferring the relationship between points-of-interests
(POIs) is very critical for service providers to offer
good user experience to business owners and customers.
Most of the existing methods for \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chan:2021:SSA,
author = "Tsz Nam Chan and Pak Lon Ip and Leong Hou U. and Byron
Choi and Jianliang Xu",
title = "{SAFE}: a share-and-aggregate bandwidth exploration
framework for kernel density visualization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "3",
pages = "513--526",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3494124.3494135",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:35:56 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3494124.3494135",
abstract = "Kernel density visualization (KDV) has been the de
facto method in many spatial analysis tasks, including
ecological modeling, crime hotspot detection, traffic
accident hotspot detection, and disease outbreak
detection. In these tasks, domain experts \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Dittrich:2021:NYD,
author = "Jens Dittrich and Joris Nix and Christian Sch{\"o}n",
title = "The next 50 years in database indexing or: the case
for automatically generated index structures",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "3",
pages = "527--540",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3494124.3494136",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:35:56 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3494124.3494136",
abstract = "Index structures are a building block of query
processing and computer science in general. Since the
dawn of computer technology there have been index
structures. And since then, a myriad of index
structures are being invented and published each and
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chapnik:2021:DDA,
author = "Koral Chapnik and Ilya Kolchinsky and Assaf Schuster",
title = "{DARLING}: data-aware load shedding in complex event
processing systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "3",
pages = "541--554",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3494124.3494137",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:35:56 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3494124.3494137",
abstract = "Complex event processing (CEP) is widely employed to
detect user-defined combinations, or patterns, of
events in massive streams of incoming data. Numerous
applications such as healthcare, fraud detection, and
more, use CEP technologies to capture \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhuo:2021:RMO,
author = "Danyang Zhuo and Kaiyuan Zhang and Zhuohan Li and
Siyuan Zhuang and Stephanie Wang and Ang Chen and Ion
Stoica",
title = "Rearchitecting in-memory object stores for low
latency",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "3",
pages = "555--568",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3494124.3494138",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:35:56 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3494124.3494138",
abstract = "Low latency is increasingly critical for modern
workloads, to the extent that compute functions are
explicitly scheduled to be co-located with their
in-memory object stores for faster access. However, the
traditional object store architecture mandates
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ma:2021:MTE,
author = "Pingchuan Ma and Shuai Wang",
title = "{MT-teql}: evaluating and augmenting neural {NLIDB} on
real-world linguistic and schema variations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "3",
pages = "569--582",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3494124.3494139",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:35:56 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3494124.3494139",
abstract = "Natural Language Interface to Database (NLIDB)
translates human utterances into SQL queries and
enables database interactions for non-expert users.
Recently, neural network models have become a major
approach to implementing NLIDB. However, neural NLIDB
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Shi:2021:TPE,
author = "Jessica Shi and Laxman Dhulipala and Julian Shun",
title = "Theoretically and practically efficient parallel
nucleus decomposition",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "3",
pages = "583--596",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3494124.3494140",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:35:56 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3494124.3494140",
abstract = "This paper studies the nucleus decomposition problem,
which has been shown to be useful in finding dense
substructures in graphs. We present a novel parallel
algorithm that is efficient both in theory and in
practice. Our algorithm achieves a work \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lu:2021:AHP,
author = "Baotong Lu and Jialin Ding and Eric Lo and Umar Farooq
Minhas and Tianzheng Wang",
title = "{APEX}: a high-performance learned index on persistent
memory",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "3",
pages = "597--610",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3494124.3494141",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:35:56 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3494124.3494141",
abstract = "The recently released persistent memory (PM) offers
high performance, persistence, and is cheaper than
DRAM. This opens up new possibilities for indexes that
operate and persist data directly on the memory bus.
Recent learned indexes exploit data \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Campos:2021:UTS,
author = "David Campos and Tung Kieu and Chenjuan Guo and
Feiteng Huang and Kai Zheng and Bin Yang and Christian
S. Jensen",
title = "Unsupervised time series outlier detection with
diversity-driven convolutional ensembles",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "3",
pages = "611--623",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3494124.3494142",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:35:56 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3494124.3494142",
abstract = "With the sweeping digitalization of societal, medical,
industrial, and scientific processes, sensing
technologies are being deployed that produce increasing
volumes of time series data, thus fueling a plethora of
new or improved applications. In this \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Miao:2021:EED,
author = "Xiaoye Miao and Yangyang Wu and Lu Chen and Yunjun Gao
and Jun Wang and Jianwei Yin",
title = "Efficient and effective data imputation with influence
functions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "3",
pages = "624--632",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3494124.3494143",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:35:56 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3494124.3494143",
abstract = "Data imputation has been extensively explored to solve
the missing data problem. The dramatically rising
volume of missing data makes the training of imputation
models computationally infeasible in real-life
scenarios. In this paper, we propose an \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kochsiek:2021:PTK,
author = "Adrian Kochsiek and Rainer Gemulla",
title = "Parallel training of knowledge graph embedding models:
a comparison of techniques",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "3",
pages = "633--645",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3494124.3494144",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:35:56 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3494124.3494144",
abstract = "Knowledge graph embedding (KGE) models represent the
entities and relations of a knowledge graph (KG) using
dense continuous representations called embeddings. KGE
methods have recently gained traction for tasks such as
knowledge graph completion and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Vitagliano:2021:DLT,
author = "Gerardo Vitagliano and Lan Jiang and Felix Naumann",
title = "Detecting layout templates in complex multiregion
files",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "3",
pages = "646--658",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3494124.3494145",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:35:56 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3494124.3494145",
abstract = "Spreadsheets are among the most commonly used file
formats for data management, distribution, and
analysis. Their widespread employment makes it easy to
gather large collections of data, but their flexible
canvas-based structure makes automated analysis
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Maliszewski:2021:WPJ,
author = "Kajetan Maliszewski and Jorge-Arnulfo Quian{\'e}-Ruiz
and Jonas Traub and Volker Markl",
title = "What is the price for joining securely?: benchmarking
equi-joins in trusted execution environments",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "3",
pages = "659--672",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3494124.3494146",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:35:56 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3494124.3494146",
abstract = "Protection of personal data has been raised to be
among the top requirements of modern systems. At the
same time, it is now frequent that the owner of the
data and the owner of the computing infrastructure are
two entities with limited trust between \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ho:2021:ETP,
author = "Van Long Ho and Nguyen Ho and Torben Bach Pedersen",
title = "Efficient temporal pattern mining in big time series
using mutual information",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "3",
pages = "673--685",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3494124.3494147",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:35:56 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3494124.3494147",
abstract = "Very large time series are increasingly available from
an ever wider range of IoT-enabled sensors deployed in
different environments. Significant insights can be
gained by mining temporal patterns from these time
series. Unlike traditional pattern \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2021:ELC,
author = "Junhua Zhang and Long Yuan and Wentao Li and Lu Qin
and Ying Zhang",
title = "Efficient label-constrained shortest path queries on
road networks: a tree decomposition approach",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "3",
pages = "686--698",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3494124.3494148",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:35:56 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3494124.3494148",
abstract = "Computing the shortest path between two vertices is a
fundamental problem in road networks. Most of the
existing works assume that the edges in the road
networks have no labels, but in many real applications,
the edges have labels and label constraints \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Suri:2021:ENC,
author = "Sahaana Suri and Ihab F. Ilyas and Christopher R{\'e}
and Theodoros Rekatsinas",
title = "{Ember}: no-code context enrichment via
similarity-based keyless joins",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "3",
pages = "699--712",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3494124.3494149",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:35:56 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3494124.3494149",
abstract = "Structured data, or data that adheres to a pre-defined
schema, can suffer from fragmented context: information
describing a single entity can be scattered across
multiple datasets or tables tailored for specific
business needs, with no explicit linking \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Vu:2021:IPE,
author = "Tin Vu and Ahmed Eldawy and Vagelis Hristidis and
Vassilis Tsotras",
title = "Incremental partitioning for efficient spatial data
analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "3",
pages = "713--726",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3494124.3494150",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:35:56 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3494124.3494150",
abstract = "Big spatial data has become ubiquitous, from mobile
applications to satellite data. In most of these
applications, data is continuously growing to huge
volumes. Existing systems for big spatial data organize
records at either the record-level or block-.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lee:2021:LAV,
author = "Doris Jung-Lin Lee and Dixin Tang and Kunal Agarwal
and Thyne Boonmark and Caitlyn Chen and Jake Kang and
Ujjaini Mukhopadhyay and Jerry Song and Micah Yong and
Marti A. Hearst and Aditya G. Parameswaran",
title = "{Lux}: always-on visualization recommendations for
exploratory dataframe workflows",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "3",
pages = "727--738",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3494124.3494151",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:35:56 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3494124.3494151",
abstract = "Exploratory data science largely happens in
computational notebooks with dataframe APIs, such as
pandas, that support flexible means to transform,
clean, and analyze data. Yet, visually exploring data
in dataframes remains tedious, requiring substantial
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Petersohn:2021:FRB,
author = "Devin Petersohn and Dixin Tang and Rehan Durrani and
Areg Melik-Adamyan and Joseph E. Gonzalez and Anthony
D. Joseph and Aditya G. Parameswaran",
title = "Flexible rule-based decomposition and metadata
independence in modin: a parallel dataframe system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "3",
pages = "739--751",
month = nov,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3494124.3494152",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Feb 5 06:35:56 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3494124.3494152",
abstract = "Dataframes have become universally popular as a means
to represent data in various stages of structure, and
manipulate it using a rich set of operators---thereby
becoming an essential tool in the data scientists'
toolbox. However, dataframe systems, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Han:2021:CED,
author = "Yuxing Han and Ziniu Wu and Peizhi Wu and Rong Zhu and
Jingyi Yang and Liang Wei Tan and Kai Zeng and Gao Cong
and Yanzhao Qin and Andreas Pfadler and Zhengping Qian
and Jingren Zhou and Jiangneng Li and Bin Cui",
title = "Cardinality estimation in {DBMS}: a comprehensive
benchmark evaluation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "4",
pages = "752--765",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3503585.3503586",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Apr 15 06:48:40 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3503585.3503586",
abstract = "Cardinality estimation (CardEst) plays a significant
role in generating high-quality query plans for a query
optimizer in DBMS. In the last decade, an increasing
number of advanced CardEst methods (especially
ML-based) have been proposed with \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2021:RRD,
author = "Qizhen Zhang and Philip A. Bernstein and Daniel S.
Berger and Badrish Chandramouli",
title = "{Redy}: remote dynamic memory cache",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "4",
pages = "766--779",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3503585.3503587",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Apr 15 06:48:40 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3503585.3503587",
abstract = "Redy is a cloud service that provides high performance
caches using RDMA-accessible remote memory. An
application can customize the performance of each cache
with a service level objective (SLO) for latency and
throughput. By using remote memory, it can \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Boissier:2021:RBC,
author = "Martin Boissier",
title = "Robust and budget-constrained encoding configurations
for in-memory database systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "4",
pages = "780--793",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3503585.3503588",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Apr 15 06:48:40 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3503585.3503588",
abstract = "Data encoding has been applied to database systems for
decades as it mitigates bandwidth bottlenecks and
reduces storage requirements. But even in the presence
of these advantages, most in-memory database systems
use data encoding only conservatively as \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Tan:2021:FNR,
author = "Shulong Tan and Weijie Zhao and Ping Li",
title = "Fast neural ranking on bipartite graph indices",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "4",
pages = "794--803",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3503585.3503589",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Apr 15 06:48:40 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3503585.3503589",
abstract = "Neural network based ranking has been widely adopted
owing to its powerful capacity in modeling complex
relationships (e.g., users and items, questions and
answers). Online neural network ranking, i.e., the so
called fast neural ranking, is considered a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gan:2021:BSD,
author = "Shaoduo Gan and Jiawei Jiang and Binhang Yuan and Ce
Zhang and Xiangru Lian and Rui Wang and Jianbin Chang
and Chengjun Liu and Hongmei Shi and Shengzhuo Zhang
and Xianghong Li and Tengxu Sun and Sen Yang and Ji
Liu",
title = "{Bagua}: scaling up distributed learning with system
relaxations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "4",
pages = "804--813",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3503585.3503590",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Apr 15 06:48:40 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3503585.3503590",
abstract = "Recent years have witnessed a growing list of systems
for distributed data-parallel training. Existing
systems largely fit into two paradigms, i.e., parameter
server and MPI-style collective operations. On the
algorithmic side, researchers have proposed \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chan:2021:SCO,
author = "Tsz Nam Chan and Pak Lon Ip and Leong Hou U. and Byron
Choi and Jianliang Xu",
title = "{SWS}: a complexity-optimized solution for
spatial-temporal kernel density visualization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "4",
pages = "814--827",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3503585.3503591",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Apr 15 06:48:40 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3503585.3503591",
abstract = "Spatial-temporal kernel density visualization (STKDV)
has been extensively used in a wide range of
applications, e.g., disease outbreak analysis, traffic
accident hotspot detection, and crime hotspot
detection. While STKDV can provide accurate and
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liu:2021:PFA,
author = "Junxu Liu and Jian Lou and Li Xiong and Jinfei Liu and
Xiaofeng Meng",
title = "Projected federated averaging with heterogeneous
differential privacy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "4",
pages = "828--840",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3503585.3503592",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Apr 15 06:48:40 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3503585.3503592",
abstract = "Federated Learning (FL) is a promising framework for
multiple clients to learn a joint model without
directly sharing the data. In addition to high utility
of the joint model, rigorous privacy protection of the
data and communication efficiency are \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Haimovich:2021:PPS,
author = "Daniel Haimovich and Dima Karamshuk and Thomas J.
Leeper and Evgeniy Riabenko and Milan Vojnovic",
title = "Popularity prediction for social media over arbitrary
time horizons",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "4",
pages = "841--849",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3503585.3503593",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Apr 15 06:48:40 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3503585.3503593",
abstract = "Predicting the popularity of social media content in
real time requires approaches that efficiently operate
at global scale. Popularity prediction is important for
many applications, including detection of harmful viral
content to enable timely content \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Doshi:2021:LWS,
author = "Ishita Doshi and Dhritiman Das and Ashish Bhutani and
Rajeev Kumar and Rushi Bhatt and Niranjan
Balasubramanian",
title = "{LANNS}: a web-scale approximate nearest neighbor
lookup system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "4",
pages = "850--858",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3503585.3503594",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Apr 15 06:48:40 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3503585.3503594",
abstract = "Nearest neighbor search (NNS) has a wide range of
applications in information retrieval, computer vision,
machine learning, databases, and other areas. Existing
state-of-the-art algorithm for nearest neighbor search,
Hierarchical Navigable Small World \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Pena:2021:FDD,
author = "Eduardo H. M. Pena and Eduardo C. de Almeida and Felix
Naumann",
title = "Fast detection of denial constraint violations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "4",
pages = "859--871",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3503585.3503595",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Apr 15 06:48:40 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3503585.3503595",
abstract = "The detection of constraint-based errors is a critical
task in many data cleaning solutions. Previous works
perform the task either using traditional data
management systems or using specialized systems that
speed up error detection. Unfortunately, both
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yu:2021:CFF,
author = "Bowen Yu and Guanyu Feng and Huanqi Cao and Xiaohan Li
and Zhenbo Sun and Haojie Wang and Xiaowei Zhu and
Weimin Zheng and Wenguang Chen",
title = "{Chukonu}: a fully-featured high-performance big data
framework that integrates a native compute engine into
{Spark}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "4",
pages = "872--885",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3503585.3503596",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Apr 15 06:48:40 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3503585.3503596",
abstract = "Apache Spark is a widely deployed big data analytics
framework that offers such attractive features as
resiliency, load-balancing, and a rich ecosystem.
However, there is still plenty of room for improvement
in its performance. Although a data-parallel \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Jin:2021:CNM,
author = "Sian Jin and Chengming Zhang and Xintong Jiang and
Yunhe Feng and Hui Guan and Guanpeng Li and Shuaiwen
Leon Song and Dingwen Tao",
title = "{COMET}: a novel memory-efficient deep learning
training framework by using error-bounded lossy
compression",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "4",
pages = "886--899",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3503585.3503597",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Apr 15 06:48:40 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3503585.3503597",
abstract = "Deep neural networks (DNNs) are becoming increasingly
deeper, wider, and non-linear due to the growing
demands on prediction accuracy and analysis quality.
Training wide and deep neural networks require large
amounts of storage resources such as memory \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2021:FMF,
author = "Zitao Li and Bolin Ding and Ce Zhang and Ninghui Li
and Jingren Zhou",
title = "Federated matrix factorization with privacy
guarantee",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "4",
pages = "900--913",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3503585.3503598",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Apr 15 06:48:40 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3503585.3503598",
abstract = "Matrix factorization (MF) approximates unobserved
ratings in a rating matrix, whose rows correspond to
users and columns correspond to items to be rated, and
has been serving as a fundamental building block in
recommendation systems. This paper \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Duong:2021:SRG,
author = "Chi Thang Duong and Trung Dung Hoang and Hongzhi Yin
and Matthias Weidlich and Quoc Viet Hung Nguyen and
Karl Aberer",
title = "Scalable robust graph embedding with {Spark}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "4",
pages = "914--922",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3503585.3503599",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Apr 15 06:48:40 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3503585.3503599",
abstract = "Graph embedding aims at learning a vector-based
representation of vertices that incorporates the
structure of the graph. This representation then
enables inference of graph properties. Existing graph
embedding techniques, however, do not scale well to
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Paul:2021:DWC,
author = "Debjyoti Paul and Jie Cao and Feifei Li and Vivek
Srikumar",
title = "Database workload characterization with query plan
encoders",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "4",
pages = "923--935",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3503585.3503600",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Apr 15 06:48:40 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3503585.3503600",
abstract = "Smart databases are adopting artificial intelligence
(AI) technologies to achieve instance optimality, and
in the future, databases will come with prepackaged AI
models within their core components. The reason is that
every database runs on different \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Modi:2021:NQO,
author = "Abhishek Modi and Kaushik Rajan and Srinivas Thimmaiah
and Prakhar Jain and Swinky Mann and Ayushi Agarwal and
Ajith Shetty and Shahid K. I. and Ashit Gosalia and
Partho Sarthi",
title = "New query optimization techniques in the {Spark}
engine of {Azure} synapse",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "4",
pages = "936--948",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3503585.3503601",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Apr 15 06:48:40 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3503585.3503601",
abstract = "The cost of big-data query execution is dominated by
stateful operators. These include sort and
hash-aggregate that typically materialize intermediate
data in memory, and exchange that materializes data to
disk and transfers data over the network. In \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sinthong:2021:DDQ,
author = "Phanwadee Sinthong and Dhaval Patel and Nianjun Zhou
and Shrey Shrivastava and Arun Iyengar and Anuradha
Bhamidipaty",
title = "{DQDF}: data-quality-aware dataframes",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "4",
pages = "949--957",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3503585.3503602",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Apr 15 06:48:40 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3503585.3503602",
abstract = "Data quality assessment is an essential process of any
data analysis process including machine learning. The
process is time-consuming as it involves multiple
independent data quality checks that are performed
iteratively at scale on evolving data \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Agarwal:2021:RGC,
author = "Archita Agarwal and Marilyn George and Aaron Jeyaraj
and Malte Schwarzkopf",
title = "Retrofitting {GDPR} compliance onto legacy databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "4",
pages = "958--970",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3503585.3503603",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Apr 15 06:48:40 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3503585.3503603",
abstract = "New privacy laws like the European Union's General
Data Protection Regulation (GDPR) require database
administrators (DBAs) to identify all information
related to an individual on request, e.g., to return or
delete it. This requires time-consuming \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wu:2021:AAC,
author = "Xinle Wu and Dalin Zhang and Chenjuan Guo and Chaoyang
He and Bin Yang and Christian S. Jensen",
title = "{AutoCTS}: automated correlated time series
forecasting",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "4",
pages = "971--983",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3503585.3503604",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Apr 15 06:48:40 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3503585.3503604",
abstract = "Correlated time series (CTS) forecasting plays an
essential role in many cyber-physical systems, where
multiple sensors emit time series that capture
interconnected processes. Solutions based on deep
learning that deliver state-of-the-art CTS \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sudhir:2021:RLM,
author = "Sivaprasad Sudhir and Michael Cafarella and Samuel
Madden",
title = "Replicated layout for in-memory database systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "4",
pages = "984--997",
month = dec,
year = "2021",
CODEN = "????",
DOI = "https://doi.org/10.14778/3503585.3503606",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Apr 15 06:48:40 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3503585.3503606",
abstract = "Scanning and filtering are the foundations of
analytical database systems. Modern DBMSs employ a
variety of techniques to partition and layout data to
improve the performance of these operations. To
accelerate query performance, systems tune data layout
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sanghi:2022:PCD,
author = "Anupam Sanghi and Shadab Ahmed and Jayant R. Haritsa",
title = "Projection-compliant database generation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "5",
pages = "998--1010",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3510397.3510398",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed May 25 08:14:25 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3510397.3510398",
abstract = "Synthesizing data using declarative formalisms has
been persuasively advocated in contemporary data
generation frameworks. In particular, they specify
operator output volumes through row-cardinality
constraints. However, thus far, adherence to these
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Jin:2022:MRE,
author = "Guodong Jin and Semih Salihoglu",
title = "Making {RDBMSs} efficient on graph workloads through
predefined joins",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "5",
pages = "1011--1023",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3510397.3510400",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed May 25 08:14:25 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3510397.3510400",
abstract = "Joins in native graph database management systems
(GDBMSs) are predefined to the system as edges, which
are indexed in adjacency list indices and serve as
pointers. This contrasts with and can be more
performant than value-based joins in RDBMSs. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Deep:2022:REJ,
author = "Shaleen Deep and Xiao Hu and Paraschos Koutris",
title = "Ranked enumeration of join queries with projections",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "5",
pages = "1024--1037",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3510397.3510401",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed May 25 08:14:25 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3510397.3510401",
abstract = "Join query evaluation with ordering is a fundamental
data processing task in relational database management
systems. SQL and custom graph query languages such as
Cypher offer this functionality by allowing users to
specify the order via the ORDER BY \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Shin:2022:HSC,
author = "Ahnjae Shin and Joo Seong Jeong and Do Yoon Kim and
Soyoung Jung and Byung-Gon Chun",
title = "Hippo: sharing computations in hyper-parameter
optimization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "5",
pages = "1038--1052",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3510397.3510402",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed May 25 08:14:25 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3510397.3510402",
abstract = "Hyper-parameter optimization is crucial for pushing
the accuracy of a deep learning model to its limits.
However, a hyper-parameter optimization job, referred
to as a study, involves numerous trials of training a
model using different training knobs, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Rinberg:2022:DJC,
author = "Arik Rinberg and Tomer Solomon and Roee Shlomo and Guy
Khazma and Gal Lushi and Idit Keidar and Paula
Ta-Shma",
title = "{DSON}: {JSON CRDT} using delta-mutations for document
stores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "5",
pages = "1053--1065",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3510397.3510403",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed May 25 08:14:25 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3510397.3510403",
abstract = "We propose DSON, a space efficient $ \delta $-based
CRDT approach for distributed JSON document stores,
enabling high availability at a global scale, while
providing strong eventual consistency guarantees. We
define the semantics of our CRDT based approach
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zeighami:2022:NDD,
author = "Sepanta Zeighami and Ritesh Ahuja and Gabriel Ghinita
and Cyrus Shahabi",
title = "A neural database for differentially private spatial
range queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "5",
pages = "1066--1078",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3510397.3510404",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed May 25 08:14:25 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3510397.3510404",
abstract = "Mobile apps and location-based services generate large
amounts of location data. Location density information
from such datasets benefits research on traffic
optimization, context-aware notifications and public
health (e.g., disease spread). To preserve \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Maltry:2022:CAR,
author = "Marcel Maltry and Jens Dittrich",
title = "A critical analysis of recursive model indexes",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "5",
pages = "1079--1091",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3510397.3510405",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed May 25 08:14:25 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3510397.3510405",
abstract = "The recursive model index (RMI) has recently been
introduced as a machine-learned replacement for
traditional indexes over sorted data, achieving
remarkably fast lookups. Follow-up work focused on
explaining RMI's performance and automatically
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ge:2022:HBD,
author = "Zerui Ge and Dumitrel Loghin and Beng Chin Ooi and
Pingcheng Ruan and Tianwen Wang",
title = "Hybrid blockchain database systems: design and
performance",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "5",
pages = "1092--1104",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3510397.3510406",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed May 25 08:14:25 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3510397.3510406",
abstract = "With the emergence of hybrid blockchain database
systems, we aim to provide an in-depth analysis of the
performance and trade-offs among a few representative
systems. To achieve this goal, we implement Veritas and
BlockchainDB from scratch. For Veritas, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Bonifati:2022:TQT,
author = "Angela Bonifati and Stefania Dumbrava and George
Fletcher and Jan Hidders and Matthias Hofer and Wim
Martens and Filip Murlak and Joshua Shinavier and
S{\l}awek Staworko and Dominik Tomaszuk",
title = "Threshold queries in theory and in the wild",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "5",
pages = "1105--1118",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3510397.3510407",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed May 25 08:14:25 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3510397.3510407",
abstract = "Threshold queries are an important class of queries
that only require computing or counting answers up to a
specified threshold value. To the best of our
knowledge, threshold queries have been largely
disregarded in the research literature, which is
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sichert:2022:UDO,
author = "Moritz Sichert and Thomas Neumann",
title = "User-defined operators: efficiently integrating custom
algorithms into modern databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "5",
pages = "1119--1131",
month = jan,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3510397.3510408",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed May 25 08:14:25 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3510397.3510408",
abstract = "In recent years, complex data mining and machine
learning algorithms have become more common in data
analytics. Several specialized systems exist to
evaluate these algorithms on ever-growing data sets,
which are built to efficiently execute different
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2022:PEP,
author = "Yue Wang and Vivek Narasayya and Yeye He and Surajit
Chaudhuri",
title = "{PACk}: an efficient partition-based distributed
agglomerative hierarchical clustering algorithm for
deduplication",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "6",
pages = "1132--1145",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3514061.3514062",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:17 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3514061.3514062",
abstract = "The Agglomerative Hierarchical Clustering (AHC)
algorithm is widely used in real-world applications. As
data volumes continue to grow, efficient scale-out
techniques for AHC are becoming increasingly important.
In this paper, we propose a Partition-. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chang:2022:NOA,
author = "Lijun Chang and Zhiyi Wang",
title = "A near-optimal approach to edge connectivity-based
hierarchical graph decomposition",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "6",
pages = "1146--1158",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3514061.3514063",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:17 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3514061.3514063",
abstract = "Driven by applications in graph analytics, the problem
of efficiently computing all k -edge connected
components ( k -ECCs) of a graph G for a user-given k
has been extensively and well studied. It is known that
the k -ECCs of G for all possible values of k
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Tong:2022:HFE,
author = "Yongxin Tong and Xuchen Pan and Yuxiang Zeng and
Yexuan Shi and Chunbo Xue and Zimu Zhou and Xiaofei
Zhang and Lei Chen and Yi Xu and Ke Xu and Weifeng Lv",
title = "{Hu-Fu}: efficient and secure spatial queries over
data federation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "6",
pages = "1159--1172",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3514061.3514064",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:17 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3514061.3514064",
abstract = "Data isolation has become an obstacle to scale up
query processing over big data, since sharing raw data
among data owners is often prohibitive due to security
concerns. A promising solution is to perform secure
queries over a federation of multiple \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fuchs:2022:SUT,
author = "Per Fuchs and Domagoj Margan and Jana Giceva",
title = "{Sortledton}: a universal, transactional graph data
structure",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "6",
pages = "1173--1186",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3514061.3514065",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:17 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3514061.3514065",
abstract = "Despite the wide adoption of graph processing across
many different application domains, there is no
underlying data structure that can serve a variety of
graph workloads (analytics, traversals, and pattern
matching) on dynamic graphs with transactional
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2022:NLF,
author = "Bowen Zhang and Shengan Zheng and Zhenlin Qi and
Linpeng Huang",
title = "{NBTree}: a lock-free {PM}-friendly persistent
{B+}-tree for {eADR}-enabled {PM} systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "6",
pages = "1187--1200",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3514061.3514066",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:17 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3514061.3514066",
abstract = "Persistent memory (PM) promises near-DRAM performance
as well as data persistency. Recently, a new feature
called eADR is available on the 2$^{nd}$ generation
Intel Optane PM with the 3$^{rd}$ generation Intel Xeon
Scalable Processors. eADR ensures that data \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Tuli:2022:TDT,
author = "Shreshth Tuli and Giuliano Casale and Nicholas R.
Jennings",
title = "{TranAD}: deep transformer networks for anomaly
detection in multivariate time series data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "6",
pages = "1201--1214",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3514061.3514067",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:17 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3514061.3514067",
abstract = "Efficient anomaly detection and diagnosis in
multivariate time-series data is of great importance
for modern industrial applications. However, building a
system that is able to quickly and accurately pinpoint
anomalous observations is a challenging \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhao:2022:SPO,
author = "Fuheng Zhao and Divyakant Agrawal and Amr {El Abbadi}
and Ahmed Metwally",
title = "{SpaceSaving$ \pm $}: an optimal algorithm for
frequency estimation and frequent items in the
bounded-deletion model",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "6",
pages = "1215--1227",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3514061.3514068",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:17 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
note = "See errata \cite{Zhao:2023:ESO}.",
URL = "https://dl.acm.org/doi/10.14778/3514061.3514068",
abstract = "In this paper, we propose the first deterministic
algorithms to solve the frequency estimation and
frequent item problems in the bounded-deletion model.
We establish the space lower bound for solving the
deterministic frequent items problem in the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zheng:2022:BEG,
author = "Chenguang Zheng and Hongzhi Chen and Yuxuan Cheng and
Zhezheng Song and Yifan Wu and Changji Li and James
Cheng and Hao Yang and Shuai Zhang",
title = "{ByteGNN}: efficient graph neural network training at
large scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "6",
pages = "1228--1242",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3514061.3514069",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:17 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3514061.3514069",
abstract = "Graph neural networks (GNNs) have shown excellent
performance in a wide range of applications such as
recommendation, risk control, and drug discovery. With
the increase in the volume of graph data, distributed
GNN systems become essential to support \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Jiang:2022:QDG,
author = "Yuli Jiang and Yu Rong and Hong Cheng and Xin Huang
and Kangfei Zhao and Junzhou Huang",
title = "Query driven-graph neural networks for community
search: from non-attributed, attributed, to interactive
attributed",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "6",
pages = "1243--1255",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3514061.3514070",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:17 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3514061.3514070",
abstract = "Given one or more query vertices, Community Search
(CS) aims to find densely intra-connected and loosely
inter-connected structures containing query vertices.
Attributed Community Search (ACS), a related problem,
is more challenging since it finds \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2022:HTT,
author = "Yang Li and Yu Shen and Huaijun Jiang and Wentao Zhang
and Jixiang Li and Ji Liu and Ce Zhang and Bin Cui",
title = "Hyper-tune: towards efficient hyper-parameter tuning
at scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "6",
pages = "1256--1265",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3514061.3514071",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:17 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3514061.3514071",
abstract = "The ever-growing demand and complexity of machine
learning are putting pressure on hyper-parameter tuning
systems: while the evaluation cost of models continues
to increase, the scalability of state-of-the-arts
starts to become a crucial bottleneck. In \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Minartz:2022:MCD,
author = "Koen Minartz and Jens E. d'Hondt and Odysseas
Papapetrou",
title = "Multivariate correlations discovery in static and
streaming data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "6",
pages = "1266--1278",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3514061.3514072",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:17 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3514061.3514072",
abstract = "Correlation analysis is an invaluable tool in many
domains, for better understanding data and extracting
salient insights. Most works to date focus on detecting
high pairwise correlations. A generalization of this
problem with known applications but no \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Poppe:2022:MPA,
author = "Olga Poppe and Qun Guo and Willis Lang and Pankaj
Arora and Morgan Oslake and Shize Xu and Ajay Kalhan",
title = "{Moneyball}: proactive auto-scaling in {Microsoft
Azure SQL} database serverless",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "6",
pages = "1279--1287",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3514061.3514073",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:17 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3514061.3514073",
abstract = "Microsoft Azure SQL Database is among the leading
relational database service providers in the cloud.
Serverless compute automatically scales resources based
on workload demand. When a database becomes idle its
resources are reclaimed. When activity \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Cheng:2022:PRP,
author = "Kewei Cheng and Xian Li and Yifan Ethan Xu and Xin
Luna Dong and Yizhou Sun",
title = "{PGE}: robust product graph embedding learning for
error detection",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "6",
pages = "1288--1296",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3514061.3514074",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:17 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3514061.3514074",
abstract = "Although product graphs (PGs) have gained increasing
attentions in recent years for their successful
applications in product search and recommendations, the
extensive power of PGs can be limited by the inevitable
involvement of various kinds of errors. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Manne:2022:CMR,
author = "Naga Nithin Manne and Shilvi Satpati and Tanu Malik
and Amitabha Bagchi and Ashish Gehani and Amitabh
Chaudhary",
title = "{CHEX}: multiversion replay with ordered checkpoints",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "6",
pages = "1297--1310",
month = feb,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3514061.3514075",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:17 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3514061.3514075",
abstract = "In scientific computing and data science disciplines,
it is often necessary to share application workflows
and repeat results. Current tools containerize
application workflows, and share the resulting
container for repeating results. These tools, due to
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Even:2022:PFP,
author = "Tomer Even and Guy Even and Adam Morrison",
title = "Prefix filter: practically and theoretically better
than bloom",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "7",
pages = "1311--1323",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3523210.3523211",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:18 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3523210.3523211",
abstract = "Many applications of approximate membership query data
structures, or filters, require only an incremental
filter that supports insertions but not deletions.
However, the design space of incremental filters is
missing a ``sweet spot'' filter that combines
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yamada:2022:SDS,
author = "Hiroyuki Yamada and Jun Nemoto",
title = "{Scalar DL}: scalable and practical {Byzantine} fault
detection for transactional database systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "7",
pages = "1324--1336",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3523210.3523212",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:18 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3523210.3523212",
abstract = "This paper presents Scalar DL, a Byzantine fault
detection (BFD) middleware for transactional database
systems. Scalar DL manages two separately administered
database replicas in a database system and can detect
Byzantine faults in the database system \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kim:2022:NLR,
author = "Gyuyeong Kim and Wonjun Lee",
title = "In-network leaderless replication for distributed data
stores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "7",
pages = "1337--1349",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3523210.3523213",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:18 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3523210.3523213",
abstract = "Leaderless replication allows any replica to handle
any type of request to achieve read scalability and
high availability for distributed data stores. However,
this entails burdensome coordination overhead of
replication protocols, degrading write \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sun:2022:FAC,
author = "Xin Sun and Xin Huang and Di Jin",
title = "Fast algorithms for core maximization on large
graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "7",
pages = "1350--1362",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3523210.3523214",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:18 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3523210.3523214",
abstract = "Core maximization, that enlarges the k -core as much
as possible by inserting a few new edges into a graph,
is particularly useful for social group engagement and
network stability improvement. However, the core
maximization problem has been \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Pan:2022:NSC,
author = "Shuye Pan and Peng Wang and Chen Wang and Wei Wang and
Jianmin Wang",
title = "{NLC}: search correlated window pairs on long time
series",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "7",
pages = "1363--1375",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3523210.3523215",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:18 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3523210.3523215",
abstract = "Nowadays, many applications, like Internet of Things
and Industrial Internet, collect data points from
sensors continuously to form long time series. Finding
correlation between time series is a fundamental task
for many time series mining problems. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2022:EBL,
author = "Hanzhi Wang and Zhewei Wei and Junhao Gan and Ye Yuan
and Xiaoyong Du and Ji-Rong Wen",
title = "Edge-based local push for personalized {PageRank}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "7",
pages = "1376--1389",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3523210.3523216",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:18 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pagerank.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3523210.3523216",
abstract = "Personalized PageRank (PPR) is a popular node
proximity metric in graph mining and network research.
A single-source PPR (SSPPR) query asks for the PPR
value of each node on the graph. Due to its importance
and wide applications, decades of efforts have
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chan:2022:CSD,
author = "Harry Kai-Ho Chan and Huan Li and Xiao Li and Hua Lu",
title = "Continuous social distance monitoring in indoor
space",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "7",
pages = "1390--1402",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3523210.3523217",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:18 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3523210.3523217",
abstract = "The COVID-19 pandemic has caused over 6 million deaths
since 2020. To contain the spread of the virus, social
distancing is one of the most simple yet effective
approaches. Motivated by this, in this paper we study
the problem of continuous social \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sun:2022:DSC,
author = "Xibo Sun and Shixuan Sun and Qiong Luo and Bingsheng
He",
title = "An in-depth study of continuous subgraph matching",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "7",
pages = "1403--1416",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3523210.3523218",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:18 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3523210.3523218",
abstract = "Continuous subgraph matching (CSM) algorithms find the
occurrences of a given pattern on a stream of data
graphs online. A number of incremental CSM algorithms
have been proposed. However, a systematical study on
these algorithms is missing to identify \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Mishra:2022:OST,
author = "Abhinav Mishra and Ram Sriharsha and Sichen Zhong",
title = "{OnlineSTL}: scaling time series decomposition by
$100 \times$",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "7",
pages = "1417--1425",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3523210.3523219",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:18 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3523210.3523219",
abstract = "Decomposing a complex time series into trend,
seasonality, and remainder components is an important
primitive that facilitates time series anomaly
detection, change point detection, and forecasting.
Although numerous batch algorithms are known for time
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2022:SSS,
author = "Haoyu Li and Qizhi Chen and Yixin Zhang and Tong Yang
and Bin Cui",
title = "{Stingy sketch}: a sketch framework for accurate and
fast frequency estimation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "7",
pages = "1426--1438",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3523210.3523220",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:18 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3523210.3523220",
abstract = "Recording the frequency of items in highly skewed data
streams is a fundamental and hot problem in recent
years. The literature demonstrates that sketch is the
most promising solution. The typical metrics to measure
a sketch are accuracy and speed, but \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2022:SDP,
author = "Yang Wang and Miao Yu and Yujie Hui and Fang Zhou and
Yuyang Huang and Rui Zhu and Xueyuan Ren and Tianxi Li
and Xiaoyi Lu",
title = "A study of database performance sensitivity to
experiment settings",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "7",
pages = "1439--1452",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3523210.3523221",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:18 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3523210.3523221",
abstract = "To allow performance comparison across different
systems, our community has developed multiple
benchmarks, such as TPC-C and YCSB, which are widely
used. However, despite such effort, interpreting and
comparing performance numbers is still a challenging
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chao:2022:ITC,
author = "Zemin Chao and Hong Gao and Yinan An and Jianzhong
Li",
title = "The inherent time complexity and an efficient
algorithm for subsequence matching problem",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "7",
pages = "1453--1465",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3523210.3523222",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:18 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3523210.3523222",
abstract = "Subsequence matching is an important and fundamental
problem on time series data. This paper studies the
inherent time complexity of the subsequence matching
problem and designs a more efficient algorithm for
solving the problem. Firstly, it is proved \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chai:2022:SDA,
author = "Chengliang Chai and Jiabin Liu and Nan Tang and
Guoliang Li and Yuyu Luo",
title = "Selective data acquisition in the wild for model
charging",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "7",
pages = "1466--1478",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3523210.3523223",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:18 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3523210.3523223",
abstract = "The lack of sufficient labeled data is a key
bottleneck for practitioners in many real-world
supervised machine learning (ML) tasks. In this paper,
we study a new problem, namely selective data
acquisition in the wild for model charging: given a
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fan:2022:DAR,
author = "Wenfei Fan and Wenzhi Fu and Ruochun Jin and Ping Lu
and Chao Tian",
title = "Discovering association rules from big graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "7",
pages = "1479--1492",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3523210.3523224",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:18 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3523210.3523224",
abstract = "This paper tackles two challenges to discovery of
graph rules. Existing discovery methods often (a)
return an excessive number of rules, and (b) do not
scale with large graphs given the intractability of the
discovery problem. We propose an application-.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Han:2022:DEE,
author = "Xiaolin Han and Reynold Cheng and Chenhao Ma and
Tobias Grubenmann",
title = "{DeepTEA}: effective and efficient online
time-dependent trajectory outlier detection",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "7",
pages = "1493--1505",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3523210.3523225",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:18 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3523210.3523225",
abstract = "In this paper, we study anomalous trajectory
detection, which aims to extract abnormal movements of
vehicles on the roads. This important problem, which
facilitates understanding of traffic behavior and
detection of taxi fraud, is challenging due to the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Simonini:2022:ERD,
author = "Giovanni Simonini and Luca Zecchini and Sonia
Bergamaschi and Felix Naumann",
title = "Entity resolution on-demand",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "7",
pages = "1506--1518",
month = mar,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3523210.3523226",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:18 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3523210.3523226",
abstract = "Entity Resolution (ER) aims to identify and merge
records that refer to the same real-world entity. ER is
typically employed as an expensive cleaning step on the
entire data before consuming it. Yet, determining which
entities are useful once cleaned \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Alhazmi:2022:FBC,
author = "Afnan Alhazmi and Tom Blount and George
Konstantinidis",
title = "{ForBackBench}: a benchmark for chasing vs.
query-rewriting",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "8",
pages = "1519--1532",
month = apr,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3529337.3529338",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:19 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3529337.3529338",
abstract = "The problems of Data Integration/Exchange (DE) and
Ontology Based Data Access (OBDA) have been extensively
studied across different communities. The underlying
problem is common: using a number of differently
structured data-sources mapped to a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2022:ASB,
author = "Jeremy Chen and Yuqing Huang and Mushi Wang and Semih
Salihoglu and Ken Salem",
title = "Accurate summary-based cardinality estimation through
the lens of cardinality estimation graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "8",
pages = "1533--1545",
month = apr,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3529337.3529339",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:19 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3529337.3529339",
abstract = "This paper is an experimental and analytical study of
two classes of summary-based cardinality estimators
that use statistics about input relations and
small-size joins in the context of graph database
management systems: (i) optimistic estimators that
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liao:2022:DDC,
author = "Xuankun Liao and Qing Liu and Jiaxin Jiang and Xin
Huang and Jianliang Xu and Byron Choi",
title = "Distributed {D-core} decomposition over large directed
graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "8",
pages = "1546--1558",
month = apr,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3529337.3529340",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:19 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3529337.3529340",
abstract = "Given a directed graph $G$ and integers $k$ and $l$, a
D-core is the maximal subgraph $H \subseteq G$ such
that for every vertex of $H$, its in-degree and
out-degree are no smaller than $k$ and $l$,
respectively. For a directed graph $G$, the problem of
D-core decomposition \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2022:EMB,
author = "Lu Chen and Chengfei Liu and Rui Zhou and Jiajie Xu
and Jianxin Li",
title = "Efficient maximal biclique enumeration for large
sparse bipartite graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "8",
pages = "1559--1571",
month = apr,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3529337.3529341",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:19 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3529337.3529341",
abstract = "Maximal bicliques are effective to reveal meaningful
information hidden in bipartite graphs. Maximal
biclique enumeration (MBE) is challenging since the
number of the maximal bicliques grows exponentially
w.r.t. the number of vertices in a bipartite \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhou:2022:TGF,
author = "Hongkuan Zhou and Da Zheng and Israt Nisa and
Vasileios Ioannidis and Xiang Song and George Karypis",
title = "{TGL}: a general framework for temporal {GNN} training
on billion-scale graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "8",
pages = "1572--1580",
month = apr,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3529337.3529342",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:19 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3529337.3529342",
abstract = "Many real world graphs contain time domain
information. Temporal Graph Neural Networks capture
temporal information as well as structural and
contextual information in the generated dynamic node
embeddings. Researchers have shown that these
embeddings \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yuan:2022:DLF,
author = "Binhang Yuan and Cameron R. Wolfe and Chen Dun and
Yuxin Tang and Anastasios Kyrillidis and Chris
Jermaine",
title = "Distributed learning of fully connected neural
networks using independent subnet training",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "8",
pages = "1581--1590",
month = apr,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3529337.3529343",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:19 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3529337.3529343",
abstract = "Distributed machine learning (ML) can bring more
computational resources to bear than single-machine
learning, thus enabling reductions in training time.
Distributed learning partitions models and data over
many machines, allowing model and dataset \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Burckhardt:2022:NEE,
author = "Sebastian Burckhardt and Badrish Chandramouli and
Chris Gillum and David Justo and Konstantinos Kallas
and Connor McMahon and Christopher S. Meiklejohn and
Xiangfeng Zhu",
title = "{Netherite}: efficient execution of serverless
workflows",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "8",
pages = "1591--1604",
month = apr,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3529337.3529344",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:19 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3529337.3529344",
abstract = "Serverless is a popular choice for cloud service
architects because it can provide scalability and
load-based billing with minimal developer effort.
Functions-as-a-service (FaaS) are originally stateless,
but emerging frameworks add stateful \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Huynh:2022:ERT,
author = "Andy Huynh and Harshal A. Chaudhari and Evimaria Terzi
and Manos Athanassoulis",
title = "{Endure}: a robust tuning paradigm for {LSM} trees
under workload uncertainty",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "8",
pages = "1605--1618",
month = apr,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3529337.3529345",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:19 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3529337.3529345",
abstract = "Log-Structured Merge trees (LSM trees) are
increasingly used as the storage engines behind several
data systems, frequently deployed in the cloud. Similar
to other database architectures, LSM trees consider
information about the expected workload (e.g.,
\ldots{})",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2022:EDB,
author = "Hongzheng Li and Yingxia Shao and Junping Du and Bin
Cui and Lei Chen",
title = "An {I/O}-efficient disk-based graph system for
scalable second-order random walk of large graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "8",
pages = "1619--1631",
month = apr,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3529337.3529346",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:19 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3529337.3529346",
abstract = "Random walk is widely used in many graph analysis
tasks, especially the first-order random walk. However,
as a simplification of real-world problems, the
first-order random walk is poor at modeling
higher-order structures in the data. Recently, second-.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Vaidya:2022:SLE,
author = "Kapil Vaidya and Subarna Chatterjee and Eric Knorr and
Michael Mitzenmacher and Stratos Idreos and Tim
Kraska",
title = "{SNARF}: a learning-enhanced range filter",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "8",
pages = "1632--1644",
month = apr,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3529337.3529347",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:19 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3529337.3529347",
abstract = "We present Sparse Numerical Array-Based Range Filters
(SNARF), a learned range filter that efficiently
supports range queries for numerical data. SNARF
creates a model of the data distribution to map the
keys into a bit array which is stored in a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2022:DEI,
author = "Xin Chen and You Peng and Sibo Wang and Jeffrey Xu
Yu",
title = "{DLCR}: efficient indexing for label-constrained
reachability queries on large dynamic graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "8",
pages = "1645--1657",
month = apr,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3529337.3529348",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:19 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3529337.3529348",
abstract = "Many real-world graphs, e.g., social networks,
biological networks, knowledge graphs, naturally come
with edge-labels, with different labels representing
different relationships between nodes. On such
edge-labeled graphs, an important query is the label-.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhao:2022:QTT,
author = "Yue Zhao and Gao Cong and Jiachen Shi and Chunyan
Miao",
title = "{QueryFormer}: a tree transformer model for query plan
representation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "8",
pages = "1658--1670",
month = apr,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3529337.3529349",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:19 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3529337.3529349",
abstract = "Machine learning has become a prominent method in many
database optimization problems such as cost estimation,
index selection and query optimization. Translating
query execution plans into their vectorized
representations is non-trivial. Recently, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lee:2022:ICI,
author = "Leon Lee and Siphrey Xie and Yunus Ma and Shimin
Chen",
title = "Index checkpoints for instant recovery in in-memory
database systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "8",
pages = "1671--1683",
month = apr,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3529337.3529350",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:19 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3529337.3529350",
abstract = "We observe that the time bottleneck during the
recovery phase of an IMDB (In-Memory DataBase system)
shifts from log replaying to index rebuilding after the
state-of-art techniques for instant recovery have been
applied. In this paper, we investigate \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Esmailoghli:2022:MMA,
author = "Mahdi Esmailoghli and Jorge-Arnulfo Quian{\'e}-Ruiz
and Ziawasch Abedjan",
title = "{MATE}: multi-attribute table extraction",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "8",
pages = "1684--1696",
month = apr,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3529337.3529353",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:19 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3529337.3529353",
abstract = "A core operation in data discovery is to find joinable
tables for a given table. Real-world tables include
both unary and n-ary join keys. However, existing table
discovery systems are optimized for unary joins and are
ineffective and slow in the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Paparrizos:2022:TUE,
author = "John Paparrizos and Yuhao Kang and Paul Boniol and
Ruey S. Tsay and Themis Palpanas and Michael J.
Franklin",
title = "{TSB-UAD}: an end-to-end benchmark suite for
univariate time-series anomaly detection",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "8",
pages = "1697--1711",
month = apr,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3529337.3529354",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:19 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3529337.3529354",
abstract = "The detection of anomalies in time series has gained
ample academic and industrial attention. However, no
comprehensive benchmark exists to evaluate time-series
anomaly detection methods. It is common to use (i)
proprietary or synthetic data, often \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Leone:2022:CRE,
author = "Manuel Leone and Stefano Huber and Akhil Arora and
Alberto Garc{\'\i}a-Dur{\'a}n and Robert West",
title = "A critical re-evaluation of neural methods for entity
alignment",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "8",
pages = "1712--1725",
month = apr,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3529337.3529355",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:19 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3529337.3529355",
abstract = "Neural methods have become the de-facto choice for the
vast majority of data analysis tasks, and entity
alignment (EA) is no exception. Not surprisingly, more
than 50 different neural EA methods have been published
since 2017. However, surprisingly, an \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Paganelli:2022:AHB,
author = "Matteo Paganelli and Francesco {Del Buono} and Andrea
Baraldi and Francesco Guerra",
title = "Analyzing how {BERT} performs entity matching",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "8",
pages = "1726--1738",
month = apr,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3529337.3529356",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 24 09:22:19 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3529337.3529356",
abstract = "State-of-the-art Entity Matching (EM) approaches rely
on transformer architectures, such as BERT, for
generating highly contex-tualized embeddings of terms.
The embeddings are then used to predict whether pairs
of entity descriptions refer to the same \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Arun:2022:SBF,
author = "Balaji Arun and Binoy Ravindran",
title = "Scalable {Byzantine} fault tolerance via partial
decentralization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "9",
pages = "1739--1752",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3538598.3538599",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 28 06:16:23 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3538598.3538599",
abstract = "Byzantine consensus is a critical component in many
permissioned Blockchains and distributed ledgers. We
propose a new paradigm for designing BFT protocols
called DQBFT that addresses three major performance and
scalability challenges that plague past \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2022:EEB,
author = "Huan Li and Lanjing Yi and Bo Tang and Hua Lu and
Christian S. Jensen",
title = "Efficient and error-bounded spatiotemporal quantile
monitoring in edge computing environments",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "9",
pages = "1753--1765",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3538598.3538600",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 28 06:16:23 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3538598.3538600",
abstract = "Underlying many types of data analytics, a
spatiotemporal quantile monitoring (SQM) query
continuously returns the quantiles of a dataset
observed in a spatiotemporal range. In this paper, we
study SQM in an Internet of Things (IoT) based edge
computing \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kato:2022:HDP,
author = "Fumiyuki Kato and Tsubasa Takahashi and Shun Takagi
and Yang Cao and Seng Pei Liew and Masatoshi
Yoshikawa",
title = "{HDPView}: differentially private materialized view
for exploring high dimensional relational data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "9",
pages = "1766--1778",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3538598.3538601",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 28 06:16:23 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3538598.3538601",
abstract = "How can we explore the unknown properties of
high-dimensional sensitive relational data while
preserving privacy? We study how to construct an
explorable privacy-preserving materialized view under
differential privacy. No existing state-of-the-art
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Schmidl:2022:ADT,
author = "Sebastian Schmidl and Phillip Wenig and Thorsten
Papenbrock",
title = "Anomaly detection in time series: a comprehensive
evaluation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "9",
pages = "1779--1797",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3538598.3538602",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 28 06:16:23 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3538598.3538602",
abstract = "Detecting anomalous subsequences in time series data
is an important task in areas ranging from
manufacturing processes over finance applications to
health care monitoring. An anomaly can indicate
important events, such as production faults, delivery
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Youngmann:2022:GED,
author = "Brit Youngmann and Sihem Amer-Yahia and Aurelien
Personnaz",
title = "Guided exploration of data summaries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "9",
pages = "1798--1807",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3538598.3538603",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 28 06:16:23 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3538598.3538603",
abstract = "Data summarization is the process of producing
interpretable and representative subsets of an input
dataset. It is usually performed following a one-shot
process with the purpose of finding the best summary. A
useful summary contains k individually \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2022:FDT,
author = "Xinyi Zhang and Zhuo Chang and Yang Li and Hong Wu and
Jian Tan and Feifei Li and Bin Cui",
title = "Facilitating database tuning with hyper-parameter
optimization: a comprehensive experimental evaluation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "9",
pages = "1808--1821",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3538598.3538604",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 28 06:16:23 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3538598.3538604",
abstract = "Recently, using automatic configuration tuning to
improve the performance of modern database management
systems (DBMSs) has attracted increasing interest from
the database community. This is embodied with a number
of systems featuring advanced tuning \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2022:ESV,
author = "Zuan Wang and Xiaofeng Ding and Hai Jin and Pan Zhou",
title = "Efficient secure and verifiable location-based skyline
queries over encrypted data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "9",
pages = "1822--1834",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3538598.3538605",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 28 06:16:23 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3538598.3538605",
abstract = "Supporting secure location-based services on encrypted
data that is outsourced to cloud computing platforms
remains an ongoing challenge for efficiency due to
expensive ciphertext calculation overhead. Furthermore,
since the clouds may not be \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhao:2022:TIC,
author = "Zhuoyue Zhao and Dong Xie and Feifei Li",
title = "{AB-tree}: index for concurrent random sampling and
updates",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "9",
pages = "1835--1847",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3538598.3538606",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 28 06:16:23 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3538598.3538606",
abstract = "There has been an increasing demand for real-time data
analytics. Approximate Query Processing (AQP) is a
popular option for that because it can use random
sampling to trade some accuracy for lower query
latency. However, the state-of-the-art AQP system
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fang:2022:RTR,
author = "Chenguang Fang and Shaoxu Song and Yinan Mei",
title = "On repairing timestamps for regular interval time
series",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "9",
pages = "1848--1860",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3538598.3538607",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 28 06:16:23 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3538598.3538607",
abstract = "Time series data are often with regular time
intervals, e.g., in IoT scenarios sensor data collected
with a pre-specified frequency, air quality data
regularly recorded by outdoor monitors, and GPS signals
periodically received from multiple satellites.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fan:2022:TEP,
author = "Wenfei Fan and Ruochun Jin and Ping Lu and Chao Tian
and Ruiqi Xu",
title = "Towards event prediction in temporal graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "9",
pages = "1861--1874",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3538598.3538608",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 28 06:16:23 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3538598.3538608",
abstract = "This paper proposes a class of temporal association
rules, denoted by TACOs, for event prediction. As
opposed to previous graph rules, TACOs monitor updates
to graphs, and can be used to capture temporal
interests in recommendation and catch frauds in
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liang:2022:DCH,
author = "Yihuai Liang and Yan Li and Byeong-Seok Shin",
title = "Decentralized crowdsourcing for human intelligence
tasks with efficient on-chain cost",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "9",
pages = "1875--1888",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3538598.3538609",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 28 06:16:23 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3538598.3538609",
abstract = "Crowdsourcing for Human Intelligence Tasks (HIT) has
been widely used to crowdsource human knowledge, such
as image annotation for machine learning. We use a
public blockchain to play the role of traditional
centralized HIT systems, such that the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2022:TDB,
author = "Yue Wang and Ruiqi Xu and Xun Jian and Alexander Zhou
and Lei Chen",
title = "Towards distributed bitruss decomposition on bipartite
graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "9",
pages = "1889--1901",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3538598.3538610",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 28 06:16:23 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3538598.3538610",
abstract = "Mining cohesive subgraphs on bipartite graphs is an
important task. The k -bitruss is one of many popular
cohesive subgraph models, which is the maximal subgraph
where each edge is contained in at least k butterflies.
The bitruss decomposition problem is \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gagliardelli:2022:GSM,
author = "Luca Gagliardelli and George Papadakis and Giovanni
Simonini and Sonia Bergamaschi and Themis Palpanas",
title = "Generalized supervised meta-blocking",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "9",
pages = "1902--1910",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3538598.3538611",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 28 06:16:23 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3538598.3538611",
abstract = "Entity Resolution is a core data integration task that
relies on Blocking to scale to large datasets.
Schema-agnostic blocking achieves very high recall,
requires no domain knowledge and applies to data of any
structuredness and schema heterogeneity. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{An:2022:YRO,
author = "Mijin An and Soojun Im and Dawoon Jung and Sang-Won
Lee",
title = "Your read is our priority in flash storage",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "9",
pages = "1911--1923",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3538598.3538612",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 28 06:16:23 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3538598.3538612",
abstract = "When replacing a dirty victim page upon page miss, the
conventional buffer managers flush the dirty victim
first to the storage before reading the missing page.
This read-after-write (RAW) protocol, unfortunately,
causes the read stall problem on flash \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Bhattacharya:2022:NWO,
author = "Arindam Bhattacharya and Chathur Gudesa and Amitabha
Bagchi and Srikanta Bedathur",
title = "New wine in an old bottle: data-aware hash functions
for {Bloom} filters",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "9",
pages = "1924--1936",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3538598.3538613",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 28 06:16:23 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3538598.3538613",
abstract = "In many applications of Bloom filters, it is possible
to exploit the patterns present in the inserted and
non-inserted keys to achieve more compression than the
standard Bloom filter. A new class of Bloom filters
called Learned Bloom filters use machine \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Peng:2022:SEA,
author = "Jingshu Peng and Zhao Chen and Yingxia Shao and Yanyan
Shen and Lei Chen and Jiannong Cao",
title = "{Sancus}: staleness-aware communication-avoiding
full-graph decentralized training in large-scale graph
neural networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "9",
pages = "1937--1950",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3538598.3538614",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 28 06:16:23 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3538598.3538614",
abstract = "Graph neural networks (GNNs) have emerged due to their
success at modeling graph data. Yet, it is challenging
for GNNs to efficiently scale to large graphs. Thus,
distributed GNNs come into play. To avoid communication
caused by expensive data movement \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Bucchi:2022:CCE,
author = "Marco Bucchi and Alejandro Grez and Andr{\'e}s
Quintana and Cristian Riveros and Stijn Vansummeren",
title = "{CORE}: a complex event recognition engine",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "9",
pages = "1951--1964",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3538598.3538615",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 28 06:16:23 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3538598.3538615",
abstract = "Complex Event Recognition (CER) systems are a
prominent technology for finding user-defined query
patterns over large data streams in real time. CER
query evaluation is known to be computationally
challenging, since it requires maintaining a set of
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Cheng:2022:TEE,
author = "Audrey Cheng and Xiao Shi and Aaron Kabcenell and
Shilpa Lawande and Hamza Qadeer and Jason Chan and
Harrison Tin and Ryan Zhao and Peter Bailis and Mahesh
Balakrishnan and Nathan Bronson and Natacha Crooks and
Ion Stoica",
title = "{TAOBench}: an end-to-end benchmark for social network
workloads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "9",
pages = "1965--1977",
month = may,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3538598.3538616",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Jul 28 06:16:23 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3538598.3538616",
abstract = "The continued emergence of large social network
applications has introduced a scale of data and query
volume that challenges the limits of existing data
stores. However, few benchmarks accurately simulate
these request patterns, leaving researchers in
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kakaraparthy:2022:VHA,
author = "Aarati Kakaraparthy and Jignesh M. Patel and Brian P.
Kroth and Kwanghyun Park",
title = "{VIP} hashing: adapting to skew in popularity of data
on the fly",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "10",
pages = "1978--1990",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3547305.3547306",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Sep 8 11:58:53 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3547305.3547306",
abstract = "All data is not equally popular. Often, some portion
of data is more frequently accessed than the rest,
which causes a skew in popularity of the data items.
Adapting to this skew can improve performance, and this
topic has been studied extensively in \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Vincon:2022:NDP,
author = "Tobias Vin{\c{c}}on and Christian Kn{\"o}dler and
Leonardo Solis-Vasquez and Arthur Bernhardt and Sajjad
Tamimi and Lukas Weber and Florian Stock and Andreas
Koch and Ilia Petrov",
title = "Near-data processing in database systems on native
computational storage under {HTAP} workloads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "10",
pages = "1991--2004",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3547305.3547307",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Sep 8 11:58:53 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3547305.3547307",
abstract = "Today's Hybrid Transactional and Analytical Processing
(HTAP) systems, tackle the ever-growing data in
combination with a mixture of transactional and
analytical workloads. While optimizing for aspects such
as data freshness and performance isolation, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Echihabi:2022:HAD,
author = "Karima Echihabi and Panagiota Fatourou and Kostas
Zoumpatianos and Themis Palpanas and Houda Benbrahim",
title = "{Hercules} against data series similarity search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "10",
pages = "2005--2018",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3547305.3547308",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Sep 8 11:58:53 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3547305.3547308",
abstract = "We propose Hercules, a parallel tree-based technique
for exact similarity search on massive disk-based data
series collections. We present novel index construction
and query answering algorithms that leverage different
summarization techniques, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Siddiqui:2022:DLO,
author = "Tarique Siddiqui and Wentao Wu and Vivek Narasayya and
Surajit Chaudhuri",
title = "{DISTILL}: low-overhead data-driven techniques for
filtering and costing indexes for scalable index
tuning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "10",
pages = "2019--2031",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3547305.3547309",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Sep 8 11:58:53 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3547305.3547309",
abstract = "Many database systems offer index tuning tools that
help automatically select appropriate indexes for
improving the performance of an input workload. Index
tuning is a resource-intensive and time-consuming task
requiring expensive optimizer calls for \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yang:2022:OML,
author = "Zhihui Yang and Zuozhi Wang and Yicong Huang and Yao
Lu and Chen Li and X. Sean Wang",
title = "Optimizing machine learning inference queries with
correlative proxy models",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "10",
pages = "2032--2044",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3547305.3547310",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Sep 8 11:58:53 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3547305.3547310",
abstract = "We consider accelerating machine learning (ML)
inference queries on unstructured datasets. Expensive
operators such as feature extractors and classifiers
are deployed as user-defined functions (UDFs), which
are not penetrable with classic query \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Su:2022:BSD,
author = "Li Su and Xiaoming Qin and Zichao Zhang and Rui Yang
and Le Xu and Indranil Gupta and Wenyuan Yu and Kai
Zeng and Jingren Zhou",
title = "{Banyan}: a scoped dataflow engine for graph query
service",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "10",
pages = "2045--2057",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3547305.3547311",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Sep 8 11:58:53 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3547305.3547311",
abstract = "Graph query services (GQS) are widely used today to
interactively answer graph traversal queries on
large-scale graph data. Existing graph query engines
focus largely on optimizing the latency of a single
query. This ignores significant challenges posed
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Huang:2022:FEU,
author = "Ziyue Huang and Yuan Qiu and Ke Yi and Graham
Cormode",
title = "Frequency estimation under multiparty differential
privacy: one-shot and streaming",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "10",
pages = "2058--2070",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3547305.3547312",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Sep 8 11:58:53 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3547305.3547312",
abstract = "We study the fundamental problem of frequency
estimation under both privacy and communication
constraints, where the data is distributed among k
parties. We consider two application scenarios: (1)
one-shot, where the data is static and the aggregator
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ali:2022:OIS,
author = "Ahsan Ali and Riccardo Pinciroli and Feng Yan and
Evgenia Smirni",
title = "Optimizing inference serving on serverless platforms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "10",
pages = "2071--2084",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3547305.3547313",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Sep 8 11:58:53 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3547305.3547313",
abstract = "Serverless computing is gaining popularity for machine
learning (ML) serving workload due to its autonomous
resource scaling, easy to use and pay-per-use cost
model. Existing serverless platforms work well for
image-based ML inference, where requests \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Alkowaileet:2022:CFS,
author = "Wail Y. Alkowaileet and Michael J. Carey",
title = "Columnar formats for schemaless {LSM}-based document
stores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "10",
pages = "2085--2097",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3547305.3547314",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Sep 8 11:58:53 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3547305.3547314",
abstract = "In the last decade, document store database systems
have gained more traction for storing and querying
large volumes of semi-structured data. However, the
flexibility of the document stores' data models has
limited their ability to store data in a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Qiu:2022:ESP,
author = "Yu-Xuan Qiu and Dong Wen and Lu Qin and Wentao Li and
Rong-Hua Li and Ying Zhang",
title = "Efficient shortest path counting on large road
networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "10",
pages = "2098--2110",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3547305.3547315",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Sep 8 11:58:53 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3547305.3547315",
abstract = "The shortest path distance and related concepts lay
the foundations of many real-world applications in road
network analysis. The shortest path count has drawn
much research attention in academia, not only as a
closeness metric accompanying the shorted \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fu:2022:TCE,
author = "Fangcheng Fu and Xupeng Miao and Jiawei Jiang and
Huanran Xue and Bin Cui",
title = "Towards communication-efficient vertical federated
learning training via cache-enabled local updates",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "10",
pages = "2111--2120",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3547305.3547316",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Sep 8 11:58:53 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3547305.3547316",
abstract = "Vertical federated learning (VFL) is an emerging
paradigm that allows different parties (e.g.,
organizations or enterprises) to collaboratively build
machine learning models with privacy protection. In the
training phase, VFL only exchanges the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhu:2022:DED,
author = "Yifan Zhu and Lu Chen and Yunjun Gao and Baihua Zheng
and Pengfei Wang",
title = "{DESIRE}: an efficient dynamic cluster-based forest
indexing for similarity search in multi-metric spaces",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "10",
pages = "2121--2133",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3547305.3547317",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Sep 8 11:58:53 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3547305.3547317",
abstract = "Similarity search finds similar objects for a given
query object based on a certain similarity metric.
Similarity search in metric spaces has attracted
increasing attention, as the metric space can
accommodate any type of data and support flexible
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kim:2022:AAB,
author = "Junghoon Kim and Kaiyu Feng and Gao Cong and Diwen Zhu
and Wenyuan Yu and Chunyan Miao",
title = "{ABC}: attributed bipartite co-clustering",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "10",
pages = "2134--2147",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3547305.3547318",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Sep 8 11:58:53 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3547305.3547318",
abstract = "Finding a set of co-clusters in a bipartite network is
a fundamental and important problem. In this paper, we
present the Attributed Bipartite Co-clustering (ABC)
problem which unifies two main concepts: (i) bipartite
modularity optimization, and (ii) \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Xiao:2022:TSD,
author = "Jinzhao Xiao and Yuxiang Huang and Changyu Hu and
Shaoxu Song and Xiangdong Huang and Jianmin Wang",
title = "Time series data encoding for efficient storage: a
comparative analysis in {Apache IoTDB}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "10",
pages = "2148--2160",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3547305.3547319",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Sep 8 11:58:53 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3547305.3547319",
abstract = "Not only the vast applications but also the distinct
features of time series data stimulate the booming
growth of time series database management systems, such
as Apache IoTDB, InfluxDB, OpenTSDB and so on. Almost
all these systems employ columnar \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2022:SLO,
author = "Teng Zhang and Jian Tan and Xin Cai and Jianying Wang
and Feifei Li and Jianling Sun",
title = "{SA-LSM}: optimize data layout for {LSM}-tree based
storage using survival analysis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "10",
pages = "2161--2174",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3547305.3547320",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Sep 8 11:58:53 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3547305.3547320",
abstract = "A significant fraction of data in cloud storage is
rarely accessed, referred to as cold data. Accurately
identifying and efficiently managing cold data on
cost-effective storages is one of the major challenges
for cloud providers, which balances between \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ferragina:2022:IMV,
author = "Paolo Ferragina and Giovanni Manzini and Travis Gagie
and Dominik K{\"o}ppl and Gonzalo Navarro and Manuel
Striani and Francesco Tosoni",
title = "Improving matrix-vector multiplication via lossless
grammar-compressed matrices",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "10",
pages = "2175--2187",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3547305.3547321",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Sep 8 11:58:53 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3547305.3547321",
abstract = "As nowadays Machine Learning (ML) techniques are
generating huge data collections, the problem of how to
efficiently engineer their storage and operations is
becoming of paramount importance. In this article we
propose a new lossless compression scheme \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wu:2022:NRL,
author = "Shangyu Wu and Yufei Cui and Jinghuan Yu and Xuan Sun
and Tei-Wei Kuo and Chun Jason Xue",
title = "{NFL}: robust learned index via distribution
transformation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "10",
pages = "2188--2200",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3547305.3547322",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Sep 8 11:58:53 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3547305.3547322",
abstract = "Recent works on learned index open a new direction for
the indexing field. The key insight of the learned
index is to approximate the mapping between keys and
positions with piece-wise linear functions. Such
methods require partitioning key space for a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zare:2022:LLG,
author = "Hamidreza Zare and Viveck Ramesh Cadambe and Bhuvan
Urgaonkar and Nader Alfares and Praneet Soni and Chetan
Sharma and Arif A. Merchant",
title = "{LEGOStore}: a linearizable geo-distributed store
combining replication and erasure coding",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "10",
pages = "2201--2215",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3547305.3547323",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Sep 8 11:58:53 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3547305.3547323",
abstract = "We design and implement LEGOStore, an erasure coding
(EC) based linearizable data store over geo-distributed
public cloud data centers (DCs). For such a data store,
the confluence of the following factors opens up
opportunities for EC to be latency-. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Simpson:2022:MMU,
author = "Michael Simpson and Farnoosh Hashemi and Laks V. S.
Lakshmanan",
title = "Misinformation mitigation under differential
propagation rates and temporal penalties",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "10",
pages = "2216--2229",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3547305.3547324",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Sep 8 11:58:53 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3547305.3547324",
abstract = "We propose an information propagation model that
captures important temporal aspects that have been well
observed in the dynamics of fake news diffusion, in
contrast with the diffusion of truth. The model
accounts for differential propagation rates of
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhou:2022:SDL,
author = "Lixi Zhou and Jiaqing Chen and Amitabh Das and Hong
Min and Lei Yu and Ming Zhao and Jia Zou",
title = "Serving deep learning models with deduplication from
relational databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "10",
pages = "2230--2243",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3547305.3547325",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Sep 8 11:58:53 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3547305.3547325",
abstract = "Serving deep learning models from relational databases
brings significant benefits. First, features extracted
from databases do not need to be transferred to any
decoupled deep learning systems for inferences, and
thus the system management overhead can \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Huang:2022:DOI,
author = "Zichun Huang and Shimin Chen",
title = "Density-optimized intersection-free mapping and matrix
multiplication for join-project operations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "10",
pages = "2244--2256",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3547305.3547326",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Sep 8 11:58:53 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3547305.3547326",
abstract = "A Join-Project operation is a join operation followed
by a duplicate eliminating projection operation. It is
used in a large variety of applications, including
entity matching, set analytics, and graph analytics.
Previous work proposes a hybrid design \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Jahangiri:2022:DTO,
author = "Shiva Jahangiri and Michael J. Carey and
Johann-Christoph Freytag",
title = "Design trade-offs for a robust dynamic hybrid hash
join",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "10",
pages = "2257--2269",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3547305.3547327",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Sep 8 11:58:53 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3547305.3547327",
abstract = "Hybrid Hash Join (HHJ) has proven to be one of the
most efficient and widely-used join algorithms. While
HHJ's performance depends largely on accurate
statistics and information about the input relations,
it may not always be practical or possible for a
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Foufoulas:2022:YYE,
author = "Yannis Foufoulas and Alkis Simitsis and Lefteris
Stamatogiannakis and Yannis Ioannidis",
title = "{YeSQL}: ``you extend {SQL}'' with rich and highly
performant user-defined functions in relational
databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "10",
pages = "2270--2283",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3547305.3547328",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Sep 8 11:58:53 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3547305.3547328",
abstract = "The diversity and complexity of modern data management
applications have led to the extension of the
relational paradigm with syntactic and semantic support
for User-Defined Functions (UDFs). Although
well-established in traditional DBMS settings, UDFs
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ahmetaj:2022:MSS,
author = "Shqiponja Ahmetaj and Bianca L{\"o}hnert and Magdalena
Ortiz and Mantas Simkus",
title = "Magic shapes for {SHACL} validation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "10",
pages = "2284--2296",
month = jun,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3547305.3547329",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Sep 8 11:58:53 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3547305.3547329",
abstract = "A key prerequisite for the successful adoption of the
Shapes Constraint Language (SHACL)---the W3C
standardized constraint language for RDF graphs---is
the availability of automated tools that efficiently
validate targeted constraints (known as shapes
\ldots{})",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Merchant:2022:SGR,
author = "Arpit Merchant and Aristides Gionis and Michael
Mathioudakis",
title = "Succinct graph representations as distance oracles: an
experimental evaluation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "11",
pages = "2297--2306",
month = jul,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3551793.3551794",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 29 08:52:37 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3551793.3551794",
abstract = "Distance oracles answer shortest-path queries between
any pair of nodes in a graph. They are often built
using succinct graph representations such as spanners,
sketches, and compressors to minimize oracle size and
query answering latency. Node \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Jiang:2022:ECS,
author = "Yangqin Jiang and Yixiang Fang and Chenhao Ma and Xin
Cao and Chunshan Li",
title = "Effective community search over large star-schema
heterogeneous information networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "11",
pages = "2307--2320",
month = jul,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3551793.3551795",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 29 08:52:37 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3551793.3551795",
abstract = "Community search (CS) enables personalized community
discovery and has found a wide spectrum of emerging
applications such as setting up social events and
friend recommendation. While CS has been extensively
studied for conventional homogeneous networks,.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ting:2022:NDT,
author = "Kai Ming Ting and Zongyou Liu and Hang Zhang and Ye
Zhu",
title = "A new distributional treatment for time series and an
anomaly detection investigation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "11",
pages = "2321--2333",
month = jul,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3551793.3551796",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 29 08:52:37 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3551793.3551796",
abstract = "Time series is traditionally treated with two main
approaches, i.e., the time domain approach and the
frequency domain approach. These approaches must rely
on a sliding window so that time-shift versions of a
periodic subsequence can be measured to be \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Denham:2022:WUL,
author = "Benjamin Denham and Edmund M-K. Lai and Roopak Sinha
and M. Asif Naeem",
title = "{Witan}: unsupervised labelling function generation
for assisted data programming",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "11",
pages = "2334--2347",
month = jul,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3551793.3551797",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 29 08:52:37 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3551793.3551797",
abstract = "Effective supervised training of modern machine
learning models often requires large labelled training
datasets, which could be prohibitively costly to
acquire for many practical applications. Research
addressing this problem has sought ways to leverage
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Bao:2022:SMM,
author = "Ergute Bao and Yizheng Zhu and Xiaokui Xiao and Yin
Yang and Beng Chin Ooi and Benjamin Hong Meng Tan and
Khin Mi Mi Aung",
title = "{Skellam} mixture mechanism: a novel approach to
federated learning with differential privacy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "11",
pages = "2348--2360",
month = jul,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3551793.3551798",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 29 08:52:37 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3551793.3551798",
abstract = "Deep neural networks have strong capabilities of
memorizing the underlying training data, which can be a
serious privacy concern. An effective solution to this
problem is to train models with differential privacy (
DP ), which provides rigorous privacy \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Hilprecht:2022:ZSC,
author = "Benjamin Hilprecht and Carsten Binnig",
title = "Zero-shot cost models for out-of-the-box learned cost
prediction",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "11",
pages = "2361--2374",
month = jul,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3551793.3551799",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 29 08:52:37 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3551793.3551799",
abstract = "In this paper, we introduce zero-shot cost models,
which enable learned cost estimation that generalizes
to unseen databases. In contrast to state-of-the-art
workload-driven approaches, which require to execute a
large set of training queries on every \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Choi:2022:WMG,
author = "Dalsu Choi and Hyunsik Yoon and Hyubjin Lee and Yon
Dohn Chung",
title = "{Waffle}: in-memory grid index for moving objects with
reinforcement learning-based configuration tuning
system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "11",
pages = "2375--2388",
month = jul,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3551793.3551800",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 29 08:52:37 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3551793.3551800",
abstract = "Location-based services for moving objects are close
to our lives. For example, ride-sharing services,
micro-mobility services, navigation and traffic
management, delivery services, and autonomous driving
are all based on moving objects. The efficient
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Jungmair:2022:DOF,
author = "Michael Jungmair and Andr{\'e} Kohn and Jana Giceva",
title = "Designing an open framework for query optimization and
compilation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "11",
pages = "2389--2401",
month = jul,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3551793.3551801",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 29 08:52:37 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3551793.3551801",
abstract = "Since its invention, data-centric code generation has
been adopted for query compilation by various database
systems in academia and industry. These database
systems are fast but maximize performance at the
expense of developer friendliness, flexibility,.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Nguyen:2022:PST,
author = "Lam-Duy Nguyen and Sang-Won Lee and Beomseok Nam",
title = "In-page shadowing and two-version timestamp ordering
for mobile {DBMSs}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "11",
pages = "2402--2414",
month = jul,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3551793.3551802",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 29 08:52:37 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3551793.3551802",
abstract = "Increasing the concurrency level in mobile database
systems has not received much attention, mainly because
the concurrency requirements of mobile workloads has
been regarded to be low. Contrary to popular belief,
mobile workloads require higher \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sun:2022:REA,
author = "Shixuan Sun and Xibo Sun and Bingsheng He and Qiong
Luo",
title = "{RapidFlow}: an efficient approach to continuous
subgraph matching",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "11",
pages = "2415--2427",
month = jul,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3551793.3551803",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 29 08:52:37 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3551793.3551803",
abstract = "Continuous subgraph matching (CSM) is an important
building block in many real-time graph processing
applications. Given a subgraph query Q and a data graph
stream, a CSM algorithm reports the occurrences of Q in
the stream. Specifically, when a new \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Helali:2022:SAA,
author = "Mossad Helali and Essam Mansour and Ibrahim Abdelaziz
and Julian Dolby and Kavitha Srinivas",
title = "A scalable {AutoML} approach based on graph neural
networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "11",
pages = "2428--2436",
month = jul,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3551793.3551804",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 29 08:52:37 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3551793.3551804",
abstract = "AutoML systems build machine learning models
automatically by performing a search over valid data
transformations and learners, along with
hyper-parameter optimization for each learner. Many
AutoML systems use meta-learning to guide search for
optimal \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Pappachan:2022:DTT,
author = "Primal Pappachan and Shufan Zhang and Xi He and Sharad
Mehrotra",
title = "Don't be a tattle-tale: preventing leakages through
data dependencies on access control protected data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "11",
pages = "2437--2449",
month = jul,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3551793.3551805",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 29 08:52:37 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3551793.3551805",
abstract = "We study the problem of answering queries when (part
of) the data may be sensitive and should not be leaked
to the querier. Simply restricting the computation to
non-sensitive part of the data may leak sensitive data
through inference based on data \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Xu:2022:ELB,
author = "Qingyu Xu and Feng Zhang and Zhiming Yao and Lv Lu and
Xiaoyong Du and Dong Deng and Bingsheng He",
title = "Efficient load-balanced butterfly counting on {GPU}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "11",
pages = "2450--2462",
month = jul,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3551793.3551806",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 29 08:52:37 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3551793.3551806",
abstract = "Butterfly counting is an important and costly
operation for large bipartite graphs. GPUs are popular
parallel heterogeneous devices and can bring
significant performance improvement for data science
applications. Unfortunately, no work enables efficient
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Benson:2022:PBB,
author = "Lawrence Benson and Leon Papke and Tilmann Rabl",
title = "{PerMA}-bench: benchmarking persistent memory access",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "11",
pages = "2463--2476",
month = jul,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3551793.3551807",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 29 08:52:37 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3551793.3551807",
abstract = "Persistent memory's (PMem) byte-addressability and
persistence at DRAM-like speed with SSD-like capacity
have the potential to cause a major performance shift
in database storage systems. With the availability of
Intel Optane DC Persistent Memory, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{He:2022:EPM,
author = "Yuliang He and Duo Lu and Kaisong Huang and Tianzheng
Wang",
title = "Evaluating persistent memory range indexes: part two",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "11",
pages = "2477--2490",
month = jul,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3551793.3551808",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 29 08:52:37 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3551793.3551808",
abstract = "Scalable persistent memory (PM) has opened up new
opportunities for building indexes that operate and
persist data directly on the memory bus, potentially
enabling instant recovery, low latency and high
throughput. When real PM hardware (Intel Optane
\ldots{})",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yogatama:2022:ODP,
author = "Bobbi W. Yogatama and Weiwei Gong and Xiangyao Yu",
title = "Orchestrating data placement and query execution in
heterogeneous {CPU-GPU DBMS}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "11",
pages = "2491--2503",
month = jul,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3551793.3551809",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 29 08:52:37 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3551793.3551809",
abstract = "There has been a growing interest in using GPU to
accelerate data analytics due to its massive
parallelism and high memory bandwidth. The main
constraint of using GPU for data analytics is the
limited capacity of GPU memory. Heterogeneous CPU-GPU
query \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2022:IMO,
author = "Weicheng Wang and Raymond Chi-Wing Wong",
title = "Interactive mining with ordered and unordered
attributes",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "11",
pages = "2504--2516",
month = jul,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3551793.3551810",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 29 08:52:37 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3551793.3551810",
abstract = "There are various queries proposed to assist users in
finding their favorite tuples from a dataset with the
help of user interaction. Specifically, they interact
with a user by asking questions. Each question presents
two tuples, which are selected from \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yang:2022:FDS,
author = "Wenzhe Yang and Sheng Wang and Yuan Sun and Zhiyong
Peng",
title = "Fast dataset search with earth mover's distance",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "11",
pages = "2517--2529",
month = jul,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3551793.3551811",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 29 08:52:37 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3551793.3551811",
abstract = "The amount of spatial data in open data portals has
increased rapidly, raising the demand for spatial
dataset search in large data repositories. In this
paper, we tackle spatial dataset search by using the
Earth Mover's Distance (EMD) to measure the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Pereira:2022:AST,
author = "Jo{\~a}o L. M. Pereira and Jo{\~a}o Casanova and
Helena Galhardas and Dennis Shasha",
title = "{AcX}: system, techniques, and experiments for acronym
expansion",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "11",
pages = "2530--2544",
month = jul,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3551793.3551812",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 29 08:52:37 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3551793.3551812",
abstract = "In this information-accumulating world, each of us
must learn continuously. To participate in a new field,
or even a sub-field, one must be aware of the
terminology including the acronyms that specialists
know so well, but newcomers do not. Building on
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2022:GTH,
author = "Hongzhi Chen and Changji Li and Chenguang Zheng and
Chenghuan Huang and Juncheng Fang and James Cheng and
Jian Zhang",
title = "{G-tran}: a high performance distributed graph
database with a decentralized architecture",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "11",
pages = "2545--2558",
month = jul,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3551793.3551813",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 29 08:52:37 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3551793.3551813",
abstract = "Graph transaction processing poses unique challenges
such as random data access due to the irregularity of
graph structures, low throughput and high abort rate
due to the relatively large read/write sets in graph
transactions. To address these \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Konig:2022:TPS,
author = "Arnd Christian K{\"o}nig and Yi Shan and Tobias
Ziegler and Aarati Kakaraparthy and Willis Lang and
Justin Moeller and Ajay Kalhan and Vivek Narasayya",
title = "Tenant placement in over-subscribed
database-as-a-service clusters",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "11",
pages = "2559--2571",
month = jul,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3551793.3551814",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 29 08:52:37 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3551793.3551814",
abstract = "Relational cloud Database-as-a-Service offerings run
on multi-tenant infrastructure consisting of clusters
of nodes, with each node hosting multiple tenant
databases. Such clusters may be over-subscribed to
increase resource utilization and improve \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2022:EBS,
author = "Yue Chen and Kaiyu Feng and Gao Cong and Han Mao
Kiah",
title = "Example-based spatial pattern matching",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "11",
pages = "2572--2584",
month = jul,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3551793.3551815",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 29 08:52:37 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3551793.3551815",
abstract = "The prevalence of GPS-enabled mobile devices and
location-based services yield massive volume of spatial
objects where each object contains information
including geographical location, name, address,
category and other attributes. This paper introduces
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Peng:2022:NFP,
author = "Zeshun Peng and Yanfeng Zhang and Qian Xu and Haixu
Liu and Yuxiao Gao and Xiaohua Li and Ge Yu",
title = "{NeuChain}: a fast permissioned blockchain system with
deterministic ordering",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "11",
pages = "2585--2598",
month = jul,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3551793.3551816",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 29 08:52:37 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3551793.3551816",
abstract = "Blockchain serves as a replicated transactional
processing system in a trustless distributed
environment. Existing blockchain systems all rely on an
explicit ordering step to determine the global order of
transactions that are collected from multiple
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{McKenna:2022:AAI,
author = "Ryan McKenna and Brett Mullins and Daniel Sheldon and
Gerome Miklau",
title = "{AIM}: an adaptive and iterative mechanism for
differentially private synthetic data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "11",
pages = "2599--2612",
month = jul,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3551793.3551817",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 29 08:52:37 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3551793.3551817",
abstract = "We propose AIM, a new algorithm for differentially
private synthetic data generation. AIM is a
workload-adaptive algorithm within the paradigm of
algorithms that first selects a set of queries, then
privately measures those queries, and finally
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Toussaint:2022:TNV,
author = "Etienne Toussaint and Paolo Guagliardo and Leonid
Libkin and Juan Sequeda",
title = "Troubles with nulls, views from the users",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "11",
pages = "2613--2625",
month = jul,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3551793.3551818",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 29 08:52:37 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3551793.3551818",
abstract = "Incomplete data, in the form of null values, has been
extensively studied since the inception of the
relational model in the 1970s. Anecdotally, one hears
that the way in which SQL, the standard language for
relational databases, handles nulls creates a
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Park:2022:GSE,
author = "Yeonhong Park and Sunhong Min and Jae W. Lee",
title = "{Ginex}: {SSD}-enabled billion-scale graph neural
network training on a single machine via provably
optimal in-memory caching",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "11",
pages = "2626--2639",
month = jul,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3551793.3551819",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 29 08:52:37 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3551793.3551819",
abstract = "Graph Neural Networks (GNNs) are receiving a spotlight
as a powerful tool that can effectively serve various
inference tasks on graph structured data. As the size
of real-world graphs continues to scale, the GNN
training system faces a scalability \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2022:SPQ,
author = "Junhua Zhang and Wentao Li and Long Yuan and Lu Qin
and Ying Zhang and Lijun Chang",
title = "Shortest-path queries on complex networks:
experiments, analyses, and improvement",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "11",
pages = "2640--2652",
month = jul,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3551793.3551820",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 29 08:52:37 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3551793.3551820",
abstract = "The shortest-path query, which returns the shortest
path between two vertices, is a basic operation on
complex networks and has numerous applications. To
handle shortest-path queries, one option is to use
traversal-based methods (e.g., breadth-first \ldots{})",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ghayyur:2022:MAA,
author = "Sameera Ghayyur and Dhrubajyoti Ghosh and Xi He and
Sharad Mehrotra",
title = "{MIDE}: accuracy aware minimally invasive data
exploration for decision support",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "11",
pages = "2653--2665",
month = jul,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3551793.3551821",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 29 08:52:37 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3551793.3551821",
abstract = "This paper studies privacy in the context of
decision-support queries that classify objects as
either true or false based on whether they satisfy the
query. Mechanisms to ensure privacy may result in false
positives and false negatives. In decision-. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ghosh:2022:JJT,
author = "Dhrubajyoti Ghosh and Peeyush Gupta and Sharad
Mehrotra and Roberto Yus and Yasser Altowim",
title = "{JENNER}: just-in-time enrichment in query
processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "11",
pages = "2666--2678",
month = jul,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3551793.3551822",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 29 08:52:37 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3551793.3551822",
abstract = "Emerging domains, such as sensor-driven smart spaces
and social media analytics, require incoming data to be
enriched prior to its use. Enrichment often consists of
machine learning (ML) functions that are too
expensive/infeasible to execute at \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2022:CCA,
author = "Jiaoyi Zhang and Yihan Gao",
title = "{CARMI}: a cache-aware learned index with a cost-based
construction algorithm",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "11",
pages = "2679--2691",
month = jul,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3551793.3551823",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Oct 29 08:52:37 MDT 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3551793.3551823",
abstract = "Learned indexes, which use machine learning models to
replace traditional index structures, have shown
promising results in recent studies. However, existing
learned indexes exhibit a performance gap between
synthetic and real-world datasets, making \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chiosa:2022:HAC,
author = "Monica Chiosa and Fabio Maschi and Ingo M{\"u}ller and
Gustavo Alonso and Norman May",
title = "Hardware acceleration of compression and encryption in
{SAP HANA}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3277--3291",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554822",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554822",
abstract = "With the advent of cloud computing, where
computational resources are expensive and data movement
needs to be secured and minimized, database management
systems need to reconsider their architecture to
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Graf:2022:FPB,
author = "Martin Graf and Lukas Laskowski and Florian Papsdorf
and Florian Sold and Roland Gremmelspacher and Felix
Naumann and Fabian Panse",
title = "{Frost}: a platform for benchmarking and exploring
data matching results",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3292--3305",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554823",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554823",
abstract = "``Bad'' data has a direct impact on 88\% of companies,
with the average company losing 12\% of its revenue due
to it. Duplicates --- multiple but different
representations of the same real-world entities --- are
among the main \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2022:BHP,
author = "Changji Li and Hongzhi Chen and Shuai Zhang and
Yingqian Hu and Chao Chen and Zhenjie Zhang and Meng Li
and Xiangchen Li and Dongqing Han and Xiaohui Chen and
Xudong Wang and Huiming Zhu and Xuwei Fu and Tingwei Wu
and Hongfei Tan and Hengtian Ding and Mengjin Liu and
Kangcheng Wang and Ting Ye and Lei Li and Xin Li and Yu
Wang and Chenguang Zheng and Hao Yang and James Cheng",
title = "{ByteGraph}: a high-performance distributed graph
database in {ByteDance}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3306--3318",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554824",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554824",
abstract = "Most products at ByteDance, e.g., TikTok, Douyin, and
Toutiao, naturally generate massive amounts of graph
data. To efficiently store, query and update massive
graph data is challenging for the broad range of
products at \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Das:2022:CEC,
author = "Prakash Das and Shivangi Srivastava and Valentin
Moskovich and Anmol Chaturvedi and Anant Mittal and
Yongqin Xiao and Mosharaf Chowdhury",
title = "{CDI-E}: an elastic cloud service for data
engineering",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3319--3331",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554825",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554825",
abstract = "We live in the gilded age of data-driven computing.
With public clouds offering virtually unlimited amounts
of compute and storage, enterprises collecting data
about every aspect of their businesses, and advances in
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2022:OED,
author = "Sheng Wang and Yiran Li and Huorong Li and Feifei Li
and Chengjin Tian and Le Su and Yanshan Zhang and
Yubing Ma and Lie Yan and Yuanyuan Sun and Xuntao Cheng
and Xiaolong Xie and Yu Zou",
title = "{Operon}: an encrypted database for
ownership-preserving data management",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3332--3345",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554826",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554826",
abstract = "The past decade has witnessed the rapid development of
cloud computing and data-centric applications. While
these innovations offer numerous attractive features
for data processing, they also bring in new issues
about \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gong:2022:TPF,
author = "Caixin Gong and Chengjin Tian and Zhengheng Wang and
Sheng Wang and Xiyu Wang and Qiulei Fu and Wu Qin and
Long Qian and Rui Chen and Jiang Qi and Ruo Wang and
Guoyun Zhu and Chenghu Yang and Wei Zhang and Feifei
Li",
title = "{Tair-PMem}: a fully durable non-volatile memory
database",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3346--3358",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554827",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554827",
abstract = "In-memory databases (IMDBs) have been the backbone of
modern systems that demand high throughput and low
latency. Because of the cost and volatility of DRAM,
IMDBs become incompetent when dealing with \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lambov:2022:TMC,
author = "Branimir Lambov",
title = "Trie memtables in {Cassandra}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3359--3371",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554828",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554828",
abstract = "This paper discusses a new memtable implementation for
Apache Cassandra which is based on tries (also called
prefix trees) and byte-comparable representations of
database keys. The implementation is already in
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Pedreira:2022:VMU,
author = "Pedro Pedreira and Orri Erling and Masha Basmanova and
Kevin Wilfong and Laith Sakka and Krishna Pai and Wei
He and Biswapesh Chattopadhyay",
title = "{Velox}: meta's unified execution engine",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3372--3384",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554829",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554829",
abstract = "The ad-hoc development of new specialized computation
engines targeted to very specific data workloads has
created a siloed data landscape. Commonly, these
engines share little to nothing with each other and are
hard to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yang:2022:OMT,
author = "Zhenkun Yang and Chuanhui Yang and Fusheng Han and
Mingqiang Zhuang and Bing Yang and Zhifeng Yang and
Xiaojun Cheng and Yuzhong Zhao and Wenhui Shi and
Huafeng Xi and Huang Yu and Bin Liu and Yi Pan and
Boxue Yin and Junquan Chen and Quanqing Xu",
title = "{OceanBase}: a 707 million {tpmC} distributed
relational database system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3385--3397",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554830",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554830",
abstract = "We have designed and developed OceanBase, a
distributed relational database system from the very
basics for a decade. Being a scale-out multi-tenant
system, OceanBase is cross-region fault tolerant, which
is based on \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lan:2022:VVR,
author = "Hai Lan and Jiong Xie and Zhifeng Bao and Feifei Li
and Wei Tian and Fang Wang and Sheng Wang and Ailin
Zhang",
title = "{VRE}: a versatile, robust, and economical trajectory
data system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3398--3410",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554831",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554831",
abstract = "Managing massive trajectory data from various moving
objects has always been a demanding task. A desired
trajectory data system should be versatile in its
supported query types and distance functions, of low
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2022:BBH,
author = "Jianjun Chen and Yonghua Ding and Ye Liu and Fangshi
Li and Li Zhang and Mingyi Zhang and Kui Wei and Lixun
Cao and Dan Zou and Yang Liu and Lei Zhang and Rui Shi
and Wei Ding and Kai Wu and Shangyu Luo and Jason Sun
and Yuming Liang",
title = "{ByteHTAP}: {Bytedance}'s {HTAP} system with high data
freshness and strong data consistency",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3411--3424",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554832",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554832",
abstract = "In recent years, at ByteDance, we see more and more
business scenarios that require performing complex
analysis over freshly imported data, together with
transaction support and strong data consistency. In
this \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wingerath:2022:BCW,
author = "Wolfram Wingerath and Benjamin Wollmer and Markus
Bestehorn and Stephan Succo and Sophie Ferrlein and
Florian B{\"u}cklers and J{\"o}rn Domnik and Fabian
Panse and Erik Witt and Anil Sener and Felix Gessert
and Norbert Ritter",
title = "{Beaconnect}: continuous web performance {A\slash B}
testing at scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3425--3431",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554833",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554833",
abstract = "Content delivery networks (CDNs) are critical for
minimizing access latency in the Web as they
efficiently distribute online resources across the
globe. But since CDNs can only be enabled on the scope
of entire \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2022:COC,
author = "Zongzhi Chen and Xinjun Yang and Feifei Li and Xuntao
Cheng and Qingda Hu and Zheyu Miao and Rongbiao Xie and
Xiaofei Wu and Kang Wang and Zhao Song and Haiqing Sun
and Zechao Zhuang and Yuming Yang and Jie Xu and Liang
Yin and Wenchao Zhou and Sheng Wang",
title = "{CloudJump}: optimizing cloud databases for cloud
storages",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3432--3444",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554834",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554834",
abstract = "There has been an increasing interest in building
cloud-native databases that decouple computation and
storage for elasticity. A cloud-native database often
adopts a cloud storage underneath its storage engine,
leveraging \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zheng:2022:DMN,
author = "Kaiping Zheng and Shaofeng Cai and Horng Ruey Chua and
Melanie Herschel and Meihui Zhang and Beng Chin Ooi",
title = "{DyHealth}: making neural networks dynamic for
effective healthcare analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3445--3458",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554835",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554835",
abstract = "In National University Hospital (NUH) in Singapore, we
conduct healthcare analytics that analyzes
heterogeneous electronic medical records (EMR) to
support effective clinical decision-making on a daily
basis. Existing \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Mishchenko:2022:BCS,
author = "Andrey Mishchenko and Dominique Danco and Abhilash
Jindal and Adrian Blue",
title = "{Blueprint}: a constraint-solving approach for
document extraction",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3459--3471",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554836",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554836",
abstract = "Blueprint is a declarative domain-specific language
for document extraction. Users describe document layout
using spatial, textual, semantic, and numerical fuzzy
constraints, and the language runtime extracts the
field-value \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yu:2022:TCL,
author = "Muzhi Yu and Zhaoxiang Lin and Jinan Sun and Runyun
Zhou and Guoqiang Jiang and Hua Huang and Shikun
Zhang",
title = "{TencentCLS}: the cloud log service with high query
performances",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3472--3482",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554837",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554837",
abstract = "With the trend of cloud computing, the cloud log
service is becoming increasingly important, as it plays
a critical role in tasks such as root cause analysis,
service monitoring and security audition. To meet these
needs, we \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Xie:2022:GMD,
author = "Jiong Xie and Zhen Chen and Jianwei Liu and Fang Wang
and Feifei Li and Zhida Chen and Yinpei Liu and Songlu
Cai and Zhenhua Fan and Fei Xiao and Yue Chen",
title = "{Ganos}: a multidimensional, dynamic, and
scene-oriented cloud-native spatial database engine",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3483--3495",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554838",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554838",
abstract = "Recently, the trend of developing digital twins for
smart cities has driven a need for managing large-scale
multidimensional, dynamic, and scene-oriented spatial
data. Due to larger data scale and more complex
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lakshman:2022:MHD,
author = "Sarath Lakshman and Apaar Gupta and Rohan Suri and
Scott Lashley and John Liang and Srinath Duvuru and
Ravi Mayuram",
title = "{Magma}: a high data density storage engine used in
{Couchbase}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3496--3508",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554839",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554839",
abstract = "We present Magma, a write-optimized high data density
key-value storage engine used in the Couchbase NoSQL
distributed document database. Today's write-heavy
data-intensive applications like ad-serving, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Cahoon:2022:DAS,
author = "Joyce Cahoon and Wenjing Wang and Yiwen Zhu and
Katherine Lin and Sean Liu and Raymond Truong and Neetu
Singh and Chengcheng Wan and Alexandra Ciortea and
Sreraman Narasimhan and Subru Krishnan",
title = "{Doppler}: automated {SKU} recommendation in migrating
{SQL} workloads to the cloud",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3509--3521",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554840",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554840",
abstract = "Selecting the optimal cloud target to migrate SQL
estates from on-premises to the cloud remains a
challenge. Current solutions are not only
time-consuming and error-prone, requiring significant
user input, but also fail to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Harizopoulos:2022:MNG,
author = "Stavros Harizopoulos and Taylor Hopper and Morton Mo
and Shyam Sundar Chandrasekaran and Tongguang Chen and
Yan Cui and Nandini Ganesh and Gary Helmling and Hieu
Pham and Sebastian Wong",
title = "{Meta}'s next-generation realtime monitoring and
analytics platform",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3522--3534",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554841",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554841",
abstract = "Unlike traditional database systems where data and
system availability are tied together, there is a wide
class of systems targeting realtime monitoring and
analytics over structured logs where these properties
can be \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gaffney:2022:SPP,
author = "Kevin P. Gaffney and Martin Prammer and Larry
Brasfield and D. Richard Hipp and Dan Kennedy and
Jignesh M. Patel",
title = "{SQLite}: past, present, and future",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3535--3547",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554842",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554842",
abstract = "In the two decades following its initial release,
SQLite has become the most widely deployed database
engine in existence. Today, SQLite is found in nearly
every smartphone, computer, web browser, television,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Guo:2022:MCN,
author = "Rentong Guo and Xiaofan Luan and Long Xiang and Xiao
Yan and Xiaomeng Yi and Jigao Luo and Qianya Cheng and
Weizhi Xu and Jiarui Luo and Frank Liu and Zhenshan Cao
and Yanliang Qiao and Ting Wang and Bo Tang and Charles
Xie",
title = "{Manu}: a cloud native vector database management
system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3548--3561",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554843",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554843",
abstract = "With the development of learning-based embedding
models, embedding vectors are widely used for analyzing
and searching unstructured data. As vector collections
exceed billion-scale, fully managed and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Galhotra:2022:ARD,
author = "Sainyam Galhotra and Udayan Khurana",
title = "Automated relational data explanation using external
semantic knowledge",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3562--3565",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554844",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554844",
abstract = "In data science problems, understanding the data is a
crucial first step. However, it can be challenging and
time intensive for a data scientist who is not an
expert in that domain. Several downstream tasks such as
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Rossi:2022:KEF,
author = "Andrea Rossi and Donatella Firmani and Paolo Merialdo
and Tommaso Teofili",
title = "{Kelpie}: an explainability framework for
embedding-based link prediction models",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3566--3569",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554845",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554845",
abstract = "The latest generations of Link Prediction (LP) models
rely on embeddings to tackle incompleteness in
Knowledge Graphs, achieving great performance at the
cost of interpretability. Their opaqueness limits the
trust \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lin:2022:ODC,
author = "Yin Lin and Brit Youngmann and Yuval Moskovitch and H.
V. Jagadish and Tova Milo",
title = "{OREO}: detection of cherry-picked generalizations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3570--3573",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554846",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554846",
abstract = "Data analytics often make sense of large data sets by
generalization: aggregating from the detailed data to a
more general context. Given a dataset, misleading
generalizations can sometimes be drawn from a
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kohn:2022:DWF,
author = "Andr{\'e} Kohn and Dominik Moritz and Mark Raasveldt
and Hannes M{\"u}hleisen and Thomas Neumann",
title = "{DuckDB-wasm}: fast analytical processing for the
web",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3574--3577",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554847",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554847",
abstract = "We introduce DuckDB-Wasm, a WebAssembly version of the
database system DuckDB, to provide fast analytical
processing for the Web. DuckDB-Wasm evaluates SQL
queries asynchronously in web workers, supports
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Xi:2022:EHL,
author = "Yihai Xi and Ning Wang and Xinyu Chen and Yiyi Zhang
and Zilong Wang and Zhihong Xu and Yue Wang",
title = "{EasyDR}: a human-in-the-loop error detection \&
repair platform for holistic table cleaning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3578--3581",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554848",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554848",
abstract = "Many tables on the web suffer from multi-level and
multi-type quality problems, but existing cleaning
systems cannot provide a comprehensive quality
improvement for them. Most of these systems are
designed for solving a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Pan:2022:HFD,
author = "Xuchen Pan and Yongxin Tong and Chunbo Xue and Zimu
Zhou and Junping Du and Yuxiang Zeng and Yexuan Shi and
Xiaofei Zhang and Lei Chen and Yi Xu and Ke Xu and
Weifeng Lv",
title = "{Hu-fu}: a data federation system for secure spatial
queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3582--3585",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554849",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554849",
abstract = "The increasing concerns on data security limit the
sharing of data distributedly stored at multiple data
owners and impede the scale of spatial queries over big
urban data. In response, data federation systems have
emerged \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gassen:2022:DCS,
author = "Marius Gassen and Benjamin H{\"a}ttasch and Benjamin
Hilprecht and Nadja Geisler and Alexander Fraser and
Carsten Binnig",
title = "Demonstrating {CAT}: synthesizing data-aware
conversational agents for transactional databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3586--3589",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554850",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554850",
abstract = "Databases for OLTP are often the backbone for
applications such as hotel room or cinema ticket
booking applications. However, developing a
conversational agent (i.e., a chatbot-like interface)
to allow end-users to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Personnaz:2022:EGE,
author = "Aur{\'e}lien Personnaz and Brit Youngmann and Sihem
Amer-Yahia",
title = "{EDA4SUM}: guided exploration of data summaries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3590--3593",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554851",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554851",
abstract = "We demonstrate EDA4Sum, a framework dedicated to
generating guided multi-step data summarization
pipelines for very large datasets. Data summarization
is the process of producing interpretable and
representative \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2022:CEQ,
author = "Chenjie Li and Juseung Lee and Zhengjie Miao and Boris
Glavic and Sudeepa Roy",
title = "{CaJaDE}: explaining query results by augmenting
provenance with context",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3594--3597",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554852",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554852",
abstract = "In this work, we demonstrate CaJaDE (Context-Aware
Join-Augmented Deep Explanations), a system that
explains query results by augmenting provenance with
contextual information from other related tables in the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Asada:2022:STT,
author = "Yuki Asada and Victor Fu and Apurva Gandhi and Advitya
Gemawat and Lihao Zhang and Dong He and Vivek Gupta and
Ehi Nosakhare and Dalitso Banda and Rathijit Sen and
Matteo Interlandi",
title = "Share the tensor tea: how databases can leverage the
machine learning ecosystem",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3598--3601",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554853",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554853",
abstract = "We demonstrate Tensor Query Processor (TQP): a query
processor that automatically compiles relational
operators into tensor programs. By leveraging tensor
runtimes such as PyTorch, TQP is able to: (1) integrate
with ML \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Tan:2022:MTV,
author = "Jess Tan and Desmond Yeo and Rachael Neoh and Huey-Eng
Chua and Sourav S Bhowmick",
title = "{MOCHA}: a tool for visualizing impact of operator
choices in query execution plans for database
education",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3602--3605",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554854",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554854",
abstract = "The database systems course is offered in many major
universities. A key learning goal of learners taking
such a course is to understand how sql queries are
processed in an RDBMS in practice. To this end,
comprehension of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chan:2022:LVK,
author = "Tsz Nam Chan and Pak Lon Ip and Kaiyan Zhao and Leong
Hou U and Byron Choi and Jianliang Xu",
title = "{LIBKDV}: a versatile kernel density visualization
library for geospatial analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3606--3609",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554855",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554855",
abstract = "Kernel density visualization (KDV) has been widely
used in many geospatial analysis tasks, including
traffic accident hotspot detection, crime hotspot
detection, and disease outbreak detection. Although KDV
can be \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ajmani:2022:DMR,
author = "Arul Ajmani and Aayush Shah and Alexander Shraer and
Adam Storm and Rebecca Taft and Oliver Tan and Nathan
VanBenschoten",
title = "A demonstration of multi-region {CockroachDB}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3610--3613",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554856",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554856",
abstract = "A database service is required to meet the
consistency, performance, and availability goals of
modern applications serving a global user-base.
Configuring a database deployed across multiple regions
such that it fulfills these \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chapman:2022:DAD,
author = "Adriane Chapman and Luca Lauro and Paolo Missier and
Riccardo Torlone",
title = "{DPDS}: assisting data science with data provenance",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3614--3617",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554857",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554857",
abstract = "Successful data-driven science requires a complex
combination of data engineering pipelines and data
modelling techniques. Robust and defensible results can
only be achieved when each step in the pipeline
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Dadvar:2022:PPO,
author = "Vargha Dadvar and Lukasz Golab and Divesh Srivastava",
title = "{POEM}: pattern-oriented explanations of {CNN}
models",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3618--3621",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554858",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554858",
abstract = "Deep learning models achieve state-of-the-art
performance in many applications, but their prediction
decisions are difficult to explain. Various solutions
exist in the area of explainable AI, for example to
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zalipynis:2022:WGA,
author = "Ramon Antonio Rodriges Zalipynis and Nikita Terlych",
title = "{WebArrayDB}: a geospatial array {DBMS} in your web
browser",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3622--3625",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554859",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554859",
abstract = "Geospatial array DBMSs operate on georeferenced N -d
arrays. They provide storage engines, query parsers,
and processing capabilities as their core
functionality. Traditionally, those have been too heavy
for a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lan:2022:ATA,
author = "Hai Lan and Yuanjia Zhang and Zhifeng Bao and Yu Dong
and Dongxu Huang and Liu Tang and Jian Zhang",
title = "{AutoDI}: towards an automatic plan regression
analysis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3626--3629",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554860",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554860",
abstract = "Manual analysis on plan regression is both
labor-intensive and inefficient for a large query plan
and numerous queries. In this paper, we demonstrate
AutoDI, an automatic detection and inference tool that
has been \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Davidson:2022:PEA,
author = "Susan B. Davidson and Shay Gershtein and Tova Milo and
Slava Novgorodov and May Shoshan",
title = "{PHOcus}: efficiently archiving photos",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3630--3633",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554861",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554861",
abstract = "Our ability to collect data is rapidly outstripping
our ability to effectively store and use it.
Organizations are therefore facing tough decisions of
what data to archive (or dispose of) to effectively
meet their business goals. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Huang:2022:VTE,
author = "Kai Huang and Qingqing Ye and Jing Zhao and Xi Zhao
and Haibo Hu and Xiaofang Zhou",
title = "{VINCENT}: towards efficient exploratory subgraph
search in graph databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3634--3637",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554862",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554862",
abstract = "Exploratory search is a search paradigm that plays a
vital role in databases, data mining, and information
retrieval to assist users to get familiar with the
underlying databases. It supports iterative query
formulation to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Drien:2022:AAP,
author = "Osnat Drien and Matanya Freiman and Yael Amsterdamer",
title = "{ActivePDB}: active probabilistic databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3638--3641",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554863",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554863",
abstract = "We present a novel framework for uncertain data
management, called ActivePDB. We are given a relational
probabilistic database, where each tuple is correct
with some probability; e.g., a database constructed
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Teofili:2022:CED,
author = "Tommaso Teofili and Donatella Firmani and Nick Koudas
and Paolo Merialdo and Divesh Srivastava",
title = "{CERTEM}: explaining and debugging black-box entity
resolution systems with {CERTA}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3642--3645",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554864",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554864",
abstract = "Entity resolution (ER) aims at identifying record
pairs that refer to the same real-world entity. Recent
works have focused on deep learning (DL) techniques, to
solve this problem. While such works have brought
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Aksoy:2022:SIS,
author = "Ahmet Kerem Aksoy and Pavel Dushev and Eleni Tzirita
Zacharatou and Holmer Hemsen and Marcela Charfuelan and
Jorge-Arnulfo Quian{\'e}-Ruiz and Beg{\"u}m Demir and
Volker Markl",
title = "Satellite image search in {AgoraEO}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3646--3649",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554865",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554865",
abstract = "The growing operational capability of global Earth
Observation (EO) creates new opportunities for
data-driven approaches to understand and protect our
planet. However, the current use of EO archives is very
restricted \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yan:2022:SDD,
author = "Li Yan and Nerissa Xu and Guozhong Li and Sourav S
Bhowmick and Byron Choi and Jianliang Xu",
title = "{SENSOR}: data-driven construction of sketch-based
visual query interfaces for time series data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3650--3653",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554866",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554866",
abstract = "Sketching is a common approach to visually query time
series data. However, a recent study reported that
sketching a pattern for querying is ``often ineffective
on its own'' in practice due to lack of
``representative objects'' to facilitate \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Bonifati:2022:DPG,
author = "Angela Bonifati and Stefania Dumbrava and Emile
Martinez and Fatemeh Ghasemi and Malo Jaffr{\'e} and
Pac{\^o}me Luton and Thomas Pickles",
title = "{DiscoPG}: property graph schema discovery and
exploration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3654--3657",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554867",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554867",
abstract = "Property graphs are becoming pervasive in a variety of
graph processing applications using interconnected
data. They allow to encode multi-labeled nodes and
edges, as well as their properties, represented as
key/value \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Maamar-Kouadri:2022:SQO,
author = "Wissam Maamar-Kouadri and Salima Benbernou and Mourad
Ouziri and Themis Palpanas and Iheb {Ben Amor}",
title = "{SA-Q}: observing, evaluating, and enhancing the
quality of the results of sentiment analysis tools",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3658--3661",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554868",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554868",
abstract = "Sentiment analysis has received constant research
attention due to its usefulness and importance in
different applications. However, despite the research
advances in this field, most current tools suffer in
prediction \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Orogat:2022:SDA,
author = "Abdelghny Orogat and Ahmed El-Roby",
title = "{SmartBench}: demonstrating automatic generation of
comprehensive benchmarks for question answering over
knowledge graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3662--3665",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554869",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554869",
abstract = "In recent years, a significant number of question
answering (QA) systems that retrieve answers to natural
language questions from knowledge graphs (KG) have been
introduced. However, finding a benchmark that
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Tu:2022:DHE,
author = "Jianhong Tu and Xiaoyue Han and Ju Fan and Nan Tang
and Chengliang Chai and Guoliang Li and Xiaoyong Du",
title = "{DADER}: hands-off entity resolution with domain
adaptation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3666--3669",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554870",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554870",
abstract = "Entity resolution (ER) is a core data integration
problem that identifies pairs of data instances
referring to the same real-world entities, and the
state-of-the-art results of ER are achieved by deep
learning (DL) based \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gale:2022:SWS,
author = "James Gale and Max Seiden and Deepanshu Utkarsh and
Jason Frantz and Rob Woollen and {\c{C}}a{\u{g}}atay
Demiralp",
title = "Sigma workbook: a spreadsheet for cloud data
warehouses",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3670--3673",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554871",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554871",
abstract = "Cloud data warehouses (CDWs) bring large-scale data
and compute power closer to users in enterprises.
However, existing tools for analyzing data in CDWs are
either limited in ad-hoc transformations or difficult
to use for \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2022:RMC,
author = "Zihao Chen and Zhizhen Xu and Baokun Han and Chen Xu
and Weining Qian and Aoying Zhou",
title = "{ReMac}: a matrix computation system with redundancy
elimination",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3674--3677",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554872",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554872",
abstract = "Distributed matrix computation solutions support query
interfaces of linear algebra expressions, which often
contain redundancy, i.e., common and loop-constant
subexpressions. However, existing solutions fail
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wenig:2022:TBT,
author = "Phillip Wenig and Sebastian Schmidl and Thorsten
Papenbrock",
title = "{TimeEval}: a benchmarking toolkit for time series
anomaly detection algorithms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3678--3681",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554873",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554873",
abstract = "Detecting anomalous subsequences in time series is an
important task in time series analytics because it
serves the identification of special events, such as
production faults, delivery bottlenecks, system
defects, or heart \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lerner:2022:DAH,
author = "Alberto Lerner and Matthias Jasny and Theo Jepsen and
Carsten Binnig and Philippe Cudr{\'e}-Mauroux",
title = "{DBMS} annihilator: a high-performance database
workload generator in action",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3682--3685",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554874",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554874",
abstract = "Modern DBMS engines can achieve unprecedented
transaction processing speeds thanks to the invention
of clever data structures, concurrency schemes, and
improvements in CPU and memory subsystems. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liang:2022:FSF,
author = "Zhiyu Liang and Hongzhi Wang",
title = "{FedTSC}: a secure federated learning system for
interpretable time series classification",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3686--3689",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554875",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554875",
abstract = "We demonstrate FedTSC, a novel federated learning (FL)
system for interpretable time series classification
(TSC). FedTSC is an FL-based TSC solution that makes a
great balance among security, interpretability,
accuracy, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wu:2022:AVA,
author = "Qingshun Wu and Yafei Li and Huiling Li and Di Zhang
and Guanglei Zhu",
title = "{AMRAS}: a visual analysis system for spatial
crowdsourcing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3690--3693",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554876",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554876",
abstract = "The wide adoption of GPS-enabled smart devices has
greatly promoted spatial crowdsourcing, where the core
issue is how to assign tasks to workers efficiently and
with high quality. In this paper, we build a novel
visual \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Al-Sayeh:2022:SCA,
author = "Hani Al-Sayeh and Muhammad Attahir Jibril and Muhammad
Waleed {Bin Saeed} and Kai-Uwe Sattler",
title = "{SparkCAD}: caching anomalies detector for {Spark}
applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3694--3697",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554877",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554877",
abstract = "Developers of Apache Spark applications can accelerate
their workloads by caching suitable intermediate
results in memory and reusing them rather than
recomputing them all over again every time they are
needed. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{vLeeuwen:2022:AQP,
author = "Wilco v. Leeuwen and Thomas Mulder and Bram van de
Wall and George Fletcher and Nikolay Yakovets",
title = "{AvantGraph} query processing engine",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3698--3701",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554878",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554878",
abstract = "We demonstrate AvantGraph, a graph query processing
engine developed by the Database group at TU Eindhoven.
Designed for efficient processing of both subgraph
matching and navigational graph queries, AvantGraph
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Boniol:2022:TNL,
author = "Paul Boniol and John Paparrizos and Yuhao Kang and
Themis Palpanas and Ruey S. Tsay and Aaron J. Elmore
and Michael J. Franklin",
title = "{Theseus}: navigating the labyrinth of time-series
anomaly detection",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3702--3705",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554879",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554879",
abstract = "The detection of anomalies in time series has gained
ample academic and industrial attention, yet, no
comprehensive benchmark exists to evaluate time-series
anomaly detection methods. Therefore, there is no final
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Hofmann:2022:DAS,
author = "Dennis Hofmann and Peter VanNostrand and Huayi Zhang
and Yizhou Yan and Lei Cao and Samuel Madden and Elke
Rundensteiner",
title = "A demonstration of {AutoOD}: a self-tuning anomaly
detection system",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3706--3709",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554880",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554880",
abstract = "Anomaly detection is a critical task in applications
like preventing financial fraud, system malfunctions,
and cybersecurity attacks. While previous research has
offered a plethora of anomaly detection algorithms,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gakhar:2022:POA,
author = "Sunny Gakhar and Joyce Cahoon and Wangchao Le and
Xiangnan Li and Kaushik Ravichandran and Hiren Patel
and Marc Friedman and Brandon Haynes and Shi Qiao and
Alekh Jindal and Jyoti Leeka",
title = "{Pipemizer}: an optimizer for analytics data
pipelines",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3710--3713",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554881",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554881",
abstract = "We demonstrate Pipemizer, an optimizer and recommender
aimed at improving the performance of queries or jobs
in pipelines. These job pipelines are ubiquitous in
modern data analytics due to jobs reading output files
written \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Redyuk:2022:DAA,
author = "Sergey Redyuk and Zoi Kaoudi and Sebastian Schelter
and Volker Markl",
title = "{DORIAN} in action: assisted design of data science
pipelines",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3714--3717",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554882",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554882",
abstract = "Existing automated machine learning solutions and
intelligent discovery assistants are popular tools that
facilitate the end-user with the design of data science
(DS) pipelines. However, they yield limited \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{He:2022:WDN,
author = "Yuntian He and Yue Zhang and Saket Gurukar and
Srinivasan Parthasarathy",
title = "{WebMILE}: democratizing network representation
learning at scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3718--3721",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554883",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554883",
abstract = "In recent years, we have seen the success of network
representation learning (NRL) methods in diverse
domains ranging from computational chemistry to drug
discovery and from social network analysis to
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Geisler:2022:DQQ,
author = "Nadja Geisler and Benjamin H{\"a}ttasch and Carsten
Binnig",
title = "Demonstrating quest: a query-driven framework to
explain classification models on tabular data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3722--3725",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554884",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554884",
abstract = "Machine learning models are everywhere now; but only
few of them are transparent in how they work. To remedy
this, local explanations aim to show users how and why
learned models produce a certain output for a given
input \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ripberger:2022:IID,
author = "Drew Ripberger and Yifan Gan and Xueyuan Ren and
Spyros Blanas and Yang Wang",
title = "{IsoBugView}: interactively debugging isolation bugs
in database applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3726--3729",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554885",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554885",
abstract = "Database applications frequently use weaker isolation
levels, such as Read Committed, for better performance,
which may lead to bugs that do not happen under
Serializable. Although a number of works have
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Foufoulas:2022:YRU,
author = "Yannis Foufoulas and Alkis Simitsis and Yannis
Ioannidis",
title = "{YeSQL}: rich user-defined functions without the
overhead",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3730--3733",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554886",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554886",
abstract = "The diversity and complexity of modern data management
applications led to the extension of the relational
paradigm with syntactic and semantic support for
User-Defined Functions (UDFs). Although
well-established in \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yang:2022:DAM,
author = "Zhihui Yang and Yicong Huang and Zuozhi Wang and Feng
Gao and Yao Lu and Chen Li and X. Sean Wang",
title = "Demonstration of accelerating machine learning
inference queries with correlative proxy models",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3734--3737",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554887",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554887",
abstract = "We will demonstrate a prototype query-processing
engine, which utilizes correlations among predicates to
accelerate machine learning (ML) inference queries on
unstructured data. Expensive operators such as feature
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liu:2022:DCI,
author = "Xiaozhen Liu and Zuozhi Wang and Shengquan Ni and
Sadeem Alsudais and Yicong Huang and Avinash Kumar and
Chen Li",
title = "Demonstration of collaborative and interactive
workflow-based data analytics in {Texera}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3738--3741",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554888",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554888",
abstract = "Collaborative data analytics is becoming increasingly
important due to the higher complexity of data science,
more diverse skills from different disciplines, more
common asynchronous schedules of team members, and
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zalipynis:2022:SAR,
author = "Ramon Antonio Rodriges Zalipynis",
title = "{SimDB} in action: road traffic simulations completely
inside array {DBMS}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3742--3745",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554889",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554889",
abstract = "Array DBMSs operate on big N -d arrays. Cellular
automata (CA) work on a discrete lattice of cells,
essentially on N -d arrays. CA facilitate decision
support as they realistically simulate complex
phenomena including road \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Badaro:2022:TTD,
author = "Gilbert Badaro and Paolo Papotti",
title = "Transformers for tabular data representation: a
tutorial on models and applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3746--3749",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554890",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554890",
abstract = "In the last few years, the natural language processing
community witnessed advances in neural representations
of free texts with transformer-based language models
(LMs). Given the importance of knowledge available in
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kiehn:2022:PDM,
author = "Felix Kiehn and Mareike Schmidt and Daniel Glake and
Fabian Panse and Wolfram Wingerath and Benjamin Wollmer
and Martin Poppinga and Norbert Ritter",
title = "Polyglot data management: state of the art \& open
challenges",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3750--3753",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554891",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554891",
abstract = "Due to the increasing variety of the current database
landscape, polyglot data management has become a hot
research topic in recent years. The underlying idea is
to combine the benefits of different data stores
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wasay:2022:MPT,
author = "Abdul Wasay and Nesime Tatbul and Justin Gottschlich",
title = "Machine programming: turning data into programmer
productivity",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3754--3757",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554892",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554892",
abstract = "Machine programming is an emerging research area that
improves the software development life cycle from
design through deployment. We present a tutorial on
machine programming research highlighting aspects
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2022:CDN,
author = "Guoliang Li and Haowen Dong and Chao Zhang",
title = "Cloud databases: new techniques, challenges, and
opportunities",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3758--3761",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554893",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554893",
abstract = "As database vendors are increasingly moving towards
the cloud data service, i.e., databases as a service
(DBaaS), cloud databases have become prevalent.
Compared with the early cloud-hosted databases, the new
generation \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Mhedhbi:2022:MTQ,
author = "Amine Mhedhbi and Semih Salihoglu",
title = "Modern techniques for querying graph-structured
relations: foundations, system implementations, and
open challenges",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3762--3765",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554894",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554894",
abstract = "The last decade has seen an emergence of numerous
specialized graph DBMSs (GDBMSs) as well as
graph-optimized extensions of RDBMSs. In addition,
several query processing techniques, such as worst-case
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fang:2022:DSD,
author = "Yixiang Fang and Wensheng Luo and Chenhao Ma",
title = "Densest subgraph discovery on large graphs:
applications, challenges, and techniques",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3766--3769",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554895",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554895",
abstract = "As one of the most fundamental problems in graph data
mining, the densest subgraph discovery (DSD) problem
has found a broad spectrum of real applications, such
as social network community detection, graph index
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Trummer:2022:BGC,
author = "Immanuel Trummer",
title = "From {BERT} to {GPT-3} codex: harnessing the potential
of very large language models for data management",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3770--3773",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554896",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554896",
abstract = "Large language models have recently advanced the state
of the art on many natural language processing
benchmarks. The newest generation of models can be
applied to a variety of tasks with little to no
specialized \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Huang:2022:PPF,
author = "Kaisong Huang and Yuliang He and Tianzheng Wang",
title = "The past, present and future of indexing on persistent
memory",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3774--3777",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554897",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554897",
abstract = "Persistent memory (PM) based indexing techniques have
been proposed to build fast yet persistent indexes that
sit on the memory bus. Over the past decade, numerous
techniques have been proposed with various \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kaoudi:2022:UDA,
author = "Zoi Kaoudi and Jorge-Arnulfo Quian{\'e}-Ruiz",
title = "Unified data analytics: state-of-the-art and open
problems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3778--3781",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554898",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554898",
abstract = "There is an urgent need for unifying data analytics as
more and more application tasks become more complex:
Nowadays, it is normal to see tasks performing data
preparation, analytical processing, and machine
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fan:2022:BGC,
author = "Wenfei Fan",
title = "Big graphs: challenges and opportunities",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3782--3797",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554899",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554899",
abstract = "Big data is typically characterized with 4V's: Volume,
Velocity, Variety and Veracity. When it comes to big
graphs, these challenges become even more staggering.
Each and every of the 4V's raises new questions, from
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Amer-Yahia:2022:TAP,
author = "Sihem Amer-Yahia",
title = "Towards {AI-powered} data-driven education",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3798--3806",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554900",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554900",
abstract = "Educational platforms are increasingly becoming
AI-driven. Besides providing a wide range of course
filtering options, personalized recommendations of
learning material and teachers are driving today's
research. While \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sun:2022:HIN,
author = "Yizhou Sun and Jiawei Han and Xifeng Yan and Philip S.
Yu and Tianyi Wu",
title = "Heterogeneous information networks: the past, the
present, and the future",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3807--3811",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554901",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554901",
abstract = "In 2011, we proposed PathSim to systematically define
and compute similarity between nodes in a heterogeneous
information network (HIN), where nodes and links are
from different types. In the PathSim paper, we for the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Roy:2022:TIA,
author = "Sudeepa Roy",
title = "Toward interpretable and actionable data analysis with
explanations and causality",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3812--3820",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554902",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554902",
abstract = "We live in a world dominated by data, where users from
different fields routinely collect, study, and make
decisions supported by data. To aid these users, the
current trend in data analysis is to design tools that
allow \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ozcan:2022:RMD,
author = "Fatma {\"O}zcan",
title = "Reflections on my data management research journey
({VLDB} women in database research award talk)",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3821--3822",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554903",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554903",
abstract = "Data-driven decision making is critical for all kinds
of enterprises, public and private. It has been my
mission to find more efficient, and effective ways to
store, manage, query and analyze data to drive
actionable \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Mohan:2022:PSF,
author = "C. Mohan",
title = "Panel: startups founded by database researchers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3823--3825",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554904",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554904",
abstract = "This in-person panel, which I will be moderating, will
focus on startups founded by worldwide database
researchers. The panelists are a set of people with
different backgrounds in terms of their geographic
locations, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Balazinska:2022:CDS,
author = "Magdalena Balazinska and Surajit Chaudhuri and AnHai
Doan and Joseph M. Hellerstein and Hanuma Kodavalla and
Ippokratis Pandis and Matei Zaharia",
title = "Cloud data systems: what are the opportunities for the
database research community?",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "12",
pages = "3826--3827",
month = aug,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3554821.3554905",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:11:07 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3554821.3554905",
abstract = "The panel will discuss the research opportunities for
the database research community in the context of cloud
native data services.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{John:2022:HDD,
author = "Sachin Basil John and Christoph Koch",
title = "High-Dimensional Data Cubes",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "13",
pages = "3828--3840",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565838.3565839",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:02 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565838.3565839",
abstract = "This paper introduces an approach to supporting
high-dimensional data cubes at interactive query speeds
and moderate storage cost. The approach is based on
binary(-domain) data cubes that are judiciously
partially materialized; the missing information
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ceccarello:2022:FSM,
author = "Matteo Ceccarello and Johann Gamper",
title = "Fast and Scalable Mining of Time Series Motifs with
Probabilistic Guarantees",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "13",
pages = "3841--3853",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565838.3565840",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:02 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565838.3565840",
abstract = "Mining time series motifs is a fundamental, yet
expensive task in exploratory data analytics. In this
paper, we therefore propose a fast method to find the
top- k motifs with probabilistic guarantees. Our
probabilistic approach is based on Locality \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Deutch:2022:FEF,
author = "Daniel Deutch and Amir Gilad and Tova Milo and Amit
Mualem and Amit Somech",
title = "{FEDEX}: an Explainability Framework for Data
Exploration Steps",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "13",
pages = "3854--3868",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565838.3565841",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:02 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565838.3565841",
abstract = "When exploring a new dataset, Data Scientists often
apply analysis queries, look for insights in the
resulting dataframe, and repeat to apply further
queries. We propose in this paper a novel solution that
assists data scientists in this laborious \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Xekalaki:2022:ETA,
author = "Maria Xekalaki and Juan Fumero and Athanasios
Stratikopoulos and Katerina Doka and Christos
Katsakioris and Constantinos Bitsakos and Nectarios
Koziris and Christos Kotselidis",
title = "Enabling Transparent Acceleration of Big Data
Frameworks Using Heterogeneous Hardware",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "13",
pages = "3869--3882",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565838.3565842",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:02 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565838.3565842",
abstract = "The ever-increasing demand for high performance Big
Data analytics and data processing, has paved the way
for heterogeneous hardware accelerators, such as
Graphics Processing Units (GPUs) and Field Programmable
Gate Arrays (FPGAs), to be integrated into \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fazzone:2022:DPN,
author = "Adriano Fazzone and Tommaso Lanciano and Riccardo
Denni and Charalampos E. Tsourakakis and Francesco
Bonchi",
title = "Discovering Polarization Niches via Dense Subgraphs
with Attractors and Repulsers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "13",
pages = "3883--3896",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565838.3565843",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:02 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565838.3565843",
abstract = "Detecting niches of polarization in social media is a
first step towards deploying mitigation strategies and
avoiding radicalization. In this paper, we model
polarization niches as close-knit dense communities of
users, which are under the influence of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lee:2022:SSU,
author = "Eunjae Lee and Sam H. Noh and Jiwon Seo",
title = "{Sage}: a System for Uncertain Network Analysis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "13",
pages = "3897--3910",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565838.3565844",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:02 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565838.3565844",
abstract = "We propose Sage, a system for uncertain network
analysis. Algorithms for uncertain network analysis
require large amounts of memory and computing resources
as they sample a large number of network instances and
run analysis on them. Sage makes uncertain \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Qin:2022:MBC,
author = "Hongchao Qin and Rong-Hua Li and Ye Yuan and Guoren
Wang and Lu Qin and Zhiwei Zhang",
title = "Mining Bursting Core in Large Temporal Graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "13",
pages = "3911--3923",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565838.3565845",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:02 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565838.3565845",
abstract = "Temporal graphs are ubiquitous. Mining communities
that are bursting in a period of time is essential for
seeking real emergency events in temporal graphs.
Unfortunately, most previous studies on community
mining in temporal networks ignore the bursting
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yu:2022:CBL,
author = "Xiang Yu and Chengliang Chai and Guoliang Li and
Jiabin Liu",
title = "Cost-Based or Learning-Based?: a Hybrid Query
Optimizer for Query Plan Selection",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "13",
pages = "3924--3936",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565838.3565846",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:02 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565838.3565846",
abstract = "Traditional cost-based optimizers are efficient and
stable to generate optimal plans for simple SQL
queries, but they may not generate high-quality plans
for complicated queries. Thus learning-based optimizers
have been proposed recently that can learn \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Meng:2022:OIA,
author = "Jingfan Meng and Huayi Wang and Jun Xu and Mitsunori
Ogihara",
title = "{ONe Index for All Kernels (ONIAK)}: a Zero
Re-Indexing {LSH} Solution to {ANNS-ALT (After Linear
Transformation)}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "13",
pages = "3937--3949",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565838.3565847",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:02 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565838.3565847",
abstract = "In this work, we formulate and solve a new type of
approximate nearest neighbor search (ANNS) problems
called ANNS after linear transformation (ALT). In
ANNS-ALT, we search for the vector (in a dataset) that,
after being linearly transformed by a user-. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Shi:2022:LIB,
author = "Jiachen Shi and Gao Cong and Xiao-Li Li",
title = "Learned Index Benefits: Machine Learning Based Index
Performance Estimation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "13",
pages = "3950--3962",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565838.3565848",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:02 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565838.3565848",
abstract = "Index selection remains one of the most challenging
problems in relational database management systems. To
find an optimum index configuration for a workload,
accurately and efficiently quantifying the benefits of
each candidate index configuration is \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2022:ORM,
author = "Jiachuan Wang and Peng Cheng and Libin Zheng and Lei
Chen and Wenjie Zhang",
title = "Online Ridesharing with Meeting Points",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "13",
pages = "3963--3975",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565838.3565849",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:02 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565838.3565849",
abstract = "Nowadays, ridesharing becomes a popular commuting
mode. Dynamically arriving riders post their origins
and destinations, then the platform assigns drivers to
serve them. In ridesharing, different groups of riders
can be served by one driver if their \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Bellomarini:2022:EPE,
author = "Luigi Bellomarini and Davide Benedetto and Matteo
Brandetti and Emanuel Sallinger",
title = "Exploiting the Power of Equality-Generating
Dependencies in Ontological Reasoning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "13",
pages = "3976--3988",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565838.3565850",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:02 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565838.3565850",
abstract = "Equality-generating dependencies (EGDs) allow to fully
exploit the power of existential quantification in
ontological reasoning settings modeled via
Tuple-Generating Dependencies (TGDs), by enabling
value-assignment or forcing the equivalence of fresh
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Aamand:2022:NRF,
author = "Anders Aamand and Debarati Das and Evangelos
Kipouridis and Jakob B. T. Knudsen and Peter M. R.
Rasmussen and Mikkel Thorup",
title = "No Repetition: Fast and Reliable Sampling with Highly
Concentrated Hashing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "13",
pages = "3989--4001",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565838.3565851",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:02 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565838.3565851",
abstract = "Stochastic sample-based estimators are among the most
fundamental and universally applied tools in
statistics. Such estimators are particularly important
when processing huge amounts of data, where we need to
be able to answer a wide range of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Attouche:2022:WGJ,
author = "Lyes Attouche and Mohamed-Amine Baazizi and Dario
Colazzo and Giorgio Ghelli and Carlo Sartiani and
Stefanie Scherzinger",
title = "Witness Generation for {JSON} Schema",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "13",
pages = "4002--4014",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565838.3565852",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:02 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565838.3565852",
abstract = "JSON Schema is a schema language for JSON documents,
based on a complex combination of structural operators,
Boolean operators (negation included), and recursive
variables. The static analysis of JSON Schema documents
comprises practically relevant \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Shankar:2022:TOP,
author = "Shreya Shankar and Aditya G. Parameswaran",
title = "Towards Observability for Production Machine Learning
Pipelines",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "13",
pages = "4015--4022",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565838.3565853",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:02 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565838.3565853",
abstract = "Software organizations are increasingly incorporating
machine learning (ML) into their product offerings,
driving a need for new data management tools. Many of
these tools facilitate the initial development of ML
applications, but sustaining these \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lee:2022:DES,
author = "Sekwon Lee and Soujanya Ponnapalli and Sharad Singhal
and Marcos K. Aguilera and Kimberly Keeton and Vijay
Chidambaram",
title = "{DINOMO}: an Elastic, Scalable, High-Performance
Key-Value Store for Disaggregated Persistent Memory",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "13",
pages = "4023--4037",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565838.3565854",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:02 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565838.3565854",
abstract = "We present Dinomo, a novel key-value store for
disaggregated persistent memory (DPM). Dinomo is the
first key-value store for DPM that simultaneously
achieves high common-case performance, scalability, and
lightweight online reconfiguration. We observe
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Shankar:2022:BCR,
author = "Shreya Shankar and Stephen Macke and Sarah Chasins and
Andrew Head and Aditya Parameswaran",
title = "Bolt-on, Compact, and Rapid Program Slicing for
Notebooks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "13",
pages = "4038--4047",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565838.3565855",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:02 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565838.3565855",
abstract = "Computational notebooks are commonly used for
iterative workflows, such as in exploratory data
analysis. This process lends itself to the accumulation
of old code and hidden state, making it hard for users
to reason about the lineage of, e.g., plots \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sun:2022:FMT,
author = "Weijie Sun and Zihuan Xu and Lei Chen",
title = "Fairness Matters: a Tit-for-Tat Strategy Against
Selfish Mining",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "13",
pages = "4048--4061",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565838.3565856",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:02 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565838.3565856",
abstract = "The proof-of-work (PoW) based blockchains are more
secure nowadays since profit-oriented miners contribute
more computing powers in exchange for fair revenues.
This virtuous circle only works under an
incentive-compatible consensus, which is found to be
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ding:2022:SIO,
author = "Jialin Ding and Ryan Marcus and Andreas Kipf and
Vikram Nathan and Aniruddha Nrusimha and Kapil Vaidya
and Alexander van Renen and Tim Kraska",
title = "{SageDB}: an Instance-Optimized Data Analytics
System",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "13",
pages = "4062--4078",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565838.3565857",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:02 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565838.3565857",
abstract = "Modern data systems are typically both complex and
general-purpose. They are complex because of the
numerous internal knobs and parameters that users need
to manually tune in order to achieve good performance;
they are general-purpose because they are \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Richly:2022:BCF,
author = "Keven Richly and Rainer Schlosser and Martin
Boissier",
title = "Budget-Conscious Fine-Grained Configuration
Optimization for Spatio-Temporal Applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "13",
pages = "4079--4092",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565838.3565858",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:02 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565838.3565858",
abstract = "Based on the performance requirements of modern
spatio-temporal data mining applications, in-memory
database systems are often used to store and process
the data. To efficiently utilize the scarce DRAM
capacities, modern database systems support various
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Hsieh:2022:NGC,
author = "Cheng-Yu Hsieh and Jieyu Zhang and Alexander Ratner",
title = "{Nemo}: Guiding and Contextualizing Weak Supervision
for Interactive Data Programming",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "15",
number = "13",
pages = "4093--4105",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565838.3565859",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:02 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565838.3565859",
abstract = "Weak Supervision (WS) techniques allow users to
efficiently create large training datasets by
programmatically labeling data with heuristic sources
of supervision. While the success of WS relies heavily
on the provided labeling heuristics, the process
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Helt:2022:CCC,
author = "Jeffrey Helt and Abhinav Sharma and Daniel J. Abadi
and Wyatt Lloyd and Jose M. Faleiro",
title = "{C5}: cloned concurrency control that always keeps
up",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "1",
pages = "1--14",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3561261.3561262",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:06:34 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3561261.3561262",
abstract = "Asynchronously replicated primary-backup databases are
commonly deployed to improve availability and offload
read-only transactions. To both apply replicated writes
from the primary and serve read-only transactions, the
backups implement a cloned \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2022:CDS,
author = "Ruihong Wang and Jianguo Wang and Stratos Idreos and
M. Tamer {\"O}zsu and Walid G. Aref",
title = "The case for distributed shared-memory databases with
{RDMA}-enabled memory disaggregation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "1",
pages = "15--22",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3561261.3561263",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:06:34 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3561261.3561263",
abstract = "Memory disaggregation (MD) allows for scalable and
elastic data center design by separating compute (CPU)
from memory. With MD, compute and memory are no longer
coupled into the same server box. Instead, they are
connected to each other via ultra-fast \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wu:2022:FED,
author = "Chenyuan Wu and Mohammad Javad Amiri and Jared Asch
and Heena Nagda and Qizhen Zhang and Boon Thau Loo",
title = "{FlexChain}: an elastic disaggregated blockchain",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "1",
pages = "23--36",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3561261.3561264",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:06:34 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3561261.3561264",
abstract = "While permissioned blockchains enable a family of data
center applications, existing systems suffer from
imbalanced loads across compute and memory,
exacerbating the underutilization of cloud resources.
This paper presents FlexChain, a novel \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2022:MNL,
author = "Zhen Zhang and Shuai Zheng and Yida Wang and Justin
Chiu and George Karypis and Trishul Chilimbi and Mu Li
and Xin Jin",
title = "{MiCS}: near-linear scaling for training gigantic
model on public cloud",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "1",
pages = "37--50",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3561261.3561265",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:06:34 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3561261.3561265",
abstract = "Existing general purpose frameworks for gigantic model
training, i.e., dense models with billions of
parameters, cannot scale efficiently on cloud
environment with various networking conditions due to
large communication overheads. In this paper, we
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yang:2022:PPC,
author = "Yi Yang and Yurong Cheng and Ye Yuan and Guoren Wang
and Lei Chen and Yongjiao Sun",
title = "Privacy-preserving cooperative online matching over
spatial crowdsourcing platforms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "1",
pages = "51--63",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3561261.3561266",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:06:34 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3561261.3561266",
abstract = "With the continuous development of spatial
crowdsourcing platform, online task assignment problem
has been widely studied as a typical problem in spatial
crowdsourcing. Most of the existing studies are based
on a single-platform task assignment to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2022:CMT,
author = "Jiayi Wang and Chengliang Chai and Nan Tang and Jiabin
Liu and Guoliang Li",
title = "Coresets over multiple tables for feature-rich and
data-efficient machine learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "1",
pages = "64--76",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3561261.3561267",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:06:34 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3561261.3561267",
abstract = "Successful machine learning (ML) needs to learn from
good data. However, one common issue about train data
for ML practitioners is the lack of good features. To
mitigate this problem, feature augmentation is often
employed by joining with (or enriching \ldots{})",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2022:SMM,
author = "Zihao Zhang and Huiqi Hu and Xuan Zhou and Jiang
Wang",
title = "{Starry}: multi-master transaction processing on
semi-leader architecture",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "1",
pages = "77--89",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3561261.3561268",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:06:34 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3561261.3561268",
abstract = "Multi-master architecture is desirable for cloud
databases in supporting large-scale transaction
processing. To enable concurrent transaction execution
on multiple computing nodes, we need an efficient
transaction commit protocol on the storage layer
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Skitsas:2022:SSE,
author = "Konstantinos Skitsas and Ioannis G. Papageorgiou and
Mohammad Sadegh Talebi and Verena Kantere and Michael
N. Katehakis and Panagiotis Karras",
title = "{SIFTER}: space-efficient value iteration for
finite-horizon {MDPs}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "1",
pages = "90--98",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3561261.3561269",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:06:34 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3561261.3561269",
abstract = "Can we solve finite-horizon Markov decision processes
(FHMDPs) while raising low memory requirements? Such
models find application in many cases where a
decision-making agent needs to act in a probabilistic
environment, from resource management to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yu:2022:TUP,
author = "Geoffrey X. Yu and Markos Markakis and Andreas Kipf
and Per-{\AA}ke Larson and Umar Farooq Minhas and Tim
Kraska",
title = "{TreeLine}: an update-in-place key-value store for
modern storage",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "1",
pages = "99--112",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3561261.3561270",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:06:34 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3561261.3561270",
abstract = "Many modern key-value stores, such as RocksDB, rely on
log-structured merge trees (LSMs). Originally designed
for spinning disks, LSMs optimize for write performance
by only making sequential writes. But this optimization
comes at the cost of reads: \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Tao:2022:DPE,
author = "Yuchao Tao and Amir Gilad and Ashwin Machanavajjhala
and Sudeepa Roy",
title = "{DPXPlain}: privately explaining aggregate query
answers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "1",
pages = "113--126",
month = sep,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3561261.3561271",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Thu Nov 17 11:06:34 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3561261.3561271",
abstract = "Differential privacy (DP) is the state-of-the-art and
rigorous notion of privacy for answering aggregate
database queries while preserving the privacy of
sensitive information in the data. In today's era of
data analysis, however, it poses new \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chang:2022:EMP,
author = "Lijun Chang and Mouyi Xu and Darren Strash",
title = "Efficient maximum $k$-plex computation over large
sparse graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "2",
pages = "127--139",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565816.3565817",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Nov 25 08:53:26 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565816.3565817",
abstract = "The k -plex model is a relaxation of the clique model
by allowing every vertex to miss up to k neighbors.
Designing exact and efficient algorithms for computing
a maximum k -plex in a graph has been receiving
increasing interest recently. However, the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Hu:2022:OSE,
author = "Tianxun Hu and Tianzheng Wang and Qingqing Zhou",
title = "Online schema evolution is (almost) free for snapshot
databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "2",
pages = "140--153",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565816.3565818",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Nov 25 08:53:26 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565816.3565818",
abstract = "Modern database applications often change their
schemas to keep up with the changing requirements.
However, support for online and transactional schema
evolution remains challenging in existing database
systems. Specifically, prior work often takes ad
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2022:LEH,
author = "Yifan Wang and Haodi Ma and Daisy Zhe Wang",
title = "{LIDER}: an efficient high-dimensional learned index
for large-scale dense passage retrieval",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "2",
pages = "154--166",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565816.3565819",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Nov 25 08:53:26 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565816.3565819",
abstract = "Passage retrieval has been studied for decades, and
many recent approaches of passage retrieval are using
dense embeddings generated from deep neural models,
called ``dense passage retrieval''. The
state-of-the-art end-to-end dense passage retrieval
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Shaham:2022:MMS,
author = "Sina Shaham and Gabriel Ghinita and Cyrus Shahabi",
title = "Models and mechanisms for spatial data fairness",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "2",
pages = "167--179",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565816.3565820",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Nov 25 08:53:26 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565816.3565820",
abstract = "Fairness in data-driven decision-making studies
scenarios where individuals from certain population
segments may be unfairly treated when being considered
for loan or job applications, access to public
resources, or other types of services. In location-.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Huang:2022:IMR,
author = "Shixun Huang and Wenqing Lin and Zhifeng Bao and
Jiachen Sun",
title = "Influence maximization in real-world closed social
networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "2",
pages = "180--192",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565816.3565821",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Nov 25 08:53:26 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565816.3565821",
abstract = "In the last few years, many closed social networks
such as WhatsAPP and WeChat have emerged to cater for
people's growing demand of privacy and independence. In
a closed social network, the posted content is not
available to all users or senders can set \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Bonifati:2022:TLI,
author = "Angela Bonifati and Francesco {Del Buono} and
Francesco Guerra and Donato Tiano",
title = "{Time2Feat}: learning interpretable representations
for multivariate time series clustering",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "2",
pages = "193--201",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565816.3565822",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Nov 25 08:53:26 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565816.3565822",
abstract = "Clustering multivariate time series is a critical task
in many real-world applications involving multiple
signals and sensors. Existing systems aim to maximize
effectiveness, efficiency and scalability, but fail to
guarantee the interpretability of the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2022:OVF,
author = "Xiaochen Li and Yuke Hu and Weiran Liu and Hanwen Feng
and Li Peng and Yuan Hong and Kui Ren and Zhan Qin",
title = "{OpBoost}: a vertical federated tree boosting
framework based on order-preserving desensitization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "2",
pages = "202--215",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565816.3565823",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Nov 25 08:53:26 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565816.3565823",
abstract = "Vertical Federated Learning (FL) is a new paradigm
that enables users with non-overlapping attributes of
the same data samples to jointly train a model without
directly sharing the raw data. Nevertheless, recent
works show that it's still not sufficient \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Perera:2022:HSD,
author = "R. Malinga Perera and Bastian Oetomo and Benjamin I.
P. Rubinstein and Renata Borovica-Gajic",
title = "{HMAB}: self-driving hierarchy of bandits for
integrated physical database design tuning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "2",
pages = "216--229",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565816.3565824",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Nov 25 08:53:26 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565816.3565824",
abstract = "Effective physical database design tuning requires
selection of several physical design structures (PDS),
such as indices and materialised views, whose
combination influences overall system performance in a
non-linear manner. While the simplicity of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Palyvos-Giannas:2022:EEO,
author = "Dimitris Palyvos-Giannas and Katerina Tzompanaki and
Marina Papatriantafilou and Vincenzo Gulisano",
title = "{Erebus}: explaining the outputs of data streaming
queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "2",
pages = "230--242",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565816.3565825",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Nov 25 08:53:26 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565816.3565825",
abstract = "In data streaming, why-provenance can explain why a
given outcome is observed but offers no help in
understanding why an expected outcome is missing.
Explaining missing answers has been addressed in DBMSs,
but these solutions are not directly applicable
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2022:PPL,
author = "Zhou Zhang and Zhaole Chu and Peiquan Jin and Yongping
Luo and Xike Xie and Shouhong Wan and Yun Luo and Xufei
Wu and Peng Zou and Chunyang Zheng and Guoan Wu and
Andy Rudoff",
title = "{PLIN}: a persistent learned index for non-volatile
memory with high performance and instant recovery",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "2",
pages = "243--255",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565816.3565826",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Nov 25 08:53:26 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565816.3565826",
abstract = "Non-Volatile Memory (NVM) has emerged as an
alternative to next-generation main memories. Although
many tree indices have been proposed for NVM, they
generally use B+-tree-like structures. To further
improve the performance of NVM-aware indices, we
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2022:FFC,
author = "Zuozhi Wang and Shengquan Ni and Avinash Kumar and
Chen Li",
title = "{Fries}: fast and consistent runtime reconfiguration
in dataflow systems with transactional guarantees",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "2",
pages = "256--268",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565816.3565827",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Nov 25 08:53:26 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565816.3565827",
abstract = "A computing job in a big data system can take a long
time to run, especially for pipelined executions on
data streams. Developers often need to change the
computing logic of the job such as fixing a loophole in
an operator or changing the machine \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Xiao:2022:FAD,
author = "Renjie Xiao and Zijing Tan and Haojin Wang and Shuai
Ma",
title = "Fast approximate denial constraint discovery",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "2",
pages = "269--281",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565816.3565828",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Nov 25 08:53:26 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565816.3565828",
abstract = "We investigate the problem of discovering approximate
denial constraints (DCs), for finding DCs that hold
with some exceptions to avoid overfitting real-life
dirty data and facilitate data cleaning tasks.
Different methods have been proposed to address
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2022:FDD,
author = "Haoyu Wang and Shaoxu Song",
title = "Frequency domain data encoding in {Apache IoTDB}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "2",
pages = "282--290",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565816.3565829",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Nov 25 08:53:26 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565816.3565829",
abstract = "Frequency domain analysis is widely conducted on time
series. While online transforming from time domain to
frequency domain is costly, e.g., by Fast Fourier
Transform (FFT), it is highly demanded to store the
frequency domain data for reuse. However, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zheng:2022:HMS,
author = "Jiping Zheng and Yuan Ma and Wei Ma and Yanhao Wang
and Xiaoyang Wang",
title = "Happiness maximizing sets under group fairness
constraints",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "2",
pages = "291--303",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565816.3565830",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Nov 25 08:53:26 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565816.3565830",
abstract = "Finding a happiness maximizing set (HMS) from a
database, i.e., selecting a small subset of tuples that
preserves the best score with respect to any
nonnegative linear utility function, is an important
problem in multi-criteria decision-making. When an
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Renggli:2022:SEF,
author = "Cedric Renggli and Xiaozhe Yao and Luka Kolar and Luka
Rimanic and Ana Klimovic and Ce Zhang",
title = "{SHiFT}: an efficient, flexible search engine for
transfer learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "2",
pages = "304--316",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565816.3565831",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Nov 25 08:53:26 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565816.3565831",
abstract = "Transfer learning can be seen as a data- and
compute-efficient alternative to training models from
scratch. The emergence of rich model repositories, such
as TensorFlow Hub, enables practitioners and
researchers to unleash the potential of these models
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Islam:2022:SCT,
author = "Md. Mouinul Islam and Dong Wei and Baruch Schieber and
Senjuti Basu Roy",
title = "Satisfying complex top-$k$ fairness constraints by
preference substitutions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "2",
pages = "317--329",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565816.3565832",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Nov 25 08:53:26 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565816.3565832",
abstract = "Given m users (voters), where each user casts her
preference for a single item (candidate) over n items
(candidates) as a ballot, the preference aggregation
problem returns k items (candidates) that have the k
highest number of preferences (votes). Our \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Karpov:2022:SSE,
author = "Nikolai Karpov and Qin Zhang",
title = "{SyncSignature}: a simple, efficient, parallelizable
framework for tree similarity joins",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "2",
pages = "330--342",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565816.3565833",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Nov 25 08:53:26 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565816.3565833",
abstract = "This paper introduces SyncSignature, the first fully
parallelizable algorithmic framework for tree
similarity joins under edit distance. SyncSignature
makes use of implicit-synchronized signature generation
schemes, which allow for an efficient and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yang:2022:APG,
author = "Shuang Yang and Yahui Sun and Jiesong Liu and Xiaokui
Xiao and Rong-Hua Li and Zhewei Wei",
title = "Approximating probabilistic group {Steiner} trees in
graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "2",
pages = "343--355",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565816.3565834",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Nov 25 08:53:26 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565816.3565834",
abstract = "Consider an edge-weighted graph, and a number of
properties of interests (PoIs). Each vertex has a
probability of exhibiting each PoI. The joint
probability that a set of vertices exhibits a PoI is
the probability that this set contains at least one
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Papadias:2022:SER,
author = "Serafeim Papadias and Zoi Kaoudi and Jorge-Arnulfo
Quian{\'e}-Ruiz and Volker Markl",
title = "Space-efficient random walks on streaming graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "2",
pages = "356--368",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565816.3565835",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Nov 25 08:53:26 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565816.3565835",
abstract = "Graphs in many applications, such as social networks
and IoT, are inherently streaming, involving continuous
additions and deletions of vertices and edges at high
rates. Constructing random walks in a graph, i.e.,
sequences of vertices selected with a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2022:PPT,
author = "Pengfei Wang and Xiaocan Zeng and Lu Chen and Fan Ye
and Yuren Mao and Junhao Zhu and Yunjun Gao",
title = "{PromptEM}: prompt-tuning for low-resource generalized
entity matching",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "2",
pages = "369--378",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565816.3565836",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Nov 25 08:53:26 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565816.3565836",
abstract = "Entity Matching (EM), which aims to identify whether
two entity records from two relational tables refer to
the same real-world entity, is one of the fundamental
problems in data management. Traditional EM assumes
that two tables are homogeneous with \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Guo:2022:CAC,
author = "Zhihan Guo and Xinyu Zeng and Kan Wu and Wuh-Chwen
Hwang and Ziwei Ren and Xiangyao Yu and Mahesh
Balakrishnan and Philip A. Bernstein",
title = "{Cornus}: atomic commit for a cloud {DBMS} with
storage disaggregation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "2",
pages = "379--392",
month = oct,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3565816.3565837",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Nov 25 08:53:26 MST 2022",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3565816.3565837",
abstract = "Two-phase commit (2PC) is widely used in distributed
databases to ensure atomicity of distributed
transactions. Conventional 2PC was originally designed
for the shared-nothing architecture and has two
limitations: long latency due to two eager log
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yuan:2022:RTT,
author = "Haitao Yuan and Guoliang Li and Zhifeng Bao",
title = "Route Travel Time Estimation on a Road Network
Revisited: Heterogeneity, Proximity, Periodicity and
Dynamicity",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "3",
pages = "393--405",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3570690.3570691",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:37 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3570690.3570691",
abstract = "In this paper, we revisit the problem of route travel
time estimation on a road network and aim to boost its
accuracy by capturing and utilizing spatio-temporal
features from four significant aspects: heterogeneity,
proximity, periodicity and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wu:2022:SOM,
author = "Yongji Wu and Matthew Lentz and Danyang Zhuo and Yao
Lu",
title = "Serving and Optimizing Machine Learning Workflows on
Heterogeneous Infrastructures",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "3",
pages = "406--419",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3570690.3570692",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:37 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3570690.3570692",
abstract = "With the advent of ubiquitous deployment of smart
devices and the Internet of Things, data sources for
machine learning inference have increasingly moved to
the edge of the network. Existing machine learning
inference platforms typically assume a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Geng:2022:CRB,
author = "Zixuan Geng and Maximilian Schleich and Dan Suciu",
title = "Computing Rule-Based Explanations by Leveraging
Counterfactuals",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "3",
pages = "420--432",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3570690.3570693",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:37 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3570690.3570693",
abstract = "Sophisticated machine models are increasingly used for
high-stakes decisions in everyday life. There is an
urgent need to develop effective explanation techniques
for such automated decisions. Rule-Based Explanations
have been proposed for high-stake \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Peng:2022:SSI,
author = "Jinfeng Peng and Derong Shen and Nan Tang and Tieying
Liu and Yue Kou and Tiezheng Nie and Hang Cui and Ge
Yu",
title = "Self-Supervised and Interpretable Data Cleaning with
Sequence Generative Adversarial Networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "3",
pages = "433--446",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3570690.3570694",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:37 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3570690.3570694",
abstract = "We study the problem of self-supervised and
interpretable data cleaning, which automatically
extracts interpretable data repair rules from dirty
data. In this paper, we propose a novel framework,
namely Garf, based on sequence generative adversarial
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Romero:2022:OVA,
author = "Francisco Romero and Johann Hauswald and Aditi Partap
and Daniel Kang and Matei Zaharia and Christos
Kozyrakis",
title = "Optimizing Video Analytics with Declarative Model
Relationships",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "3",
pages = "447--460",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3570690.3570695",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:37 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3570690.3570695",
abstract = "The availability of vast video collections and the
accuracy of ML models has generated significant
interest in video analytics systems. Since naively
processing all frames using expensive models is
impractical, researchers have proposed optimizations
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Jiang:2022:SRT,
author = "Jiaxin Jiang and Yuan Li and Bingsheng He and Bryan
Hooi and Jia Chen and Johan Kok Zhi Kang",
title = "{Spade}: a Real-Time Fraud Detection Framework on
Evolving Graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "3",
pages = "461--469",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3570690.3570696",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:37 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3570690.3570696",
abstract = "Real-time fraud detection is a challenge for most
financial and electronic commercial platforms. To
identify fraudulent communities, Grab, one of the
largest technology companies in Southeast Asia, forms a
graph from a set of transactions and detects \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Miao:2022:GET,
author = "Xupeng Miao and Yujie Wang and Youhe Jiang and Chunan
Shi and Xiaonan Nie and Hailin Zhang and Bin Cui",
title = "{Galvatron}: Efficient Transformer Training over
Multiple {GPUs} Using Automatic Parallelism",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "3",
pages = "470--479",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3570690.3570697",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:37 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3570690.3570697",
abstract = "Transformer models have achieved state-of-the-art
performance on various domains of applications and
gradually becomes the foundations of the advanced large
deep learning (DL) models. However, how to train these
models over multiple GPUs efficiently is \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2022:IDL,
author = "Qitong Wang and Stephen Whitmarsh and Vincent Navarro
and Themis Palpanas",
title = "{iEDeaL}: a Deep Learning Framework for Detecting
Highly Imbalanced Interictal Epileptiform Discharges",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "3",
pages = "480--490",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3570690.3570698",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:37 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3570690.3570698",
abstract = "Epilepsy is a chronic neurological disease, ranked as
the second most burdensome neurological disorder
worldwide. Detecting Interictal Epileptiform Discharges
(IEDs) is among the most important clinician operations
to support epilepsy diagnosis, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zapridou:2022:DLP,
author = "Eleni Zapridou and Ioannis Mytilinis and Anastasia
Ailamaki",
title = "{Dalton}: Learned Partitioning for Distributed Data
Streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "3",
pages = "491--504",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3570690.3570699",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:37 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3570690.3570699",
abstract = "To sustain the input rate of high-throughput streams,
modern stream processing systems rely on parallel
execution. However, skewed data yield imbalanced load
assignments and create stragglers that hinder
scalability Deciding on a static partitioning for
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Behrouz:2022:FCS,
author = "Ali Behrouz and Farnoosh Hashemi and Laks V. S.
Lakshmanan",
title = "{FirmTruss} Community Search in Multilayer Networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "3",
pages = "505--518",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3570690.3570700",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:37 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3570690.3570700",
abstract = "In applications such as biological, social, and
transportation networks, interactions between objects
span multiple aspects. For accurately modeling such
applications, multilayer networks have been proposed.
Community search allows for personalized \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Xu:2022:ETC,
author = "Tianyang Xu and Zhao Lu and Yuanyuan Zhu",
title = "Efficient Triangle-Connected Truss Community Search in
Dynamic Graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "3",
pages = "519--531",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3570690.3570701",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:37 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3570690.3570701",
abstract = "Community search studies the retrieval of certain
community structures containing query vertices, which
has received lots of attention recently. k -truss is a
fundamental community structure where each edge is
contained in at least k --- 2 triangles. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sabek:2022:CLM,
author = "Ibrahim Sabek and Kapil Vaidya and Dominik Horn and
Andreas Kipf and Michael Mitzenmacher and Tim Kraska",
title = "Can Learned Models Replace Hash Functions?",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "3",
pages = "532--545",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3570690.3570702",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:37 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3570690.3570702",
abstract = "Hashing is a fundamental operation in database
management, playing a key role in the implementation of
numerous core database data structures and algorithms.
Traditional hash functions aim to mimic a function that
maps a key to a random value, which can \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhao:2022:TGA,
author = "Yue Zhao and George H. Chen and Zhihao Jia",
title = "{TOD}: {GPU}-Accelerated Outlier Detection via Tensor
Operations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "3",
pages = "546--560",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3570690.3570703",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:37 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3570690.3570703",
abstract = "Outlier detection (OD) is a key machine learning task
for finding rare and deviant data samples, with many
time-critical applications such as fraud detection and
intrusion detection. In this work, we propose TOD, the
first tensor-based system for \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ma:2022:FFL,
author = "Chaohong Ma and Xiaohui Yu and Yifan Li and Xiaofeng
Meng and Aishan Maoliniyazi",
title = "{FILM}: a Fully Learned Index for Larger-Than-Memory
Databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "3",
pages = "561--573",
month = nov,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3570690.3570704",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:37 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3570690.3570704",
abstract = "As modern applications generate data at an
unprecedented speed and often require the
querying/analysis of data spanning a large duration, it
is crucial to develop indexing techniques that cater to
larger-than-memory databases, where data reside on
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Mazmudar:2022:CMI,
author = "Miti Mazmudar and Thomas Humphries and Jiaxiang Liu
and Matthew Rafuse and Xi He",
title = "Cache Me If You Can: Accuracy-Aware Inference Engine
for Differentially Private Data Exploration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "4",
pages = "574--586",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3574245.3574246",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:39 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3574245.3574246",
abstract = "Differential privacy (DP) allows data analysts to
query databases that contain users' sensitive
information while providing a quantifiable privacy
guarantee to users. Recent interactive DP systems such
as APEx provide accuracy guarantees over the query
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Falzon:2022:RSE,
author = "Francesca Falzon and Evangelia Anna Markatou and
Zachary Espiritu and Roberto Tamassia",
title = "Range Search over Encrypted Multi-Attribute Data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "4",
pages = "587--600",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3574245.3574247",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:39 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3574245.3574247",
abstract = "This work addresses expressive queries over encrypted
data by presenting the first systematic study of
multi-attribute range search on a symmetrically
encrypted database outsourced to an honest-but-curious
server. Prior work includes a thorough analysis
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ren:2022:HMA,
author = "Xuanle Ren and Le Su and Zhen Gu and Sheng Wang and
Feifei Li and Yuan Xie and Song Bian and Chao Li and
Fan Zhang",
title = "{HEDA}: Multi-Attribute Unbounded Aggregation over
Homomorphically Encrypted Database",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "4",
pages = "601--614",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3574245.3574248",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:39 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3574245.3574248",
abstract = "Recent years have witnessed the rapid development of
the encrypted database, due to the increasing number of
data privacy breaches and the corresponding laws and
regulations that caused millions of dollars in loss.
These encrypted databases may rely on \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Shen:2022:DPG,
author = "Chih-Ya Shen and Shao-Heng Ko and Guang-Siang Lee and
Wang-Chien Lee and De-Nian Yang",
title = "Density Personalized Group Query",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "4",
pages = "615--628",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3574245.3574249",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:39 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3574245.3574249",
abstract = "Research on new queries for finding dense subgraphs
and groups has been actively pursued due to their many
applications, especially in social network analysis and
graph mining. However, existing work faces two major
weaknesses: (i) incapability of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Geng:2022:NDH,
author = "Jinkun Geng and Anirudh Sivaraman and Balaji Prabhakar
and Mendel Rosenblum",
title = "{Nezha}: Deployable and High-Performance Consensus
Using Synchronized Clocks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "4",
pages = "629--642",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3574245.3574250",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:39 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3574245.3574250",
abstract = "This paper presents a high-performance consensus
protocol, Nezha, which can be deployed by cloud tenants
without support from cloud providers. Nezha bridges the
gap between protocols such as Multi-Paxos and Raft,
which can be readily deployed, and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ahmad:2022:PPR,
author = "Ishtiyaque Ahmad and Divyakant Agrawal and Amr {El
Abbadi} and Trinabh Gupta",
title = "{Pantheon}: Private Retrieval from Public Key--Value
Store",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "4",
pages = "643--656",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3574245.3574251",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:39 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3574245.3574251",
abstract = "Consider a cloud server that owns a key-value store
and provides a private query service to its clients.
Preserving client privacy in this setting is difficult
because the key-value store is public, and a client
cannot encrypt or modify it. Therefore, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{DaDalt:2022:BSV,
author = "Francesco {Da Dalt} and Simon Scherrer and Adrian
Perrig",
title = "{Bayesian} Sketches for Volume Estimation in Data
Streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "4",
pages = "657--669",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3574245.3574252",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:39 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3574245.3574252",
abstract = "Given large data streams of items, each attributable
to a certain key and possessing a certain volume, the
aggregate volume associated with a key is difficult to
estimate in a way that is both efficient and accurate.
On the one hand, exact counting with \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Moti:2022:WWA,
author = "Moin Hussain Moti and Panagiotis Simatis and Dimitris
Papadias",
title = "{Waffle}: a Workload-Aware and Query-Sensitive
Framework for Disk-Based Spatial Indexing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "4",
pages = "670--683",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3574245.3574253",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:39 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3574245.3574253",
abstract = "Although several spatial indexes achieve fast query
processing, they are ineffective for highly dynamic
data sets because of costly updates. On the other hand,
simple structures that enable efficient updates are
slow for spatial queries. In this paper, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Pena:2022:FAD,
author = "Eduardo H. M. Pena and Fabio Porto and Felix Naumann",
title = "Fast Algorithms for Denial Constraint Discovery",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "4",
pages = "684--696",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3574245.3574254",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:39 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3574245.3574254",
abstract = "Denial constraints (DCs) are an integrity constraint
formalism widely used to detect inconsistencies in
data. Several algorithms have been devised to discover
DCs from data, as manually specifying them is
burdensome and, worse yet, error-prone. The \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Jiao:2022:TQI,
author = "Pu Jiao and Sheng Di and Hanqi Guo and Kai Zhao and
Jiannan Tian and Dingwen Tao and Xin Liang and Franck
Cappello",
title = "Toward Quantity-of-Interest Preserving Lossy
Compression for Scientific Data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "4",
pages = "697--710",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3574245.3574255",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:39 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3574245.3574255",
abstract = "Today's scientific simulations and instruments are
producing a large amount of data, leading to
difficulties in storing, transmitting, and analyzing
these data. While error-controlled lossy compressors
are effective in significantly reducing data \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Demirci:2022:SGC,
author = "Gunduz Vehbi Demirci and Aparajita Haldar and Hakan
Ferhatosmanoglu",
title = "Scalable Graph Convolutional Network Training on
Distributed-Memory Systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "4",
pages = "711--724",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3574245.3574256",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:39 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3574245.3574256",
abstract = "Graph Convolutional Networks (GCNs) are extensively
utilized for deep learning on graphs. The large data
sizes of graphs and their vertex features make scalable
training algorithms and distributed memory systems
necessary. Since the convolution \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Schafer:2022:MSA,
author = "Patrick Sch{\"a}fer and Ulf Leser",
title = "{Motiflets}: Simple and Accurate Detection of Motifs
in Time Series",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "4",
pages = "725--737",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3574245.3574257",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:39 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3574245.3574257",
abstract = "A time series motif intuitively is a short time series
that repeats itself approximately the same within a
larger time series. Such motifs often represent
concealed structures, such as heart beats in an ECG
recording, the riff in a pop song, or sleep \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Narayan:2022:CFM,
author = "Avanika Narayan and Ines Chami and Laurel Orr and
Christopher R{\'e}",
title = "Can Foundation Models Wrangle Your Data?",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "4",
pages = "738--746",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3574245.3574258",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:39 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3574245.3574258",
abstract = "Foundation Models (FMs) are models trained on large
corpora of data that, at very large scale, can
generalize to new tasks without any task-specific
finetuning. As these models continue to grow in size,
innovations continue to push the boundaries of
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kim:2022:MDB,
author = "Bogyeong Kim and Kyoseung Koo and Undraa Enkhbat and
Sohyun Kim and Juhun Kim and Bongki Moon",
title = "{M2Bench}: a Database Benchmark for Multi-Model
Analytic Workloads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "4",
pages = "747--759",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3574245.3574259",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:39 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3574245.3574259",
abstract = "As the world becomes increasingly data-centric, the
tasks dealt with by a database management system (DBMS)
become more complex and diverse. Compared with
traditional workloads that typically require only a
single data model, modern-day computational \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Baruah:2022:POD,
author = "Nirvik Baruah and Peter Kraft and Fiodar Kazhamiaka
and Peter Bailis and Matei Zaharia",
title = "Parallelism-Optimizing Data Placement for Faster
Data-Parallel Computations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "4",
pages = "760--771",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3574245.3574260",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:39 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3574245.3574260",
abstract = "Systems performing large data-parallel computations,
including online analytical processing (OLAP) systems
like Druid and search engines like Elasticsearch, are
increasingly being used for business-critical real-time
applications where providing low \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lazebnik:2022:SSB,
author = "Teddy Lazebnik and Amit Somech and Abraham Itzhak
Weinberg",
title = "{SubStrat}: a Subset-Based Optimization Strategy for
Faster {AutoML}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "4",
pages = "772--780",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3574245.3574261",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:39 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3574245.3574261",
abstract = "Automated machine learning (AutoML) frameworks have
become important tools in the data scientist's arsenal,
as they dramatically reduce the manual work devoted to
the construction of ML pipelines. Such frameworks
intelligently search among millions of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gurukar:2022:MWS,
author = "Saket Gurukar and Nikil Pancha and Andrew Zhai and
Eric Kim and Samson Hu and Srinivasan Parthasarathy and
Charles Rosenberg and Jure Leskovec",
title = "{MultiBiSage}: a {Web}-Scale Recommendation System
Using Multiple Bipartite Graphs at {Pinterest}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "4",
pages = "781--789",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3574245.3574262",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:39 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3574245.3574262",
abstract = "Graph Convolutional Networks (GCN) can efficiently
integrate graph structure and node features to learn
high-quality node embeddings. At Pinterest, we have
developed and deployed PinSage, a data-efficient GCN
that learns pin embeddings from the Pin-. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zeakis:2022:TEF,
author = "Alexandros Zeakis and Dimitrios Skoutas and Dimitris
Sacharidis and Odysseas Papapetrou and Manolis
Koubarakis",
title = "{TokenJoin}: Efficient Filtering for Set Similarity
Join with Maximum Weighted Bipartite Matching",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "4",
pages = "790--802",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3574245.3574263",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:39 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3574245.3574263",
abstract = "Set similarity join is an important problem with many
applications in data discovery, cleaning and
integration. To increase robustness, fuzzy set
similarity join calculates the similarity of two sets
based on maximum weighted bipartite matching instead
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kayali:2022:QSC,
author = "Moe Kayali and Dan Suciu",
title = "Quasi-Stable Coloring for Graph Compression:
Approximating Max-Flow, Linear Programs, and
Centrality",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "4",
pages = "803--815",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3574245.3574264",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:39 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3574245.3574264",
abstract = "We propose quasi-stable coloring, an approximate
version of stable coloring. Stable coloring, also
called color refinement, is a well-studied technique in
graph theory for classifying vertices, which can be
used to build compact, lossless \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Pujol:2022:MAD,
author = "David Pujol and Albert Sun and Brandon Fain and Ashwin
Machanavajjhala",
title = "Multi-Analyst Differential Privacy for Online Query
Answering",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "4",
pages = "816--828",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3574245.3574265",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:39 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3574245.3574265",
abstract = "Most differentially private mechanisms are designed
for the use of a single analyst. In reality, however,
there are often multiple stakeholders with different
and possibly conflicting priorities that must share the
same privacy loss budget. This \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gubner:2022:EVM,
author = "Tim Gubner and Peter Boncz",
title = "{Excalibur}: a Virtual Machine for Adaptive
Fine-grained {JIT}-Compiled Query Execution based on
{VOILA}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "4",
pages = "829--841",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3574245.3574266",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:39 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3574245.3574266",
abstract = "In recent years, hardware has become increasingly
diverse, in terms of features as well as performance.
This poses a problem for complex software in general
and database systems in particular. To achieve
top-notch performance, we need to exploit \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Qin:2022:ADO,
author = "Lianke Qin and Rajesh Jayaram and Elaine Shi and Zhao
Song and Danyang Zhuo and Shumo Chu",
title = "{Adore}: Differentially Oblivious Relational Database
Operators",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "4",
pages = "842--855",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3574245.3574267",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:39 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3574245.3574267",
abstract = "There has been a recent effort in applying
differential privacy on memory access patterns to
enhance data privacy. This is called differential
obliviousness. Differential obliviousness is a
promising direction because it provides a principled
trade-off \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Laddad:2022:KCC,
author = "Shadaj Laddad and Conor Power and Mae Milano and Alvin
Cheung and Natacha Crooks and Joseph M. Hellerstein",
title = "Keep {CALM} and {CRDT} On",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "4",
pages = "856--863",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3574245.3574268",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:39 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3574245.3574268",
abstract = "Despite decades of research and practical experience,
developers have few tools for programming reliable
distributed applications without resorting to expensive
coordination techniques. Conflict-free replicated
datatypes (CRDTs) are a promising line of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lu:2022:MLS,
author = "Kejing Lu and Yoshiharu Ishikawa and Chuan Xiao",
title = "{MQH}: Locality Sensitive Hashing on Multi-level
Quantization Errors for Point-to-Hyperplane Distances",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "4",
pages = "864--876",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3574245.3574269",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:39 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3574245.3574269",
abstract = "Point-to-hyperplane nearest neighbor search (P2HNNS)
is a fundamental problem which has many applications in
data mining and machine learning. In this paper, we
propose a provable Locality-Sensitive-Hashing (LSH)
scheme based on multi-level quantization \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Szarnyas:2022:LSN,
author = "G{\'a}bor Sz{\'a}rnyas and Jack Waudby and Benjamin A.
Steer and D{\'a}vid Szak{\'a}llas and Altan Birler and
Mingxi Wu and Yuchen Zhang and Peter Boncz",
title = "The {LDBC} Social Network Benchmark: Business
Intelligence Workload",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "4",
pages = "877--890",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3574245.3574270",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:39 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3574245.3574270",
abstract = "The Social Network Benchmark's Business Intelligence
workload (SNB BI) is a comprehensive graph OLAP
benchmark targeting analytical data systems capable of
supporting graph workloads. This paper marks the
finalization of almost a decade of research in
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{An:2022:MCM,
author = "Shuai An and Yang Cao",
title = "Making Cache Monotonic and Consistent",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "4",
pages = "891--904",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3574245.3574271",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:39 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3574245.3574271",
abstract = "We propose monotonic consistent caching (MCC), a cache
scheme for applications that demand consistency and
monotonicity. MCC warrants that a transaction-like
request always sees a consistent view of the backend
database and observed writes over the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wei:2022:SPE,
author = "Ziyun Wei and Immanuel Trummer",
title = "{SkinnerMT}: Parallelizing for Efficiency and
Robustness in Adaptive Query Processing on Multicore
Platforms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "4",
pages = "905--917",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3574245.3574272",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:39 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3574245.3574272",
abstract = "SkinnerMT is an adaptive query processing engine,
specialized for multi-core platforms. SkinnerMT
features different strategies for parallel processing
that allow users to trade between average run time and
performance robustness. First, SkinnerMT \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ding:2022:EAQ,
author = "Dujian Ding and Sihem Amer-Yahia and Laks Lakshmanan",
title = "On Efficient Approximate Queries over Machine Learning
Models",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "4",
pages = "918--931",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3574245.3574273",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:39 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3574245.3574273",
abstract = "The question of answering queries over ML predictions
has been gaining attention in the database community.
This question is challenging because finding high
quality answers by invoking an oracle such as a human
expert or an expensive deep neural \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Khatiwada:2022:IDL,
author = "Aamod Khatiwada and Roee Shraga and Wolfgang
Gatterbauer and Ren{\'e}e J. Miller",
title = "Integrating Data Lake Tables",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "4",
pages = "932--945",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3574245.3574274",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:39 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3574245.3574274",
abstract = "We have made tremendous strides in providing tools for
data scientists to discover new tables useful for their
analyses. But despite these advances, the proper
integration of discovered tables has been
under-explored. An interesting semantics for \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kang:2022:PTS,
author = "Hongbo Kang and Yiwei Zhao and Guy E. Blelloch and
Laxman Dhulipala and Yan Gu and Charles McGuffey and
Phillip B. Gibbons",
title = "{PIM-Tree}: a Skew-Resistant Index for
Processing-in-Memory",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "4",
pages = "946--958",
month = dec,
year = "2022",
CODEN = "????",
DOI = "https://doi.org/10.14778/3574245.3574275",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:39 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3574245.3574275",
abstract = "The performance of today's in-memory indexes is
bottlenecked by the memory latency/bandwidth wall.
Processing-in-memory (PIM) is an emerging approach that
potentially mitigates this bottleneck, by enabling
low-latency memory access whose aggregate \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2023:ATR,
author = "Mingxuan Li and Yazhe Wang and Shuai Ma and Chao Liu
and Dongdong Huo and Yu Wang and Zhen Xu",
title = "Auto-Tuning with Reinforcement Learning for
Permissioned Blockchain Systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "5",
pages = "1000--1012",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3579075.3579076",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:40 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3579075.3579076",
abstract = "In a permissioned blockchain, performance dictates its
development, which is substantially influenced by its
parameters. However, research on auto-tuning for better
performance has somewhat stagnated because of the
difficulty posed by distributed \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Xie:2023:PSH,
author = "Minhui Xie and Youyou Lu and Qing Wang and Yangyang
Feng and Jiaqiang Liu and Kai Ren and Jiwu Shu",
title = "{PetPS}: Supporting Huge Embedding Models with
Persistent Memory",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "5",
pages = "1013--1022",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3579075.3579077",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:40 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3579075.3579077",
abstract = "Embedding models are effective for learning
high-dimensional sparse data. Traditionally, they are
deployed in DRAM parameter servers (PS) for online
inference access. However, the ever-increasing model
capacity makes this practice suffer from both high
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Rabbani:2023:EVS,
author = "Kashif Rabbani and Matteo Lissandrini and Katja Hose",
title = "Extraction of Validating Shapes from Very Large
Knowledge Graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "5",
pages = "1023--1032",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3579075.3579078",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:40 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3579075.3579078",
abstract = "Knowledge Graphs (KGs) represent heterogeneous domain
knowledge on the Web and within organizations. There
exist shapes constraint languages to define validating
shapes to ensure the quality of the data in KGs.
Existing techniques to extract validating \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Pang:2023:AFM,
author = "Pu Pang and Gang Deng and Kaihao Bai and Quan Chen and
Shixuan Sun and Bo Liu and Yu Xu and Hongbo Yao and
Zhengheng Wang and Xiyu Wang and Zheng Liu and Zhuo
Song and Yong Yang and Tao Ma and Minyi Guo",
title = "{Async-Fork}: Mitigating Query Latency Spikes Incurred
by the Fork-based Snapshot Mechanism from the {OS}
Level",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "5",
pages = "1033--1045",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3579075.3579079",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:40 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3579075.3579079",
abstract = "In-memory key-value stores (IMKVSes) serve many online
applications. They generally adopt the fork-based
snapshot mechanism to support data backup. However,
this method can result in query latency spikes because
the engine is out-of-service for queries \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2023:CPJ,
author = "Qichen Wang and Xiao Hu and Binyang Dai and Ke Yi",
title = "Change Propagation Without Joins",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "5",
pages = "1046--1058",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3579075.3579080",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:40 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3579075.3579080",
abstract = "We revisit the classical change propagation framework
for query evaluation under updates. The standard
framework takes a query plan and materializes the
intermediate views, which incurs high polynomial costs
in both space and time, with the join \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Xie:2023:FFF,
author = "Yuexiang Xie and Zhen Wang and Dawei Gao and Daoyuan
Chen and Liuyi Yao and Weirui Kuang and Yaliang Li and
Bolin Ding and Jingren Zhou",
title = "{FederatedScope}: a Flexible Federated Learning
Platform for Heterogeneity",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "5",
pages = "1059--1072",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3579075.3579081",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:40 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3579075.3579081",
abstract = "Although remarkable progress has been made by existing
federated learning (FL) platforms to provide
infrastructures for development, these platforms may
not well tackle the challenges brought by various types
of heterogeneity. To fill this gap, in this \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2023:AAC,
author = "Boyang Li and Yurong Cheng and Ye Yuan and Yi Yang and
QianQian Jin and Guoren Wang",
title = "{ACTA}: Autonomy and Coordination Task Assignment in
Spatial Crowdsourcing Platforms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "5",
pages = "1073--1085",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3579075.3579082",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:40 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3579075.3579082",
abstract = "Spatial platforms have become increasingly important
in people's daily lives. Task assignment is a critical
problem in these platforms that matches real-time
orders to suitable workers. Most studies only focus on
independent platforms that are in a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Um:2023:FAD,
author = "Taegeon Um and Byungsoo Oh and Byeongchan Seo and
Minhyeok Kweun and Goeun Kim and Woo-Yeon Lee",
title = "{FastFlow}: Accelerating Deep Learning Model Training
with Smart Offloading of Input Data Pipeline",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "5",
pages = "1086--1099",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3579075.3579083",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:40 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3579075.3579083",
abstract = "When training a deep learning (DL) model, input data
are pre-processed on CPUs and transformed into tensors,
which are then fed into GPUs for gradient computations
of model training. Expensive GPUs must be fully
utilized during training to accelerate \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhao:2023:FFM,
author = "Xi Zhao and Bolong Zheng and Xiaomeng Yi and Xiaofan
Luan and Charles Xie and Xiaofang Zhou and Christian S.
Jensen",
title = "{FARGO}: Fast Maximum Inner Product Search via Global
Multi-Probing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "5",
pages = "1100--1112",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3579075.3579084",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:40 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3579075.3579084",
abstract = "Maximum inner product search (MIPS) in
high-dimensional spaces has wide applications but is
computationally expensive due to the curse of
dimensionality. Existing studies employ asymmetric
transformations that reduce the MIPS problem to a
nearest \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kiefer:2023:ODP,
author = "Martin Kiefer and Ilias Poulakis and Eleni Tzirita
Zacharatou and Volker Markl",
title = "Optimistic Data Parallelism for {FPGA}-Accelerated
Sketching",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "5",
pages = "1113--1125",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3579075.3579085",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:40 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3579075.3579085",
abstract = "Sketches are a popular approximation technique for
large datasets and high-velocity data streams. While
custom FPGA-based hardware has shown admirable
throughput at sketching, the state-of-the-art exploits
data parallelism by fully replicating resources
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Arcolezi:2023:RCM,
author = "H{\'e}ber H. Arcolezi and S{\'e}bastien Gambs and
Jean-Fran{\c{c}}ois Couchot and Catuscia Palamidessi",
title = "On the Risks of Collecting Multidimensional Data Under
Local Differential Privacy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "5",
pages = "1126--1139",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3579075.3579086",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:40 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3579075.3579086",
abstract = "The private collection of multiple statistics from a
population is a fundamental statistical problem. One
possible approach to realize this is to rely on the
local model of differential privacy (LDP). Numerous LDP
protocols have been developed for the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chatzakis:2023:OJL,
author = "Manos Chatzakis and Panagiota Fatourou and Eleftherios
Kosmas and Themis Palpanas and Botao Peng",
title = "{Odyssey}: a Journey in the Land of Distributed Data
Series Similarity Search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "5",
pages = "1140--1153",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3579075.3579087",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:40 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3579075.3579087",
abstract = "This paper presents Odyssey, a novel distributed
data-series processing framework that efficiently
addresses the critical challenges of exhibiting good
speedup and ensuring high scalability in data series
processing by taking advantage of the full \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fang:2023:AER,
author = "Lanting Fang and Kaiyu Feng and Jie Gui and Shanshan
Feng and Aiqun Hu",
title = "Anonymous Edge Representation for Inductive Anomaly
Detection in Dynamic Bipartite Graph",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "5",
pages = "1154--1167",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3579075.3579088",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:40 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3579075.3579088",
abstract = "The activities in many real-world applications, such
as e-commerce and online education, are usually modeled
as a dynamic bipartite graph that evolves over time. It
is a critical task to detect anomalies inductively in a
dynamic bipartite graph. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yang:2023:STR,
author = "Junyong Yang and Ming Zhong and Yuanyuan Zhu and
Tieyun Qian and Mengchi Liu and Jeffrey Xu Yu",
title = "Scalable Time-Range $k$-Core Query on Temporal
Graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "5",
pages = "1168--1180",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3579075.3579089",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:40 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3579075.3579089",
abstract = "Querying cohesive subgraphs on temporal graphs with
various time constraints has attracted intensive
research interests recently. In this paper, we study a
novel Temporal k -Core Query (TCQ) problem: given a
time interval, find all distinct k -cores that
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhu:2023:HPR,
author = "Erkang Zhu and Silu Huang and Surajit Chaudhuri",
title = "High-Performance Row Pattern Recognition Using Joins",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "5",
pages = "1181--1195",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3579075.3579090",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:40 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3579075.3579090",
abstract = "The SQL standard introduced MATCH_RECOGNIZE in 2016
for row pattern recognition. Since then,
MATCH_RECOGNIZE has been supported by several leading
relation systems, they implemented this function using
Non-Deterministic Finite Automaton (NFA). While NFA
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Luo:2023:HGA,
author = "Kelin Luo and Alexandre M. Florio and Syamantak Das
and Xiangyu Guo",
title = "A Hierarchical Grouping Algorithm for the
Multi-Vehicle Dial-a-Ride Problem",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "5",
pages = "1195--1207",
month = jan,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3579075.3579091",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Mar 11 08:12:40 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3579075.3579091",
abstract = "Ride-sharing is an essential aspect of modern urban
mobility. In this paper, we consider a classical
problem in ride-sharing --- the Multi-Vehicle
Dial-a-Ride Problem (Multi-Vehicle DaRP). Given a fleet
of vehicles with a fixed capacity stationed at
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liu:2023:LAD,
author = "Xiaoxuan Liu and Shuxian Wang and Mengzhu Sun and
Sicheng Pan and Ge Li and Siddharth Jha and Cong Yan
and Junwen Yang and Shan Lu and Alvin Cheung",
title = "Leveraging Application Data Constraints to Optimize
Database-Backed {Web} Applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "6",
pages = "1208--1221",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3583140.3583141",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 1 07:43:11 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3583140.3583141",
abstract = "Exploiting the relationships among data is a classical
query optimization technique. As persistent data is
increasingly being created and maintained
programmatically, prior work that infers data
relationships from data statistics misses an important
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gruber:2023:BCD,
author = "Ferdinand Gruber and Maximilian Bandle and Alexis
Engelke and Thomas Neumann and Jana Giceva",
title = "Bringing Compiling Databases to {RISC} Architectures",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "6",
pages = "1222--1234",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3583140.3583142",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 1 07:43:11 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3583140.3583142",
abstract = "Current hardware development greatly influences the
design decisions of modern database systems. For many
modern performance-focused database systems, query
compilation emerged as an integral part and different
approaches for code generation evolved, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Cha:2023:BLH,
author = "Hokeun Cha and Xiangpeng Hao and Tianzheng Wang and
Huanchen Zhang and Aditya Akella and Xiangyao Yu",
title = "{B$^{\rm link}$-hash}: an Adaptive Hybrid Index for
In-Memory Time-Series Databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "6",
pages = "1235--1248",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3583140.3583143",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 1 07:43:11 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3583140.3583143",
abstract = "High-speed data ingestion is critical in time-series
workloads that are driven by the growth of Internet of
Things (IoT) applications. We observe that traditional
tree-based indexes encounter severe scalability
bottlenecks for time-series workloads that insert
monotonically increasing timestamp keys into an index;
all insertions go to a small memory region that sees
extremely high contention.\par
In this work, we present a new index design,
Blink-hash, that enhances a tree-based index with hash
leaf nodes to mitigate the contention of monotonic
insertions --- insertions go to random locations within
a hash node (which is much larger than a B+-tree node)
to reduce conflicts. We develop further optimizations
(median approximation and lazy split) to accelerate
hash node splits. We also develop structure adaptation
optimizations to dynamically convert a hash node to
B+-tree nodes for good scan performance. Our evaluation
shows that Blink-hash achieves up to 91.3$ \times $
higher throughput than conventional indexes in a
time-series workload that monotonically inserts
timestamps into an index, while showing comparable scan
performance to a well-optimized B+-tree.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Huang:2023:DSE,
author = "Wentao Huang and Yunhong Ji and Xuan Zhou and
Bingsheng He and Kian-Lee Tan",
title = "A Design Space Exploration and Evaluation for
Main-Memory Hash Joins in Storage Class Memory",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "6",
pages = "1249--1263",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3583140.3583144",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 1 07:43:11 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3583140.3583144",
abstract = "In this paper, we seek to perform a rigorous
experimental study of main-memory hash joins in storage
class memory (SCM). In particular, we perform a design
space exploration in real SCM for two state-of-the-art
join algorithms: partitioned hash join \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Huang:2023:EBB,
author = "Kaile Huang and Si Liu and Zhenge Chen and Hengfeng
Wei and David Basin and Haixiang Li and Anqun Pan",
title = "Efficient Black-Box Checking of Snapshot Isolation in
Databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "6",
pages = "1264--1276",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3583140.3583145",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 1 07:43:11 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3583140.3583145",
abstract = "Snapshot isolation (SI) is a prevalent weak isolation
level that avoids the performance penalty imposed by
serializability and simultaneously prevents various
undesired data anomalies. Nevertheless, SI anomalies
have recently been found in production \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2023:DPV,
author = "Zitao Li and Tianhao Wang and Ninghui Li",
title = "Differentially Private Vertical Federated Clustering",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "6",
pages = "1277--1290",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3583140.3583146",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 1 07:43:11 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3583140.3583146",
abstract = "In many applications, multiple parties have private
data regarding the same set of users but on disjoint
sets of attributes, and a server wants to leverage the
data to train a model. To enable model learning while
protecting the privacy of the data \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhao:2023:PCT,
author = "Fuheng Zhao and Punnal Ismail Khan and Divyakant
Agrawal and Amr {El Abbadi} and Arpit Gupta and Zaoxing
Liu",
title = "{Panakos}: Chasing the Tails for Multidimensional Data
Streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "6",
pages = "1291--1304",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3583140.3583147",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 1 07:43:11 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3583140.3583147",
abstract = "System operators are often interested in extracting
different feature streams from multi-dimensional data
streams; and reporting their distributions at regular
intervals, including the heavy hitters that contribute
to the tail portion of the feature \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Furst:2023:VOM,
author = "Jonathan F{\"u}rst and Mauricio Fadel Argerich and Bin
Cheng",
title = "{VersaMatch}: Ontology Matching with Weak
Supervision",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "6",
pages = "1305--1318",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3583140.3583148",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 1 07:43:11 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3583140.3583148",
abstract = "Ontology matching is crucial to data integration for
across-silo data sharing and has been mainly addressed
with heuristic and machine learning (ML) methods. While
heuristic methods are often inflexible and hard to
extend to new domains, ML methods rely \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sun:2023:RRT,
author = "Yushi Sun and Hao Xin and Lei Chen",
title = "{RECA}: Related Tables Enhanced Column Semantic Type
Annotation Framework",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "6",
pages = "1319--1331",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3583140.3583149",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 1 07:43:11 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3583140.3583149",
abstract = "Understanding the semantics of tabular data is of
great importance in various downstream applications,
such as schema matching, data cleaning, and data
integration. Column semantic type annotation is a
critical task in the semantic understanding of
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2023:ZWT,
author = "Yiming Li and Yanyan Shen and Lei Chen and Mingxuan
Yuan",
title = "{Zebra}: When Temporal Graph Neural Networks Meet
Temporal Personalized {PageRank}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "6",
pages = "1332--1345",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3583140.3583150",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 1 07:43:11 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pagerank.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3583140.3583150",
abstract = "Temporal graph neural networks (T-GNNs) are
state-of-the-art methods for learning representations
over dynamic graphs. Despite the superior performance,
T-GNNs still suffer from high computational complexity
caused by the tedious recursive temporal \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Feng:2023:EAC,
author = "Su Feng and Boris Glavic and Oliver Kennedy",
title = "Efficient Approximation of Certain and Possible
Answers for Ranking and Window Queries over Uncertain
Data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "6",
pages = "1346--1358",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3583140.3583151",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 1 07:43:11 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3583140.3583151",
abstract = "Uncertainty arises naturally in many application
domains due to, e.g., data entry errors and ambiguity
in data cleaning. Prior work in incomplete and
probabilistic databases has investigated the semantics
and efficient evaluation of ranking and top-k
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yue:2023:GEV,
author = "Cong Yue and Tien Tuan Anh Dinh and Zhongle Xie and
Meihui Zhang and Gang Chen and Beng Chin Ooi and
Xiaokui Xiao",
title = "{GlassDB}: an Efficient Verifiable Ledger Database
System Through Transparency",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "6",
pages = "1359--1371",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3583140.3583152",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 1 07:43:11 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3583140.3583152",
abstract = "Verifiable ledger databases protect data history
against malicious tampering. Existing systems, such as
blockchains and certificate transparency, are based on
transparency logs --- a simple abstraction allowing
users to verify that a log maintained by \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2023:EDT,
author = "Qian Zhang and Jingyao Li and Hongyao Zhao and
Quanqing Xu and Wei Lu and Jinliang Xiao and Fusheng
Han and Chuanhui Yang and Xiaoyong Du",
title = "Efficient Distributed Transaction Processing in
Heterogeneous Networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "6",
pages = "1372--1385",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3583140.3583153",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 1 07:43:11 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3583140.3583153",
abstract = "Countrywide and worldwide business, like gaming and
social networks, drives the popularity of
inter-data-center transactions. To support
inter-data-center transaction processing and data
center fault tolerance simultaneously, existing
protocols suffer \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Jiang:2023:ASE,
author = "Zhiguo Jiang and Hanhua Chen and Hai Jin",
title = "{Auxo}: a Scalable and Efficient Graph Stream
Summarization Structure",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "6",
pages = "1386--1398",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3583140.3583154",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 1 07:43:11 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3583140.3583154",
abstract = "A graph stream refers to a continuous stream of edges,
forming a huge and fast-evolving graph. The vast volume
and high update speed of a graph stream bring stringent
requirements for the data management structure,
including sublinear space cost, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{He:2023:OOS,
author = "Xiao He and Ye Li and Jian Tan and Bin Wu and Feifei
Li",
title = "{OneShotSTL}: One-Shot Seasonal-Trend Decomposition
For Online Time Series Anomaly Detection And
Forecasting",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "6",
pages = "1399--1412",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3583140.3583155",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 1 07:43:11 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3583140.3583155",
abstract = "Seasonal-trend decomposition is one of the most
fundamental concepts in time series analysis that
supports various downstream tasks, including time
series anomaly detection and forecasting. However,
existing decomposition methods rely on batch \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{vanRenen:2023:CAB,
author = "Alexander van Renen and Viktor Leis",
title = "Cloud Analytics Benchmark",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "6",
pages = "1413--1425",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3583140.3583156",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 1 07:43:11 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3583140.3583156",
abstract = "The cloud facilitates the transition to a
service-oriented perspective. This affects cloud-native
data management in general, and data analytics in
particular. Instead of managing a multi-node database
cluster on-premise, end users simply send queries
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Alhomssi:2023:SRS,
author = "Adnan Alhomssi and Viktor Leis",
title = "Scalable and Robust Snapshot Isolation for
High-Performance Storage Engines",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "6",
pages = "1426--1438",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3583140.3583157",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 1 07:43:11 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3583140.3583157",
abstract = "MVCC-based snapshot isolation promises that read
queries can proceed without interfering with concurrent
writes. However, as we show experimentally, in existing
implementations a single long-running query can easily
cause transactional throughput to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2023:FFS,
author = "Xiang Li and Fabing Li and Mingyu Gao",
title = "{Flare}: a Fast, Secure, and Memory-Efficient
Distributed Analytics Framework",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "6",
pages = "1439--1452",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3583140.3583158",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 1 07:43:11 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3583140.3583158",
abstract = "As big data processing in the cloud becomes prevalent
today, data privacy on such public platforms raises
critical concerns. Hardware-based trusted execution
environments (TEEs) provide promising and practical
platforms for low-cost privacy-preserving \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{An:2023:NSB,
author = "Mijin An and Jonghyeok Park and Tianzheng Wang and
Beomseok Nam and Sang-Won Lee",
title = "{NV-SQL}: Boosting {OLTP} Performance with
Non-Volatile {DIMMs}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "6",
pages = "1453--1465",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3583140.3583159",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 1 07:43:11 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3583140.3583159",
abstract = "When running OLTP workloads, relational DBMSs with
flash SSDs still suffer from the durability overhead.
Heavy writes to SSD not only limit the performance but
also shorten the storage lifespan. To mitigate the
durability overhead, this paper proposes a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhu:2023:LLR,
author = "Rong Zhu and Wei Chen and Bolin Ding and Xingguang
Chen and Andreas Pfadler and Ziniu Wu and Jingren
Zhou",
title = "{Lero}: a Learning-to-Rank Query Optimizer",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "6",
pages = "1466--1479",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3583140.3583160",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 1 07:43:11 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3583140.3583160",
abstract = "A recent line of works apply machine learning
techniques to assist or rebuild cost-based query
optimizers in DBMS. While exhibiting superiority in
some benchmarks, their deficiencies, e.g., unstable
performance, high training cost, and slow model
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lee:2023:DCS,
author = "Kitaek Lee and Insoon Jo and Jaechan Ahn and Hyuk Lee
and Hwang Lee and Woong Sul and Hyungsoo Jung",
title = "Deploying Computational Storage for {HTAP DBMSs} Takes
More Than Just Computation Offloading",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "6",
pages = "1480--1493",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3583140.3583161",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 1 07:43:11 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3583140.3583161",
abstract = "Hybrid transactional/analytical processing (HTAP)
would overload database systems. To alleviate
performance interference between transactions and
analytics, recent research pursues the potential of
in-storage processing (ISP) using commodity \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Tang:2023:TPC,
author = "Dixin Tang and Alan Fekete and Indranil Gupta and
Aditya G. Parameswaran",
title = "Transactional Panorama: a Conceptual Framework for
User Perception in Analytical Visual Interfaces",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "6",
pages = "1494--1506",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3583140.3583162",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 1 07:43:11 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3583140.3583162",
abstract = "Many tools empower analysts and data scientists to
consume analysis results in a visual interface. When
the underlying data changes, these results need to be
updated, but this update can take a long time---all
while the user continues to explore the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Paulsen:2023:SSY,
author = "Derek Paulsen and Yash Govind and AnHai Doan",
title = "{Sparkly}: a Simple yet Surprisingly Strong {TF\slash
IDF} Blocker for Entity Matching",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "6",
pages = "1507--1519",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3583140.3583163",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 1 07:43:11 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3583140.3583163",
abstract = "Blocking is a major task in entity matching. Numerous
blocking solutions have been developed, but as far as
we can tell, blocking using the well-known tf/idf
measure has received virtually no attention. Yet, when
we experimented with tf/idf blocking \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Negi:2023:RQD,
author = "Parimarjan Negi and Ziniu Wu and Andreas Kipf and
Nesime Tatbul and Ryan Marcus and Sam Madden and Tim
Kraska and Mohammad Alizadeh",
title = "Robust Query Driven Cardinality Estimation under
Changing Workloads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "6",
pages = "1520--1533",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3583140.3583164",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 1 07:43:11 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3583140.3583164",
abstract = "Query driven cardinality estimation models learn from
a historical log of queries. They are lightweight,
having low storage requirements, fast inference and
training, and are easily adaptable for any kind of
query. Unfortunately, such models can suffer \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fu:2023:CTR,
author = "Han Fu and Chang Liu and Bin Wu and Feifei Li and Jian
Tan and Jianling Sun",
title = "{CatSQL}: Towards Real World Natural Language to {SQL}
Applications",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "6",
pages = "1534--1547",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3583140.3583165",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 1 07:43:11 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3583140.3583165",
abstract = "Natural language to SQL (NL2SQL) techniques provide a
convenient interface to access databases, especially
for non-expert users, to conduct various data
analytics. Existing methods often employ either a
rule-base approach or a deep learning based \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Azizi:2023:EGB,
author = "Ilias Azizi and Karima Echihabi and Themis Palpanas",
title = "{ELPIS}: Graph-Based Similarity Search for Scalable
Data Science",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "6",
pages = "1548--1559",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3583140.3583166",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 1 07:43:11 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3583140.3583166",
abstract = "The recent popularity of learned embeddings has fueled
the growth of massive collections of high-dimensional
(high-d) vectors that model complex data. Finding
similar vectors in these collections is at the core of
many important and practical data \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Duffy:2023:DKV,
author = "Carl Duffy and Jaehoon Shim and Sang-Hoon Kim and
Jin-Soo Kim",
title = "{Dotori}: a Key--Value {SSD} Based {KV} Store",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "6",
pages = "1560--1572",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3583140.3583167",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 1 07:43:11 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3583140.3583167",
abstract = "Key-value SSDs (KVSSDs) represent a major shift in the
storage stack design, with numerous potential benefits.
Despite this, their lack of native features critical to
operation in real world scenarios hinders their
adoption, and these benefits go \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Pujol:2023:PPG,
author = "David Pujol and Amir Gilad and Ashwin
Machanavajjhala",
title = "{PreFair}: Privately Generating Justifiably Fair
Synthetic Data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "6",
pages = "1573--1586",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3583140.3583168",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 1 07:43:11 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3583140.3583168",
abstract = "When a database is protected by Differential Privacy
(DP), its usability is limited in scope. In this
scenario, generating a synthetic version of the data
that mimics the properties of the private data allows
users to perform any operation on the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Shraga:2023:EDC,
author = "Roee Shraga and Ren{\'e}e J. Miller",
title = "Explaining Dataset Changes for Semantic Data
Versioning with {Explain-Da-V}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "6",
pages = "1587--1600",
month = feb,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3583140.3583169",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 1 07:43:11 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3583140.3583169",
abstract = "In multi-user environments in which data science and
analysis is collaborative, multiple versions of the
same datasets are generated. While managing and storing
data versions has received some attention in the
research literature, the semantic nature of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Budiu:2023:DAI,
author = "Mihai Budiu and Tej Chajed and Frank McSherry and
Leonid Ryzhyk and Val Tannen",
title = "{DBSP}: Automatic Incremental View Maintenance for
Rich Query Languages",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "7",
pages = "1601--1614",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3587136.3587137",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 9 09:08:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3587136.3587137",
abstract = "Incremental view maintenance (IVM) has long been a
central problem in database theory. Many solutions have
been proposed for restricted classes of database
languages, such as the relational algebra, or Datalog.
These techniques do not naturally \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liang:2023:SSP,
author = "Ling Liang and Jilan Lin and Zheng Qu and Ishtiyaque
Ahmad and Fengbin Tu and Trinabh Gupta and Yufei Ding
and Yuan Xie",
title = "{SPG}: Structure-Private Graph Database via
{SqueezePIR}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "7",
pages = "1615--1628",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3587136.3587138",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 9 09:08:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3587136.3587138",
abstract = "Many relational data in our daily life are represented
as graphs, making graph application an important
workload. Because of the large scale of graph datasets,
moving graph data to the cloud becomes a popular
option. To keep the confidential and private \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2023:IES,
author = "Jingyuan Zhang and Ao Wang and Xiaolong Ma and
Benjamin Carver and Nicholas John Newman and Ali Anwar
and Lukas Rupprecht and Vasily Tarasov and Dimitrios
Skourtis and Feng Yan and Yue Cheng",
title = "{InfiniStore}: Elastic Serverless Cloud Storage",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "7",
pages = "1629--1642",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3587136.3587139",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 9 09:08:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3587136.3587139",
abstract = "Cloud object storage such as AWS S3 is cost-effective
and highly elastic but relatively slow, while
high-performance cloud storage such as AWS ElastiCache
is expensive and provides limited elasticity. We
present a new cloud storage service called \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fang:2023:DGE,
author = "Peng Fang and Arijit Khan and Siqiang Luo and Fang
Wang and Dan Feng and Zhenli Li and Wei Yin and Yuchao
Cao",
title = "Distributed Graph Embedding with Information-Oriented
Random Walks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "7",
pages = "1643--1656",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3587136.3587140",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 9 09:08:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3587136.3587140",
abstract = "Graph embedding maps graph nodes to low-dimensional
vectors, and is widely adopted in machine learning
tasks. The increasing availability of billion-edge
graphs underscores the importance of learning efficient
and effective embeddings on large graphs, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zheng:2023:SSV,
author = "Shuyuan Zheng and Yang Cao and Masatoshi Yoshikawa",
title = "Secure {Shapley} Value for Cross-Silo Federated
Learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "7",
pages = "1657--1670",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3587136.3587141",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 9 09:08:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3587136.3587141",
abstract = "The Shapley value (SV) is a fair and principled metric
for contribution evaluation in cross-silo federated
learning (cross-silo FL), wherein organizations, i.e.,
clients, collaboratively train prediction models with
the coordination of a parameter \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2023:SSF,
author = "Xiang Li and Nuozhou Sun and Yunqian Luo and Mingyu
Gao",
title = "{SODA}: a Set of Fast Oblivious Algorithms in
Distributed Secure Data Analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "7",
pages = "1671--1684",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3587136.3587142",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 9 09:08:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3587136.3587142",
abstract = "Cloud systems are now a prevalent platform to host
large-scale big-data analytics applications such as
machine learning and relational database. However, data
privacy remains as a critical concern for public cloud
systems. Existing trusted hardware \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Hong:2023:GSB,
author = "Zicong Hong and Song Guo and Enyuan Zhou and Wuhui
Chen and Huawei Huang and Albert Zomaya",
title = "{GriDB}: Scaling Blockchain Database via Sharding and
Off-Chain Cross-Shard Mechanism",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "7",
pages = "1685--1698",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3587136.3587143",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 9 09:08:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3587136.3587143",
abstract = "Blockchain databases have attracted widespread
attention but suffer from poor scalability due to
underlying non-scalable blockchains. While blockchain
sharding is necessary for a scalable blockchain
database, it poses a new challenge named on-chain
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Jian:2023:SAS,
author = "Xun Jian and Zhiyuan Li and Lei Chen",
title = "{SUFF}: Accelerating Subgraph Matching with Historical
Data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "7",
pages = "1699--1711",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3587136.3587144",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 9 09:08:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3587136.3587144",
abstract = "Subgraph matching is a fundamental problem in graph
theory and has wide applications in areas like
sociology, chemistry, and social networks. Due to its
NP-hardness, the basic approach is a brute-force search
over the whole search space. Some pruning \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{He:2023:WDM,
author = "Haochen He and Erci Xu and Shanshan Li and Zhouyang
Jia and Si Zheng and Yue Yu and Jun Ma and Xiangke
Liao",
title = "When Database Meets New Storage Devices: Understanding
and Exposing Performance Mismatches via
Configurations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "7",
pages = "1712--1725",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3587136.3587145",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 9 09:08:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3587136.3587145",
abstract = "NVMe SSD hugely boosts the I/O speed, with up to GB/s
throughput and microsecond-level latency.
Unfortunately, DBMS users can often find their
high-performanced storage devices tend to deliver
less-than-expected or even worse performance when
compared \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fan:2023:SAD,
author = "Grace Fan and Jin Wang and Yuliang Li and Dan Zhang
and Ren{\'e}e J. Miller",
title = "Semantics-Aware Dataset Discovery from Data Lakes with
Contextualized Column-Based Representation Learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "7",
pages = "1726--1739",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3587136.3587146",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 9 09:08:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3587136.3587146",
abstract = "Dataset discovery from data lakes is essential in many
real application scenarios. In this paper, we propose
Starmie, an end-to-end framework for dataset discovery
from data lakes (with table union search as the main
use case). Our proposed framework \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Mortensen:2023:MEM,
author = "Kasper Overgaard Mortensen and Fatemeh Zardbani and
Mohammad Ahsanul Haque and Steinn Ymir Agustsson and
Davide Mottin and Philip Hofmann and Panagiotis
Karras",
title = "Marigold: Efficient $k$-Means Clustering in High
Dimensions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "7",
pages = "1740--1748",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3587136.3587147",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 9 09:08:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3587136.3587147",
abstract = "How can we efficiently and scalably cluster
high-dimensional data? The k -means algorithm clusters
data by iteratively reducing intra-cluster Euclidean
distances until convergence. While it finds
applications from recommendation engines to image
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sabek:2023:CLM,
author = "Ibrahim Sabek and Tim Kraska",
title = "The Case for Learned In-Memory Joins",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "7",
pages = "1749--1762",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3587136.3587148",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 9 09:08:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3587136.3587148",
abstract = "In-memory join is an essential operator in any
database engine. It has been extensively investigated
in the database literature. In this paper, we study
whether exploiting the CDF-based learned models to
boost the join performance is practical. To the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2023:EEB,
author = "Ruiyuan Li and Zheng Li and Yi Wu and Chao Chen and Yu
Zheng",
title = "{Elf}: Erasing-Based Lossless Floating-Point
Compression",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "7",
pages = "1763--1776",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3587136.3587149",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 9 09:08:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/fparith.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3587136.3587149",
abstract = "There are a prohibitively large number of
floating-point time series data generated at an
unprecedentedly high rate. An efficient, compact and
lossless compression for time series data is of great
importance for a wide range of scenarios. Most existing
lossless floating-point compression methods are based
on the XOR operation, but they do not fully exploit the
trailing zeros, which usually results in an
unsatisfactory compression ratio. This paper proposes
an Erasing-based Lossless Floating-point compression
algorithm, i.e., \pkg{Elf}. The main idea of \pkg{Elf}
is to erase the last few bits (i.e., set them to zero)
of floating-point values, so the XORed values are
supposed to contain many trailing zeros. The challenges
of the erasing-based method are three-fold. First, how
to quickly determine the erased bits? Second, how to
losslessly recover the original data from the erased
ones? Third, how to compactly encode the erased data?
Through rigorous mathematical analysis, \pkg{Elf} can
directly determine the erased bits and restore the
original values without losing any precision. To
further improve the compression ratio, we propose a
novel encoding strategy for the XORed values with many
trailing zeros. \pkg{Elf} works in a streaming fashion.
It takes only $ O(N) $ (where $N$ is the length of a
time series) in time and $ O(1)$ in space, and achieves
a notable compression ratio with a theoretical
guarantee. Extensive experiments using 22 datasets show
the powerful performance of \pkg{Elf} compared with 9
advanced competitors.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2023:LLO,
author = "Tianyi Chen and Jun Gao and Hedui Chen and Yaofeng
Tu",
title = "{LOGER}: a Learned Optimizer Towards Generating
Efficient and Robust Query Execution Plans",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "7",
pages = "1777--1789",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3587136.3587150",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 9 09:08:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3587136.3587150",
abstract = "Query optimization based on deep reinforcement
learning (DRL) has become a hot research topic
recently. Despite the achieved promising progress, DRL
optimizers still face great challenges of robustly
producing efficient plans, due to the vast search
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Martens:2023:RPG,
author = "Wim Martens and Matthias Niewerth and Tina Popp and
Carlos Rojas and Stijn Vansummeren and Domagoj Vrgoc",
title = "Representing Paths in Graph Database Pattern
Matching",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "7",
pages = "1790--1803",
month = mar,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3587136.3587151",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue May 9 09:08:30 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3587136.3587151",
abstract = "Modern graph database query languages such as GQL,
SQL/PGQ, and their academic predecessor G-Core promote
paths to first-class citizens in the sense that their
pattern matching facility can return paths, as opposed
to only nodes and edges. This is \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2023:ZVE,
author = "Xiling Li and Chenkai Weng and Yongxin Xu and Xiao
Wang and Jennie Rogers",
title = "{ZKSQL}: Verifiable and Efficient Query Evaluation
with Zero-Knowledge Proofs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "8",
pages = "1804--1816",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3594512.3594513",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 11:11:42 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3594512.3594513",
abstract = "Individuals and organizations are using databases to
store personal information at an unprecedented rate.
This creates a quandary for data providers. They are
responsible for protecting the privacy of individuals
described in their database. On the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Piao:2023:CGE,
author = "Chengzhi Piao and Tingyang Xu and Xiangguo Sun and Yu
Rong and Kangfei Zhao and Hong Cheng",
title = "Computing Graph Edit Distance via Neural Graph
Matching",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "8",
pages = "1817--1829",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3594512.3594514",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 11:11:42 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3594512.3594514",
abstract = "Graph edit distance (GED) computation is a fundamental
NP-hard problem in graph theory. Given a graph pair (
G$_1$, G$_2$ ), GED is defined as the minimum number of
primitive operations converting G$_1$ to G$_2$. Early
studies focus on search-based inexact algorithms
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Schaler:2023:BUE,
author = "Christine Sch{\"a}ler and Thomas H{\"u}tter and Martin
Sch{\"a}ler",
title = "Benchmarking the Utility of $w$-Event Differential
Privacy Mechanisms --- When Baselines Become Mighty
Competitors",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "8",
pages = "1830--1842",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3594512.3594515",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 11:11:42 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3594512.3594515",
abstract = "The w -event framework is the current standard for
ensuring differential privacy on continuously monitored
data streams. Following the proposition of w -event
differential privacy, various mechanisms to implement
the framework are proposed. Their \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Augustine:2023:CGA,
author = "Eriq Augustine and Lise Getoor",
title = "Collective Grounding: Applying Database Techniques to
Grounding Templated Models",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "8",
pages = "1843--1855",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3594512.3594516",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 11:11:42 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3594512.3594516",
abstract = "The process of instantiating, or ``grounding'', a
first-order model is a fundamental component of
reasoning in logic. It has been widely studied in the
context of theorem proving, database theory, and
artificial intelligence. Within the relational
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Adams:2023:EEP,
author = "Jan Niklas Adams and Cameron Pitsch and Tobias
Brockhoff and Wil M. P. van der Aalst",
title = "An Experimental Evaluation of Process Concept Drift
Detection",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "8",
pages = "1856--1869",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3594512.3594517",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 11:11:42 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3594512.3594517",
abstract = "Process mining provides techniques to learn models
from event data. These models can be descriptive (e.g.,
Petri nets) or predictive (e.g., neural networks). The
learned models offer operational support to process
owners by conformance checking, process \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Vitagliano:2023:PDL,
author = "Gerardo Vitagliano and Mazhar Hameed and Lan Jiang and
Lucas Reisener and Eugene Wu and Felix Naumann",
title = "{Pollock}: a Data Loading Benchmark",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "8",
pages = "1870--1882",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3594512.3594518",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 11:11:42 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3594512.3594518",
abstract = "Any system at play in a data-driven project has a
fundamental requirement: the ability to load data. The
de-facto standard format to distribute and consume raw
data is csv. Yet, the plain text and flexible nature of
this format make such files often \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Xiao:2023:APL,
author = "Yingtai Xiao and Guanhong Wang and Danfeng Zhang and
Daniel Kifer",
title = "Answering Private Linear Queries Adaptively Using the
Common Mechanism",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "8",
pages = "1883--1896",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3594512.3594519",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 11:11:42 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3594512.3594519",
abstract = "When analyzing confidential data through a privacy
filter, a data scientist often needs to decide which
queries will best support their intended analysis. For
example, an analyst may wish to study noisy two-way
marginals in a dataset produced by a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Du:2023:LLD,
author = "Yuntao Du and Yujia Hu and Zhikun Zhang and Ziquan
Fang and Lu Chen and Baihua Zheng and Yunjun Gao",
title = "{LDPTrace}: Locally Differentially Private Trajectory
Synthesis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "8",
pages = "1897--1909",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3594512.3594520",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 11:11:42 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3594512.3594520",
abstract = "Trajectory data has the potential to greatly benefit a
wide-range of real-world applications, such as tracking
the spread of the disease through people's movement
patterns and providing personalized location-based
services based on travel preference. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kitsios:2023:SPH,
author = "Xenophon Kitsios and Panagiotis Liakos and Katia
Papakonstantinopoulou and Yannis Kotidis",
title = "{Sim-Piece}: Highly Accurate Piecewise Linear
Approximation through Similar Segment Merging",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "8",
pages = "1910--1922",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3594512.3594521",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 11:11:42 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3594512.3594521",
abstract = "Approximating series of timestamped data points using
a sequence of line segments with a maximum error
guarantee is a fundamental data compression problem,
termed as piecewise linear approximation (PLA). Due to
the increasing need to analyze massive \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Marinelli:2023:TMF,
author = "Eugenio Marinelli and Yiqing Yan and Virginie Magnone
and Charlotte Dumargne and Pascal Barbry and Thomas
Heinis and Raja Appuswamy",
title = "Towards Migration-Free {``Just-in-Case''} Data
Archival for Future Cloud Data Lakes Using Synthetic
{DNA}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "8",
pages = "1923--1929",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3594512.3594522",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 11:11:42 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3594512.3594522",
abstract = "Given the growing adoption of AI, cloud data lakes are
facing the need to support cost-effective
``just-in-case'' data archival over long time periods
to meet regulatory compliance requirements.
Unfortunately, current media technologies suffer from
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Dong:2023:FGR,
author = "Zhiyuan Dong and Zhaoguo Wang and Xiaodong Zhang and
Xian Xu and Changgeng Zhao and Haibo Chen and Aurojit
Panda and Jinyang Li",
title = "Fine-Grained Re-Execution for Efficient Batched Commit
of Distributed Transactions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "8",
pages = "1930--1943",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3594512.3594523",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 11:11:42 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3594512.3594523",
abstract = "Distributed transaction systems incur extensive
cross-node communication to execute and commit
serializable OLTP transactions. As a result, their
performance greatly suffers. Caching data at nodes that
execute transactions can cut down remote reads.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fan:2023:LDT,
author = "Wenfei Fan and Resul Tugay and Yaoshu Wang and Min Xie
and Muhammad Asif Ali",
title = "Learning and Deducing Temporal Orders",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "8",
pages = "1944--1957",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3594512.3594524",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 11:11:42 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3594512.3594524",
abstract = "This paper studies how to determine temporal orders on
attribute values in a set of tuples that pertain to the
same entity, in the absence of complete timestamps. We
propose a creator-critic framework to learn and deduce
temporal orders by combining \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2023:BBG,
author = "Xu Chen and Zhen Wang and Shuncheng Liu and Yaliang Li
and Kai Zeng and Bolin Ding and Jingren Zhou and Han Su
and Kai Zheng",
title = "{BASE}: Bridging the Gap between Cost and Latency for
Query Optimization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "8",
pages = "1958--1966",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3594512.3594525",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 11:11:42 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3594512.3594525",
abstract = "Some recent works have shown the advantages of
reinforcement learning (RL) based learned query
optimizers. These works often use the cost (i.e., the
estimation of cost model) or the latency (i.e.,
execution time) as guidance signals for training their
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lemiesz:2023:EFO,
author = "Jakub Lemiesz",
title = "Efficient Framework for Operating on Data Sketches",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "8",
pages = "1967--1978",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3594512.3594526",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 11:11:42 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3594512.3594526",
abstract = "We study the problem of analyzing massive data streams
based on concise data sketches. Recently, a number of
papers have investigated how to estimate the results of
set-theory operations based on sketches. In this paper
we present a framework that \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhao:2023:TEI,
author = "Xi Zhao and Yao Tian and Kai Huang and Bolong Zheng
and Xiaofang Zhou",
title = "Towards Efficient Index Construction and Approximate
Nearest Neighbor Search in High-Dimensional Spaces",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "8",
pages = "1979--1991",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3594512.3594527",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 11:11:42 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3594512.3594527",
abstract = "The approximate nearest neighbor (ANN) search in
high-dimensional spaces is a fundamental but
computationally very expensive problem. Many methods
have been designed for solving the ANN problem, such as
LSH-based methods and graph-based methods. The LSH-.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sun:2023:LIC,
author = "Zhaoyan Sun and Xuanhe Zhou and Guoliang Li",
title = "Learned Index: a Comprehensive Experimental
Evaluation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "8",
pages = "1992--2004",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3594512.3594528",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 11:11:42 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3594512.3594528",
abstract = "Indexes can improve query-processing performance by
avoiding full table scans. Although traditional indexes
(e.g., B+-tree) have been widely used, learned indexes
are proposed to adopt machine learning models to reduce
the query latency and index size. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2023:LIG,
author = "Yanping Zhang and Johes Bater and Kartik Nayak and
Ashwin Machanavajjhala",
title = "{Longshot}: Indexing Growing Databases Using {MPC} and
Differential Privacy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "8",
pages = "2005--2018",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3594512.3594529",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 11:11:42 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3594512.3594529",
abstract = "In this work, we propose Longshot, a novel design for
secure outsourced database systems that supports ad-hoc
queries through the use of secure multi-party
computation and differential privacy. By combining
these two techniques, we build and maintain \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Paparrizos:2023:ASS,
author = "John Paparrizos and Kaize Wu and Aaron Elmore and
Christos Faloutsos and Michael J. Franklin",
title = "Accelerating Similarity Search for Elastic Measures: a
Study and New Generalization of Lower Bounding
Distances",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "8",
pages = "2019--2032",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3594512.3594530",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 11:11:42 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3594512.3594530",
abstract = "Similarity search is a core analytical task, and its
performance critically depends on the choice of
distance measure. For time-series querying, elastic
measures achieve state-of-the-art accuracy but are
computationally expensive. Thus, fast lower \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wu:2023:ALA,
author = "Chenyuan Wu and Bhavana Mehta and Mohammad Javad Amiri
and Ryan Marcus and Boon Thau Loo",
title = "{AdaChain}: a Learned Adaptive Blockchain",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "8",
pages = "2033--2046",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3594512.3594531",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 11:11:42 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3594512.3594531",
abstract = "This paper presents AdaChain, a learning-based
blockchain framework that adaptively chooses the best
permissioned blockchain architecture to optimize
effective throughput for dynamic transaction workloads.
AdaChain addresses the challenge in Blockchain
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhou:2023:ICS,
author = "Yingli Zhou and Yixiang Fang and Wensheng Luo and
Yunming Ye",
title = "Influential Community Search over Large Heterogeneous
Information Networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "8",
pages = "2047--2060",
month = apr,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3594512.3594532",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri Jun 23 11:11:42 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3594512.3594532",
abstract = "Recently, the topic of influential community search
has gained much attention. Given a graph, it aims to
find communities of vertices with high importance
values from it. Existing works mainly focus on
conventional homogeneous networks, where vertices
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Arafat:2023:NBH,
author = "Naheed Anjum Arafat and Arijit Khan and Arpit Kumar
Rai and Bishwamittra Ghosh",
title = "Neighborhood-Based Hypergraph Core Decomposition",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "9",
pages = "2061--2074",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3598581.3598582",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:00 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3598581.3598582",
abstract = "We propose neighborhood-based core decomposition: a
novel way of decomposing hypergraphs into hierarchical
neighborhood-cohesive subhypergraphs. Alternative
approaches to decomposing hypergraphs, e.g., reduction
to clique or bipartite graphs, are not \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Layne:2023:TSG,
author = "Janet Layne and Justin Carpenter and Edoardo Serra and
Francesco Gullo",
title = "Temporal {SIR-GN}: Efficient and Effective Structural
Representation Learning for Temporal Graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "9",
pages = "2075--2089",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3598581.3598583",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:00 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3598581.3598583",
abstract = "Node representation learning (NRL) generates numerical
vectors (embeddings) for the nodes of a graph.
Structural NRL specifically assigns similar node
embeddings for those nodes that exhibit similar
structural roles. This is in contrast with its
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Haas:2023:WMN,
author = "Gabriel Haas and Viktor Leis",
title = "What Modern {NVMe} Storage Can Do, and How to Exploit
it: High-Performance {I/O} for High-Performance Storage
Engines",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "9",
pages = "2090--2102",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3598581.3598584",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:00 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3598581.3598584",
abstract = "NVMe SSDs based on flash are cheap and offer high
throughput. Combining several of these devices into a
single server enables 10 million I/O operations per
second or more. Our experiments show that existing
out-of-memory database systems and storage engines
achieve only a fraction of this performance. In this
work, we demonstrate that it is possible to close the
performance gap between hardware and software through
an I/O optimized storage engine design. In a heavy
out-of-memory setting, where the dataset is 10 times
larger than main memory, our system can achieve more
than 1 million TPC-C transactions per second.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Banakar:2023:WES,
author = "Vinay Banakar and Kan Wu and Yuvraj Patel and Kimberly
Keeton and Andrea C. Arpaci-Dusseau and Remzi H.
Arpaci-Dusseau",
title = "{WiscSort}: External Sorting for Byte-Addressable
Storage",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "9",
pages = "2103--2116",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3598581.3598585",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:00 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3598581.3598585",
abstract = "We present WiscSort, a new approach to
high-performance concurrent sorting for existing and
future byte-addressable storage (BAS) devices. WiscSort
carefully reduces writes, exploits random reads by
splitting keys and values during sorting, and performs
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ayad:2023:TIL,
author = "Lorraine A. K. Ayad and Grigorios Loukides and Solon
P. Pissis",
title = "Text Indexing for Long Patterns: Anchors are All you
Need",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "9",
pages = "2117--2131",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3598581.3598586",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:00 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3598581.3598586",
abstract = "In many real-world database systems, a large fraction
of the data is represented by strings: sequences of
letters over some alphabet. This is because strings can
easily encode data arising from different sources. It
is often crucial to represent such \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Afroozeh:2023:FCL,
author = "Azim Afroozeh and Peter Boncz",
title = "The {FastLanes} Compression Layout: Decoding $> 100$
Billion Integers per Second with Scalar Code",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "9",
pages = "2132--2144",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3598581.3598587",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:00 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3598581.3598587",
abstract = "The open-source FastLanes project aims to improve big
data formats, such as Parquet, ORC and columnar
database formats, in multiple ways. In this paper, we
significantly accelerate decoding of all common
Light-Weight Compression (LWC) schemes: DICT, FOR,.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yue:2023:VAP,
author = "Cong Yue and Meihui Zhang and Changhao Zhu and Gang
Chen and Dumitrel Loghin and Beng Chin Ooi",
title = "{VeriBench}: Analyzing the Performance of Database
Systems with Verifiability",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "9",
pages = "2145--2157",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3598581.3598588",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:00 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3598581.3598588",
abstract = "Database systems are paying more attention to data
security in recent years. Immutable systems such as
blockchains, verifiable databases, and ledger databases
are equipped with various verifiability mechanisms to
protect data. Such systems often adopt \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2023:TDL,
author = "Jiangneng Li and Zheng Wang and Gao Cong and Cheng
Long and Han Mao Kiah and Bin Cui",
title = "Towards Designing and Learning Piecewise Space-Filling
Curves",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "9",
pages = "2158--2171",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3598581.3598589",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:00 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3598581.3598589",
abstract = "To index multi-dimensional data, space-filling curves
(SFCs) have been used to map the data to one dimension,
and then a one-dimensional indexing method such as the
B-tree is used to index the mapped data. The existing
SFCs all adopt a single mapping \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhu:2023:MQB,
author = "Xiaoke Zhu and Yang Liu and Shuhao Liu and Wenfei
Fan",
title = "{MiniGraph}: Querying Big Graphs with a Single
Machine",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "9",
pages = "2172--2185",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3598581.3598590",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:00 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3598581.3598590",
abstract = "This paper presents MiniGraph, an out-of-core system
for querying big graphs with a single machine. As
opposed to previous single-machine graph systems,
MiniGraph proposes a pipelined architecture to overlap
I/O and CPU operations, and improves multi-. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Choi:2023:BEC,
author = "Yunyoung Choi and Kunsoo Park and Hyunjoon Kim",
title = "{BICE}: Exploring Compact Search Space by Using
Bipartite Matching and Cell-Wide Verification",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "9",
pages = "2186--2198",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3598581.3598591",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:00 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3598581.3598591",
abstract = "Subgraph matching is the problem of searching for all
embeddings of a query graph in a data graph, and
subgraph query processing (also known as subgraph
search) is to find all the data graphs that contain a
query graph as subgraphs. Extensive research \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Tian:2023:MDT,
author = "Anxin Tian and Alexander Zhou and Yue Wang and Lei
Chen",
title = "Maximal {D}-Truss Search in Dynamic Directed Graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "9",
pages = "2199--2211",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3598581.3598592",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:00 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3598581.3598592",
abstract = "Community search (CS) aims at personalized subgraph
discovery which is the key to understanding the
organisation of many real-world networks. CS in
undirected networks has attracted significant attention
from researchers, including many solutions for
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2023:DDD,
author = "Pengfei Li and Hua Lu and Rong Zhu and Bolin Ding and
Long Yang and Gang Pan",
title = "{DILI}: a Distribution-Driven Learned Index",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "9",
pages = "2212--2224",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3598581.3598593",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:00 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3598581.3598593",
abstract = "Targeting in-memory one-dimensional search keys, we
propose a novel DIstribution-driven Learned Index tree
(DILI), where a concise and computation-efficient
linear regression model is used for each node. An
internal node's key range is equally divided \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zeakis:2023:PTE,
author = "Alexandros Zeakis and George Papadakis and Dimitrios
Skoutas and Manolis Koubarakis",
title = "Pre-Trained Embeddings for Entity Resolution: an
Experimental Analysis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "9",
pages = "2225--2238",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3598581.3598594",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:00 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3598581.3598594",
abstract = "Many recent works on Entity Resolution (ER) leverage
Deep Learning techniques involving language models to
improve effectiveness. This is applied to both main
steps of ER, i.e., blocking and matching. Several
pre-trained embeddings have been tested, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zheng:2023:DGN,
author = "Yanping Zheng and Zhewei Wei and Jiajun Liu",
title = "Decoupled Graph Neural Networks for Large Dynamic
Graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "9",
pages = "2239--2247",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3598581.3598595",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:00 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3598581.3598595",
abstract = "Real-world graphs, such as social networks, financial
transactions, and recommendation systems, often
demonstrate dynamic behavior. This phenomenon, known as
graph stream, involves the dynamic changes of nodes and
the emergence and disappearance of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zardbani:2023:AIO,
author = "Fatemeh Zardbani and Nikos Mamoulis and Stratos Idreos
and Panagiotis Karras",
title = "Adaptive Indexing of Objects with Spatial Extent",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "9",
pages = "2248--2260",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3598581.3598596",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:00 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3598581.3598596",
abstract = "Can we quickly explore large multidimensional data in
main memory? Adaptive indexing responds to this need by
building an index incrementally, in response to
queries; in its default form, it indexes a single
attribute or, in the presence of several \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2023:LNF,
author = "Xu Chen and Haitian Chen and Zibo Liang and Shuncheng
Liu and Jinghong Wang and Kai Zeng and Han Su and Kai
Zheng",
title = "{LEON}: a New Framework for {ML}-Aided Query
Optimization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "9",
pages = "2261--2273",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3598581.3598597",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:00 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3598581.3598597",
abstract = "Query optimization has long been a fundamental yet
challenging topic in the database field. With the
prosperity of machine learning (ML), some recent works
have shown the advantages of reinforcement learning
(RL) based learned query optimizer. However, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Faria:2023:TIT,
author = "Nuno Faria and Jos{\'e} Pereira and Ana Nunes Alonso
and Ricardo Vila{\c{c}}a and Yunus Koning and Niels
Nes",
title = "{TiQuE}: Improving the Transactional Performance of
Analytical Systems for True Hybrid Workloads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "9",
pages = "2274--2288",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3598581.3598598",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:00 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3598581.3598598",
abstract = "Transactions have been a key issue in database
management for a long time and there are a plethora of
architectures and algorithms to support and implement
them. The current state-of-the-art is focused on
storage management and is tightly coupled with
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Bang:2023:SRQ,
author = "Jaeho Bang and Gaurav Tarlok Kakkar and Pramod
Chunduri and Subrata Mitra and Joy Arulraj",
title = "Seiden: Revisiting Query Processing in Video Database
Systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "9",
pages = "2289--2301",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3598581.3598599",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:00 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3598581.3598599",
abstract = "State-of-the-art video database management systems
(VDBMSs) often use lightweight proxy models to
accelerate object retrieval and aggregate queries. The
key assumption underlying these systems is that the
proxy model is an order of magnitude faster than
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kossmann:2023:ETL,
author = "Ferdi Kossmann and Ziniu Wu and Eugenie Lai and Nesime
Tatbul and Lei Cao and Tim Kraska and Sam Madden",
title = "Extract-Transform-Load for Video Streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "9",
pages = "2302--2315",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3598581.3598600",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:00 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3598581.3598600",
abstract = "Social media, self-driving cars, and traffic cameras
produce video streams at large scales and cheap cost.
However, storing and querying video at such scales is
prohibitively expensive. We propose to treat
large-scale video analytics as a data \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sudhir:2023:PED,
author = "Sivaprasad Sudhir and Wenbo Tao and Nikolay Laptev and
Cyrille Habis and Michael Cafarella and Samuel Madden",
title = "{Pando}: Enhanced Data Skipping with Logical Data
Partitioning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "9",
pages = "2316--2329",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3598581.3598601",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:00 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3598581.3598601",
abstract = "With enormous volumes of data, quickly retrieving data
that is relevant to a query is essential for achieving
high performance. Modern cloud-based database systems
often partition the data into blocks and employ various
techniques to skip irrelevant \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Maliszewski:2023:CLJ,
author = "Kajetan Maliszewski and Jorge-Arnulfo Quian{\'e}-Ruiz
and Volker Markl",
title = "Cracking-Like Join for Trusted Execution
Environments",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "9",
pages = "2330--2343",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3598581.3598602",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:00 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3598581.3598602",
abstract = "Data processing on non-trusted infrastructures, such
as the public cloud, has become increasingly popular,
despite posing risks to data privacy. However, the
existing cloud DBMSs either lack sufficient privacy
guarantees or underperform. In this paper, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Calikyilmaz:2023:OQA,
author = "Umut {\c{C}}alikyilmaz and Sven Groppe and Jinghua
Groppe and Tobias Winker and Stefan Prestel and Farida
Shagieva and Daanish Arya and Florian Preis and Le
Gruenwald",
title = "Opportunities for Quantum Acceleration of Databases:
Optimization of Queries and Transaction Schedules",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "9",
pages = "2344--2353",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3598581.3598603",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:00 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3598581.3598603",
abstract = "The capabilities of quantum computers, such as the
number of supported qubits and maximum circuit depth,
have grown exponentially in recent years. Commercially
relevant applications that take advantage of quantum
computing are expected to be available \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Miao:2023:SSD,
author = "Xupeng Miao and Yining Shi and Zhi Yang and Bin Cui
and Zhihao Jia",
title = "{SDPipe}: a Semi-Decentralized Framework for
Heterogeneity-Aware Pipeline-parallel Training",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "9",
pages = "2354--2363",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3598581.3598604",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:00 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3598581.3598604",
abstract = "The increasing size of both deep learning models and
training data necessitates the ability to scale out
model training through pipeline-parallel training,
which combines pipelined model parallelism and data
parallelism. However, most of them assume an \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lee:2023:LCP,
author = "Bohyun Lee and Mijin An and Sang-Won Lee",
title = "{LRU-C}: Parallelizing Database {I/Os} for Flash
{SSDs}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "9",
pages = "2364--2376",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3598581.3598605",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:00 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3598581.3598605",
abstract = "The conventional database buffer managers have two
inherent sources of I/O serialization: read stall and
mutex conflict. The serialized I/O makes storage and
CPU under-utilized, limiting transaction throughput and
latency. Such harm stands out on flash \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2023:WYF,
author = "Zixuan Chen and Panagiotis Manolios and Mirek
Riedewald",
title = "Why Not Yet: Fixing a Top-$k$ Ranking that is Not Fair
to Individuals",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "9",
pages = "2377--2390",
month = may,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3598581.3598606",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:00 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3598581.3598606",
abstract = "This work considers why-not questions in the context
of top-k queries and score-based ranking functions.
Following the popular linear scalarization approach for
multi-objective optimization, we study rankings based
on the weighted sum of multiple \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sharma:2023:ITS,
author = "Shantanu Sharma and Yin Li and Sharad Mehrotra and
Nisha Panwar and Komal Kumari and Swagnik
Roychoudhury",
title = "Information-Theoretically Secure and Highly Efficient
Search and Row Retrieval",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "10",
pages = "2391--2403",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3603581.3603582",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:02 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3603581.3603582",
abstract = "Information-theoretic or unconditional security
provides the highest level of security --- independent
of the computational capability of an adversary.
Secret-sharing techniques achieve information-theoretic
security by splitting a secret into multiple \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kato:2023:OOF,
author = "Fumiyuki Kato and Yang Cao and Masatoshi Yoshikawa",
title = "{Olive}: Oblivious Federated Learning on Trusted
Execution Environment against the Risk of
Sparsification",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "10",
pages = "2404--2417",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3603581.3603583",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:02 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3603581.3603583",
abstract = "Combining Federated Learning (FL) with a Trusted
Execution Environment (TEE) is a promising approach for
realizing privacy-preserving FL, which has garnered
significant academic attention in recent years.
Implementing the TEE on the server side enables
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Luo:2023:TEF,
author = "Chengyang Luo and Qing Liu and Yunjun Gao and Lu Chen
and Ziheng Wei and Congcong Ge",
title = "{Task}: an Efficient Framework for Instant
Error-Tolerant Spatial Keyword Queries on Road
Networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "10",
pages = "2418--2430",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3603581.3603584",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:02 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3603581.3603584",
abstract = "Instant spatial keyword queries return the results as
soon as users type in some characters instead of a
complete keyword, which allow users to query the
geo-textual data in a type-as-you-search manner.
However, the existing methods of instant spatial
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kassaie:2023:ACI,
author = "Besat Kassaie and Frank Wm. Tompa",
title = "Autonomously Computable Information Extraction",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "10",
pages = "2431--2443",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3603581.3603585",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:02 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3603581.3603585",
abstract = "Most optimization techniques deployed in information
extraction systems assume that source documents are
static. Instead, extracted relations can be considered
to be materialized views defined by a language built on
regular expressions. Using this \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Koutsoukos:2023:NIV,
author = "Dimitrios Koutsoukos and Raghav Bhartia and Michal
Friedman and Ana Klimovic and Gustavo Alonso",
title = "{NVM}: Is it Not Very Meaningful for Databases?",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "10",
pages = "2444--2457",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3603581.3603586",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:02 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3603581.3603586",
abstract = "Persistent or Non Volatile Memory (PMEM) offers
expanded memory capacity and faster access to
persistent storage. However, there is no comprehensive
empirical analysis of existing database engines under
different PMEM modes, to understand how databases
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Dong:2023:DJT,
author = "Yuyang Dong and Chuan Xiao and Takuma Nozawa and
Masafumi Enomoto and Masafumi Oyamada",
title = "{DeepJoin}: Joinable Table Discovery with Pre-Trained
Language Models",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "10",
pages = "2458--2470",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3603581.3603587",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:02 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3603581.3603587",
abstract = "Due to the usefulness in data enrichment for data
analysis tasks, joinable table discovery has become an
important operation in data lake management. Existing
approaches target equi-joins, the most common way of
combining tables for creating a unified \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wu:2023:FPP,
author = "Yuncheng Wu and Naili Xing and Gang Chen and Tien Tuan
Anh Dinh and Zhaojing Luo and Beng Chin Ooi and Xiaokui
Xiao and Meihui Zhang",
title = "{Falcon}: a Privacy-Preserving and Interpretable
Vertical Federated Learning System",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "10",
pages = "2471--2484",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3603581.3603588",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:02 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3603581.3603588",
abstract = "Federated learning (FL) enables multiple data owners
to collaboratively train machine learning (ML) models
without disclosing their raw data. In the vertical
federated learning (VFL) setting, the collaborating
parties have data from the same set of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gao:2023:ESE,
author = "Haotian Gao and Cong Yue and Tien Tuan Anh Dinh and
Zhiyong Huang and Beng Chin Ooi",
title = "Enabling Secure and Efficient Data Analytics Pipeline
Evolution with Trusted Execution Environment",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "10",
pages = "2485--2498",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3603581.3603589",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:02 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3603581.3603589",
abstract = "Modern data analytics pipelines are highly dynamic, as
they are constantly monitored and fine-tuned by both
data engineers and scientists. Recent systems managing
pipelines ease creating, deploying, and tracking their
evolution. However, privacy \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Doraiswamy:2023:CGD,
author = "Harish Doraiswamy and Vikas Kalagi and Karthik
Ramachandra and Jayant R. Haritsa",
title = "A Case for Graphics-Driven Query Processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "10",
pages = "2499--2511",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3603581.3603590",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:02 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3603581.3603590",
abstract = "Over the past decade, the database research community
has directed considerable attention towards harnessing
the power of GPUs in query processing engines. The
proposed techniques have primarily focused on devising
customized low-level mechanisms that \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Tian:2023:EER,
author = "Wei Tian and Jieming Shi and Siqiang Luo and Hui Li
and Xike Xie and Yuanhang Zou",
title = "Effective and Efficient Route Planning Using
Historical Trajectories on Road Networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "10",
pages = "2512--2524",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3603581.3603591",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:02 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3603581.3603591",
abstract = "We study route planning that utilizes historical
trajectories to predict a realistic route from a source
to a destination on a road network at given departure
time. Route planning is a fundamental task in many
location-based services. It is challenging \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lampropoulos:2023:AIH,
author = "Konstantinos Lampropoulos and Fatemeh Zardbani and
Nikos Mamoulis and Panagiotis Karras",
title = "Adaptive Indexing in High-Dimensional Metric Spaces",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "10",
pages = "2525--2537",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3603581.3603592",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:02 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3603581.3603592",
abstract = "Similarity search in high-dimensional metric spaces is
routinely used in many applications including
content-based image retrieval, bioinformatics, data
mining, and recommender systems. Search can be
accelerated by the use of an index. However, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gao:2023:PCS,
author = "Sen Gao and Hongchao Qin and Rong-Hua Li and Bingsheng
He",
title = "Parallel Colorful $h$-Star Core Maintenance in Dynamic
Graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "10",
pages = "2538--2550",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3603581.3603593",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:02 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3603581.3603593",
abstract = "The higher-order structure cohesive subgraph mining is
an important operator in many graph analysis tasks.
Recently, the colorful h -star core model has been
proposed as an effective alternative to h -clique based
cohesive subgraph models, in \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2023:MFM,
author = "Jia Li and Wenyue Zhao and Nikos Ntarmos and Yang Cao
and Peter Buneman",
title = "{MITra}: a Framework for Multi-Instance Graph
Traversal",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "10",
pages = "2551--2564",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3603581.3603594",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:02 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3603581.3603594",
abstract = "This paper presents MITra, a framework for composing
multi-instance graph algorithms that traverse from
multiple source vertices simultaneously over a single
thread. Underlying MITra is a model of multi-instance
traversal that uniformly captures \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2023:CEB,
author = "Jiazun Chen and Yikuan Xia and Jun Gao",
title = "{CommunityAF}: an Example-Based Community Search
Method via Autoregressive Flow",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "10",
pages = "2565--2577",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3603581.3603595",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:02 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3603581.3603595",
abstract = "Example-based community search utilizes hidden
patterns of given examples rather than explicit rules,
reducing users' burden and enhancing flexibility.
However, existing works face challenges such as low
scalability, high training cost, and improper
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lin:2023:ABA,
author = "Yiming Lin and Yeye He and Surajit Chaudhuri",
title = "{Auto-BI}: Automatically Build {BI}-Models Leveraging
Local Join Prediction and Global Schema Graph",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "10",
pages = "2578--2590",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3603581.3603596",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:02 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3603581.3603596",
abstract = "Business Intelligence (BI) is crucial in modern
enterprises and billion-dollar business. Traditionally,
technical experts like database administrators would
manually prepare BI-models (e.g., in star or snowflake
schemas) that join tables in data \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2023:TDC,
author = "Yuemin Zhang and Qingqing Ye and Rui Chen and Haibo Hu
and Qilong Han",
title = "Trajectory Data Collection with Local Differential
Privacy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "10",
pages = "2591--2604",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3603581.3603597",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:02 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3603581.3603597",
abstract = "Trajectory data collection is a common task with many
applications in our daily lives. Analyzing trajectory
data enables service providers to enhance their
services, which ultimately benefits users. However,
directly collecting trajectory data may give \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gao:2023:LNM,
author = "Jian Gao and Xin Cao and Xin Yao and Gong Zhang and
Wei Wang",
title = "{LMSFC}: a Novel Multidimensional Index Based on
Learned Monotonic Space Filling Curves",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "10",
pages = "2605--2617",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3603581.3603598",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:02 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3603581.3603598",
abstract = "The recently proposed learned indexes have attracted
much attention as they can adapt to the actual data and
query distributions to attain better search efficiency.
Based on this technique, several existing works build
up indexes for multi-dimensional \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Rong:2023:SDC,
author = "Kexin Rong and Mihai Budiu and Athinagoras
Skiadopoulos and Lalith Suresh and Amy Tai",
title = "Scaling a Declarative Cluster Manager Architecture
with Query Optimization Techniques",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "10",
pages = "2618--2631",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3603581.3603599",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:02 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3603581.3603599",
abstract = "Cluster managers play a crucial role in data centers
by distributing workloads among infrastructure
resources. Declarative Cluster Management (DCM) is a
new cluster management architecture that enables users
to express placement policies declaratively \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Singh:2023:CLT,
author = "Mukul Singh and Jos{\'e} Cambronero S{\'a}nchez and
Sumit Gulwani and Vu Le and Carina Negreanu and
Mohammad Raza and Gust Verbruggen",
title = "{Cornet}: Learning Table Formatting Rules By Example",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "10",
pages = "2632--2644",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3603581.3603600",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:02 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3603581.3603600",
abstract = "Spreadsheets are widely used for table manipulation
and presentation. Stylistic formatting of these tables
is an important property for presentation and analysis.
As a result, popular spreadsheet software, such as
Excel, supports automatically formatting tables based
on rules. Unfortunately, writing such formatting rules
can be challenging for users as it requires knowledge
of the underlying rule language and data logic. We
present Cornet, a system that tackles the novel problem
of automatically learning such formatting rules from
user-provided formatted cells. Cornet takes inspiration
from advances in inductive programming and combines
symbolic rule enumeration with a neural ranker to learn
conditional formatting rules. To motivate and evaluate
our approach, we extracted tables with over 450K unique
formatting rules from a corpus of over 1.8M real
worksheets. Since we are the first to introduce the
task of automatically learning conditional formatting
rules, we compare Cornet to a wide range of symbolic
and neural baselines adapted from related domains. Our
results show that Cornet accurately learns rules across
varying setups. Additionally, we show that in some
cases Cornet can find rules that are shorter than those
written by users and can also discover rules in
spreadsheets that users have manually formatted.
Furthermore, we present two case studies investigating
the generality of our approach by extending Cornet to
related data tasks (e.g., filtering) and generalizing
to conditional formatting over multiple columns.",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zuo:2023:AAR,
author = "Chaoji Zuo and Dong Deng",
title = "{ARKGraph}: All-Range Approximate
{$K$}-Nearest-Neighbor Graph",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "10",
pages = "2645--2658",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3603581.3603601",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:02 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3603581.3603601",
abstract = "Given a collection of vectors, the approximate
K-nearest-neighbor graph (KGraph for short) connects
every vector to its approximate K-nearest-neighbors
(KNN for short). KGraph plays an important role in high
dimensional data visualization, semantic \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Youngmann:2023:CDI,
author = "Brit Youngmann and Michael Cafarella and Babak Salimi
and Anna Zeng",
title = "Causal Data Integration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "10",
pages = "2659--2665",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3603581.3603602",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:02 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3603581.3603602",
abstract = "Causal inference is fundamental to empirical
scientific discoveries in natural and social sciences;
however, in the process of conducting causal inference,
data management problems can lead to false discoveries.
Two such problems are (i) not having all \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Martini:2023:MFI,
author = "Michael Martini and Daniel Schuster and Wil M. P. van
der Aalst",
title = "Mining Frequent Infix Patterns from Concurrency-Aware
Process Execution Variants",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "10",
pages = "2666--2678",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3603581.3603603",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:02 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3603581.3603603",
abstract = "Event logs, as considered in process mining, document
a large number of individual process executions.
Moreover, each process execution consists of various
executed activities. To cope with the vast amount of
process executions in event logs, the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Pedreira:2023:CDM,
author = "Pedro Pedreira and Orri Erling and Konstantinos
Karanasos and Scott Schneider and Wes McKinney and
Satya R. Valluri and Mohamed Zait and Jacques Nadeau",
title = "The Composable Data Management System Manifesto",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "10",
pages = "2679--2685",
month = jun,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3603581.3603604",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 9 10:33:02 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3603581.3603604",
abstract = "The requirement for specialization in data management
systems has evolved faster than our software
development practices. After decades of organic growth,
this situation has created a siloed landscape composed
of hundreds of products developed and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Schmitt:2023:TLS,
author = "Daniel Schmitt and Daniel Kocher and Nikolaus Augsten
and Willi Mann and Alexander Miller",
title = "A Two-Level Signature Scheme for Stable Set Similarity
Joins",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "11",
pages = "2686--2698",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611479.3611480",
ISSN = "2150-8097",
bibdate = "Fri Aug 25 07:25:43 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611479.3611480",
abstract = "We study the set similarity join problem, which
retrieves all pairs of similar sets from two
collections of sets for a given distance function.
Existing exact solutions employ a signature-based
filter-verification framework: If two sets are similar,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Rodriguez:2023:SRD,
author = "Olivier Rodriguez and Federico Ulliana and Marie-Laure
Mugnier",
title = "Scalable Reasoning on Document Stores via
Instance-Aware Query Rewriting",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "11",
pages = "2699--2713",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611479.3611481",
ISSN = "2150-8097",
bibdate = "Fri Aug 25 07:25:43 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611479.3611481",
abstract = "Data trees, typically encoded in JSON, are ubiquitous
in data-driven applications. This ubiquity makes urgent
the development of novel techniques for querying
heterogeneous JSON data in a flexible manner. We
propose a rule language for JSON, called \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2023:EVS,
author = "Enhao Zhang and Maureen Daum and Dong He and Brandon
Haynes and Ranjay Krishna and Magdalena Balazinska",
title = "{EQUI-VOCAL}: Synthesizing Queries for Compositional
Video Events from Limited User Interactions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "11",
pages = "2714--2727",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611479.3611482",
ISSN = "2150-8097",
bibdate = "Fri Aug 25 07:25:43 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611479.3611482",
abstract = "We introduce EQUI-VOCAL: a new system that
automatically synthesizes queries over videos from
limited user interactions. The user only provides a
handful of positive and negative examples of what they
are looking for. EQUI-VOCAL utilizes these initial
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2023:LBG,
author = "Yuhao Zhang and Arun Kumar",
title = "{Lotan}: Bridging the Gap between {GNNs} and Scalable
Graph Analytics Engines",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "11",
pages = "2728--2741",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611479.3611483",
ISSN = "2150-8097",
bibdate = "Fri Aug 25 07:25:43 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611479.3611483",
abstract = "Recent advances in Graph Neural Networks (GNNs) have
changed the landscape of modern graph analytics. The
complexity of GNN training and the scalability
challenges have also sparked interest from the systems
community, with efforts to build systems that
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kraft:2023:EAT,
author = "Peter Kraft and Qian Li and Xinjing Zhou and Peter
Bailis and Michael Stonebraker and Matei Zaharia and
Xiangyao Yu",
title = "{Epoxy}: {ACID} Transactions across Diverse Data
Stores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "11",
pages = "2742--2754",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611479.3611484",
ISSN = "2150-8097",
bibdate = "Fri Aug 25 07:25:43 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611479.3611484",
abstract = "Developers are increasingly building applications that
incorporate multiple data stores, for example to manage
heterogeneous data. Often, these require transactional
safety for operations across stores, but few systems
support such guarantees. To solve \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Bother:2023:AVH,
author = "Maximilian B{\"o}ther and Lawrence Benson and Ana
Klimovic and Tilmann Rabl",
title = "Analyzing Vectorized Hash Tables across {CPU}
Architectures",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "11",
pages = "2755--2768",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611479.3611485",
ISSN = "2150-8097",
bibdate = "Fri Aug 25 07:25:43 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611479.3611485",
abstract = "Data processing systems often leverage vector
instructions to achieve higher performance. When
applying vector instructions, an often overlooked data
structure is the hash table, even though it is
fundamental in data processing systems for operations
such as indexing, aggregating, and joining. In this
paper, we characterize and evaluate three fundamental
vectorized hashing schemes, vectorized linear probing
(VLP), vectorized fingerprinting (VFP), and
bucket-based comparison (BBC). We implement these
hashing schemes on the x86, ARM, and Power CPU
architectures, as modern database systems must provide
efficient implementations for multiple platforms due to
the continuously increasing hardware heterogeneity. We
present various implementation variants and
platform-specific optimizations, which we evaluate for
integer keys, string keys, large payloads, skewed
distributions, and multiple threads. Our extensive
evaluation and comparison to three scalar hashing
schemes on four servers shows that BBC outperforms
scalar linear probing by a factor of more than 2x,
while also scaling well to high load factors. We find
that vectorized hashing schemes come with caveats that
need to be considered, such as the increased
engineering overhead, differences between CPUs, and
differences between vector ISAs, such as AVX and
AVX-512, which impact performance. We conclude with key
findings for vectorized hashing scheme
implementations.",
acknowledgement = ack-nhfb,
ajournal = "",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Durner:2023:ECO,
author = "Dominik Durner and Viktor Leis and Thomas Neumann",
title = "Exploiting Cloud Object Storage for High-Performance
Analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "11",
pages = "2769--2782",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611479.3611486",
ISSN = "2150-8097",
bibdate = "Fri Aug 25 07:25:43 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611479.3611486",
abstract = "Elasticity of compute and storage is crucial for
analytical cloud database systems. All cloud vendors
provide disaggregated object stores, which can be used
as storage backend for analytical query engines. Until
recently, local storage was unavoidable \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Karapiperis:2023:RBS,
author = "Dimitrios Karapiperis and Christos Tjortjis and
Vassilios S. Verykios",
title = "A Randomized Blocking Structure for Streaming Record
Linkage",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "11",
pages = "2783--2791",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611479.3611487",
ISSN = "2150-8097",
bibdate = "Fri Aug 25 07:25:43 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611479.3611487",
abstract = "A huge amount of data, in terms of streams, are
collected nowadays via a variety of sources, such as
sensors, mobile devices, or even raw log files. The
unprecedented rate at which these data are generated
and collected calls for novel record linkage \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Riveros:2023:RNR,
author = "Cristian Riveros and Nicol{\'a}s {Van Sint Jan} and
Domagoj Vrgoc",
title = "{REmatch}: a Novel Regex Engine for Finding All
Matches",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "11",
pages = "2792--2804",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611479.3611488",
ISSN = "2150-8097",
bibdate = "Fri Aug 25 07:25:43 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/string-matching.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611479.3611488",
abstract = "In this paper, we present the REmatch system for
information extraction. REmatch is based on a recently
proposed enumeration algorithm for evaluating regular
expressions with capture variables supporting the
all-match semantics. It tells a story of what it takes
to make a theoretically optimal algorithm work in
practice. As we show here, a naive implementation of
the original algorithm would have a hard time dealing
with realistic workloads. We thus develop a new
algorithm and a series of optimizations that make
REmatch as fast or faster than many popular RegEx
engines while at the same time being able to return all
the outputs: a task that most other engines tend to
struggle with.",
acknowledgement = ack-nhfb,
ajournal = "",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2023:AAO,
author = "Junxiong Wang and Immanuel Trummer and Ahmet Kara and
Dan Olteanu",
title = "{ADOPT}: Adaptively Optimizing Attribute Orders for
Worst-Case Optimal Join Algorithms via Reinforcement
Learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "11",
pages = "2805--2817",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611479.3611489",
ISSN = "2150-8097",
bibdate = "Fri Aug 25 07:25:43 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611479.3611489",
abstract = "The performance of worst-case optimal join algorithms
depends on the order in which the join attributes are
processed. Selecting good orders before query execution
is hard, due to the large space of possible orders and
unreliable execution cost \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Hu:2023:TSM,
author = "Zheng Hu and Weiguo Zheng and Xiang Lian",
title = "Triangular Stability Maximization by Influence Spread
over Social Networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "11",
pages = "2818--2831",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611479.3611490",
ISSN = "2150-8097",
bibdate = "Fri Aug 25 07:25:43 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611479.3611490",
abstract = "In many real-world applications such as social network
analysis and online advertising/marketing, one of the
most important and popular problems is called influence
maximization (IM), which finds a set of k seed users
that maximize the expected number \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Guan:2023:CSE,
author = "Haoquan Guan and Ziling Chen and Shaoxu Song",
title = "{CORE-Sketch}: On Exact Computation of Median Absolute
Deviation with Limited Space",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "11",
pages = "2832--2844",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611479.3611491",
ISSN = "2150-8097",
bibdate = "Fri Aug 25 07:25:43 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611479.3611491",
abstract = "Median absolute deviation (MAD), the median of the
absolute deviations from the median, has been found
useful in various applications such as outlier
detection. Together with median, MAD is more robust to
abnormal data than mean and standard deviation
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lulf:2023:FSC,
author = "Christian L{\"u}lf and Denis Mayr Lima Martins and
Marcos Antonio Vaz Salles and Yongluan Zhou and Fabian
Gieseke",
title = "Fast Search-by-Classification for Large-Scale
Databases Using Index-Aware Decision Trees and Random
Forests",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "11",
pages = "2845--2857",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611479.3611492",
ISSN = "2150-8097",
bibdate = "Fri Aug 25 07:25:43 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611479.3611492",
abstract = "The vast amounts of data collected in various domains
pose great challenges to modern data exploration and
analysis. To find ``interesting'' objects in large
databases, users typically define a query using
positive and negative example objects and train a
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Calautti:2023:SOC,
author = "Marco Calautti and Mostafa Milani and Andreas Pieris",
title = "Semi-Oblivious Chase Termination for Linear
Existential Rules: an Experimental Studya",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "11",
pages = "2858--2870",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611479.3611493",
ISSN = "2150-8097",
bibdate = "Fri Aug 25 07:25:43 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611479.3611493",
abstract = "The chase procedure is a fundamental algorithmic tool
in databases that allows us to reason with constraints,
such as existential rules, with a plethora of
applications. It takes as input a database and a set of
constraints, and iteratively completes \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lee:2023:AIC,
author = "Kukjin Lee and Anshuman Dutt and Vivek Narasayya and
Surajit Chaudhuri",
title = "Analyzing the Impact of Cardinality Estimation on
Execution Plans in {Microsoft SQL} Server",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "11",
pages = "2871--2883",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611479.3611494",
ISSN = "2150-8097",
bibdate = "Fri Aug 25 07:25:43 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611479.3611494",
abstract = "Cardinality estimation is widely believed to be one of
the most important causes of poor query plans. Prior
studies evaluate the impact of cardinality estimation
on plan quality on a set of Select-Project-Join queries
on PostgreSQL DBMS. Our empirical \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lee:2023:WLZ,
author = "Jongsung Lee and Donguk Kim and Jae W. Lee",
title = "{WALTZ}: Leveraging Zone Append to Tighten the Tail
Latency of {LSM} Tree on {ZNS SSD}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "11",
pages = "2884--2896",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611479.3611495",
ISSN = "2150-8097",
bibdate = "Fri Aug 25 07:25:43 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611479.3611495",
abstract = "We propose WALTZ, an LSM tree-based key-value store on
the emerging Zoned Namespace (ZNS) SSD. The key
contribution of WALTZ is to leverage the zone append
command, which is a recent addition to ZNS SSD
specifications, to provide tight tail latency. The
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Russo:2023:AAQ,
author = "Matthew Russo and Tatsunori Hashimoto and Daniel Kang
and Yi Sun and Matei Zaharia",
title = "Accelerating Aggregation Queries on Unstructured
Streams of Data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "11",
pages = "2897--2910",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611479.3611496",
ISSN = "2150-8097",
bibdate = "Fri Aug 25 07:25:43 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611479.3611496",
abstract = "Analysts and scientists are interested in querying
streams of video, audio, and text to extract
quantitative insights. For example, an urban planner
may wish to measure congestion by querying the live
feed from a traffic camera. Prior work has used deep
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Bai:2023:QIS,
author = "Qiushi Bai and Sadeem Alsudais and Chen Li",
title = "{QueryBooster}: Improving {SQL} Performance Using
Middleware Services for Human-Centered Query
Rewriting",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "11",
pages = "2911--2924",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611479.3611497",
ISSN = "2150-8097",
bibdate = "Fri Aug 25 07:25:43 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611479.3611497",
abstract = "SQL query performance is critical in database
applications, and query rewriting is a technique that
transforms an original query into an equivalent query
with a better performance. In a wide range of
database-supported systems, there is a unique problem
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhu:2023:CRA,
author = "Jiongli Zhu and Sainyam Galhotra and Nazanin Sabri and
Babak Salimi",
title = "Consistent Range Approximation for Fair Predictive
Modeling",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "11",
pages = "2925--2938",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611479.3611498",
ISSN = "2150-8097",
bibdate = "Fri Aug 25 07:25:43 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611479.3611498",
abstract = "This paper proposes a novel framework for certifying
the fairness of predictive models trained on biased
data. It draws from query answering for incomplete and
inconsistent databases to formulate the problem of
consistent range approximation (CRA) of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yin:2023:SMW,
author = "Haoteng Yin and Muhan Zhang and Jianguo Wang and Pan
Li",
title = "{SUREL+}: Moving from Walks to Sets for Scalable
Subgraph-Based Graph Representation Learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "11",
pages = "2939--2948",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611479.3611499",
ISSN = "2150-8097",
bibdate = "Fri Aug 25 07:25:43 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611479.3611499",
abstract = "Subgraph-based graph representation learning (SGRL)
has recently emerged as a powerful tool in many
prediction tasks on graphs due to its advantages in
model expressiveness and generalization ability. Most
previous SGRL models face computational issues
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2023:ESN,
author = "Hanzhi Wang and Zhewei Wei",
title = "Estimating Single-Node {PageRank} in {$ \tilde
{O}(\min d_t, \sqrt {m}) $} Time",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "11",
pages = "2949--2961",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611479.3611500",
ISSN = "2150-8097",
bibdate = "Fri Aug 25 07:25:43 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pagerank.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611479.3611500",
abstract = "PageRank is a famous measure of graph centrality that
has numerous applications in practice. The problem of
computing a single node's PageRank has been the subject
of extensive research over a decade. However, existing
methods still incur large time complexities despite
years of efforts. Even on undirected graphs where
several valuable properties held by PageRank scores,
the problem of locally approximating the PageRank score
of a target node remains a challenging task. Two
commonly adopted techniques, Monte-Carlo based random
walks and backward push, both cost $O(n)$ time in the
worst-case scenario, which hinders existing methods
from achieving a sublinear time complexity like
$O(\sqrt{m})$ on an undirected graph with $n$ nodes and
$m$ edges.\par
In this paper, we focus on the problem of single-node
PageRank computation on undirected graphs. We propose a
novel algorithm, SetPush, for estimating single-node
PageRank specifically on undirected graphs. With
non-trivial analysis, we prove that our SetPush
achieves the $\tilde{O}(\min(d_, \sqrt{m}))$ time
complexity for estimating the target node $t$'s
PageRank with constant relative error and constant
failure probability on undirected graphs. We conduct
comprehensive experiments to demonstrate the
effectiveness of SetPush.",
acknowledgement = ack-nhfb,
ajournal = "",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2023:SAQ,
author = "Yunjia Zhang and Yannis Chronis and Jignesh M. Patel
and Theodoros Rekatsinas",
title = "Simple Adaptive Query Processing vs. Learned Query
Optimizers: Observations and Analysis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "11",
pages = "2962--2975",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611479.3611501",
ISSN = "2150-8097",
bibdate = "Fri Aug 25 07:25:43 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611479.3611501",
abstract = "There have been many decades of work on optimizing
query processing in database management systems.
Recently, modern machine learning (ML), and
specifically reinforcement learning (RL), has gained
increased attention as a means to develop a query
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Xu:2023:BTO,
author = "Helen Xu and Amanda Li and Brian Wheatman and Manoj
Marneni and Prashant Pandey",
title = "{BP-Tree}: Overcoming the Point-Range Operation
Tradeoff for In-Memory {B}-Trees",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "11",
pages = "2976--2989",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611479.3611502",
ISSN = "2150-8097",
bibdate = "Fri Aug 25 07:25:43 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611479.3611502",
abstract = "B-trees are the go-to data structure for in-memory
indexes in databases and storage systems. B-trees
support both point operations (i.e., inserts and finds)
and range operations (i.e., iterators and maps).
However, there is an inherent tradeoff between \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lv:2023:HXT,
author = "Ge Lv and Chen Jason Zhang and Lei Chen",
title = "{HENCE-X}: Toward Heterogeneity-Agnostic Multi-Level
Explainability for Deep Graph Networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "11",
pages = "2990--3003",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611479.3611503",
ISSN = "2150-8097",
bibdate = "Fri Aug 25 07:25:43 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611479.3611503",
abstract = "Deep graph networks (DGNs) have demonstrated their
outstanding effectiveness on both heterogeneous and
homogeneous graphs. However their black-box nature does
not allow human users to understand their working
mechanisms. Recently, extensive efforts have \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yuan:2023:ARE,
author = "Haitao Yuan and Sai Wang and Zhifeng Bao and
Shangguang Wang",
title = "Automatic Road Extraction with Multi-Source Data
Revisited: Completeness, Smoothness and
Discrimination",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "11",
pages = "3004--3017",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611479.3611504",
ISSN = "2150-8097",
bibdate = "Fri Aug 25 07:25:43 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611479.3611504",
abstract = "Extracting roads from multi-source data, such as
aerial images and vehicle trajectories, is an important
way to maintain road networks in the filed of urban
computing. In this paper, we revisit the problem of
road extraction and aim to boost its \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fent:2023:ABQ,
author = "Philipp Fent and Guido Moerkotte and Thomas Neumann",
title = "Asymptotically Better Query Optimization Using Indexed
Algebra",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "11",
pages = "3018--3030",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611479.3611505",
ISSN = "2150-8097",
bibdate = "Fri Aug 25 07:25:43 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611479.3611505",
abstract = "Query optimization is essential for the efficient
execution of queries. The necessary analysis, if we can
and should apply optimizations and transform the query
plan, is already challenging. Traditional techniques
focus on the availability of columns at \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Skavantzos:2023:NPG,
author = "Philipp Skavantzos and Sebastian Link",
title = "Normalizing Property Graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "11",
pages = "3031--3043",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611479.3611506",
ISSN = "2150-8097",
bibdate = "Fri Aug 25 07:25:43 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611479.3611506",
abstract = "Normalization aims at minimizing sources of potential
data inconsistency and costs of update maintenance
incurred by data redundancy. For relational databases,
different classes of dependencies cause data redundancy
and have resulted in proposals such \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liu:2023:DDC,
author = "Chunwei Liu and Anna Pavlenko and Matteo Interlandi
and Brandon Haynes",
title = "A Deep Dive into Common Open Formats for Analytical
{DBMSs}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "11",
pages = "3044--3056",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611479.3611507",
ISSN = "2150-8097",
bibdate = "Fri Aug 25 07:25:43 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611479.3611507",
abstract = "This paper evaluates the suitability of Apache Arrow,
Parquet, and ORC as formats for subsumption in an
analytical DBMS. We systematically identify and explore
the high-level features that are important to support
efficient querying in modern OLAP DBMSs \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Huang:2023:SDP,
author = "Zezhou Huang and Jiaxiang Liu and Daniel Gbenga Alabi
and Raul Castro Fernandez and Eugene Wu",
title = "{Saibot}: a Differentially Private Data Search
Platform",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "11",
pages = "3057--3070",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611479.3611508",
ISSN = "2150-8097",
bibdate = "Fri Aug 25 07:25:43 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611479.3611508",
abstract = "Recent data search platforms use ML task-based utility
measures rather than metadata-based keywords, to search
large dataset corpora. Requesters submit a training
dataset, and these platforms search for augmentations
---join or union-compatible datasets--. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Huang:2023:JGT,
author = "Zezhou Huang and Rathijit Sen and Jiaxiang Liu and
Eugene Wu",
title = "{JoinBoost}: Grow Trees over Normalized Data Using
Only {SQL}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "11",
pages = "3071--3084",
month = jul,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611479.3611509",
ISSN = "2150-8097",
bibdate = "Fri Aug 25 07:25:43 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611479.3611509",
abstract = "Although dominant for tabular data, ML libraries that
train tree models over normalized databases (e.g.,
LightGBM, XGBoost) require the data to be denormalized
as a single table, materialized, and exported. This
process is not scalable, slow, and poses \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Tatemura:2023:PPP,
author = "Junichi Tatemura and Tao Zou and Jagan
Sankaranarayanan and Yanlai Huang and Jim Chen and Yupu
Zhang and Kevin Lai and Hao Zhang and Gokul Nath Babu
Manoharan and Goetz Graefe and Divyakant Agrawal and
Brad Adelberg and Shilpa Kolhar and Indrajit Roy",
title = "Progressive Partitioning for Parallelized Query
Execution in {Google}'s {Napa}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "12",
pages = "3475--3487",
month = aug,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611540.3611541",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 18 10:22:20 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611540.3611541",
abstract = "Napa holds Google's critical data warehouses in
log-structured merge trees for real-time data ingestion
and sub-second response for billions of queries per
day. These \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Depoutovitch:2023:TMB,
author = "Alex Depoutovitch and Chong Chen and Per-Ake Larson
and Jack Ng and Shu Lin and Guanzhu Xiong and Paul Lee
and Emad Boctor and Samiao Ren and Lengdong Wu and
Yuchen Zhang and Calvin Sun",
title = "{Taurus MM}: Bringing Multi-Master to the Cloud",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "12",
pages = "3488--3500",
month = aug,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611540.3611542",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 18 10:22:20 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611540.3611542",
abstract = "A single-master database has limited update capacity
because a single node handles all updates. A
multi-master database potentially has higher update
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Mao:2023:SCN,
author = "Yancan Mao and Zhanghao Chen and Yifan Zhang and Meng
Wang and Yong Fang and Guanghui Zhang and Rui Shi and
Richard T. B. Ma",
title = "{StreamOps}: Cloud-Native Runtime Management for
Streaming Services in {ByteDance}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "12",
pages = "3501--3514",
month = aug,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611540.3611543",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 18 10:22:20 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611540.3611543",
abstract = "Stream processing is widely used for real-time data
processing and decision-making, leading to tens of
thousands of streaming jobs deployed in ByteDance
cloud. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Anneser:2023:ALQ,
author = "Christoph Anneser and Nesime Tatbul and David Cohen
and Zhenggang Xu and Prithviraj Pandian and Nikolay
Laptev and Ryan Marcus",
title = "{AutoSteer}: Learned Query Optimization for Any {SQL}
Database",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "12",
pages = "3515--3527",
month = aug,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611540.3611544",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 18 10:22:20 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611540.3611544",
abstract = "This paper presents AutoSteer, a learning-based
solution that automatically drives query optimization
in any SQL database that exposes tunable optimizer
knobs. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2023:KRT,
author = "Jianjun Chen and Rui Shi and Heng Chen and Li Zhang
and Ruidong Li and Wei Ding and Liya Fan and Hao Wang
and Mu Xiong and Yuxiang Chen and Benchao Dong and
Kuankuan Guo and Yuanjin Lin and Xiao Liu and Haiyang
Shi and Peipei Wang and Zikang Wang and Yemeng Yang and
Junda Zhao and Dongyan Zhou and Zhikai Zuo and Yuming
Liang",
title = "{Krypton}: Real-Time Serving and Analytical {SQL}
Engine at {ByteDance}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "12",
pages = "3528--3542",
month = aug,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611540.3611545",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 18 10:22:20 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611540.3611545",
abstract = "In recent years, at ByteDance, we have started seeing
more and more business scenarios that require
performing real-time data serving besides complex Ad
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zou:2023:EVE,
author = "Yuanhang Zou and Zhihao Ding and Jieming Shi and
Shuting Guo and Chunchen Su and Yafei Zhang",
title = "{EmbedX}: a Versatile, Efficient and Scalable Platform
to Embed Both Graphs and High-Dimensional Sparse Data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "12",
pages = "3543--3556",
month = aug,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611540.3611546",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 18 10:22:20 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611540.3611546",
abstract = "In modern online services, it is of growing importance
to process web-scale graph data and high-dimensional
sparse data together into embeddings for downstream
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Saxena:2023:SAG,
author = "Mohit Saxena and Benjamin Sowell and Daiyan Alamgir
and Nitin Bahadur and Bijay Bisht and Santosh
Chandrachood and Chitti Keswani and G. Krishnamoorthy
and Austin Lee and Bohou Li and Zach Mitchell and
Vaibhav Porwal and Maheedhar Reddy Chappidi and Brian
Ross and Noritaka Sekiyama and Omer Zaki and Linchi
Zhang and Mehul A. Shah",
title = "The Story of {AWS Glue}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "12",
pages = "3557--3569",
month = aug,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611540.3611547",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 18 10:22:20 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611540.3611547",
abstract = "AWS Glue is Amazon's serverless data integration cloud
service that makes it simple and cost effective to
extract, clean, enrich, load, and organize data.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2023:TGE,
author = "Yang Li and Huaijun Jiang and Yu Shen and Yide Fang
and Xiaofeng Yang and Danqing Huang and Xinyi Zhang and
Wentao Zhang and Ce Zhang and Peng Chen and Bin Cui",
title = "Towards General and Efficient Online Tuning for
{Spark}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "12",
pages = "3570--3583",
month = aug,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611540.3611548",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 18 10:22:20 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611540.3611548",
abstract = "The distributed data analytic system --- Spark is a
common choice for processing massive volumes of
heterogeneous data, while it is challenging to tune its
parameters \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2023:CBP,
author = "Jiashu Zhang and Wen Jiang and Bo Tang and Haoxiang Ma
and Lixun Cao and Zhongbin Jiang and Yuanyuan Nie and
Fan Wang and Lei Zhang and Yuming Liang",
title = "{CDSBen}: Benchmarking the Performance of Storage
Services in Cloud-Native Database System at
{ByteDance}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "12",
pages = "3584--3596",
month = aug,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611540.3611549",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 18 10:22:20 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611540.3611549",
abstract = "In this work, we focus on the performance benchmarking
problem of storage services in cloud-native database
systems, which are widely used in various cloud
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhou:2023:FBR,
author = "Xuanhe Zhou and Cheng Chen and Kunyi Li and Bingsheng
He and Mian Lu and Qiaosheng Liu and Wei Huang and
Guoliang Li and Zhao Zheng and Yuqiang Chen",
title = "{FEBench}: a Benchmark for Real-Time Relational Data
Feature Extraction",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "12",
pages = "3597--3609",
month = aug,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611540.3611550",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 18 10:22:20 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611540.3611550",
abstract = "As the use of online AI inference services rapidly
expands in various applications (e.g., fraud detection
in banking, product recommendation in e-commerce),
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Xiao:2023:MDF,
author = "Fei Xiao and Yuncheng Wu and Meihui Zhang and Gang
Chen and Beng Chin Ooi",
title = "{MINT}: Detecting Fraudulent Behaviors from
Time-Series Relational Data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "12",
pages = "3610--3623",
month = aug,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611540.3611551",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 18 10:22:20 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611540.3611551",
abstract = "The e-commerce platforms, such as Shopee, have
accumulated a huge volume of time-series relational
data, which contains useful information on
differentiating \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ahmad:2023:MPS,
author = "Shafi Ahmad and Dillidorai Arumugam and Srdan Bozovic
and Elnata Degefa and Sailesh Duvvuri and Steven Gott
and Nitish Gupta and Joachim Hammer and Nivedita
Kaluskar and Raghav Kaushik and Rakesh Khanduja and
Prasad Mujumdar and Gaurav Malhotra and Pankaj Naik and
Nikolas Ogg and Krishna Kumar Parthasarthy and Raghu
Ramakrishnan and Vlad Rodriguez and Rahul Sharma and
Jakub Szymaszek and Andreas Wolter",
title = "{Microsoft Purview}: a System for Central Governance
of Data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "12",
pages = "3624--3635",
month = aug,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611540.3611552",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 18 10:22:20 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611540.3611552",
abstract = "Modern data estates are spread across data located on
premises, on the edge and in one or more public clouds,
spread across various sources like multiple relational
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lin:2023:AAI,
author = "Liang Lin and Yuhan Li and Bin Wu and Huijun Mai and
Renjie Lou and Jian Tan and Feifei Li",
title = "{Anser}: Adaptive Information Sharing Framework of
{AnalyticDB}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "12",
pages = "3636--3648",
month = aug,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611540.3611553",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 18 10:22:20 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611540.3611553",
abstract = "The surge in data analytics has fostered burgeoning
demand for AnalyticDB on Alibaba Cloud, which has well
served thousands of customers from \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Brucke:2023:TAI,
author = "Christoph Br{\"u}cke and Philipp H{\"a}rtling and
Rodrigo D Escobar Palacios and Hamesh Patel and Tilmann
Rabl",
title = "{TPCx-AI} --- An Industry Standard Benchmark for
Artificial Intelligence and Machine Learning Systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "12",
pages = "3649--3661",
month = aug,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611540.3611554",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 18 10:22:20 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611540.3611554",
abstract = "Artificial intelligence (AI) and machine learning (ML)
techniques have existed for years, but new hardware
trends and advances in model training and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Psallidas:2023:OEE,
author = "Fotis Psallidas and Ashvin Agrawal and Chandru Sugunan
and Khaled Ibrahim and Konstantinos Karanasos and
Jes{\'u}s Camacho-Rodr{\'{\i}}guez and Avrilia Floratou
and Carlo Curino and Raghu Ramakrishnan",
title = "{OneProvenance}: Efficient Extraction of Dynamic
Coarse-Grained Provenance from Database Query Event
Logs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "12",
pages = "3662--3675",
month = aug,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611540.3611555",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 18 10:22:20 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611540.3611555",
abstract = "Provenance encodes information that connects datasets,
their generation workflows, and associated metadata
(e.g., who or when executed a query). As \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Srinivasan:2023:TEB,
author = "V. Srinivasan and Andrew Gooding and Sunil Sayyaparaju
and Thomas Lopatic and Kevin Porter and Ashish Shinde
and B. Narendran",
title = "Techniques and Efficiencies from Building a Real-Time
{DBMS}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "12",
pages = "3676--3688",
month = aug,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611540.3611556",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 18 10:22:20 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611540.3611556",
abstract = "This paper describes a variety of techniques from over
a decade of developing Aerospike (formerly Citrusleaf),
a real-time DBMS that is being used in some of the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2023:RTW,
author = "Jiaqi Wang and Tianyi Li and Anni Wang and Xiaoze Liu
and Lu Chen and Jie Chen and Jianye Liu and Junyang Wu
and Feifei Li and Yunjun Gao",
title = "Real-Time Workload Pattern Analysis for Large-Scale
Cloud Databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "12",
pages = "3689--3701",
month = aug,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611540.3611557",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 18 10:22:20 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611540.3611557",
abstract = "Hosting database services on cloud systems has become
a common practice. This has led to the increasing
volume of database workloads, which provides the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2023:BDA,
author = "Jiang Li and Qi Xie and Yan Ma and Jian Ma and
Kunshang Ji and Yizhong Zhang and Chaojun Zhang and
Yixiu Chen and Gangsheng Wu and Jie Zhang and Kaidi
Yang and Xinyi He and Qiuyang Shen and Yanting Tao and
Haiwei Zhao and Penghui Jiao and Chengfei Zhu and David
Qian and Cheng Xu",
title = "Big Data Analytic Toolkit: a General-Purpose, Modular,
and Heterogeneous Acceleration Toolkit for Data
Analytical Engines",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "12",
pages = "3702--3714",
month = aug,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611540.3611558",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 18 10:22:20 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611540.3611558",
abstract = "Query compilation and hardware acceleration are
important technologies for optimizing the performance
of data processing engines. There have been many works
on \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Shen:2023:LTC,
author = "Chunhui Shen and Qianyu Ouyang and Feibo Li and
Zhipeng Liu and Longcheng Zhu and Yujie Zou and Qing Su
and Tianhuan Yu and Yi Yi and Jianhong Hu and Cen Zheng
and Bo Wen and Hanbang Zheng and Lunfan Xu and Sicheng
Pan and Bin Wu and Xiao He and Ye Li and Jian Tan and
Sheng Wang and Dan Pei and Wei Zhang and Feifei Li",
title = "{Lindorm TSDB}: a Cloud-Native Time-Series Database
for Large-Scale Monitoring Systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "12",
pages = "3715--3727",
month = aug,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611540.3611559",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 18 10:22:20 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611540.3611559",
abstract = "Internet services supported by large-scale distributed
systems have become essential for our daily life. To
ensure the stability and high quality of services,
diverse \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yang:2023:OPH,
author = "Zhifeng Yang and Quanqing Xu and Shanyan Gao and
Chuanhui Yang and Guoping Wang and Yuzhong Zhao and
Fanyu Kong and Hao Liu and Wanhong Wang and Jinliang
Xiao",
title = "{OceanBase Paetica}: a Hybrid Shared-Nothing\slash
Shared-Everything Database for Supporting Single
Machine and Distributed Cluster",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "12",
pages = "3728--3740",
month = aug,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611540.3611560",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 18 10:22:20 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611540.3611560",
abstract = "In the ongoing evolution of the OceanBase database
system, it is essential to enhance its adaptability to
small-scale enterprises. The OceanBase database system
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yao:2023:SEU,
author = "Yuanyuan Yao and Dimeng Li and Hailiang Jie and
Hailiang Jie and Tianyi Li and Jie Chen and Jiaqi Wang
and Feifei Li and Yunjun Gao",
title = "{SimpleTS}: an Efficient and Universal Model Selection
Framework for Time Series Forecasting",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "12",
pages = "3741--3753",
month = aug,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611540.3611561",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 18 10:22:20 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611540.3611561",
abstract = "Time series forecasting, that predicts events through
a sequence of time, has received increasing attention
in past decades. The diverse range of time series
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yang:2023:PSC,
author = "Xinjun Yang and Yingqiang Zhang and Hao Chen and Chuan
Sun and Feifei Li and Wenchao Zhou",
title = "{PolarDB-SCC}: a Cloud-Native Database Ensuring Low
Latency for Strongly Consistent Reads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "12",
pages = "3754--3767",
month = aug,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611540.3611562",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 18 10:22:20 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611540.3611562",
abstract = "A classic design of cloud-native databases adopts an
architecture that consists of one read/write (RW) node
and one or more read-only (RO) nodes. In such a design,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yamada:2023:SUT,
author = "Hiroyuki Yamada and Toshihiro Suzuki and Yuji Ito and
Jun Nemoto",
title = "{ScalarDB}: Universal Transaction Manager for
Polystores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "12",
pages = "3768--3780",
month = aug,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611540.3611563",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 18 10:22:20 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611540.3611563",
abstract = "This paper presents ScalarDB, a universal transaction
manager that achieves distributed transactions across
multiple disparate databases. ScalarDB \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Nie:2023:APS,
author = "Xiaonan Nie and Yi Liu and Fangcheng Fu and Jinbao Xue
and Dian Jiao and Xupeng Miao and Yangyu Tao and Bin
Cui",
title = "{Angel-PTM}: a Scalable and Economical Large-Scale
Pre-Training System in {Tencent}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "12",
pages = "3781--3794",
month = aug,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611540.3611564",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 18 10:22:20 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611540.3611564",
abstract = "Recent years have witnessed the unprecedented
achievements of large-scale pre-trained models,
especially Transformer models. Many products and
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2023:EEE,
author = "Ji You Li and Jiachi Zhang and Wenchao Zhou and Yuhang
Liu and Shuai Zhang and Zhuoming Xue and Ding Xu and
Hua Fan and Fangyuan Zhou and Feifei Li",
title = "{Eigen}: End-to-End Resource Optimization for
Large-Scale Databases on the Cloud",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "12",
pages = "3795--3807",
month = aug,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611540.3611565",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 18 10:22:20 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611540.3611565",
abstract = "Increasingly, cloud database vendors host large-scale
geographically distributed clusters to provide cloud
database services. When managing the clusters, we
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Pan:2023:MUA,
author = "Zhicheng Pan and Yihang Wang and Yingying Zhang and
Sean Bin Yang and Yunyao Cheng and Peng Chen and
Chenjuan Guo and Qingsong Wen and Xiduo Tian and
Yunliang Dou and Zhiqiang Zhou and Chengcheng Yang and
Aoying Zhou and Bin Yang",
title = "{MagicScaler}: Uncertainty-Aware, Predictive
Autoscaling",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "12",
pages = "3808--3821",
month = aug,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611540.3611566",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 18 10:22:20 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611540.3611566",
abstract = "Predictive autoscaling is a key enabler for optimizing
cloud resource allocation in Alibaba Cloud's computing
platforms, which dynamically adjust the Elastic Compute
Service",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Povzner:2023:KCN,
author = "Anna Povzner and Prince Mahajan and Jason Gustafson
and Jun Rao and Ismael Juma and Feng Min and Shriram
Sridharan and Nikhil Bhatia and Gopi Attaluri and
Adithya Chandra and Stanislav Kozlovski and Rajini
Sivaram and Lucas Bradstreet and Bob Barrett and
Dhruvil Shah and David Jacot and David Arthur and Ron
Dagostino and Colin McCabe and Manikumar Reddy Obili
and Kowshik Prakasam and Jose Garcia Sancio and Vikas
Singh and Alok Nikhil and Kamal Gupta",
title = "{Kora}: a Cloud-Native Event Streaming Platform for
{Kafka}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "12",
pages = "3822--3834",
month = aug,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611540.3611567",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 18 10:22:20 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611540.3611567",
abstract = "Event streaming is an increasingly critical
infrastructure service used in many industries and
there is growing demand for cloud-native solutions.
Confluent \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Pasupuleti:2023:ASE,
author = "Krishna Kantikiran Pasupuleti and Jiakun Li and Hong
Su and Mohamed Ziauddin",
title = "Automatic {SQL} Error Mitigation in {Oracle}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "12",
pages = "3835--3847",
month = aug,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611540.3611568",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 18 10:22:20 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611540.3611568",
abstract = "Despite best coding practices, software bugs are
inevitable in a large codebase. In traditional
databases, when errors occur during query processing,
they \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhao:2023:PFE,
author = "Yanli Zhao and Andrew Gu and Rohan Varma and Liang Luo
and Chien-Chin Huang and Min Xu and Less Wright and
Hamid Shojanazeri and Myle Ott and Sam Shleifer and
Alban Desmaison and Can Balioglu and Pritam Damania and
Bernard Nguyen and Geeta Chauhan and Yuchen Hao and
Ajit Mathews and Shen Li",
title = "{PyTorch FSDP}: Experiences on Scaling Fully Sharded
Data Parallel",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "12",
pages = "3848--3860",
month = aug,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3611540.3611569",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon Sep 18 10:22:20 MDT 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/python.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3611540.3611569",
abstract = "It is widely acknowledged that large models have the
potential to deliver superior performance across a
broad range of domains. Despite the remarkable
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Qiu:2023:DDO,
author = "Lina Qiu and Georgios Kellaris and Nikos Mamoulis and
Kobbi Nissim and George Kollios",
title = "{Doquet}: Differentially Oblivious Range and Join
Queries with Private Data Structures",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "13",
pages = "4160--4173",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3625054.3625055",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:04 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3625054.3625055",
abstract = "Most cloud service providers offer limited data
privacy guarantees, discouraging clients from using
them for managing their sensitive data. Cloud providers
may use servers with Trusted Execution Environments
(TEEs) to protect outsourced data, while \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chiosa:2023:AAC,
author = "Monica Chiosa and Thomas B. Preu{\ss}er and Michaela
Blott and Gustavo Alonso",
title = "{AMNES}: Accelerating the Computation of Data
Correlation Using {FPGAs}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "13",
pages = "4174--7187",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3625054.3625056",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:04 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3625054.3625056",
abstract = "A widely used approach to characterize input data in
both databases and ML is computing the correlation
between attributes. The operation is supported by all
major database engines and ML platforms. However, it is
an expensive operation as the number of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Daum:2023:VPY,
author = "Maureen Daum and Enhao Zhang and Dong He and Stephen
Mussmann and Brandon Haynes and Ranjay Krishna and
Magdalena Balazinska",
title = "{VOCALExplore}: Pay-as-You-Go Video Data Exploration
and Model Building",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "13",
pages = "4188--4201",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3625054.3625057",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:04 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3625054.3625057",
abstract = "We introduce VOCALExplore, a system designed to
support users in building domain-specific models over
video datasets. VOCALExplore supports interactive
labeling sessions and trains models using user-supplied
labels. VOCALExplore maximizes model quality \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Arora:2023:FRA,
author = "Pankaj Arora and Surajit Chaudhuri and Sudipto Das and
Junfeng Dong and Cyril George and Ajay Kalhan and Arnd
Christian K{\"o}nig and Willis Lang and Changsong Li
and Feng Li and Jiaqi Liu and Lukas M. Maas and Akshay
Mata and Ishai Menache and Justin Moeller and Vivek
Narasayya and Matthaios Olma and Morgan Oslake and
Elnaz Rezai and Yi Shan and Manoj Syamala and Shize Xu
and Vasileios Zois",
title = "Flexible Resource Allocation for Relational
Database-as-a-Service",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "13",
pages = "4202--4215",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3625054.3625058",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:04 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3625054.3625058",
abstract = "Oversubscription is an essential cost management
strategy for cloud database providers, and its
importance is magnified by the emerging paradigm of
serverless databases. In contrast to general purpose
techniques used for oversubscription in hypervisors,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gu:2023:SEA,
author = "Rong Gu and Han Li and Haipeng Dai and Wenjie Huang
and Jie Xue and Meng Li and Jiaqi Zheng and Haoran Cai
and Yihua Huang and Guihai Chen",
title = "{ShadowAQP}: Efficient Approximate Group-by and Join
Query via Attribute-Oriented Sample Size Allocation and
Data Generation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "13",
pages = "4216--4229",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3625054.3625059",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:04 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3625054.3625059",
abstract = "Approximate query processing (AQP) is one of the key
techniques to cope with big data querying problem on
account that it obtains approximate answers
efficiently. To address non-trivial sample selection
and heavy sampling cost issues in AQP, we propose
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liu:2023:ODP,
author = "Rui Liu and Kwanghyun Park and Fotis Psallidas and
Xiaoyong Zhu and Jinghui Mo and Rathijit Sen and Matteo
Interlandi and Konstantinos Karanasos and Yuanyuan Tian
and Jes{\'u}s Camacho-Rodr{\'\i}guez",
title = "Optimizing Data Pipelines for Machine Learning in
Feature Stores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "13",
pages = "4230--4239",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3625054.3625060",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:04 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3625054.3625060",
abstract = "Data pipelines (i.e., converting raw data to features)
are critical for machine learning (ML) models, yet
their development and management is time-consuming.
Feature stores have recently emerged as a new
``DBMS-for-ML'' with the premise of enabling data
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Angles:2023:SSE,
author = "Renzo Angles and Georg Gottlob and Aleksandar
Pavlovi{\'c} and Reinhard Pichler and Emanuel
Sallinger",
title = "{SparqLog}: a System for Efficient Evaluation of
{SPARQL 1.1} Queries via {Datalog}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "13",
pages = "4240--4253",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3625054.3625061",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:04 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3625054.3625061",
abstract = "Over the past decade, Knowledge Graphs have received
enormous interest both from industry and from academia.
Research in this area has been driven, above all, by
the Database (DB) community and the Semantic Web (SW)
community. However, there still \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Konig:2023:SLC,
author = "Arnd Christian K{\"o}nig and Yi Shan and Karan Newatia
and Luke Marshall and Vivek Narasayya",
title = "Solver-In-The-Loop Cluster Resource Management for
Database-as-a-Service",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "13",
pages = "4254--4267",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3625054.3625062",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:04 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3625054.3625062",
abstract = "In Database-as-a-Service (DBaaS) clusters, resource
management is a complex optimization problem that
assigns tenants to nodes, subject to various
constraints and objectives. Tenants share resources
within a node, however, their resource demands can
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Henneberg:2023:REH,
author = "Justus Henneberg and Felix Schuhknecht",
title = "{RTIndeX}: Exploiting Hardware-Accelerated {GPU}
Raytracing for Database Indexing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "13",
pages = "4268--4281",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3625054.3625063",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:04 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3625054.3625063",
abstract = "Data management on GPUs has become increasingly
relevant due to a tremendous rise in processing power
and available GPU memory. Similar to main-memory
systems, there is a need for performant GPU-resident
index structures to speed up query processing.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lian:2023:CCT,
author = "Jinqing Lian and Xinyi Zhang and Yingxia Shao and
Zenglin Pu and Qingfeng Xiang and Yawen Li and Bin
Cui",
title = "{ContTune}: Continuous Tuning by Conservative
{Bayesian} Optimization for Distributed Stream Data
Processing Systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "13",
pages = "4282--4295",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3625054.3625064",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:04 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3625054.3625064",
abstract = "The past decade has seen rapid growth of distributed
stream data processing systems. Under these systems, a
stream application is realized as a Directed Acyclic
Graph (DAG) of operators, where the level of
parallelism of each operator has a substantial
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Melissourgos:2023:SUS,
author = "Dimitrios Melissourgos and Haibo Wang and Shigang Chen
and Chaoyi Ma and Shiping Chen",
title = "Single Update Sketch with Variable Counter Structure",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "13",
pages = "4296--4309",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3625054.3625065",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:04 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3625054.3625065",
abstract = "Per-flow size measurement is key to many streaming
applications and management systems, particularly in
high-speed networks. Performing such measurement on the
data plane of a network device at the line rate
requires on-chip memory and computing \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Trummer:2023:CLL,
author = "Immanuel Trummer",
title = "Can Large Language Models Predict Data Correlations
from Column Names?",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "13",
pages = "4310--4323",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3625054.3625066",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:04 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3625054.3625066",
abstract = "Recent publications suggest using natural language
analysis on database schema elements to guide tuning
and profiling efforts. The underlying hypothesis is
that state-of-the-art language processing methods,
so-called language models, are able to extract
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chamani:2023:GTO,
author = "Javad Ghareh Chamani and Ioannis Demertzis and
Dimitrios Papadopoulos and Charalampos Papamanthou and
Rasool Jalili",
title = "{GraphOS}: Towards Oblivious Graph Processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "13",
pages = "4324--4338",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3625054.3625067",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:04 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3625054.3625067",
abstract = "We propose GraphOS, a system that allows a client that
owns a graph database to outsource it to an untrusted
server for storage and querying. It relies on
doubly-oblivious primitives and trusted hardware to
achieve a very strong privacy and efficiency \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2023:COC,
author = "Kefei Wang and Feng Chen",
title = "{Catalyst}: Optimizing Cache Management for Large
In-memory Key-value Systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "16",
number = "13",
pages = "4339--4352",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3625054.3625068",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:04 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3625054.3625068",
abstract = "In-memory key-value cache systems, such as Memcached
and Redis, are essential in today's data centers. A key
mission of such cache systems is to identify the most
valuable data for caching. To achieve this, the current
system design keeps track of each \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zheng:2023:DDL,
author = "Bolong Zheng and Yongyong Gao and Jingyi Wan and
Lingsen Yan and Long Hu and Bo Liu and Yunjun Gao and
Xiaofang Zhou and Christian S. Jensen",
title = "{DecLog}: Decentralized Logging in Non-Volatile Memory
for Time Series Database Systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "1",
pages = "1--14",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3617838.3617839",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:06 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3617838.3617839",
abstract = "Growing demands for the efficient processing of
extreme-scale time series workloads call for more
capable time series database management systems
(TSDBMS). Specifically, to maintain consistency and
durability of transaction processing, systems employ
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2023:EDW,
author = "Fangyuan Zhang and Mengxu Jiang and Sibo Wang",
title = "Efficient Dynamic Weighted Set Sampling and Its
Extension",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "1",
pages = "15--27",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3617838.3617840",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:06 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3617838.3617840",
abstract = "Given a weighted set S of n elements, weighted set
sampling (WSS) samples an element in S so that each
element a$_i$; is sampled with a probability
proportional to its weight w ( a$_i$ ). The classic
alias method pre-processes an index in O ( n ) time
with O ( n ) \ldots{} ",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lin:2023:ZLI,
author = "Yiming Lin and Sharad Mehrotra",
title = "{ZIP}: Lazy Imputation during Query Processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "1",
pages = "28--40",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3617838.3617841",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:06 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3617838.3617841",
abstract = "This paper develops a query-time missing value
imputation framework, entitled ZIP, that modifies
relational operators to be imputation aware in order to
minimize the joint cost of imputing and query
processing. The modified operators use a cost-based
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2023:FTA,
author = "Xunkai Li and Zhengyu Wu and Wentao Zhang and Yinlin
Zhu and Rong-Hua Li and Guoren Wang",
title = "{FedGTA}: Topology-Aware Averaging for Federated Graph
Learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "1",
pages = "41--50",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3617838.3617842",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:06 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3617838.3617842",
abstract = "Federated Graph Learning (FGL) is a distributed
machine learning paradigm that enables collaborative
training on large-scale subgraphs across multiple local
systems. Existing FGL studies fall into two categories:
(i) FGL Optimization, which improves \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chang:2023:HPM,
author = "Xueqin Chang and Xiangyu Ke and Lu Chen and Congcong
Ge and Ziheng Wei and Yunjun Gao",
title = "Host Profit Maximization: Leveraging Performance
Incentives and User Flexibility",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "1",
pages = "51--64",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3617838.3617843",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:06 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3617838.3617843",
abstract = "The social network host has knowledge of the network
structure and user characteristics and can earn a
profit by providing merchants with viral marketing
campaigns. We investigate the problem of host profit
maximization by leveraging performance \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Patwa:2023:DPP,
author = "Shweta Patwa and Danyu Sun and Amir Gilad and Ashwin
Machanavajjhala and Sudeepa Roy",
title = "{DP-PQD}: Privately Detecting Per-Query Gaps in
Synthetic Data Generated by Black-Box Mechanisms",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "1",
pages = "65--78",
month = sep,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3617838.3617844",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 5 08:24:06 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3617838.3617844",
abstract = "Synthetic data generation methods, and in particular,
private synthetic data generation methods, are gaining
popularity as a means to make copies of sensitive
databases that can be shared widely for research and
data analysis. Some of the fundamental \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wei:2023:CSP,
author = "Ruidi Wei and Florian Kerschbaum",
title = "Cryptographically Secure Private Record Linkage using
Locality-Sensitive Hashing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "2",
pages = "79--91",
month = oct,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3626292.3626293",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 12 09:42:35 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3626292.3626293",
abstract = "Private record linkage (PRL) is the problem of
identifying pairs of records that approximately match
across datasets in a secure, privacy-preserving manner.
Two-party PRL specifically allows each of the parties
to obtain records from the other party, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Arora:2023:LME,
author = "Simran Arora and Brandon Yang and Sabri Eyuboglu and
Avanika Narayan and Andrew Hojel and Immanuel Trummer
and Christopher R{\'e}",
title = "Language Models Enable Simple Systems for Generating
Structured Views of Heterogeneous Data Lakes",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "2",
pages = "92--105",
month = oct,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3626292.3626294",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 12 09:42:35 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3626292.3626294",
abstract = "A long standing goal in the data management community
is developing systems that input documents and output
queryable tables without user effort. Given the sheer
variety of potential documents, state-of-the art
systems make simplifying assumptions and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2023:QRD,
author = "Jinyang Li and Yuval Moskovitch and Julia Stoyanovich
and H. V. Jagadish",
title = "Query Refinement for Diversity Constraint
Satisfaction",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "2",
pages = "106--118",
month = oct,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3626292.3626295",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 12 09:42:35 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3626292.3626295",
abstract = "Diversity, group representation, and similar needs
often apply to query results, which in turn require
constraints on the sizes of various subgroups in the
result set. Traditional relational queries only specify
conditions as part of the query predicate \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2023:EEL,
author = "Zhaoheng Li and Pranav Gor and Rahul Prabhu and Hui Yu
and Yuzhou Mao and Yongjoo Park",
title = "{ElasticNotebook}: Enabling Live Migration for
Computational Notebooks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "2",
pages = "119--133",
month = oct,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3626292.3626296",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 12 09:42:35 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3626292.3626296",
abstract = "Computational notebooks (e.g., Jupyter, Google Colab)
are widely used for interactive data science and
machine learning. In those frameworks, users can start
a session, then execute cells (i.e., a set of
statements) to create variables, train models,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Huang:2023:BNL,
author = "Kecheng Huang and Zhaoyan Shen and Zili Shao and Tong
Zhang and Feng Chen",
title = "Breathing New Life into an Old Tree: Resolving Logging
Dilemma of {B$^+$}-tree on Modern Computational Storage
Drives",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "2",
pages = "134--147",
month = oct,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3626292.3626297",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 12 09:42:35 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3626292.3626297",
abstract = "Having dominated databases and various data management
systems for decades, B$^+$-tree is infamously subject
to a logging dilemma: One could improve B$^+$-tree
speed performance by equipping it with a larger log,
which nevertheless will degrade its crash \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zeng:2023:EEC,
author = "Xinyu Zeng and Yulong Hui and Jiahong Shen and Andrew
Pavlo and Wes McKinney and Huanchen Zhang",
title = "An Empirical Evaluation of Columnar Storage Formats",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "2",
pages = "148--161",
month = oct,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3626292.3626298",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 12 09:42:35 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3626292.3626298",
abstract = "Columnar storage is a core component of a modern data
analytics system. Although many database management
systems (DBMSs) have proprietary storage formats, most
provide extensive support to open-source storage
formats such as Parquet and ORC to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yuan:2023:EGA,
author = "Yichao Yuan and Haojie Ye and Sanketh Vedula and Wynn
Kaza and Nishil Talati",
title = "{Everest}: {GPU}-Accelerated System for Mining
Temporal Motifs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "2",
pages = "162--174",
month = oct,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3626292.3626299",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 12 09:42:35 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3626292.3626299",
abstract = "Temporal motif mining is the task of finding the
occurrences of subgraph patterns within a large input
temporal graph that obey the specified structural and
temporal constraints. Despite its utility in several
critical application domains that demand \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wu:2023:BSB,
author = "Xueyi Wu and Yuanyuan Xu and Wenjie Zhang and Ying
Zhang",
title = "Billion-Scale Bipartite Graph Embedding: a
Global-Local Induced Approach",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "2",
pages = "175--183",
month = oct,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3626292.3626300",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 12 09:42:35 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3626292.3626300",
abstract = "Bipartite graph embedding (BGE), as the fundamental
task in bipartite network analysis, is to map each node
to compact low-dimensional vectors that preserve
intrinsic properties. The existing solutions towards
BGE fall into two groups: metric-based \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ni:2023:UAP,
author = "Wangze Ni and Pengze Chen and Lei Chen and Peng Cheng
and Chen Jason Zhang and Xuemin Lin",
title = "Utility-Aware Payment Channel Network Rebalance",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "2",
pages = "184--196",
month = oct,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3626292.3626301",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 12 09:42:35 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3626292.3626301",
abstract = "The payment channel network (PCN) is a promising
solution to increase the throughput of blockchains.
However, unidirectional transactions can deplete a
user's deposits in a payment channel (PC), reducing the
success ratio of transactions (SRoT). To \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2023:AAB,
author = "Pengfei Li and Wenqing Wei and Rong Zhu and Bolin Ding
and Jingren Zhou and Hua Lu",
title = "{ALECE}: an Attention-based Learned Cardinality
Estimator for {SPJ} Queries on Dynamic Workloads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "2",
pages = "197--210",
month = oct,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3626292.3626302",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 12 09:42:35 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3626292.3626302",
abstract = "For efficient query processing, DBMS query optimizers
have for decades relied on delicate cardinality
estimation methods. In this work, we propose an
Attention-based LEarned Cardinality Estimator (ALECE
for short) for SPJ queries. The core idea is to
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Xia:2023:FLE,
author = "Haojun Xia and Zhen Zheng and Yuchao Li and Donglin
Zhuang and Zhongzhu Zhou and Xiafei Qiu and Yong Li and
Wei Lin and Shuaiwen Leon Song",
title = "{Flash-LLM}: Enabling Cost-Effective and
Highly-Efficient Large Generative Model Inference with
Unstructured Sparsity",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "2",
pages = "211--224",
month = oct,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3626292.3626303",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 12 09:42:35 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3626292.3626303",
abstract = "With the fast growth of parameter size, it becomes
increasingly challenging to deploy large generative
models as they typically require large GPU memory
consumption and massive computation. Unstructured model
pruning has been a common approach to reduce \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Howard:2023:CCF,
author = "Heidi Howard and Fritz Alder and Edward Ashton and
Amaury Chamayou and Sylvan Clebsch and Manuel Costa and
Antoine Delignat-Lavaud and C{\'e}dric Fournet and
Andrew Jeffery and Matthew Kerner and Fotios Kounelis
and Markus A. Kuppe and Julien Maffre and Mark
Russinovich and Christoph M. Wintersteiger",
title = "Confidential Consortium Framework: Secure Multiparty
Applications with Confidentiality, Integrity, and High
Availability",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "2",
pages = "225--240",
month = oct,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3626292.3626304",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 12 09:42:35 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3626292.3626304",
abstract = "Confidentiality, integrity protection, and high
availability, abbreviated to CIA, are essential
properties for trustworthy data systems. The rise of
cloud computing and the growing demand for multiparty
applications however means that building modern
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Duan:2023:VVL,
author = "Sijing Duan and Feng Lyu and Xin Zhu and Yi Ding and
Haotian Wang and Desheng Zhang and Xue Liu and Yaoxue
Zhang and Ju Ren",
title = "{VeLP}: Vehicle Loading Plan Learning from Human
Behavior in Nationwide Logistics System",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "2",
pages = "241--249",
month = oct,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3626292.3626305",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 12 09:42:35 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3626292.3626305",
abstract = "For a nationwide logistics transportation system, it
is critical to make the vehicle loading plans (i.e.,
given many packages, deciding vehicle types and
numbers) at each sorting and distribution center. This
task is currently completed by dispatchers \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Naik:2023:RQS,
author = "Aaditya Naik and Aalok Thakkar and Adam Stein and
Rajeev Alur and Mayur Naik",
title = "Relational Query Synthesis $ \bowtie $ Decision Tree
Learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "2",
pages = "250--263",
month = oct,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3626292.3626306",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Tue Dec 12 09:42:35 MST 2023",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3626292.3626306",
abstract = "We study the problem of synthesizing a core fragment
of relational queries called select-project-join (SPJ)
queries from input-output examples. Search-based
synthesis techniques are suited to synthesizing
projections and joins by navigating the network
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yao:2023:RRA,
author = "Feng Yao and Qian Tao and Wenyuan Yu and Yanfeng Zhang
and Shufeng Gong and Qiange Wang and Ge Yu and Jingren
Zhou",
title = "{RAGraph}: a Region-Aware Framework for
Geo-Distributed Graph Processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "3",
pages = "264--277",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3632093.3632094",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:36:59 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3632093.3632094",
abstract = "In many global businesses of multinational
enterprises, graph-structure data is usually
geographically distributed in different regions to
support low-latency services. Geo-distributed graph
processing suffers from the Wide Area Networks (WANs)
with \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lin:2023:SDB,
author = "Qiuru Lin and Sai Wu and Junbo Zhao and Jian Dai and
Meng Shi and Gang Chen and Feifei Li",
title = "{SmartLite}: a {DBMS-Based} Serving System for {DNN}
Inference in Resource-Constrained Environments",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "3",
pages = "278--291",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3632093.3632095",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:36:59 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3632093.3632095",
abstract = "Many IoT applications require the use of multiple deep
neural networks (DNNs) to perform various tasks on
low-cost edge devices with limited computation
resources. However, existing DNN model serving
platforms, such as TensorFlow Serving and TorchServe,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wu:2023:BMC,
author = "Shiwen Wu and Qiyu Wu and Honghua Dong and Wen Hua and
Xiaofang Zhou",
title = "Blocker and Matcher Can Mutually Benefit: a
Co-Learning Framework for Low-Resource Entity
Resolution",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "3",
pages = "292--304",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3632093.3632096",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:36:59 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3632093.3632096",
abstract = "Entity resolution (ER) approaches typically consist of
a blocker and a matcher. They share the same goal and
cooperate in different roles: the blocker first quickly
removes obvious non-matches, and the matcher
subsequently determines whether the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ang:2023:TTS,
author = "Yihao Ang and Qiang Huang and Yifan Bao and Anthony K.
H. Tung and Zhiyong Huang",
title = "{TSGBench}: Time Series Generation Benchmark",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "3",
pages = "305--318",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3632093.3632097",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:36:59 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3632093.3632097",
abstract = "Synthetic Time Series Generation (TSG) is crucial in a
range of applications, including data augmentation,
anomaly detection, and privacy preservation. Although
significant strides have been made in this field,
existing methods exhibit three key \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Punter:2023:OEM,
author = "Wieger R. Punter and Odysseas Papapetrou and Minos
Garofalakis",
title = "{OmniSketch}: Efficient Multi-Dimensional
High-Velocity Stream Analytics with Arbitrary
Predicates",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "3",
pages = "319--331",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3632093.3632098",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:36:59 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3632093.3632098",
abstract = "A key need in different disciplines is to perform
analytics over fast-paced data streams, similar in
nature to the traditional OLAP analytics in relational
databases --- i.e., with filters and aggregates.
Storing unbounded streams, however, is not a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chung:2023:MBK,
author = "Kai Hiu Chung and Alexander Zhou and Yue Wang and Lei
Chen",
title = "Maximum Balanced $ (k, \epsilon)$-Bitruss Detection in
Signed Bipartite Graph",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "3",
pages = "332--344",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3632093.3632099",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:36:59 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3632093.3632099",
abstract = "Signed bipartite graphs represent relationships
between two sets of entities, including both positive
and negative interactions, allowing for a more
comprehensive modeling of real-world networks. In this
work, we focus on the detection of cohesive \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2023:MVI,
author = "Xiao Li and Huan Li and Hua Lu and Christian S. Jensen
and Varun Pandey and Volker Markl",
title = "Missing Value Imputation for Multi-Attribute Sensor
Data Streams via Message Propagation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "3",
pages = "345--358",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3632093.3632100",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:36:59 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3632093.3632100",
abstract = "Sensor data streams occur widely in various real-time
applications in the context of the Internet of Things
(IoT). However, sensor data streams feature missing
values due to factors such as sensor failures,
communication errors, or depleted batteries. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2023:IID,
author = "Yuhang Chen and Chaoyun Zhang and Minghua Ma and
Yudong Liu and Ruomeng Ding and Bowen Li and Shilin He
and Saravan Rajmohan and Qingwei Lin and Dongmei
Zhang",
title = "{ImDiffusion}: Imputed Diffusion Models for
Multivariate Time Series Anomaly Detection",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "3",
pages = "359--372",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3632093.3632101",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:36:59 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3632093.3632101",
abstract = "Anomaly detection in multivariate time series data is
of paramount importance for large-scale systems.
However, accurately detecting anomalies in such data
poses significant challenges due to the need for
precise data modeling capability. Existing \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sun:2023:CIP,
author = "Dajun Sun and Wei Dong and Ke Yi",
title = "Confidence Intervals for Private Query Processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "3",
pages = "373--385",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3632093.3632102",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:36:59 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3632093.3632102",
abstract = "Whenever randomness is involved in query processing,
confidence intervals are commonly returned to the user
to indicate the statistical significance of the query
answer. However, this problem has not been explicitly
addressed under differential privacy, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liang:2023:SBF,
author = "Zhiyu Liang and Jianfeng Zhang and Chen Liang and
Hongzhi Wang and Zheng Liang and Lujia Pan",
title = "A Shapelet-Based Framework for Unsupervised
Multivariate Time Series Representation Learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "3",
pages = "386--399",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3632093.3632103",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:36:59 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3632093.3632103",
abstract = "Recent studies have shown great promise in
unsupervised representation learning (URL) for
multivariate time series, because URL has the
capability in learning generalizable representation for
many downstream tasks without using inaccessible
labels. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2023:FSE,
author = "Letong Wang and Xiangyun Ding and Yan Gu and Yihan
Sun",
title = "Fast and Space-Efficient Parallel Algorithms for
Influence Maximization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "3",
pages = "400--413",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3632093.3632104",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:36:59 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3632093.3632104",
abstract = "Influence Maximization (IM) is a crucial problem in
data science. The goal is to find a fixed-size set of
highly influential seed vertices on a network to
maximize the influence spread along the edges. While IM
is NP-hard on commonly used diffusion \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2023:TEF,
author = "Yile Chen and Gao Cong and Cuauhtemoc Anda",
title = "{TERI}: an Effective Framework for Trajectory Recovery
with Irregular Time Intervals",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "3",
pages = "414--426",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3632093.3632105",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:36:59 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3632093.3632105",
abstract = "The proliferation of trajectory data has facilitated
various applications in urban spaces, such as travel
time estimation, traffic monitoring, and flow
prediction. These applications require a substantial
volume of high-quality trajectories as the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2023:DGS,
author = "Yuhan Chen and Haojie Ye and Sanketh Vedula and Alex
Bronstein and Ronald Dreslinski and Trevor Mudge and
Nishil Talati",
title = "Demystifying Graph Sparsification Algorithms in Graph
Properties Preservation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "3",
pages = "427--440",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3632093.3632106",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:36:59 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3632093.3632106",
abstract = "Graph sparsification is a technique that approximates
a given graph by a sparse graph with a subset of
vertices and/or edges. The goal of an effective
sparsification algorithm is to maintain specific graph
properties relevant to the downstream task while
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Cao:2023:GDS,
author = "Jiashen Cao and Rathijit Sen and Matteo Interlandi and
Joy Arulraj and Hyesoon Kim",
title = "{GPU} Database Systems Characterization and
Optimization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "3",
pages = "441--454",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3632093.3632107",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:36:59 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3632093.3632107",
abstract = "GPUs offer massive parallelism and high-bandwidth
memory access, making them an attractive option for
accelerating data analytics in database systems.
However, while modern GPUs possess more resources than
ever before (e.g., higher DRAM bandwidth), \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2023:NDG,
author = "Chaoyi Chen and Dechao Gao and Yanfeng Zhang and
Qiange Wang and Zhenbo Fu and Xuecang Zhang and Junhua
Zhu and Yu Gu and Ge Yu",
title = "{NeutronStream}: a Dynamic {GNN} Training Framework
with Sliding Window for Graph Streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "3",
pages = "455--468",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3632093.3632108",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:36:59 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3632093.3632108",
abstract = "Existing Graph Neural Network (GNN) training
frameworks have been designed to help developers easily
create performant GNN implementations. However, most
existing GNN frameworks assume that the input graphs
are static, but ignore that most real-world \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Hildred:2023:CLL,
author = "Joshua Hildred and Michael Abebe and Khuzaima
Daudjee",
title = "{Caerus}: Low-Latency Distributed Transactions for
Geo-Replicated Systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "3",
pages = "469--482",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3632093.3632109",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:36:59 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3632093.3632109",
abstract = "Distributed deterministic database systems achieve
high transaction throughput for geographically
replicated data. Supporting transactions with ACID
guarantees requires deterministic databases to order
transactions globally to dictate execution order. In
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2023:EEA,
author = "Aoqian Zhang and Shuqing Deng and Dongping Cui and Ye
Yuan and Guoren Wang",
title = "An Experimental Evaluation of Anomaly Detection in
Time Series",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "3",
pages = "483--496",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3632093.3632110",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:36:59 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3632093.3632110",
abstract = "Anomaly detection in time series data has been studied
for decades in both statistics and computer science.
Various algorithms have been proposed for different
scenarios, such as fraud detection, environmental
monitoring, manufacturing, and healthcare. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Singh:2023:FAE,
author = "Mukul Singh and Jos{\'e} Cambronero and Sumit Gulwani
and Vu Le and Carina Negreanu and Elnaz Nouri and
Mohammad Raza and Gust Verbruggen",
title = "{FormaT5}: Abstention and Examples for Conditional
Table Formatting with Natural Language",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "3",
pages = "497--510",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3632093.3632111",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:36:59 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3632093.3632111",
abstract = "Formatting is an important property in tables for
visualization, presentation, and analysis. Spreadsheet
software allows users to automatically format their
tables by writing data-dependent conditional formatting
(CF) rules. Writing such rules is often \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Schonberger:2023:QID,
author = "Manuel Sch{\"o}nberger and Immanuel Trummer and
Wolfgang Mauerer",
title = "Quantum-Inspired Digital Annealing for Join Ordering",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "3",
pages = "511--524",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3632093.3632112",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:36:59 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3632093.3632112",
abstract = "Finding the optimal join order (JO) is one of the most
important problems in query optimisation, and has been
extensively considered in research and practise. As it
involves huge search spaces, approximation approaches
and heuristics are commonly used, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Musleh:2023:KSB,
author = "Mashaal Musleh and Mohamed F. Mokbel",
title = "{Kamel}: a Scalable {BERT}-Based System for Trajectory
Imputation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "3",
pages = "525--538",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3632093.3632113",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:36:59 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3632093.3632113",
abstract = "Numerous important applications rely on detailed
trajectory data. Yet, unfortunately, trajectory
datasets are typically sparse with large spatial and
temporal gaps between each two points, which is a major
hurdle for their accuracy. This paper presents
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2023:ETL,
author = "Xinyi Zhang and Hong Wu and Yang Li and Zhengju Tang
and Jian Tan and Feifei Li and Bin Cui",
title = "An Efficient Transfer Learning Based Configuration
Adviser for Database Tuning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "3",
pages = "539--552",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3632093.3632114",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:36:59 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3632093.3632114",
abstract = "In recent years, a wide spectrum of database tuning
systems have emerged to automatically optimize database
performance. However, these systems require a
significant number of workload runs to deliver a
satisfactory level of database performance, which
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Petralia:2023:ATT,
author = "Adrien Petralia and Philippe Charpentier and Themis
Palpanas",
title = "{ADF \& TransApp}: a Transformer-Based Framework for
Appliance Detection Using Smart Meter Consumption
Series",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "3",
pages = "553--562",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3632093.3632115",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:36:59 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3632093.3632115",
abstract = "Over the past decade, millions of smart meters have
been installed by electricity suppliers worldwide,
allowing them to collect a large amount of electricity
consumption data, albeit sampled at a low frequency
(one point every 30min). One of the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wooders:2023:RAA,
author = "Sarah Wooders and Xiangxi Mo and Amit Narang and Kevin
Lin and Ion Stoica and Joseph M. Hellerstein and
Natacha Crooks and Joseph E. Gonzalez",
title = "{RALF}: Accuracy-Aware Scheduling for Feature Store
Maintenance",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "3",
pages = "563--576",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3632093.3632116",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:36:59 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3632093.3632116",
abstract = "Feature stores (also sometimes referred to as
embedding stores) are becoming ubiquitous in model
serving systems: downstream applications query these
stores for auxiliary inputs at inference-time. Stored
features are derived by featurizing rapidly \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Huang:2023:ALH,
author = "Kaisong Huang and Tianzheng Wang and Qingqing Zhou and
Qingzhong Meng",
title = "The Art of Latency Hiding in Modern Database Engines",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "3",
pages = "577--590",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3632093.3632117",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:36:59 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3632093.3632117",
abstract = "Modern database engines must well use multicore CPUs,
large main memory and fast storage devices to achieve
high performance. A common theme is hiding latencies
such that more CPU cycles can be dedicated to ``real''
work, improving overall throughput. Yet \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Najafi:2023:MSN,
author = "Mohammad Matin Najafi and Chenhao Ma and Xiaodong Li
and Reynold Cheng and Laks V. S. Lakshmanan",
title = "{MOSER}: Scalable Network {Motif} Discovery Using
Serial Test",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "3",
pages = "591--603",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3632093.3632118",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:36:59 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3632093.3632118",
abstract = "Given a graph G, a motif (e.g., 3-node clique) is a
fundamental building block for G. Recently, motif-based
graph analysis has attracted much attention due to its
efficacy in tasks such as clustering, ranking, and link
prediction. These tasks require \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2023:CMP,
author = "Dongxiang Zhang and Teng Ma and Junnan Hu and Yijun
Bei and Kian-Lee Tan and Gang Chen",
title = "Co-Movement Pattern Mining from Videos",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "3",
pages = "604--616",
month = nov,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3632093.3632119",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:36:59 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3632093.3632119",
abstract = "Co-movement pattern mining from GPS trajectories has
been an intriguing subject in spatial-temporal data
mining. In this paper, we extend this research line by
migrating the data source from GPS sensors to
surveillance cameras, and presenting the first
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ge:2023:EAS,
author = "Qian Ge and Yu Liu and Yinghao Zhao and Yuetian Sun
and Lei Zou and Yuxing Chen and Anqun Pan",
title = "Efficient and Accurate {SimRank}-Based Similarity
Joins: Experiments, Analysis, and Improvement",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "4",
pages = "617--629",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3636218.3636219",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:37:01 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3636218.3636219",
abstract = "SimRank-based similarity joins, which mainly include
threshold-based and top- k similarity joins, are
important types of all-pair SimRank queries. Although a
line of related algorithms have been proposed recently,
they still fall short of providing \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2023:ERN,
author = "Wentao Li and Maolin Cai and Min Gao and Dong Wen and
Lu Qin and Wei Wang",
title = "Expanding Reverse Nearest Neighbors",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "4",
pages = "630--642",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3636218.3636220",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:37:01 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3636218.3636220",
abstract = "In a graph, the reverse nearest neighbors (RNN) of
vertex f refer to the set of vertices that consider f
as their nearest neighbor. When f represents a facility
like a subway station, its RNN comprises potential
users who prefer the nearest facility. In \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhao:2023:ESO,
author = "Fuheng Zhao and Divyakant Agrawal and Amr {El Abbadi}
and Ahmed Metwally and Claire Mathieu and Michel de
Rougemont",
title = "Errata for {``SpaceSaving$ \pm $: an Optimal Algorithm
for Frequency Estimation and Frequent Items in the
Bounded-Deletion Model''}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "4",
pages = "643",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3636218.3636221",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:37:01 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
note = "See \cite{Zhao:2022:SPO}.",
URL = "https://dl.acm.org/doi/10.14778/3636218.3636221",
abstract = "This errata article points out an implicit assumption
in the work of four of us published in VLDB 2022. The
SpaceSaving\pm algorithm in bounded deletion data
stream presented in the paper implicitly assumed
deletions happen after all insertions. When \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Siddiqui:2023:CET,
author = "Tarique Siddiqui and Vivek Narasayya and Marius
Dumitru and Surajit Chaudhuri",
title = "Cache-Efficient Top-$k$ Aggregation over High
Cardinality Large Datasets",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "4",
pages = "644--656",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3636218.3636222",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:37:01 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3636218.3636222",
abstract = "Top-k aggregation queries are widely used in data
analytics for summarizing and identifying important
groups from large amounts of data. These queries are
usually processed by first computing exact aggregates
for all groups and then selecting the groups \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Cai:2023:ETB,
author = "Xinwei Cai and Xiangyu Ke and Kai Wang and Lu Chen and
Tianming Zhang and Qing Liu and Yunjun Gao",
title = "Efficient Temporal Butterfly Counting and Enumeration
on Temporal Bipartite Graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "4",
pages = "657--670",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3636218.3636223",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:37:01 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3636218.3636223",
abstract = "Bipartite graphs characterize relationships between
two different sets of entities, like actor-movie,
user-item, and author-paper. The butterfly, a
4-vertices 4-edges (2,2)-biclique, is the simplest
cohesive motif in a bipartite graph and is the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhong:2023:TTB,
author = "Tianxiong Zhong and Zhiwei Zhang and Guo Lu and Ye
Yuan and Yu-Ping Wang and Guoren Wang",
title = "{TVM}: a Tile-based Video Management Framework",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "4",
pages = "671--684",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3636218.3636224",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:37:01 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3636218.3636224",
abstract = "With the exponential growth of video data, there is a
pressing need for efficient video analysis technology.
Modern query frameworks aim to accelerate queries by
reducing the frequency of calls to expensive deep
neural networks, which often overlook the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2023:SCR,
author = "Yi Zhang and Jan Deriu and George
Katsogiannis-Meimarakis and Catherine Kosten and
Georgia Koutrika and Kurt Stockinger",
title = "{ScienceBenchmark}: a Complex Real-World Benchmark for
Evaluating Natural Language to {SQL} Systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "4",
pages = "685--698",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3636218.3636225",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:37:01 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3636218.3636225",
abstract = "Natural Language to SQL systems (NL-to-SQL) have
recently shown improved accuracy (exceeding 80\%) for
natural language to SQL query translation due to the
emergence of transformer-based language models, and the
popularity of the Spider benchmark. However,.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2023:DMS,
author = "Lu Chen and Chengfei Liu and Rui Zhou and Kewen Liao
and Jiajie Xu and Jianxin Li",
title = "Densest Multipartite Subgraph Search in Heterogeneous
Information Networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "4",
pages = "699--711",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3636218.3636226",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:37:01 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3636218.3636226",
abstract = "Cohesive multipartite subgraphs (CMS) in heterogeneous
information networks (HINs) uncover closely connected
vertex groups of multiple types, enhancing real
applications like community search and anomaly
detection. However, existing works for HINs pay
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Nagrecha:2023:SOD,
author = "Kabir Nagrecha and Arun Kumar",
title = "{Saturn}: an Optimized Data System for
Multi-Large-Model Deep Learning Workloads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "4",
pages = "712--725",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3636218.3636227",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:37:01 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3636218.3636227",
abstract = "Large models such as GPT-3 and ChatGPT have
transformed deep learning (DL), powering applications
that have captured the public's imagination. Such
models must be trained on multiple GPUs due to their
size and computational load, driving the development
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Cai:2023:BTF,
author = "Miao Cai and Junru Shen and Yifan Yuan and Zhihao Qu
and Baoliu Ye",
title = "{BonsaiKV}: Towards Fast, Scalable, and Persistent
Key--Value Stores with Tiered, Heterogeneous Memory
System",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "4",
pages = "726--739",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3636218.3636228",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:37:01 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3636218.3636228",
abstract = "Emerging NUMA/CXL-based tiered memory systems with
heterogeneous memory devices such as DRAM and NVMM
deliver ultrafast speed, large capacity, and data
persistence all at once, offering great promise to
high-performance in-memory key-value stores. To
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Reiner:2023:SEC,
author = "Silvan Reiner and Michael Grossniklaus",
title = "Sample-Efficient Cardinality Estimation Using
Geometric Deep Learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "4",
pages = "740--752",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3636218.3636229",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:37:01 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3636218.3636229",
abstract = "In database systems, accurate cardinality estimation
is a cornerstone of effective query optimization. In
this context, estimators that use machine learning have
shown significant promise. Despite their potential, the
effectiveness of these learned \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhao:2023:MTS,
author = "Kai Zhao and Chenjuan Guo and Yunyao Cheng and Peng
Han and Miao Zhang and Bin Yang",
title = "Multiple Time Series Forecasting with Dynamic Graph
Modeling",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "4",
pages = "753--765",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3636218.3636230",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:37:01 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3636218.3636230",
abstract = "Multiple time series forecasting plays an essential
role in many applications. Solutions based on graph
neural network (GNN) that deliver state-of-the-art
forecasting performance use the relation graph which
can capture historical correlations among time
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Cheng:2023:WGA,
author = "Yunyao Cheng and Peng Chen and Chenjuan Guo and Kai
Zhao and Qingsong Wen and Bin Yang and Christian S.
Jensen",
title = "Weakly Guided Adaptation for Robust Time Series
Forecasting",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "4",
pages = "766--779",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3636218.3636231",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:37:01 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3636218.3636231",
abstract = "Robust multivariate time series forecasting is crucial
in many cyberphysical and Internet of Things
applications. Existing state-of-the-art robust
forecasting models decompose time series into
independent functions covering trends and
periodicities. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yang:2023:ACA,
author = "Rui Yang and Evgenios M. Kornaropoulos and Yue Cheng",
title = "Algorithmic Complexity Attacks on Dynamic Learned
Indexes",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "4",
pages = "780--793",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3636218.3636232",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:37:01 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3636218.3636232",
abstract = "Learned Index Structures (LIS) view a sorted index as
a model that learns the data distribution, takes a data
element key as input, and outputs the predicted
position of the key. The original LIS can only handle
lookup operations with no support for \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhu:2023:MDC,
author = "Jiaqi Zhu and Shaofeng Cai and Fang Deng and Beng Chin
Ooi and Wenqiao Zhang",
title = "{METER}: a Dynamic Concept Adaptation Framework for
Online Anomaly Detection",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "4",
pages = "794--807",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3636218.3636233",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:37:01 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3636218.3636233",
abstract = "Real-time analytics and decision-making require online
anomaly detection (OAD) to handle drifts in data
streams efficiently and effectively. Unfortunately,
existing approaches are often constrained by their
limited detection capacity and slow adaptation
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2023:EAL,
author = "Hailin Zhang and Penghao Zhao and Xupeng Miao and
Yingxia Shao and Zirui Liu and Tong Yang and Bin Cui",
title = "Experimental Analysis of Large-Scale Learnable Vector
Storage Compression",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "4",
pages = "808--822",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3636218.3636234",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:37:01 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3636218.3636234",
abstract = "Learnable embedding vector is one of the most
important applications in machine learning, and is
widely used in various database-related domains.
However, the high dimensionality of sparse data in
recommendation tasks and the huge volume of corpus in
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhao:2023:CSC,
author = "Yue Zhao and Zhaodonghui Li and Gao Cong",
title = "A Comparative Study and Component Analysis of Query
Plan Representation Techniques in {ML4DB} Studies",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "4",
pages = "823--835",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3636218.3636235",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:37:01 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3636218.3636235",
abstract = "Query plan is widely used as input in machine learning
for databases (ML4DB) research, with query plan
representation as a critical step. However, existing
studies typically focus on one task, and propose a
novel design to represent query plans along \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhuang:2023:TGD,
author = "Zeyang Zhuang and Penghui Li and Pingchuan Ma and Wei
Meng and Shuai Wang",
title = "Testing Graph Database Systems via Graph-Aware
Metamorphic Relations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "4",
pages = "836--848",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3636218.3636236",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:37:01 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3636218.3636236",
abstract = "Graph database systems (GDBs) have supported many
important real-world applications such as social
networks, logistics, and path planning. Meanwhile,
logic bugs are also prevalent in GDBs, leading to
incorrect results and severe consequences. However,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Cong:2023:OCE,
author = "Tianji Cong and Madelon Hulsebos and Zhenjie Sun and
Paul Groth and H. V. Jagadish",
title = "Observatory: Characterizing Embeddings of Relational
Tables",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "4",
pages = "849--862",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3636218.3636237",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:37:01 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3636218.3636237",
abstract = "Language models and specialized table embedding models
have recently demonstrated strong performance on many
tasks over tabular data. Researchers and practitioners
are keen to leverage these models in many new
application contexts; but limited \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kim:2023:FAD,
author = "Taeyoon Kim and ChanHo Park and Mansur Mukimbekov and
Heelim Hong and Minseok Kim and Ze Jin and Changdae Kim
and Ji-Yong Shin and Myeongjae Jeon",
title = "{FusionFlow}: Accelerating Data Preprocessing for
Machine Learning with {CPU--GPU} Cooperation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "4",
pages = "863--876",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3636218.3636238",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:37:01 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3636218.3636238",
abstract = "Data augmentation enhances the accuracy of DL models
by diversifying training samples through a sequence of
data transformations. While recent advancements in data
augmentation have demonstrated remarkable efficacy,
they often rely on computationally \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Mohr-Daurat:2023:BAD,
author = "Hubert Mohr-Daurat and Xuan Sun and Holger Pirk",
title = "{BOSS} --- an Architecture for Database Kernel
Composition",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "4",
pages = "877--890",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3636218.3636239",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:37:01 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3636218.3636239",
abstract = "Composable Database System Research has yielded
components such as Apache Arrow for Storage, Meta's
Velox for processing and Apache Calcite for query
planning. What is lacking, however, is a design for a
general, efficient and easy-to-use architecture to
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhi:2023:CBC,
author = "Xiangyu Zhi and Xiao Yan and Bo Tang and Ziyao Yin and
Yanchao Zhu and Minqi Zhou",
title = "{CoroGraph}: Bridging Cache Efficiency and Work
Efficiency for Graph Algorithm Execution",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "4",
pages = "891--903",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3636218.3636240",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:37:01 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3636218.3636240",
abstract = "Many systems are designed to run graph algorithms
efficiently in memory but they achieve only cache
efficiency or work efficiency. We tackle this
fundamental trade-off in existing systems by designing
CoroGraph, a system that attains both cache \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Cheng:2023:MSO,
author = "Audrey Cheng and Jack Waudby and Hugo Firth and
Natacha Crooks and Ion Stoica",
title = "Mammoths are Slow: The Overlooked Transactions of
Graph Data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "4",
pages = "904--911",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3636218.3636241",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:37:01 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3636218.3636241",
abstract = "This paper argues for better concurrency control to
support mammoth transactions, which read and write to
many items. While these requests are prevalent on graph
data, few systems support them efficiently. Currently,
developers must make the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhou:2023:VVS,
author = "Enyuan Zhou and Song Guo and Zicong Hong and Christian
S. Jensen and Yang Xiao and Dalin Zhang and Jinwen
Liang and Qingqi Pei",
title = "{VeriDKG}: a Verifiable {SPARQL} Query Engine for
Decentralized Knowledge Graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "4",
pages = "912--925",
month = dec,
year = "2023",
CODEN = "????",
DOI = "https://doi.org/10.14778/3636218.3636242",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Mar 20 07:37:01 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3636218.3636242",
abstract = "The ability to decentralize knowledge graphs (KG) is
important to exploit the full potential of the Semantic
Web and realize the Web 3.0 vision. However,
decentralization also renders KGs more prone to attacks
with adverse effects on data integrity and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Weng:2024:EEP,
author = "Lianggui Weng and Rong Zhu and Di Wu and Bolin Ding
and Bolong Zheng and Jingren Zhou",
title = "{Eraser}: Eliminating Performance Regression on
Learned Query Optimizer",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "5",
pages = "926--938",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3641204.3641205",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:34 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3641204.3641205",
abstract = "Efficient query optimization is crucial for database
management systems. Recently, machine learning models
have been applied in query optimizers to generate
better plans, but the unpredictable performance
regressions prevent them from being truly \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2024:HNB,
author = "Chao Zhang and Guoliang Li and Tao Lv",
title = "{HyBench}: a New Benchmark for {HTAP} Databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "5",
pages = "939--951",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3641204.3641206",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:34 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3641204.3641206",
abstract = "In this paper, we propose, HyBench, a new benchmark
for HTAP databases. First, we generate the testing data
by simulating a representative HTAP application. We
particularly develop a time-dependent generation phase
and an anomaly generation phase for \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Tae:2024:FFA,
author = "Ki Hyun Tae and Hantian Zhang and Jaeyoung Park and
Kexin Rong and Steven Euijong Whang",
title = "{Falcon}: Fair Active Learning Using Multi-Armed
Bandits",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "5",
pages = "952--965",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3641204.3641207",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:34 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3641204.3641207",
abstract = "Biased data can lead to unfair machine learning
models, highlighting the importance of embedding
fairness at the beginning of data analysis,
particularly during dataset curation and labeling. In
response, we propose Falcon, a scalable fair active
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wu:2024:BSC,
author = "Honghu Wu and Xiangrong Zhu and Wei Hu",
title = "A Blockchain System for Clustered Federated Learning
with Peer-to-Peer Knowledge Transfer",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "5",
pages = "966--979",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3641204.3641208",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:34 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3641204.3641208",
abstract = "Federated Learning (FL) is a novel distributed,
privacy-preserving machine learning paradigm.
Conventional FL suffers from drawbacks such as single
point of failure and client drift. Blockchain is a
distributed computing architecture famous for
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhu:2024:PSD,
author = "Rong Zhu and Lianggui Weng and Wenqing Wei and Di Wu
and Jiazhen Peng and Yifan Wang and Bolin Ding and Defu
Lian and Bolong Zheng and Jingren Zhou",
title = "{PilotScope}: Steering Databases with Machine Learning
Drivers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "5",
pages = "980--993",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3641204.3641209",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:34 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3641204.3641209",
abstract = "Learned databases, or AI4DB techniques, have rapidly
developed in the last decade. Deploying machine
learning (ML) and AI4DB algorithms into actual
databases is the gold standard to examine their
performance in practice. However, due to the complexity
of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2024:TSO,
author = "Yishuai Li and Yunfeng Zhu and Chao Shi and Guanhua
Zhang and Jianzhong Wang and Xiaolu Zhang",
title = "Timestamp as a Service, Not an Oracle",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "5",
pages = "994--1006",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3641204.3641210",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:34 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3641204.3641210",
abstract = "We present a logical timestamping mechanism for
ordering transactions in distributed databases,
eliminating the single point of failure (SPoF) that
bother existing timestamp ``oracles''. The main
innovation is a bipartite client-server architecture,
where \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Xing:2024:DDI,
author = "Junjie Xing and Xinyu Wang and H. V. Jagadish",
title = "Data-Driven Insight Synthesis for Multi-Dimensional
Data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "5",
pages = "1007--1019",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3641204.3641211",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:34 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3641204.3641211",
abstract = "Exploratory data analysis can uncover interesting data
insights from data. Current methods utilize
``interestingness measures'' designed based on system
designers' perspectives, thus inherently restricting
the insights to their defined scope. These systems,.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Xing:2024:DNM,
author = "Naili Xing and Shaofeng Cai and Gang Chen and Zhaojing
Luo and Beng Chin Ooi and Jian Pei",
title = "Database Native Model Selection: Harnessing Deep
Neural Networks in Database Systems",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "5",
pages = "1020--1033",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3641204.3641212",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:34 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3641204.3641212",
abstract = "The growing demand for advanced analytics beyond
statistical aggregation calls for database systems that
support effective model selection of deep neural
networks (DNNs). However, existing model selection
strategies are based on either training-based
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2024:QSD,
author = "Kaiyu Chen and Dong Wen and Wenjie Zhang and Ying
Zhang and Xiaoyang Wang and Xuemin Lin",
title = "Querying Structural Diversity in Streaming Graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "5",
pages = "1034--1046",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3641204.3641213",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:34 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3641204.3641213",
abstract = "Structural diversity of a vertex refers to the
diversity of connections within its neighborhood and
has been applied in various fields such as viral
marketing and user engagement. The paper studies
querying the structural diversity of a vertex for any
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gou:2024:LSE,
author = "Xiangyang Gou and Xinyi Ye and Lei Zou and Jeffrey Xu
Yu",
title = "{LM-SRPQ}: Efficiently Answering Regular Path Query in
Streaming Graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "5",
pages = "1047--1059",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3641204.3641214",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:34 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3641204.3641214",
abstract = "Regular path query (RPQ) is a basic operation for
graph data analysis, and persistent RPQ in streaming
graphs is a new-emerging research topic. In this paper,
we propose a novel algorithm for persistent RPQ in
streaming graphs, named LM-SRPQ. It solves \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gao:2024:EET,
author = "Shihong Gao and Yiming Li and Yanyan Shen and Yingxia
Shao and Lei Chen",
title = "{ETC}: Efficient Training of Temporal Graph Neural
Networks over Large-Scale Dynamic Graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "5",
pages = "1060--1072",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3641204.3641215",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:34 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3641204.3641215",
abstract = "Dynamic graphs play a crucial role in various
real-world applications, such as link prediction and
node classification on social media and e-commerce
platforms. Temporal Graph Neural Networks (T-GNNs) have
emerged as a leading approach for handling \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wu:2024:TFS,
author = "Chenyuan Wu and Mohammad Javad Amiri and Haoyun Qin
and Bhavana Mehta and Ryan Marcus and Boon Thau Loo",
title = "Towards Full Stack Adaptivity in Permissioned
Blockchains",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "5",
pages = "1073--1080",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3641204.3641216",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:34 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3641204.3641216",
abstract = "This paper articulates our vision for a learning-based
untrustworthy distributed database. We focus on
permissioned blockchain systems as an emerging instance
of untrustworthy distributed databases and argue that
as novel smart contracts, modern hardware,. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Han:2024:BLC,
author = "Jindong Han and Weijia Zhang and Hao Liu and Tao Tao
and Naiqiang Tan and Hui Xiong",
title = "{BigST}: Linear Complexity Spatio-Temporal Graph
Neural Network for Traffic Forecasting on Large-Scale
Road Networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "5",
pages = "1081--1090",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3641204.3641217",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:34 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3641204.3641217",
abstract = "Spatio-Temporal Graph Neural Network (STGNN) has been
used as a common workhorse for traffic forecasting.
However, most of them require prohibitive quadratic
computational complexity to capture long-range
spatio-temporal dependencies, thus hindering \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Min:2024:SWO,
author = "Xinhao Min and Kai Lu and Pengyu Liu and Jiguang Wan
and Changsheng Xie and Daohui Wang and Ting Yao and
Huatao Wu",
title = "{SepHash}: a Write-Optimized Hash Index On
Disaggregated Memory via Separate Segment Structure",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "5",
pages = "1091--1104",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3641204.3641218",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:34 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3641204.3641218",
abstract = "Disaggregated memory separates compute and memory
resources into independent pools connected by fast RDMA
(Remote Direct Memory Access) networks, which can
improve memory utilization, reduce cost, and enable
elastic scaling of compute and memory \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Tang:2024:XBM,
author = "Dahai Tang and Jiali Wang and Rong Chen and Lei Wang
and Wenyuan Yu and Jingren Zhou and Kenli Li",
title = "{XGNN}: Boosting Multi-{GPU} {GNN} Training via Global
{GNN} Memory Store",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "5",
pages = "1105--1118",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3641204.3641219",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:34 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3641204.3641219",
abstract = "GPUs are commonly utilized to accelerate GNN training,
particularly on a multi-GPU server with high-speed
interconnects (e.g., NVLink and NVSwitch). However, the
rapidly increasing scale of graphs poses a challenge to
applying GNN to real-world \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Tao:2024:CEP,
author = "Youming Tao and Cheng-Long Wang and Miao Pan and
Dongxiao Yu and Xiuzhen Cheng and Di Wang",
title = "Communication Efficient and Provable Federated
Unlearning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "5",
pages = "1119--1131",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3641204.3641220",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:34 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3641204.3641220",
abstract = "We study federated unlearning, a novel problem to
eliminate the impact of specific clients or data points
on the global model learned via federated learning
(FL). This problem is driven by the right to be
forgotten and the privacy challenges in FL. We
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gao:2024:TSE,
author = "Dawei Gao and Haibin Wang and Yaliang Li and Xiuyu Sun
and Yichen Qian and Bolin Ding and Jingren Zhou",
title = "Text-to-{SQL} Empowered by Large Language Models: a
Benchmark Evaluation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "5",
pages = "1132--1145",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3641204.3641221",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:34 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3641204.3641221",
abstract = "Large language models (LLMs) have emerged as a new
paradigm for Text-to-SQL task. However, the absence of
a systematical benchmark inhibits the development of
designing effective, efficient and economic LLM-based
Text-to-SQL solutions. To address this \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Mai:2024:SPQ,
author = "Anh L. Mai and Pengyu Wang and Azza Abouzied and
Matteo Brucato and Peter J. Haas and Alexandra Meliou",
title = "Scaling Package Queries to a Billion Tuples via
Hierarchical Partitioning and Customized Optimization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "5",
pages = "1146--1158",
month = jan,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3641204.3641222",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:34 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3641204.3641222",
abstract = "A package query returns a package---a multiset of
tuples---that maximizes or minimizes a linear objective
function subject to linear constraints, thereby
enabling in-database decision support. Prior work has
established the equivalence of package queries
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Deng:2024:MIM,
author = "Yuhao Deng and Chengliang Chai and Lei Cao and Nan
Tang and Jiayi Wang and Ju Fan and Ye Yuan and Guoren
Wang",
title = "{MisDetect}: Iterative Mislabel Detection using Early
Loss",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "6",
pages = "1159--1172",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3648160.3648161",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3648160.3648161",
abstract = "Supervised machine learning (ML) models trained on
data with mislabeled instances often produce inaccurate
results due to label errors. Traditional methods of
detecting mislabeled instances rely on data proximity,
where an instance is considered \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fan:2024:CMA,
author = "Wenfei Fan and Muyang Liu and Shuhao Liu and Chao
Tian",
title = "Capturing More Associations by Referencing External
Graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "6",
pages = "1173--1186",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3648160.3648162",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3648160.3648162",
abstract = "This paper studies association rule discovery in a
graph $G_1$ by referencing an external graph $G_2$ with
overlapping information. The objective is to enrich
$G_1$ with relevant properties and links from $G_2$. As
a testbed, we consider Graph Association Rules
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lin:2024:QEQ,
author = "Longlong Lin and Pingpeng Yuan and Rong-Hua Li and
Chunxue Zhu and Hongchao Qin and Hai Jin and Tao Jia",
title = "{QTCS}: Efficient Query-Centered Temporal Community
Search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "6",
pages = "1187--1199",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3648160.3648163",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3648160.3648163",
abstract = "Temporal community search is an important task in
graph analysis, which has been widely used in many
practical applications. However, existing methods
suffer from two major defects: (i) they only require
that the target result contains the query vertex q,.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fu:2024:DAD,
author = "Jie Fu and Qingqing Ye and Haibo Hu and Zhili Chen and
Lulu Wang and Kuncan Wang and Xun Ran",
title = "{DPSUR}: Accelerating Differentially Private
Stochastic Gradient Descent Using Selective Update and
Release",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "6",
pages = "1200--1213",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3648160.3648164",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3648160.3648164",
abstract = "Machine learning models are known to memorize private
data to reduce their training loss, which can be
inadvertently exploited by privacy attacks such as
model inversion and membership inference. To protect
against these attacks, differential privacy (DP).
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Erben:2024:HCW,
author = "Alexander Erben and Ruben Mayer and Hans-Arno
Jacobsen",
title = "How Can We Train Deep Learning Models Across Clouds
and Continents? {An} Experimental Study",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "6",
pages = "1214--1226",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3648160.3648165",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3648160.3648165",
abstract = "This paper aims to answer the question: Can deep
learning models be cost-efficiently trained on a global
market of spot VMs spanning different data centers and
cloud providers? To provide guidance, we extensively
evaluate the cost and throughput \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Park:2024:ASA,
author = "Jeongmin Brian Park and Vikram Sharma Mailthody and
Zaid Qureshi and Wen-mei Hwu",
title = "Accelerating Sampling and Aggregation Operations in
{GNN} Frameworks with {GPU} Initiated Direct Storage
Accesses",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "6",
pages = "1227--1240",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3648160.3648166",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3648160.3648166",
abstract = "Graph Neural Networks (GNNs) are emerging as a
powerful tool for learning from graph-structured data
and performing sophisticated inference tasks in various
application domains. Although GNNs have been shown to
be effective on modest-sized graphs, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yuan:2024:CEG,
author = "Hao Yuan and Yajiong Liu and Yanfeng Zhang and Xin Ai
and Qiange Wang and Chaoyi Chen and Yu Gu and Ge Yu",
title = "Comprehensive Evaluation of {GNN} Training Systems: a
Data Management Perspective",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "6",
pages = "1241--1254",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3648160.3648167",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3648160.3648167",
abstract = "Many Graph Neural Network (GNN) training systems have
emerged recently to support efficient GNN training.
Since GNNs embody complex data dependencies between
training samples, the training of GNNs should address
distinct challenges different from DNN \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chan:2024:LFH,
author = "Tsz Nam Chan and Rui Zang and Bojian Zhu and Leong Hou
U. and Dingming Wu and Jianliang Xu",
title = "{LION}: Fast and High-Resolution Network Kernel
Density Visualization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "6",
pages = "1255--1268",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3648160.3648168",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3648160.3648168",
abstract = "Network Kernel Density Visualization (NKDV) has often
been used in a wide range of applications, e.g.,
criminology, transportation science, and urban
planning. However, NKDV is computationally expensive,
which cannot be scalable to large-scale datasets
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2024:PBP,
author = "Zitao Li and Bolin Ding and Liuyi Yao and Yaliang Li
and Xiaokui Xiao and Jingren Zhou",
title = "Performance-Based Pricing for Federated Learning via
Auction",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "6",
pages = "1269--1282",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3648160.3648169",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3648160.3648169",
abstract = "Many machine learning techniques rely on plenty of
training data. However, data are often possessed
unequally by different entities, with a large
proportion of data being held by a small number of
data-rich entities. It can be challenging to
incentivize \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Diao:2024:OIO,
author = "Yiqun Diao and Yutong Yang and Qinbin Li and Bingsheng
He and Mian Lu",
title = "{OEBench}: Investigating Open Environment Challenges
in Real-World Relational Data Streams",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "6",
pages = "1283--1296",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3648160.3648170",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3648160.3648170",
abstract = "How to get insights from relational data streams in a
timely manner is a hot research topic. Data streams can
present unique challenges, such as distribution drifts,
outliers, emerging classes, and changing features,
which have recently been described as \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Xie:2024:IMV,
author = "Jiadong Xie and Zehua Chen and Deming Chu and Fan
Zhang and Xuemin Lin and Zhihong Tian",
title = "Influence Maximization via Vertex Countering",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "6",
pages = "1297--1309",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3648160.3648171",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3648160.3648171",
abstract = "Competitive viral marketing considers the product
competition of multiple companies, where each user may
adopt one product and propagate the product to other
users. Existing studies focus on a traditional seeding
strategy where a company only selects \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2024:ODA,
author = "Tingting Wang and Shixun Huang and Zhifeng Bao and J.
Shane Culpepper and Volkan Dedeoglu and Reza
Arablouei",
title = "Optimizing Data Acquisition to Enhance Machine
Learning Performance",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "6",
pages = "1310--1323",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3648160.3648172",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3648160.3648172",
abstract = "In this paper, we study how to acquire labeled data
points from a large data pool to enrich a training set
for enhancing supervised machine learning (ML)
performance. The state-of-the-art solution is the
clustering-based training set selection (CTS)
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2024:MSC,
author = "Xin Chen and Jieming Shi and You Peng and Wenqing Lin
and Sibo Wang and Wenjie Zhang",
title = "Minimum Strongly Connected Subgraph Collection in
Dynamic Graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "6",
pages = "1324--1336",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3648160.3648173",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3648160.3648173",
abstract = "Real-world directed graphs are dynamically changing,
and it is important to identify and maintain the strong
connectivity information between nodes, which is useful
in numerous applications. Given an input graph G, we
study a new problem, minimum \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhu:2024:FDF,
author = "Junhao Zhu and Yuren Mao and Lu Chen and Congcong Ge
and Ziheng Wei and Yunjun Gao",
title = "{FusionQuery}: On-demand Fusion Queries over
Multi-source Heterogeneous Data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "6",
pages = "1337--1349",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3648160.3648174",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3648160.3648174",
abstract = "Centralised data management systems (e.g., data lakes)
support queries over multi-source heterogeneous data.
However, the query results from multiple sources
commonly involve between-source conflicts, which makes
query results unreliable and confusing \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Justen:2024:PAN,
author = "David Justen and Daniel Ritter and Campbell Fraser and
Andrew Lamb and Allison Lee and Thomas Bodner and Mhd
Yamen Haddad and Steffen Zeuch and Volker Markl and
Matthias Boehm",
title = "{POLAR}: Adaptive and Non-invasive Join Order
Selection via Plans of Least Resistance",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "6",
pages = "1350--1363",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3648160.3648175",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3648160.3648175",
abstract = "Join ordering and query optimization are crucial for
query performance but remain challenging due to unknown
or changing characteristics of query intermediates,
especially for complex queries with many joins. Over
the past two decades, a spectrum of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2024:DAG,
author = "Zhiyuan Li and Xun Jian and Yue Wang and Yingxia Shao
and Lei Chen",
title = "{DAHA}: Accelerating {GNN} Training with Data and
Hardware Aware Execution Planning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "6",
pages = "1364--1376",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3648160.3648176",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3648160.3648176",
abstract = "Graph neural networks (GNNs) have been gaining a
reputation for effective modeling of graph data. Yet,
it is challenging to train GNNs efficiently. Many
frameworks have been proposed but most of them suffer
from high batch preparation cost and data \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lu:2024:FSB,
author = "Ziyi Lu and Qiang Cao and Hong Jiang and Yuxing Chen
and Jie Yao and Anqun Pan",
title = "{FluidKV}: Seamlessly Bridging the Gap between
Indexing Performance and Memory-Footprint on Ultra-Fast
Storage",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "6",
pages = "1377--1390",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3648160.3648177",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3648160.3648177",
abstract = "Our extensive experiments reveal that existing
key-value stores (KVSs) achieve high performance at the
expense of a huge memory footprint that is often
impractical or unacceptable. Even with the emerging
ultra-fast byte-addressable persistent memory (PM),.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Shah:2024:HDC,
author = "Vraj Shah and Thomas Parashos and Arun Kumar",
title = "How Do Categorical Duplicates Affect {ML}? {A} New
Benchmark and Empirical Analyses",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "6",
pages = "1391--1404",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3648160.3648178",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3648160.3648178",
abstract = "The tedious grunt work involved in data preparation
(prep) before ML reduces ML user productivity. It is
also a roadblock to industrial-scale cloud AutoML
workflows that build ML models for millions of
datasets. One important data prep step for ML is
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Cui:2024:CUF,
author = "Pengjie Cui and Haotian Liu and Bo Tang and Ye Yuan",
title = "{CGgraph}: an Ultra-Fast Graph Processing System on
Modern Commodity {CPU--GPU} Co-processor",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "6",
pages = "1405--1417",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3648160.3648179",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3648160.3648179",
abstract = "In recent years, many CPU-GPU heterogeneous graph
processing systems have been developed in both academic
and industrial to facilitate large-scale graph
processing in various applications, e.g., social
networks and biological networks. However, the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2024:FCD,
author = "Xinyu Chen and Jiannan Tian and Ian Beaver and Cynthia
Freeman and Yan Yan and Jianguo Wang and Dingwen Tao",
title = "{FCBench}: Cross-Domain Benchmarking of Lossless
Compression for Floating-Point Data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "6",
pages = "1418--1431",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3648160.3648180",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3648160.3648180",
abstract = "While both the database and high-performance computing
(HPC) communities utilize lossless compression methods
to minimize floating-point data size, a disconnect
persists between them. Each community designs and
assesses methods in a domain-specific \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Hurst:2024:PFA,
author = "Aaron Hurst and Daniel E. Lucani and Qi Zhang",
title = "{PairwiseHist}: Fast, Accurate and Space-Efficient
Approximate Query Processing with Data Compression",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "6",
pages = "1432--1445",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3648160.3648181",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3648160.3648181",
abstract = "Exponential growth in data collection is creating
significant challenges for data storage and analytics
latency. Approximate Query Processing (AQP) has long
been touted as a solution for accelerating analytics on
large datasets, however, there is still \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2024:MAD,
author = "Huayi Zhang and Binwei Yan and Lei Cao and Samuel
Madden and Elke Rundensteiner",
title = "{MetaStore}: Analyzing Deep Learning Meta-Data at
Scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "6",
pages = "1446--1459",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3648160.3648182",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3648160.3648182",
abstract = "The process of training deep learning models produces
a huge amount of meta-data, including but not limited
to losses, hidden feature embeddings, and gradients.
Model diagnosis tools have been developed to analyze
losses and feature embeddings with the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lv:2024:RES,
author = "Yangming Lv and Kai Zhang and Ziming Wang and Xiaodong
Zhang and Rubao Lee and Zhenying He and Yinan Jing and
X. Sean Wang",
title = "{RTScan}: Efficient Scan with Ray Tracing Cores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "6",
pages = "1460--1472",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3648160.3648183",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3648160.3648183",
abstract = "Indexing is a core technique for accelerating
predicate evaluation in databases. After many years of
effort, the indexing performance has reached its peak
on the existing hardware infrastructure. We propose to
use ray tracing (RT) cores to move the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Huang:2024:FRM,
author = "Kezhao Huang and Haitian Jiang and Minjie Wang and
Guangxuan Xiao and David Wipf and Xiang Song and Quan
Gan and Zengfeng Huang and Jidong Zhai and Zheng
Zhang",
title = "{FreshGNN}: Reducing Memory Access via Stable
Historical Embeddings for Graph Neural Network
Training",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "6",
pages = "1473--1486",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3648160.3648184",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3648160.3648184",
abstract = "A key performance bottleneck when training graph
neural network (GNN) models on large, real-world graphs
is loading node features onto a GPU. Due to limited GPU
memory, expensive data movement is necessary to
facilitate the storage of these features on \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zheng:2024:SBA,
author = "Ying Zheng and Kian-Lee Tan",
title = "Sorting on Byte-Addressable Storage: The Resurgence of
Tree Structure",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "6",
pages = "1487--1500",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3648160.3648185",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3648160.3648185",
abstract = "The tree structure is notably popular for storage and
indexing; however, tree-based sorting such as tree sort
is rarely used in practice. Nevertheless, with the
advent of byte-addressable storage (BAS), the tree
structure captures our attention with its \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chatziliadis:2024:EPD,
author = "Xenofon Chatziliadis and Eleni Tzirita Zacharatou and
Alphan Eracar and Steffen Zeuch and Volker Markl",
title = "Efficient Placement of Decomposable Aggregation
Functions for Stream Processing over Large
Geo-Distributed Topologies",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "6",
pages = "1501--1514",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3648160.3648186",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3648160.3648186",
abstract = "A recent trend in stream processing is offloading the
computation of decomposable aggregation functions (DAF)
from cloud nodes to geo-distributed fog/edge devices to
decrease latency and improve energy efficiency.
However, deploying DAFs on low-end \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Hou:2024:AEB,
author = "Jiamin Hou and Zhanhao Zhao and Zhouyu Wang and Wei Lu
and Guodong Jin and Dong Wen and Xiaoyong Du",
title = "{AeonG}: an Efficient Built-in Temporal Support in
Graph Databases",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "6",
pages = "1515--1527",
month = feb,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3648160.3648187",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Mon May 6 06:22:36 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3648160.3648187",
abstract = "Real-world graphs are often dynamic and evolve over
time. It is crucial for storing and querying a graph's
evolution in graph databases. However, existing works
either suffer from high storage overhead or lack
efficient temporal query support, or both. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yu:2024:RIT,
author = "Tao Yu and Zhaonian Zou and Weihua Sun and Yu Yan",
title = "Refactoring Index Tuning Process with Benefit
Estimation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "7",
pages = "1528--1541",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3654621.3654622",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 31 09:17:13 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3654621.3654622",
abstract = "Index tuning is a challenging task aiming to improve
query performance by selecting the most effective
indexes for a database and a workload. Existing
automatic index tuning methods typically rely on
``what-if tools'' to evaluate the benefit of an index
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2024:LSY,
author = "Xunkai Li and Meihao Liao and Zhengyu Wu and Daohan Su
and Wentao Zhang and Rong-Hua Li and Guoren Wang",
title = "{LightDiC}: a Simple Yet Effective Approach for
Large-Scale Digraph Representation Learning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "7",
pages = "1542--1551",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3654621.3654623",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 31 09:17:13 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3654621.3654623",
abstract = "Most existing graph neural networks (GNNs) are limited
to undirected graphs, whose restricted scope of the
captured relational information hinders their
expressive capabilities and deployment. Compared with
undirected graphs, directed graphs (digraphs)
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kuang:2024:EDD,
author = "Shulei Kuang and Honghui Yang and Zijing Tan and Shuai
Ma",
title = "Efficient Differential Dependency Discovery",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "7",
pages = "1552--1564",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3654621.3654624",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 31 09:17:13 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3654621.3654624",
abstract = "Differential dependencies (DDs) are proposed to
specify constraints on the differences between values,
where the semantics of difference can be ``similar'',
``dissimilar'' and beyond. DDs subsume functional
dependencies (FDs), and find valuable applications
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lehmann:2024:YLQ,
author = "Claude Lehmann and Pavel Sulimov and Kurt Stockinger",
title = "Is Your Learned Query Optimizer Behaving As You
Expect? {A} Machine Learning Perspective",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "7",
pages = "1565--1577",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3654621.3654625",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 31 09:17:13 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3654621.3654625",
abstract = "The current boom of learned query optimizers (LQO) can
be explained not only by the general continuous
improvement of deep learning (DL) methods but also by
the straightforward formulation of a query optimization
problem (QOP) as a machine learning (ML) \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2024:MCK,
author = "Zhuoxing Zhang and Sebastian Link",
title = "Mixed Covers of Keys and Functional Dependencies for
Maintaining the Integrity of Data under Updates",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "7",
pages = "1578--1590",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3654621.3654626",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 31 09:17:13 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3654621.3654626",
abstract = "Covers for a set of functional dependencies (FDs) are
fundamental for many areas of data management, such as
integrity maintenance, query optimization, database
design, and data cleaning. When declaring integrity
constraints, keys enjoy native support in \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Deng:2024:OSH,
author = "Yuhao Deng and Yu Wang and Lei Cao and Lianpeng Qiao
and Yuping Wang and Jingzhe Xu and Yizhou Yan and
Samuel Madden",
title = "Outlier Summarization via Human Interpretable Rules",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "7",
pages = "1591--1604",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3654621.3654627",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 31 09:17:13 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3654621.3654627",
abstract = "Outlier detection is crucial for preventing financial
fraud, network intrusions, and device failures. Users
often expect systems to automatically summarize and
interpret outlier detection results to reduce human
effort and convert outliers into \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yuan:2024:NEE,
author = "Haitao Yuan and Gao Cong and Guoliang Li",
title = "Nuhuo: an Effective Estimation Model for Traffic Speed
Histogram Imputation on a Road Network",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "7",
pages = "1605--1617",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3654621.3654628",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 31 09:17:13 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3654621.3654628",
abstract = "Traffic speed histograms show the distribution of
traffic speeds over a certain period. Traffic speed
might not be recorded continuously, leading to missing
histograms for some links on a road network. However,
accurate imputation of missing histograms \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ravikumar:2024:IPP,
author = "Deepak Ravikumar and Alex Yeo and Yiwen Zhu and Aditya
Lakra and Harsha Nagulapalli and Santhosh Ravindran and
Steve Suh and Niharika Dutta and Andrew Fogarty and
Yoonjae Park and Sumeet Khushalani and Arijit Tarafdar
and Kunal Parekh and Subru Krishnan",
title = "Intelligent Pooling: Proactive Resource Provisioning
in Large-scale Cloud Service",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "7",
pages = "1618--1627",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3654621.3654629",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 31 09:17:13 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3654621.3654629",
abstract = "The proliferation of big data and analytic workloads
has driven the need for cloud compute and cluster-based
job processing. With Apache Spark, users can process
terabytes of data at ease with hundreds of parallel
executors. Providing low latency access \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ye:2024:EES,
author = "Yutong Ye and Xiang Lian and Mingsong Chen",
title = "Efficient Exact Subgraph Matching via {GNN}-Based Path
Dominance Embedding",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "7",
pages = "1628--1641",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3654621.3654630",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 31 09:17:13 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3654621.3654630",
abstract = "The classic problem of exact subgraph matching returns
those subgraphs in a large-scale data graph that are
isomorphic to a given query graph, which has gained
increasing importance in many real-world applications
such as social network analysis, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2024:LDH,
author = "Zijia Wang and Haoran Liu and Chen Lin and Zhifeng Bao
and Guoliang Li and Tianqing Wang",
title = "Leveraging Dynamic and Heterogeneous Workload
Knowledge to Boost the Performance of Index Advisors",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "7",
pages = "1642--1654",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3654621.3654631",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 31 09:17:13 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3654621.3654631",
abstract = "Current index advisors often struggle to balance
efficiency and effectiveness when dealing with workload
shifts. This arises from ignorance of the continual
similarity and distant variety in workloads. This paper
proposes a novel learning-based index \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ertl:2024:UPM,
author = "Otmar Ertl",
title = "{UltraLogLog}: a Practical and More Space-Efficient
Alternative to {HyperLogLog} for Approximate Distinct
Counting",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "7",
pages = "1655--1668",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3654621.3654632",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 31 09:17:13 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3654621.3654632",
abstract = "Since its invention HyperLogLog has become the
standard algorithm for approximate distinct counting.
Due to its space efficiency and suitability for
distributed systems, it is widely used and also
implemented in numerous databases. This work presents
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Gong:2024:RTI,
author = "Zengyang Gong and Yuxiang Zeng and Lei Chen",
title = "Real-Time Insertion Operator for Shared Mobility on
Time-Dependent Road Networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "7",
pages = "1669--1682",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3654621.3654633",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 31 09:17:13 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3654621.3654633",
abstract = "One of the most important challenges in shared
mobility services ( e.g., ride-sharing and parcel
delivery) is planning routes for workers by considering
real road conditions. To tackle this challenge, the
``insertion operator'', which computes the optimal
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fan:2024:XTM,
author = "Dayi Fan and Rubao Lee and Xiaodong Zhang",
title = "{X-TED}: Massive Parallelization of Tree Edit
Distance",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "7",
pages = "1683--1696",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3654621.3654634",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 31 09:17:13 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3654621.3654634",
abstract = "The tree edit distance (TED) has been found in a wide
spectrum of applications in artificial intelligence,
bioinformatics, and other areas, which serves as a
metric to quantify the dissimilarity between two trees.
As applications continue to scale in \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Shin:2024:CES,
author = "Wonseok Shin and Siwoo Song and Kunsoo Park and
Wook-Shin Han",
title = "Cardinality Estimation of Subgraph Matching: a
Filtering-Sampling Approach",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "7",
pages = "1697--1709",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3654621.3654635",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 31 09:17:13 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3654621.3654635",
abstract = "Subgraph counting is a fundamental problem in
understanding and analyzing graph structured data, yet
computationally challenging. This calls for an accurate
and efficient algorithm for Subgraph Cardinality
Estimation, which is to estimate the number of
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liang:2024:ERS,
author = "Qi Liang and Dian Ouyang and Fan Zhang and Jianye Yang
and Xuemin Lin and Zhihong Tian",
title = "Efficient Regular Simple Path Queries under Transitive
Restricted Expressions",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "7",
pages = "1710--1722",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3654621.3654636",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 31 09:17:13 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3654621.3654636",
abstract = "There are two fundamental problems in regular simple
path queries (RSPQs). One is the reachability problem
which asks whether there exists a simple path between
the source and the target vertex matching the given
regular expression, and the other is the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhong:2024:MSD,
author = "Shuhan Zhong and Sizhe Song and Weipeng Zhuo and
Guanyao Li and Yang Liu and S.-H. Gary Chan",
title = "A Multi-Scale Decomposition {MLP}-Mixer for Time
Series Analysis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "7",
pages = "1723--1736",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3654621.3654637",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 31 09:17:13 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3654621.3654637",
abstract = "Time series data, including univariate and
multivariate ones, are characterized by unique
composition and complex multi-scale temporal
variations. They often require special consideration of
decomposition and multi-scale modeling to analyze.
Existing \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Xia:2024:PSS,
author = "Haocheng Xia and Xiang Li and Junyuan Pang and Jinfei
Liu and Kui Ren and Li Xiong",
title = "{P-Shapley}: {Shapley} Values on Probabilistic
Classifiers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "7",
pages = "1737--1750",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3654621.3654638",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 31 09:17:13 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3654621.3654638",
abstract = "The Shapley value provides a unique approach to
equitably gauge each player's contribution within a
coalition and has extensive applications with various
utility functions. In data valuation for machine
learning, particularly for classification tasks,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{He:2024:OVS,
author = "Wenjia He and Ibrahim Sabek and Yuze Lou and Michael
Cafarella",
title = "Optimizing Video Selection {LIMIT} Queries with
Commonsense Knowledge",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "7",
pages = "1751--1764",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3654621.3654639",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 31 09:17:13 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3654621.3654639",
abstract = "Video is becoming a major part of contemporary data
collection. It is increasingly important to process
video selection queries --- selecting videos that
contain target objects. Advances in neural networks
allow us to detect the objects in an image, and
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Huo:2024:ZZT,
author = "Nan Huo and Reynold Cheng and Ben Kao and Wentao Ning
and Nur Al Hasan Haldar and Xiaodong Li and Jinyang Li
and Mohammad Matin Najafi and Tian Li and Ge Qu",
title = "{ZeroEA}: a Zero-Training Entity Alignment Framework
via Pre-Trained Language Model",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "7",
pages = "1765--1774",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3654621.3654640",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 31 09:17:13 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3654621.3654640",
abstract = "Entity alignment (EA), a crucial task in knowledge
graph (KG) research, aims to identify equivalent
entities across different KGs to support downstream
tasks like KG integration, text-to-SQL, and
question-answering systems. Given rich semantic
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liu:2024:EGR,
author = "Xueli Liu and Bowen Dong and Wenzhi Fu and Nannan Wu
and Xin Wang and Wenjun Wang",
title = "Extending Graph Rules with Oracles",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "7",
pages = "1775--1787",
month = mar,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3654621.3654641",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Fri May 31 09:17:13 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3654621.3654641",
abstract = "This paper proposes a class of graph rules for
deducing associations between entities, referred to as
Graph Rules with Oracles and denoted by GROs. As
opposed to previous graph rules, GROs support oracle
functions to import (a) external knowledge, and (b).
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Mei:2024:FME,
author = "Junyi Mei and Shixuan Sun and Chao Li and Cheng Xu and
Cheng Chen and Yibo Liu and Jing Wang and Cheng Zhao
and Xiaofeng Hou and Minyi Guo and Bingsheng He and
Xiaoliang Cong",
title = "{FlowWalker}: a Memory-Efficient and High-Performance
{GPU}-Based Dynamic Graph Random Walk Framework",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "8",
pages = "1788--1801",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3659437.3659438",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 1 06:18:48 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3659437.3659438",
abstract = "Dynamic graph random walk (DGRW) emerges as a
practical tool for capturing structural relations
within a graph. Effectively executing DGRW on GPU
presents certain challenges. First, existing sampling
methods demand a pre-processing buffer, causing
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kim:2024:ASK,
author = "Minsu Kim and Jinwoo Hwang and Guseul Heo and Seiyeon
Cho and Divya Mahajan and Jongse Park",
title = "Accelerating String-Key Learned Index Structures via
Memoization-Based Incremental Training",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "8",
pages = "1802--1815",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3659437.3659439",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 1 06:18:48 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3659437.3659439",
abstract = "Learned indexes use machine learning models to learn
the mappings between keys and their corresponding
positions in key-value indexes. These indexes use the
mapping information as training data. Learned indexes
require frequent retrainings of their \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liao:2024:TBC,
author = "Xuankun Liao and Qing Liu and Xin Huang and Jianliang
Xu",
title = "Truss-Based Community Search over Streaming Directed
Graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "8",
pages = "1816--1829",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3659437.3659440",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 1 06:18:48 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3659437.3659440",
abstract = "Community search aims to retrieve dense subgraphs that
contain the query vertices. While many effective
community models and algorithms have been proposed in
the literature, none of them address the unique
challenges posed by streaming graphs, where \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Salazar-Diaz:2024:IDM,
author = "Ricardo Salazar-D{\'\i}az and Boris Glavic and Tilmann
Rabl",
title = "{InferDB}: In-Database Machine Learning Inference
Using Indexes",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "8",
pages = "1830--1842",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3659437.3659441",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 1 06:18:48 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3659437.3659441",
abstract = "The performance of inference with machine learning
(ML) models and its integration with analytical query
processing have become critical bottlenecks for data
analysis in many organizations. An ML inference
pipeline typically consists of a preprocessing
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wei:2024:AAM,
author = "Fei Wei and Ergute Bao and Xiaokui Xiao and Yin Yang
and Bolin Ding",
title = "{AAA}: an Adaptive Mechanism for Locally
Differentially Private Mean Estimation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "8",
pages = "1843--1855",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3659437.3659442",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 1 06:18:48 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3659437.3659442",
abstract = "Local differential privacy ( LDP ) is a strong privacy
standard that has been adopted by popular software
systems, including Chrome, iOS, MacOS, and Windows. The
main idea is that each individual perturbs their own
data locally, and only submits the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Deng:2024:AMP,
author = "Yangshen Deng and Muxi Yan and Bo Tang",
title = "Accelerating {Merkle} {Patricia} Trie with {GPU}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "8",
pages = "1856--1869",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3659437.3659443",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 1 06:18:48 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3659437.3659443",
abstract = "Merkle Patricia Trie (MPT) is a type of trie structure
that offers efficient lookup and insert operators for
immutable data systems that require multi-version
access and tamper-evident controls, such as blockchains
and verifiable databases. The \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2024:PAS,
author = "Shaowei Wang and Yun Peng and Jin Li and Zikai Wen and
Zhipeng Li and Shiyu Yu and Di Wang and Wei Yang",
title = "Privacy Amplification via Shuffling: Unified,
Simplified, and Tightened",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "8",
pages = "1870--1883",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3659437.3659444",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 1 06:18:48 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3659437.3659444",
abstract = "The shuffle model of differential privacy provides
promising privacy-utility balances in decentralized,
privacy-preserving data analysis. However, the current
analyses of privacy amplification via shuffling lack
both tightness and generality. To address \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Song:2024:DMR,
author = "Jiansen Song and Wensheng Dou and Yu Gao and Ziyu Cui
and Yingying Zheng and Dong Wang and Wei Wang and Jun
Wei and Tao Huang",
title = "Detecting Metadata-Related Logic Bugs in Database
Systems via Raw Database Construction",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "8",
pages = "1884--1897",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3659437.3659445",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 1 06:18:48 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3659437.3659445",
abstract = "Database Management Systems (DBMSs) are widely used to
efficiently store and retrieve data. DBMSs usually
support various metadata, e.g., integrity constraints
for ensuring data integrity and indexes for locating
data. DBMSs can further utilize these \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wu:2024:ZHD,
author = "Biao Wu and Qiang Huang and Anthony K. H. Tung",
title = "From Zero to Hero: Detecting Leaked Data through
Synthetic Data Injection and Model Querying",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "8",
pages = "1898--1910",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3659437.3659446",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 1 06:18:48 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3659437.3659446",
abstract = "Safeguarding the Intellectual Property (IP) of data
has become critically important as machine learning
applications continue to proliferate, and their success
heavily relies on the quality of training data. While
various mechanisms exist to secure data \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2024:OOD,
author = "Guanduo Chen and Zhenying He and Meng Li and Siqiang
Luo",
title = "{Oasis}: an Optimal Disjoint Segmented Learned Range
Filter",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "8",
pages = "1911--1924",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3659437.3659447",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 1 06:18:48 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3659437.3659447",
abstract = "The learning-enhanced data structure has inspired the
development of the range filter, bringing significantly
better false positive rate (FPR) than traditional
non-learned range filters. Its core idea is to employ
piece-wise linear functions that \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Deng:2024:LBD,
author = "Yuhao Deng and Chengliang Chai and Lei Cao and Qin
Yuan and Siyuan Chen and Yanrui Yu and Zhaoze Sun and
Junyi Wang and Jiajun Li and Ziqi Cao and Kaisen Jin
and Chi Zhang and Yuqing Jiang and Yuanfang Zhang and
Yuping Wang and Ye Yuan and Guoren Wang and Nan Tang",
title = "{LakeBench}: a Benchmark for Discovering Joinable and
Unionable Tables in Data Lakes",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "8",
pages = "1925--1938",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3659437.3659448",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 1 06:18:48 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3659437.3659448",
abstract = "Discovering tables from poorly maintained data lakes
is a significant challenge in data management. Two key
tasks are identifying joinable and unionable tables,
crucial for data integration, analysis, and machine
learning. However, there's a lack of a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lao:2024:GMR,
author = "Jiale Lao and Yibo Wang and Yufei Li and Jianping Wang
and Yunjia Zhang and Zhiyuan Cheng and Wanghu Chen and
Mingjie Tang and Jianguo Wang",
title = "{GPTuner}: a Manual-Reading Database Tuning System via
{GPT}-Guided {Bayesian} Optimization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "8",
pages = "1939--1952",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3659437.3659449",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 1 06:18:48 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3659437.3659449",
abstract = "Modern database management systems (DBMS) expose
hundreds of configurable knobs to control system
behaviours. Determining the appropriate values for
these knobs to improve DBMS performance is a
long-standing problem in the database community. As
there is \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ermshaus:2024:RCS,
author = "Arik Ermshaus and Patrick Sch{\"a}fer and Ulf Leser",
title = "Raising the {ClaSS} of Streaming Time Series
Segmentation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "8",
pages = "1953--1966",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3659437.3659450",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 1 06:18:48 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3659437.3659450",
abstract = "Ubiquitous sensors today emit high frequency streams
of numerical measurements that reflect properties of
human, animal, industrial, commercial, and natural
processes. Shifts in such processes, e.g. caused by
external events or internal state changes, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2024:FLS,
author = "Qiyan Li and Jeffrey Xu Yu",
title = "Fast Local Subgraph Counting",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "8",
pages = "1967--1980",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3659437.3659451",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 1 06:18:48 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3659437.3659451",
abstract = "We study local subgraph counting queries, $Q = (p,
o)$, to count how many times a given $k$-node pattern
graph $p$ appears around every node $\upsilon$ in a
data graph G when the given center node $o$ in $p$ maps
to $\upsilon$. Such local subgraph counting becomes
important in GNNs \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2024:RER,
author = "Yunjia Zhang and Jordan Henkel and Avrilia Floratou
and Joyce Cahoon and Shaleen Deep and Jignesh M.
Patel",
title = "{ReAcTable}: Enhancing {ReAct} for Table Question
Answering",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "8",
pages = "1981--1994",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3659437.3659452",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 1 06:18:48 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3659437.3659452",
abstract = "Table Question Answering (TQA) presents a substantial
challenge at the intersection of natural language
processing and data analytics. This task involves
answering natural language (NL) questions on top of
tabular data, demanding proficiency in logical
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ai:2024:NRS,
author = "Xin Ai and Qiange Wang and Chunyu Cao and Yanfeng
Zhang and Chaoyi Chen and Hao Yuan and Yu Gu and Ge
Yu",
title = "{NeutronOrch}: Rethinking Sample-Based {GNN} Training
under {CPU--GPU} Heterogeneous Environments",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "8",
pages = "1995--2008",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3659437.3659453",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 1 06:18:48 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3659437.3659453",
abstract = "Graph Neural Networks (GNNs) have shown exceptional
performance across a wide range of applications.
Current frameworks leverage CPU-GPU heterogeneous
environments for GNN model training, incorporating
mini-batch and sampling techniques to mitigate GPU
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liu:2024:RED,
author = "Zifan Liu and Shaleen Deep and Anna Fariha and Fotis
Psallidas and Ashish Tiwari and Avrilia Floratou",
title = "{Rapidash}: Efficient Detection of Constraint
Violations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "8",
pages = "2009--2021",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3659437.3659454",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 1 06:18:48 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3659437.3659454",
abstract = "Denial Constraint (DC) is a well-established formalism
that captures a wide range of integrity constraints
commonly encountered, including candidate keys,
functional dependencies, and ordering constraints,
among others. Given their significance, there
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Mohapatra:2024:DPD,
author = "Shubhankar Mohapatra and Jianqiao Zong and Florian
Kerschbaum and Xi He",
title = "Differentially Private Data Generation with Missing
Data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "8",
pages = "2022--2035",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3659437.3659455",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 1 06:18:48 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3659437.3659455",
abstract = "Despite several works that succeed in generating
synthetic data with differential privacy (DP)
guarantees, they are inadequate for generating
high-quality synthetic data when the input data has
missing values. In this work, we formalize the problems
of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Su:2024:EYA,
author = "Zhaoyuan Su and Ammar Ahmed and Zirui Wang and Ali
Anwar and Yue Cheng",
title = "Everything You Always Wanted to Know About Storage
Compressibility of Pre-Trained {ML} Models but Were
Afraid to Ask",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "8",
pages = "2036--2049",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3659437.3659456",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 1 06:18:48 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3659437.3659456",
abstract = "As the number of pre-trained machine learning (ML)
models is growing exponentially, data reduction tools
are not catching up. Existing data reduction techniques
are not specifically designed for pre-trained model
(PTM) dataset files. This is largely due \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Li:2024:FFF,
author = "Haoyang Li and Shimin Di and Calvin Hong Yi Li and Lei
Chen and Xiaofang Zhou",
title = "Fight Fire with Fire: Towards Robust Graph Neural
Networks on Dynamic Graphs via Actively Defense",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "8",
pages = "2050--2063",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3659437.3659457",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 1 06:18:48 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3659437.3659457",
abstract = "Graph neural networks (GNNs) have achieved great
success on various graph tasks. However, recent studies
have revealed that GNNs are vulnerable to injective
attacks. Due to the openness of platforms, attackers
can inject malicious nodes with carefully \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zirak:2024:SLB,
author = "Farzaneh Zirak and Farhana Choudhury and Renata
Borovica-Gajic",
title = "{SeLeP}: Learning Based Semantic Prefetching for
Exploratory Database Workloads",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "8",
pages = "2064--2076",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3659437.3659458",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 1 06:18:48 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3659437.3659458",
abstract = "Prefetching is a crucial technique employed in
traditional databases to enhance interactivity,
particularly in the context of data exploration. Data
exploration is a query processing paradigm in which
users search for insights buried in the data, often
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2024:CEF,
author = "Yiwei Chen and Kaiyu Li and Guoliang Li and Yong
Wang",
title = "Contributions Estimation in Federated Learning: a
Comprehensive Experimental Evaluation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "8",
pages = "2077--2090",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3659437.3659459",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 1 06:18:48 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3659437.3659459",
abstract = "Federated Learning (FL) provides a privacy-preserving
and decentralized approach to collaborative machine
learning for multiple FL clients. The contribution
estimation mechanism in FL is extensively studied
within the database community, which aims to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Maroulis:2024:VAT,
author = "Stavros Maroulis and Vassilis Stamatopoulos and George
Papastefanatos and Manolis Terrovitis",
title = "Visualization-Aware Time Series Min-Max Caching with
Error Bound Guarantees",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "8",
pages = "2091--2103",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3659437.3659460",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 1 06:18:48 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3659437.3659460",
abstract = "This paper addresses the challenges in interactive
visual exploration of large multi-variate time series
data. Traditional data reduction techniques may improve
latency but can distort visualizations.
State-of-the-art methods aimed at 100\% accurate
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kayali:2024:CFM,
author = "Moe Kayali and Anton Lykov and Ilias Fountalis and
Nikolaos Vasiloglou and Dan Olteanu and Dan Suciu",
title = "{Chorus}: Foundation Models for Unified Data Discovery
and Exploration",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "8",
pages = "2104--2114",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3659437.3659461",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 1 06:18:48 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3659437.3659461",
abstract = "We apply foundation models to data discovery and
exploration tasks. Foundation models are large language
models (LLMS) that show promising performance on a
range of diverse tasks unrelated to their training. We
show that these models are highly \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Leis:2024:CND,
author = "Viktor Leis and Christian Dietrich",
title = "Cloud-Native Database Systems and Unikernels:
Reimagining {OS} Abstractions for Modern Hardware",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "8",
pages = "2115--2122",
month = apr,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3659437.3659462",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Jun 1 06:18:48 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3659437.3659462",
abstract = "This paper explores the intersection of operating
systems and database systems, focusing on the potential
of specialized kernels for cloud-native database
systems. Although the idea of custom, DBMS-optimized OS
kernels is old, it is largely unrealized \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Xiong:2024:CEC,
author = "Haoran Xiong and Hang Zhang and Zeyu Wang and Zhenying
He and Peng Wang and X. Sean Wang",
title = "{CIVET}: Exploring Compact Index for Variable-Length
Subsequence Matching on Time Series",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "9",
pages = "2123--2135",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3665844.3665845",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:54 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3665844.3665845",
abstract = "Nowadays the demands for managing and analyzing
substantially increasing collections of time series are
becoming more challenging. Subsequence matching, as a
core subroutine in time series analysis, has drawn
significant research attention. Most of the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kittivorawong:2024:SGV,
author = "Chanwut Kittivorawong and Yongming Ge and Yousef Helal
and Alvin Cheung",
title = "Spatialyze: a Geospatial Video Analytics System with
Spatial-Aware Optimizations",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "9",
pages = "2136--2148",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3665844.3665846",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:54 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3665844.3665846",
abstract = "Videos that are shot using commodity hardware such as
phones and surveillance cameras record various metadata
such as time and location. We encounter such geospatial
videos on a daily basis and such videos have been
growing in volume significantly. Yet, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yin:2024:OMS,
author = "Hanyan Yin and Dongxie Wen and Jiajun Li and Zhewei
Wei and Xiao Zhang and Zengfeng Huang and Feifei Li",
title = "Optimal Matrix Sketching over Sliding Windows",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "9",
pages = "2149--2161",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3665844.3665847",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:54 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3665844.3665847",
abstract = "Matrix sketching, aimed at approximating a matrix $ A
\in R^{N \times d} $ consisting of vector streams of
length $N$ with a smaller sketching matrix $ B \in R^{l
\times d}, l \ll N$, has garnered increasing attention
in fields such as large-scale data analytics and
machine learning. A \ldots {}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Baca:2024:WFE,
author = "Radim Baca",
title = "Window Function Expression: Let the Self-Join Enter",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "9",
pages = "2162--2174",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3665844.3665848",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:54 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3665844.3665848",
abstract = "Window function expressions (WFEs) became part of the
SQL:2003 standard, and since then, they have often been
implemented in database systems (DBS). They are
especially essential to OLAP DBSs, and people use them
daily. Even though WFEs are a heavily \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kakaraparthy:2024:SSD,
author = "Aarati Kakaraparthy and Jignesh M. Patel",
title = "{SplitDF}: Splitting Dataframes for Memory-Efficient
Data Analysis",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "9",
pages = "2175--2184",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3665844.3665849",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:54 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3665844.3665849",
abstract = "Dataframe is a popular construct in data analysis
libraries that offers a tabular view of the data.
However, data within a dataframe often has redundancy,
which can lead to high memory utilization of data
analysis libraries. Inspired by the process of
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Daliri:2024:SMI,
author = "Majid Daliri and Juliana Freire and Christopher Musco
and A{\'e}cio Santos and Haoxiang Zhang",
title = "Sampling Methods for Inner Product Sketching",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "9",
pages = "2185--2197",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3665844.3665850",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:54 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3665844.3665850",
abstract = "Recently, Bessa et al. (PODS 2023) showed that
sketches based on coordinated weighted sampling
theoretically and empirically outperform popular linear
sketching methods like Johnson-Lindentrauss projection
and CountSketch for the ubiquitous problem of
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Hu:2024:DDI,
author = "Han Hu and Jiye Qiu and Hongzhi Wang and Bin Liang and
Songling Zou",
title = "{DIDS}: Double Indices and Double Summarizations for
Fast Similarity Search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "9",
pages = "2198--2211",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3665844.3665851",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:54 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3665844.3665851",
abstract = "Data series has been one of the significant data forms
in various applications. It becomes imperative to
devise a data series index that supports both
approximate and exact similarity searches for large
data series collections in high-dimensional metric
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Xu:2024:IGC,
author = "Qian Xu and Juan Yang and Feng Zhang and Zheng Chen
and Jiawei Guan and Kang Chen and Ju Fan and Youren
Shen and Ke Yang and Yu Zhang and Xiaoyong Du",
title = "Improving Graph Compression for Efficient
Resource-Constrained Graph Analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "9",
pages = "2212--2226",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3665844.3665852",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:54 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3665844.3665852",
abstract = "Recent studies have shown the promise of directly
processing compressed graphs. However, its benefits
have been limited by high peak-memory usage and
unbearably long compression time. In this paper, we
introduce Laconic, a novel rule-based graph \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2024:EUC,
author = "Jianwei Wang and Kai Wang and Xuemin Lin and Wenjie
Zhang and Ying Zhang",
title = "Efficient Unsupervised Community Search with
Pre-Trained Graph Transformer",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "9",
pages = "2227--2240",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3665844.3665853",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:54 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3665844.3665853",
abstract = "Community search has aroused widespread interest in
the past decades. Among existing solutions, the
learning-based models exhibit outstanding performance
in terms of accuracy by leveraging labels to (1) train
the model for community score learning, and (2)
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wei:2024:LLS,
author = "Jiuqi Wei and Botao Peng and Xiaodong Lee and Themis
Palpanas",
title = "{DET-LSH}: a Locality-Sensitive Hashing Scheme with
Dynamic Encoding Tree for Approximate Nearest Neighbor
Search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "9",
pages = "2241--2254",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3665844.3665854",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:54 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3665844.3665854",
abstract = "Locality-sensitive hashing (LSH) is a well-known
solution for approximate nearest neighbor (ANN) search
in high-dimensional spaces due to its robust
theoretical guarantee on query accuracy. Traditional
LSH-based methods mainly focus on improving the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liu:2024:BEA,
author = "Haoyu Liu and Siqiang Luo",
title = "{BIRD}: Efficient Approximation of Bidirectional
Hidden Personalized {PageRank}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "9",
pages = "2255--2268",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3665844.3665855",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:54 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pagerank.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3665844.3665855",
abstract = "In bipartite graph analysis, similarity measures play
a pivotal role in various applications. Among existing
metrics, the Bidirectional Hidden Personalized PageRank
(BHPP) stands out for its superior query quality.
However, the computational expense of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yu:2024:GGP,
author = "Zihao Yu and Ningyi Liao and Siqiang Luo",
title = "{GENTI}: {GPU}-Powered Walk-Based Subgraph Extraction
for Scalable Representation Learning on Dynamic
Graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "9",
pages = "2269--2278",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3665844.3665856",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:54 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3665844.3665856",
abstract = "Graph representation learning is an emerging task for
effectively embedding graph-structured data with
learned features. Among them, Subgraph-based GRL (SGRL)
methods have demonstrated better scalability and
expressiveness for large-scale tasks. The core
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Feuer:2024:ANF,
author = "Benjamin Feuer and Yurong Liu and Chinmay Hegde and
Juliana Freire",
title = "{ArcheType}: a Novel Framework for Open-Source Column
Type Annotation Using Large Language Models",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "9",
pages = "2279--2292",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3665844.3665857",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:54 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3665844.3665857",
abstract = "Existing deep-learning approaches to semantic column
type annotation (CTA) have important shortcomings: they
rely on semantic types which are fixed at training
time; require a large number of training samples per
type; incur high run-time inference costs;. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chang:2024:TSM,
author = "Yanchuan Chang and Egemen Tanin and Gao Cong and
Christian S. Jensen and Jianzhong Qi",
title = "Trajectory Similarity Measurement: an Efficiency
Perspective",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "9",
pages = "2293--2306",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3665844.3665858",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:54 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3665844.3665858",
abstract = "Trajectories that capture object movement have
numerous applications, in which similarity computation
between trajectories often plays a key role.
Traditionally, trajectory similarity is quantified by
means of non-learned measures, e.g., Hausdorff, that
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wheatman:2024:BUF,
author = "Brian Wheatman and Xiaojun Dong and Zheqi Shen and
Laxman Dhulipala and Jakub Lacki and Prashant Pandey
and Helen Xu",
title = "{BYO}: a Unified Framework for Benchmarking
Large-Scale Graph Containers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "9",
pages = "2307--2320",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3665844.3665859",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:54 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3665844.3665859",
abstract = "A fundamental building block in any graph algorithm is
a graph container --- - a data structure used to
represent the graph. Ideally, a graph container enables
efficient access to the underlying graph, has low space
usage, and supports updating the graph \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhu:2024:SVD,
author = "Yizheng Zhu and Yuncheng Wu and Zhaojing Luo and Beng
Chin Ooi and Xiaokui Xiao",
title = "Secure and Verifiable Data Collaboration with Low-Cost
Zero-Knowledge Proofs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "9",
pages = "2321--2334",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3665844.3665860",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:54 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3665844.3665860",
abstract = "Federated Learning (FL) emerges as a viable solution
to facilitate data collaboration, enabling multiple
clients to collaboratively train a machine learning
(ML) model under the supervision of a central server
while ensuring the confidentiality of their \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Nagda:2024:RDD,
author = "Heena Nagda and Shubhendra Pal Singhal and Mohammad
Javad Amiri and Boon Thau Loo",
title = "{Rashnu}: Data-Dependent Order-Fairness",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "9",
pages = "2335--2348",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3665844.3665861",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:54 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3665844.3665861",
abstract = "Distributed data management systems use state Machine
Replication (SMR) to provide fault tolerance. The SMR
algorithm enables Byzantine Fault-Tolerant (BFT)
protocols to guarantee safety and liveness despite the
malicious failure of nodes. However, SMR \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Huang:2024:SBA,
author = "Yuchuan Huang and Mohamed F. Mokbel",
title = "{Sparcle}: Boosting the Accuracy of Data Cleaning
Systems through Spatial Awareness",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "9",
pages = "2349--2362",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3665844.3665862",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:54 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3665844.3665862",
abstract = "Though data cleaning systems have earned great success
and wide spread in both academia and industry, they
fall short when trying to clean spatial data. The main
reason is that state-of-the-art data cleaning systems
mainly rely on functional dependency \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Qiu:2024:TTC,
author = "Xiangfei Qiu and Jilin Hu and Lekui Zhou and Xingjian
Wu and Junyang Du and Buang Zhang and Chenjuan Guo and
Aoying Zhou and Christian S. Jensen and Zhenli Sheng
and Bin Yang",
title = "{TFB}: Towards Comprehensive and Fair Benchmarking of
Time Series Forecasting Methods",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "9",
pages = "2363--2377",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3665844.3665863",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:54 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3665844.3665863",
abstract = "Time series are generated in diverse domains such as
economic, traffic, health, and energy, where
forecasting of future values has numerous important
applications. Not surprisingly, many forecasting
methods are being proposed. To ensure progress, it is
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Liu:2024:FFG,
author = "Chengjun Liu and Zhuo Peng and Weiguo Zheng and Lei
Zou",
title = "{FSM}: a Fine-Grained Splitting and Merging Framework
for Dual-Balanced Graph Partition",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "9",
pages = "2378--2391",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3665844.3665864",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:54 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3665844.3665864",
abstract = "Partitioning a large graph into smaller subgraphs by
minimizing the number of cutting vertices and edges,
namely cut size or replication factor, plays a crucial
role in distributed graph processing tasks. However,
many prior works have primarily focused \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Marchesin:2024:ERE,
author = "Stefano Marchesin and Gianmaria Silvello",
title = "Efficient and Reliable Estimation of Knowledge Graph
Accuracy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "9",
pages = "2392--2403",
month = may,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3665844.3665865",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:54 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3665844.3665865",
abstract = "Data accuracy is a central dimension of data quality,
especially when dealing with Knowledge Graphs (KGs).
Auditing the accuracy of KGs is essential to make
informed decisions in entity-oriented services or
applications. However, manually evaluating the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhou:2024:BID,
author = "Wei Zhou and Chen Lin and Xuanhe Zhou and Guoliang
Li",
title = "Breaking It Down: an In-Depth Study of Index
Advisors",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "10",
pages = "2405--2418",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3675034.3675035",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:55 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3675034.3675035",
abstract = "Index advisors aim to improve workload performance by
judiciously selecting an appropriate set of indexes.
Various heuristic-based and learning-based methods have
been proposed. However, there lacks a comprehensive
assessment of existing index advisors, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Deng:2024:AMC,
author = "Wen Deng and Weiguo Zheng and Hong Cheng",
title = "Accelerating Maximal Clique Enumeration via Graph
Reduction",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "10",
pages = "2419--3431",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3675034.3675036",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:55 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3675034.3675036",
abstract = "As a fundamental task in graph data management,
maximal clique enumeration (MCE) has attracted
extensive attention from both academic and industrial
communities due to its wide range of applications.
However, MCE is very challenging as the number of
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Bai:2024:PPB,
author = "Jiyang Bai and Peixiang Zhao",
title = "{Poligras}: Policy-Based Graph Summarization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "10",
pages = "2432--2444",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3675034.3675037",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:55 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3675034.3675037",
abstract = "Large graphs are ubiquitous. Their sizes, rates of
growth, and complexity, however, have significantly
outpaced human capabilities to ingest and make sense of
them. As a cost-effective graph simplification
technique, graph summarization is aimed to \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zheng:2024:SSW,
author = "Leqian Zheng and Lei Xu and Cong Wang and Sheng Wang
and Yuke Hu and Zhan Qin and Feifei Li and Kui Ren",
title = "{SWAT}: a System-Wide Approach to Tunable Leakage
Mitigation in Encrypted Data Stores",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "10",
pages = "2445--2458",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3675034.3675038",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:55 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3675034.3675038",
abstract = "Numerous studies have underscored the significant
privacy risks associated with various leakage patterns
in encrypted data stores. While many solutions have
been proposed to mitigate these leakages, they either
(1) incur substantial overheads, (2) focus \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2024:TTI,
author = "Kai Wang and Yuwei Xu and Siqiang Luo",
title = "{TIGER}: Training Inductive Graph Neural Network for
Large-Scale Knowledge Graph Reasoning",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "10",
pages = "2459--2472",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3675034.3675039",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:55 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3675034.3675039",
abstract = "Knowledge Graph (KG) Reasoning plays a vital role in
various applications by predicting missing facts from
existing knowledge. Inductive KG reasoning approaches
based on Graph Neural Networks (GNNs) have shown
impressive performance, particularly when \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2024:ISW,
author = "Chao Zhang and Angela Bonifati and M. Tamer {\"O}zsu",
title = "Incremental Sliding Window Connectivity over Streaming
Graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "10",
pages = "2473--2486",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3675034.3675040",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:55 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3675034.3675040",
abstract = "We study index-based processing for connectivity
queries within sliding windows on streaming graphs.
These queries, which determine whether two vertices
belong to the same connected component, are fundamental
operations in real-time graph data processing
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Cai:2024:CEC,
author = "Qingpeng Cai and Kaiping Zheng and H. V. Jagadish and
Beng Chin Ooi and James Yip",
title = "{CohortNet}: Empowering Cohort Discovery for
Interpretable Healthcare Analytics",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "10",
pages = "2487--2500",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3675034.3675041",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:55 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3675034.3675041",
abstract = "Cohort studies are of significant importance in the
field of healthcare analytics. However, existing
methods typically involve manual, labor-intensive, and
expert-driven pattern definitions or rely on simplistic
clustering techniques that lack medical \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2024:EIM,
author = "Jinghao Wang and Yanping Wu and Xiaoyang Wang and Ying
Zhang and Lu Qin and Wenjie Zhang and Xuemin Lin",
title = "Efficient Influence Minimization via Node Blocking",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "10",
pages = "2501--2513",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3675034.3675042",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:55 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3675034.3675042",
abstract = "Given a graph G, a budget k and a misinformation seed
set S, Influence Minimization (IMIN) via node blocking
aims to find a set of k nodes to be blocked such that
the expected spread of S is minimized. This problem
finds important applications in \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhou:2024:DBD,
author = "Xuanhe Zhou and Guoliang Li and Zhaoyan Sun and
Zhiyuan Liu and Weize Chen and Jianming Wu and Jiesi
Liu and Ruohang Feng and Guoyang Zeng",
title = "{D-Bot}: Database Diagnosis System using Large
Language Models",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "10",
pages = "2514--2527",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3675034.3675043",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:55 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3675034.3675043",
abstract = "Database administrators (DBAs) play an important role
in managing database systems. However, it is hard and
tedious for DBAs to manage vast database instances and
give timely response (waiting for hours is intolerable
in many online cases). In addition, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Qiao:2024:BFS,
author = "Yiming Qiao and Yihan Gao and Huanchen Zhang",
title = "{Blitzcrank}: Fast Semantic Compression for In-Memory
Online Transaction Processing",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "10",
pages = "2528--2540",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3675034.3675044",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:55 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3675034.3675044",
abstract = "We present Blitzcrank, a high-speed semantic
compressor designed for OLTP databases. Previous
solutions are inadequate for compressing row-stores:
they suffer from either low compression factor due to a
coarse compression granularity or suboptimal \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2024:SSS,
author = "Zhihao Chen and Tianji Yang and Yixiao Zheng and Zhao
Zhang and Cheqing Jin and Aoying Zhou",
title = "{Spectrum}: Speedy and Strictly-Deterministic Smart
Contract Transactions for Blockchain Ledgers",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "10",
pages = "2541--2554",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3675034.3675045",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:55 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3675034.3675045",
abstract = "Today, blockchain ledgers utilize concurrent
deterministic execution schemes to scale up. However,
ordering fairness is not preserved in these schemes:
although they ensure all replicas achieve the same
serial order, this order does not always align with
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2024:FCG,
author = "Zihao Zhang and Huiqi Hu and Xuan Zhou and Yaofeng Tu
and Weining Qian and Aoying Zhou",
title = "Fast Commitment for Geo-Distributed Transactions via
Decentralized Co-Coordinators",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "10",
pages = "2555--2567",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3675034.3675046",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:55 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3675034.3675046",
abstract = "In a geo-distributed database, data shards and their
respective replicas are deployed in distinct
datacenters across multiple regions, enabling
regional-level disaster recovery and the ability to
serve global users locally. However, transaction
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lerner:2024:CRS,
author = "Alberto Lerner and Gustavo Alonso",
title = "{CXL} and the Return of Scale-Up Database Engines",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "10",
pages = "2568--2575",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3675034.3675047",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:55 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3675034.3675047",
abstract = "The trend toward specialized processing devices such
as TPUs, DPUs, GPUs, and FPGAs has exposed the
weaknesses of PCIe in interconnecting these devices and
their hosts. Several attempts have been proposed to
improve, augment, or downright replace PCIe, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fang:2024:IAC,
author = "Shuheng Fang and Kangfei Zhao and Yu Rong and Zhixun
Li and Jeffrey Xu Yu",
title = "Inductive Attributed Community Search: To Learn
Communities Across Graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "10",
pages = "2576--2589",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3675034.3675048",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:55 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3675034.3675048",
abstract = "Attributed community search (ACS) aims to identify
subgraphs satisfying both structure cohesiveness and
attribute homogeneity in attributed graphs, for a given
query that contains query nodes and query attributes.
Previously, algorithmic approaches deal \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yuan:2024:ELC,
author = "Long Yuan and Xia Li and Zi Chen and Xuemin Lin and
Xiang Zhao and Wenjie Zhang",
title = "{I/O} Efficient Label-Constrained Reachability Queries
in Large Graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "10",
pages = "2590--2602",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3675034.3675049",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:55 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3675034.3675049",
abstract = "Computing the reachability between two vertices in a
graph is a fundamental problem in graph data analysis.
Most of the existing works assume that the edges in the
graph have no labels, but in many real application
scenarios, edges naturally come with \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Lu:2024:DSR,
author = "Baotong Lu and Kaisong Huang and Chieh-Jan Mike Liang
and Tianzheng Wang and Eric Lo",
title = "{DEX}: Scalable Range Indexing on Disaggregated
Memory",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "10",
pages = "2603--2616",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3675034.3675050",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:55 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3675034.3675050",
abstract = "Memory disaggregation can potentially allow
memory-optimized range indexes such as B+-trees to
scale beyond one machine while attaining high hardware
utilization and low cost. Designing scalable indexes on
disaggregated memory, however, is challenging
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ni:2024:ADR,
author = "Wei Ni and Xiaoye Miao and Xiangyu Zhao and Yangyang
Wu and Shuwei Liang and Jianwei Yin",
title = "Automatic Data Repair: Are We Ready to Deploy?",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "10",
pages = "2617--2630",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3675034.3675051",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:55 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3675034.3675051",
abstract = "Data quality is paramount in today's data-driven
world, especially in the era of generative AI. Dirty
data with errors and inconsistencies usually leads to
flawed insights, unreliable decision-making, and biased
or low-quality outputs from generative \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chang:2024:BHM,
author = "Chaokun Chang and Eric Lo and Chunxiao Ye",
title = "{Biathlon}: Harnessing Model Resilience for
Accelerating {ML} Inference Pipelines",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "10",
pages = "2631--2640",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3675034.3675052",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:55 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3675034.3675052",
abstract = "Machine learning inference pipelines commonly
encountered in data science and industries often
require real-time responsiveness due to their
user-facing nature. However, meeting this requirement
becomes particularly challenging when certain input
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zeng:2024:DSD,
author = "Yuanyuan Zeng and Chenhao Ma and Yixiang Fang",
title = "Distributed Shortest Distance Labeling on Large-Scale
Graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "10",
pages = "2641--2653",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3675034.3675053",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:55 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3675034.3675053",
abstract = "Distance labeling approaches are widely adopted to
speed up the shortest-distance query performance. Due
to the explosive growth of data graphs, a single
machine can hardly satisfy the requirements of both
computational power and memory capacity, which
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Luo:2024:EPD,
author = "Wensheng Luo and Yixiang Fang and Chunxu Lin and
Yingli Zhou",
title = "Efficient Parallel {D}-Core Decomposition at Scale",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "10",
pages = "2654--2667",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3675034.3675054",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:55 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3675034.3675054",
abstract = "Directed graphs are prevalent in social networks, web
networks, and communication networks. A well-known
concept of the directed graph is the D-core, or ( k, l
)-core, which is the maximal subgraph in which each
vertex has an in-degree not less than k and \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Pellegrina:2024:EDS,
author = "Leonardo Pellegrina and Fabio Vandin",
title = "Efficient Discovery of Significant Patterns with
Few-Shot Resampling",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "10",
pages = "2668--2680",
month = jun,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3675034.3675055",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Wed Aug 7 06:07:55 MDT 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3675034.3675055",
abstract = "Significant pattern mining is a fundamental task in
mining transactional data, requiring to identify
patterns significantly associated with the value of a
given feature, the target. In several applications,
such as biomedicine, basket market analysis, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2024:RBP,
author = "Qixu Chen and Raymond Chi-Wing Wong",
title = "Robust Best Point Selection under Unreliable User
Feedback",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "11",
pages = "2681--2693",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3681954.3681955",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Nov 9 16:34:53 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3681954.3681955",
abstract = "The task of finding a user's utility function
(representing the user's preference) by asking them to
compare pairs of points through a series of questions,
each requiring him/her to compare 2 points for choosing
a more preferred one, to find the best \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Cheng:2024:TOT,
author = "Audrey Cheng and Aaron Kabcenell and Jason Chan and
Xiao Shi and Peter Bailis and Natacha Crooks and Ion
Stoica",
title = "Towards Optimal Transaction Scheduling",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "11",
pages = "2694--2707",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3681954.3681956",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Nov 9 16:34:53 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3681954.3681956",
abstract = "Maximizing transaction throughput is key to
high-performance database systems, which focus on
minimizing data access conflicts to improve
performance. However, finding efficient schedules that
reduce conflicts remains an open problem. For
efficiency, \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Campos:2024:QDE,
author = "David Campos and Bin Yang and Tung Kieu and Miao Zhang
and Chenjuan Guo and Christian S. Jensen",
title = "{QCore}: Data-Efficient, On-Device Continual
Calibration for Quantized Models",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "11",
pages = "2708--2721",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3681954.3681957",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Nov 9 16:34:53 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3681954.3681957",
abstract = "We are witnessing an increasing availability of
streaming data that may contain valuable information on
the underlying processes. It is thus attractive to be
able to deploy machine learning models, e.g., for
classification, on edge devices near sensors \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2024:EAP,
author = "Yalong Zhang and Rong-Hua Li and Qi Zhang and Hongchao
Qin and Lu Qin and Guoren Wang",
title = "Efficient Algorithms for Pseudoarboricity Computation
in Large Static and Dynamic Graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "11",
pages = "2722--2734",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3681954.3681958",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Nov 9 16:34:53 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3681954.3681958",
abstract = "The arboricity a ( G ) of a graph G is defined as the
minimum number of edge-disjoint forests that the edge
set of G can be partitioned into. It is a fundamental
metric and has been widely used in many graph analysis
applications. However, computing a ( G ) is \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2024:RPB,
author = "Meng Chen and Kai Zhang and Zhenying He and Yinan Jing
and X. Sean Wang",
title = "{RoarGraph}: a Projected Bipartite Graph for Efficient
Cross-Modal Approximate Nearest Neighbor Search",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "11",
pages = "2735--2749",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3681954.3681959",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Nov 9 16:34:53 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3681954.3681959",
abstract = "Approximate Nearest Neighbor Search (ANNS) is a
fundamental and critical component in many
applications, including recommendation systems and
large language model-based applications. With the
advancement of multimodal neural models, which
transform data \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fan:2024:CSL,
author = "Ju Fan and Zihui Gu and Songyue Zhang and Yuxin Zhang
and Zui Chen and Lei Cao and Guoliang Li and Samuel
Madden and Xiaoyong Du and Nan Tang",
title = "Combining Small Language Models and Large Language
Models for Zero-Shot {NL2SQL}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "11",
pages = "2750--2763",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3681954.3681960",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Nov 9 16:34:53 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3681954.3681960",
abstract = "Zero-shot natural language to SQL (NL2SQL) aims to
generalize pretrained NL2SQL models to new environments
( e.g., new databases and new linguistic phenomena)
without any annotated NL2SQL samples from these
environments. Existing approaches either use \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Guliyev:2024:DGD,
author = "Rustam Guliyev and Aparajita Haldar and Hakan
Ferhatosmanoglu",
title = "{D3-GNN}: Dynamic Distributed Dataflow for Streaming
Graph Neural Networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "11",
pages = "2764--2777",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3681954.3681961",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Nov 9 16:34:53 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3681954.3681961",
abstract = "Graph Neural Network (GNN) models on streaming graphs
entail algorithmic challenges to continuously capture
its dynamic state, as well as systems challenges to
optimize latency, memory, and throughput during both
inference and training. We present D3-GNN,. \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Su:2024:DBO,
author = "Yunxiang Su and Shaoxu Song and Xiangdong Huang and
Chen Wang and Jianmin Wang",
title = "Distance-Based Outlier Query Optimization in {Apache
IoTDB}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "11",
pages = "2778--2790",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3681954.3681962",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Nov 9 16:34:53 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3681954.3681962",
abstract = "While outlier detection has been widely studied over
streaming data, the query of outliers in time series
databases was largely overlooked. Apache IoTDB, an
open-source time series database, employs LSM-tree
based storage to support intensive writing \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yang:2024:TMF,
author = "Jianye Yang and Sheng Fang and Zhaoquan Gu and Ziyi Ma
and Xuemin Lin and Zhihong Tian",
title = "{TC-Match}: Fast Time-Constrained Continuous Subgraph
Matching",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "11",
pages = "2791--2804",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3681954.3681963",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Nov 9 16:34:53 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3681954.3681963",
abstract = "Continuously monitoring structural patterns in
streaming graphs is a critical task in many real-time
graph-based applications. In this paper, we study the
problem of time-constrained continuous subgraph
matching (shorted as TCSM) over streaming graphs.
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wornow:2024:AEF,
author = "Michael Wornow and Avanika Narayan and Krista
Opsahl-Ong and Quinn McIntyre and Nigam Shah and
Christopher R{\'e}",
title = "Automating the Enterprise with Foundation Models",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "11",
pages = "2805--2812",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3681954.3681964",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Nov 9 16:34:53 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3681954.3681964",
abstract = "Automating enterprise workflows could unlock $ 4 t r i
l l i o n / y e a r i n p r o d u c t i v i t y g a i n
s. D e s p i t e b e i n g o f i n t e r e s t t o t h
e d a t a m a n a g e m e n t c o m m u n i t y f o r d
e c a d e s, t h e u l t i m a t e v i s i o n o f e n
d - t o - e n d w o r k f l o w a u t o m a t i o n h a
s r e m a i n e d e l u s i v e. C u r r e n t s o l u
t i o n s \ldots {}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Tian:2024:EIT,
author = "Anxin Tian and Alexander Zhou and Yue Wang and Xun
Jian and Lei Chen",
title = "Efficient Index for Temporal Core Queries over
Bipartite Graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "11",
pages = "2813--2825",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3681954.3681965",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Nov 9 16:34:53 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3681954.3681965",
abstract = "Many real-world binary relations can be modelled as
bipartite graphs, which can be inherently temporal and
each edge is associated with a timestamp. The $(\alpha,
\beta)$-core, a popular structure that requires minimum
degrees over two layers of vertices, is useful
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Kato:2024:UFF,
author = "Fumiyuki Kato and Li Xiong and Shun Takagi and Yang
Cao and Masatoshi Yoshikawa",
title = "{Uldp-FL}: Federated Learning with Across-Silo
User-Level Differential Privacy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "11",
pages = "2826--2839",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3681954.3681966",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Nov 9 16:34:53 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3681954.3681966",
abstract = "Differentially Private Federated Learning (DP-FL) has
garnered attention as a collaborative machine learning
approach that ensures formal privacy. Most DP-FL
approaches ensure DP at the record-level within each
silo for cross-silo FL. However, a single \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Yang:2024:EFI,
author = "Junyong Yang and Ming Zhong and Yuanyuan Zhu and
Tieyun Qian and Mengchi Liu and Jeffrey Xu Yu",
title = "Evolution Forest Index: Towards Optimal Temporal
$k$-Core Component Search via Time-Topology Isomorphic
Computation",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "11",
pages = "2840--2853",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3681954.3681967",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Nov 9 16:34:53 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3681954.3681967",
abstract = "For a temporal graph like transaction network, finding
a densely connected subgraph that contains a vertex
like a suspicious account during a period is valuable.
Thus, we study the Temporal k -Core Component Search
(TCCS) problem, which aims to find a \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ma:2024:EDP,
author = "Yuxin Ma and Ping Gong and Tianming Wu and Jiawei Yi
and Chengru Yang and Cheng Li and Qirong Peng and
Guiming Xie and Yongcheng Bao and Haifeng Liu and
Yinlong Xu",
title = "Eliminating Data Processing Bottlenecks in {GNN}
Training over Large Graphs via Two-level Feature
Compression",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "11",
pages = "2854--2866",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3681954.3681968",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Nov 9 16:34:53 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3681954.3681968",
abstract = "Training GNNs over large graphs faces a severe data
processing bottleneck, involving both sampling and
feature loading. To tackle this issue, we introduce
F$^2$ CGT, a fast GNN training system incorporating
feature compression. To avoid potential accuracy
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Rumbaugh:2024:TSI,
author = "Douglas B. Rumbaugh and Dong Xie and Zhuoyue Zhao",
title = "Towards Systematic Index Dynamization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "11",
pages = "2867--2879",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3681954.3681969",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Nov 9 16:34:53 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3681954.3681969",
abstract = "There is significant interest in examining large
datasets using complex domain-specific queries. In many
cases, these queries can be accelerated using
specialized indexes. Unfortunately, the development of
a practical index is difficult, because \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Treder-Tschechlov:2024:ECB,
author = "Dennis Treder-Tschechlov and Manuel Fritz and Holger
Schwarz and Bernhard Mitschang",
title = "Ensemble Clustering Based on Meta-Learning and
Hyperparameter Optimization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "11",
pages = "2880--2892",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3681954.3681970",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Nov 9 16:34:53 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3681954.3681970",
abstract = "Efficient clustering algorithms, such as k -Means, are
often used in practice because they scale well for
large datasets. However, they are only able to detect
simple data characteristics. Ensemble clustering can
overcome this limitation by combining \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Guo:2024:ESR,
author = "Chenjuan Guo and Ronghui Xu and Bin Yang and Ye Yuan
and Tung Kieu and Yan Zhao and Christian S. Jensen",
title = "Efficient Stochastic Routing in Path-Centric Uncertain
Road Networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "11",
pages = "2893--2905",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3681954.3681971",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Nov 9 16:34:53 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3681954.3681971",
abstract = "The availability of massive vehicle trajectory data
enables the modeling of road-network constrained
movement as travel-cost distributions rather than just
single-valued costs, thereby capturing the inherent
uncertainty of movement and enabling improved
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Bonifati:2024:TPG,
author = "Angela Bonifati and Filip Murlak and Yann Ramusat",
title = "Transforming Property Graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "11",
pages = "2906--2918",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3681954.3681972",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Nov 9 16:34:53 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3681954.3681972",
abstract = "In this paper, we study a declarative framework for
specifying transformations of property graphs. In order
to express such transformations, we leverage queries
formulated in the Graph Pattern Calculus (GPC), which
is an abstraction of the common core of \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sun:2024:LLM,
author = "Yushi Sun and Hao Xin and Kai Sun and Yifan Ethan Xu
and Xiao Yang and Xin Luna Dong and Nan Tang and Lei
Chen",
title = "Are Large Language Models a Good Replacement of
Taxonomies?",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "11",
pages = "2919--2932",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3681954.3681973",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Nov 9 16:34:53 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3681954.3681973",
abstract = "Large language models (LLMs) demonstrate an impressive
ability to internalize knowledge and answer natural
language questions. Although previous studies validate
that LLMs perform well on general knowledge while
presenting poor performance on long-tail \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhang:2024:EAD,
author = "Yalong Zhang and Rong-Hua Li and Qi Zhang and Hongchao
Qin and Guoren Wang",
title = "Efficient Algorithms for Density Decomposition on
Large Static and Dynamic Graphs",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "11",
pages = "2933--2945",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3681954.3681974",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Nov 9 16:34:53 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3681954.3681974",
abstract = "Locally-densest subgraph (LDS) decomposition is a
fundamental decomposition in graph analysis that finds
numerous applications in various domains, including
community detection, fraud detection, graph querying,
and graph visualization. However, the LDS \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Zhou:2024:EMM,
author = "Yingli Zhou and Yixiang Fang and Chenhao Ma and Tianci
Hou and Xin Huang",
title = "Efficient Maximal {Motif}-Clique Enumeration over
Large Heterogeneous Information Networks",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "11",
pages = "2946--2959",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3681954.3681975",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Nov 9 16:34:53 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3681954.3681975",
abstract = "In the heterogeneous information network (HIN), a
motif-clique is a ``complete graph'' for a given motif
(or a small connected graph) that could capture the
desired relationship in the motif. The maximal
motif-cliques of HINs have found various \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sheng:2024:OCR,
author = "Zeang Sheng and Wentao Zhang and Yangyu Tao and Bin
Cui",
title = "{OUTRE}: an {OUT-of-Core De-REdundancy} {GNN} Training
Framework for Massive Graphs within A Single Machine",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "11",
pages = "2960--2973",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3681954.3681976",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Nov 9 16:34:53 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3681954.3681976",
abstract = "Sampling-based Graph Neural Networks (GNNs) have
become the de facto standard for handling various graph
learning tasks on large-scale graphs. As the graph size
grows larger and even exceeds the standard host memory
size of a single machine, out-of-core \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Fang:2024:RSA,
author = "Chenguang Fang and Zijie Chen and Shaoxu Song and
Xiangdong Huang and Chen Wang and Jianmin Wang",
title = "On Reducing Space Amplification with Multi-Column
Compaction in {Apache IoTDB}",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "11",
pages = "2974--2986",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3681954.3681977",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Nov 9 16:34:53 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3681954.3681977",
abstract = "Log-structured merge trees (LSM-trees) are commonly
employed as the storage engines for write-intensive
workloads in modern time series databases including
Apache IoTDB. Following append-only principle,
LSM-trees can handle intensive writes and updates,
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Schmidl:2024:AUH,
author = "Sebastian Schmidl and Felix Naumann and Thorsten
Papenbrock",
title = "{AutoTSAD}: Unsupervised Holistic Anomaly Detection
for Time Series Data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "11",
pages = "2987--3002",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3681954.3681978",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Nov 9 16:34:53 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3681954.3681978",
abstract = "Detecting anomalous subsequences in time series data
is one of the key tasks in time series analytics,
having applications in environmental monitoring,
preventive healthcare, predictive maintenance, and many
further areas. Data scientists have developed
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Chen:2024:EWB,
author = "Zheng Chen and Feng Zhang and Yang Chen and Xiaokun
Fang and Guanyu Feng and Xiaowei Zhu and Wenguang Chen
and Xiaoyong Du",
title = "Enabling Window-Based Monotonic Graph Analytics with
Reusable Transitional Results for Pattern-Consistent
Queries",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "11",
pages = "3003--3016",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3681954.3681979",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Nov 9 16:34:53 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3681954.3681979",
abstract = "Evolving graphs consisting of slices are large and
constantly changing. For example, in Alipay, the graph
generates hundreds of millions of new transaction
records every day. Analyzing the graph within a
temporary window is time-consuming due to the
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Ramos:2024:WAS,
author = "Maria Ramos and Jo{\~a}o Azevedo and Kyle Kingsbury
and Jos{\'e} Pereira and T{\^a}nia Esteves and Ricardo
Macedo and Jo{\~a}o Paulo",
title = "When Amnesia Strikes: Understanding and Reproducing
Data Loss Bugs with Fault Injection",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "11",
pages = "3017--3030",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3681954.3681980",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Nov 9 16:34:53 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3681954.3681980",
abstract = "We present LazyFS, a new fault injection tool that
simplifies the debugging and reproduction of complex
data durability bugs experienced by databases,
key-value stores, and other data-centric systems in
crashes. Our tool simulates persistence properties
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Wang:2024:PTA,
author = "Leixia Wang and Qingqing Ye and Haibo Hu and Xiaofeng
Meng",
title = "{PriPL-Tree}: Accurate Range Query for Arbitrary
Distribution under Local Differential Privacy",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "11",
pages = "3031--3044",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3681954.3681981",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Nov 9 16:34:53 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3681954.3681981",
abstract = "Answering range queries in the context of Local
Differential Privacy (LDP) is a widely studied problem
in Online Analytical Processing (OLAP). Existing LDP
solutions all assume a uniform data distribution within
each domain partition, which may not align \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Sun:2024:WWS,
author = "Yu Sun and Jingyu Zhu and Xiao Xu and Xian Xu and
Yuyao Sun and Shaoxu Song and Xiang Li and Xiaojie
Yuan",
title = "{Win-Win}: On Simultaneous Clustering and Imputing
over Incomplete Data",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "11",
pages = "3045--3057",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3681954.3681982",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Nov 9 16:34:53 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3681954.3681982",
abstract = "Although clustering methods have shown promising
performance in various applications, they cannot
effectively handle incomplete data. Existing studies
often impute missing values first before clustering
analysis and conduct these two processes \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Takagi:2024:HDP,
author = "Shun Takagi and Li Xiong and Fumiyuki Kato and Yang
Cao and Masatoshi Yoshikawa",
title = "{HRNet}: Differentially Private Hierarchical and
Multi-Resolution Network for Human Mobility Data
Synthesization",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "11",
pages = "3058--3071",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3681954.3681983",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Nov 9 16:34:53 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3681954.3681983",
abstract = "Human mobility data offers valuable insights for many
applications such as urban planning and pandemic
response, but its use also raises privacy concerns. In
this paper, we introduce the Hierarchical and
Multi-Resolution Network (HRNet), a novel deep
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}
@Article{Dong:2024:EMI,
author = "Sijie Dong and Qitong Wang and Soror Sahri and Themis
Palpanas and Divesh Srivastava",
title = "Efficiently Mitigating the Impact of Data Drift on
Machine Learning Pipelines",
journal = j-PROC-VLDB-ENDOWMENT,
volume = "17",
number = "11",
pages = "3072--3081",
month = jul,
year = "2024",
CODEN = "????",
DOI = "https://doi.org/10.14778/3681954.3681984",
ISSN = "2150-8097",
ISSN-L = "2150-8097",
bibdate = "Sat Nov 9 16:34:53 MST 2024",
bibsource = "https://www.math.utah.edu/pub/tex/bib/vldbe.bib",
URL = "https://dl.acm.org/doi/10.14778/3681954.3681984",
abstract = "Despite the increasing success of Machine Learning
(ML) techniques in real-world applications, their
maintenance over time remains challenging. In
particular, the prediction accuracy of deployed ML
models can suffer due to significant changes between
\ldots{}",
acknowledgement = ack-nhfb,
ajournal = "Proc. VLDB Endowment",
fjournal = "Proceedings of the VLDB Endowment",
journal-URL = "https://dl.acm.org/loi/pvldb",
}