@article{Levin_Finley_2017, title={A Spatial Biosurveillance Synthetic Data Generator in R}, volume={9}, url={https://ojphi.org/ojs/index.php/ojphi/article/view/7583}, DOI={10.5210/ojphi.v9i1.7583}, abstractNote={<div style="left: 90px; top: 250.758px; font-size: 14.1667px; font-family: sans-serif; transform: scaleX(1.08129);" data-canvas-width="63.778333333333336">Objective</div><div style="left: 105px; top: 265.709px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.00636);" data-canvas-width="381.2264166666666">To develop a spatially accurate biosurveillance synthetic data</div><div style="left: 90px; top: 282.376px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.987834);" data-canvas-width="393.0428333333335">generator for the testing, evaluation, and comparison of new outbreak</div><div style="left: 90px; top: 299.043px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.00091);" data-canvas-width="119.60916666666667">detection techniques.</div><div style="left: 90px; top: 329.091px; font-size: 14.1667px; font-family: sans-serif; transform: scaleX(1.11768);" data-canvas-width="82.63416666666666">Introduction</div><div style="left: 105px; top: 344.043px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.01064);" data-canvas-width="378.8889166666667">Development of new methods for the rapid detection of emerging</div><div style="left: 90px; top: 360.709px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.994319);" data-canvas-width="393.2865">disease outbreaks is a research priority in the field of biosurveillance.</div><div style="left: 90px; top: 377.376px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.973572);" data-canvas-width="393.0796666666664">Because real-world data are often proprietary in nature, scientists must</div><div style="left: 90px; top: 394.043px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.02908);" data-canvas-width="394.49916666666644">utilize synthetic data generation methods to evaluate new detection</div><div style="left: 90px; top: 410.709px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.959453);" data-canvas-width="395.0389166666668">methodologies. Colizza et. al. have shown that epidemic spread is</div><div style="left: 90px; top: 427.376px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.01792);" data-canvas-width="394.2384999999999">dependent on the airline transportation network [1], yet current data</div><div style="left: 90px; top: 444.043px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.00049);" data-canvas-width="284.4383333333332">generators do not operate over network structures.</div><div style="left: 105px; top: 460.709px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.986332);" data-canvas-width="380.95441666666653">Here we present a new spatial data generator that models the</div><div style="left: 90px; top: 477.376px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.03601);" data-canvas-width="394.66774999999996">spread of contagion across a network of cities connected by airline</div><div style="left: 90px; top: 494.043px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.03111);" data-canvas-width="394.46799999999985">routes. The generator is developed in the R programming language</div><div style="left: 90px; top: 510.709px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.976921);" data-canvas-width="392.8685833333332">and produces data compatible with the popular `surveillance’ software</div><div style="left: 90px; top: 527.376px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.00104);" data-canvas-width="49.951666666666654">package.</div><div style="left: 90px; top: 559.091px; font-size: 14.1667px; font-family: sans-serif; transform: scaleX(1.07287);" data-canvas-width="58.23916666666666">Methods</div><div style="left: 105px; top: 574.043px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.964449);" data-canvas-width="379.7516666666666">Colizza et. al. demonstrate the power-law relationships between</div><div style="left: 90px; top: 590.709px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.00148);" data-canvas-width="393.41400000000004">city population, air traffic, and degree distribution [1]. We generate a</div><div style="left: 90px; top: 607.376px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.974546);" data-canvas-width="393.09949999999986">transportation network as a Chung-Lu random graph [2] that preserves</div><div style="left: 90px; top: 624.043px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.00082);" data-canvas-width="227.78583333333333">these scale-free relationships (Figure 1).</div><div style="left: 105px; top: 640.709px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.01213);" data-canvas-width="378.9399166666666">First, given a power-law exponent and a desired number of cities,</div><div style="left: 90px; top: 657.376px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.997173);" data-canvas-width="396.1935">a probability mass function (PMF) is generated that mirrors the</div><div style="left: 90px; top: 674.043px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.97263);" data-canvas-width="394.8391666666668">expected degree distribution for the given power-law relationship.</div><div style="left: 90px; top: 690.709px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.979836);" data-canvas-width="395.3803333333332">Values are then sampled from this PMF to generate an expected</div><div style="left: 90px; top: 707.376px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.965556);" data-canvas-width="395.33074999999997">degree (number of connected cities) for each city in the network.</div><div style="left: 90px; top: 724.043px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.988918);" data-canvas-width="393.0456666666665">Edges (airline connections) are added to the network probabilistically</div><div style="left: 90px; top: 740.709px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.02034);" data-canvas-width="394.32916666666665">as described in [2]. Unconnected graph components are each joined</div><div style="left: 90px; top: 757.376px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.989312);" data-canvas-width="392.8558333333335">to the largest component using linear preferential attachment. Finally,</div><div style="left: 90px; top: 774.043px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.02912);" data-canvas-width="390.1556666666665">city sizes are calculated based on an observed three-quarter power-</div><div style="left: 90px; top: 790.709px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.00075);" data-canvas-width="348.6274999999998">law scaling relationship with the sampled degree distribution.</div><div style="left: 105px; top: 807.376px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.970077);" data-canvas-width="377.87741666666636">Each city is represented as a customizable stochastic compartmental</div><div style="left: 90px; top: 824.043px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.02511);" data-canvas-width="394.4779166666666">SIR model. Transportation between cities is modeled similar to [2].</div><div style="left: 90px; top: 840.709px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.98413);" data-canvas-width="393.11083333333306">An infection is initialized in a single random city and infection counts</div><div style="left: 90px; top: 857.376px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.96001);" data-canvas-width="395.3321666666665">are recorded in each city for a fixed period of time. A consistent</div><div style="left: 90px; top: 874.043px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.0396);" data-canvas-width="394.42550000000017">fraction of the modeled infection cases are recorded as daily clinic</div><div style="left: 90px; top: 890.709px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.00227);" data-canvas-width="393.6080833333332">visits. These counts are then added onto statically generated baseline</div><div style="left: 90px; top: 907.376px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.0281);" data-canvas-width="394.3773333333333">data for each city to produce a full synthetic data set. Alternatively,</div><div style="left: 90px; top: 924.043px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.993301);" data-canvas-width="393.2808333333332">data sets can be generated using real-world networks, such as the one</div><div style="left: 90px; top: 940.709px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.00214);" data-canvas-width="332.4774999999999">maintained by the International Air Transport Association.</div><div style="left: 90px; top: 972.425px; font-size: 14.1667px; font-family: sans-serif; transform: scaleX(1.08488);" data-canvas-width="51.17">Results</div><div style="left: 105px; top: 987.376px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.99831);" data-canvas-width="374.98175">Dynamics such as the number of cities, degree distribution power-</div><div style="left: 90px; top: 1004.04px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.04065);" data-canvas-width="394.5983333333335">law exponent, traffic flow, and disease kinetics can be customized.</div><div style="left: 90px; top: 1020.71px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.03228);" data-canvas-width="394.5544166666665">In the presented example (Figure 2) the outbreak spreads over a 20</div><div style="left: 90px; top: 1037.38px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.01472);" data-canvas-width="394.1676666666666">city transportation network. Infection spreads rapidly once the more</div><div style="left: 90px; top: 1054.04px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.991999);" data-canvas-width="393.1788333333335">populated hub cities are infected. Cities that are multiple flights away</div><div style="left: 90px; top: 1070.71px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.04372);" data-canvas-width="394.43966666666665">from the initially infected city are infected late in the process. The</div><div style="left: 90px; top: 1087.38px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.992465);" data-canvas-width="393.28083333333336">generator is capable of creating data sets of arbitrary size, length, and</div><div style="left: 90px; top: 1104.04px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.998851);" data-canvas-width="390.0012499999999">connectivity to better mirror a diverse set of observed network types.</div><div style="left: 90px; top: 1135.76px; font-size: 14.1667px; font-family: sans-serif; transform: scaleX(1.10336);" data-canvas-width="85.01416666666667">Conclusions</div><div style="left: 105px; top: 1150.71px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.04392);" data-canvas-width="384.7525000000001">New computational methods for outbreak detection and</div><div style="left: 90px; top: 1167.38px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.01868);" data-canvas-width="394.4169999999999">surveillance must be compared to established approaches. Outbreak</div><div style="left: 90px; top: 1184.04px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.984893);" data-canvas-width="393.02158333333296">mitigation strategies require a realistic model of human transportation</div><div style="left: 90px; top: 1200.71px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.00998);" data-canvas-width="393.87583333333305">behavior to best evaluate impact. These actions require test data that</div><div style="left: 90px; top: 1217.38px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.03441);" data-canvas-width="394.4878333333329">accurately reflect the complexity of the real-world data they would</div><div style="left: 510px; top: 249.043px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.01786);" data-canvas-width="396.6935833333336">be applied to. The outbreak data generated here represents the</div><div style="left: 510px; top: 265.709px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.95546);" data-canvas-width="394.55725">complexity of modern transportation networks and are made to be</div><div style="left: 510px; top: 282.376px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.97771);" data-canvas-width="392.82750000000004">easily integrated with established software packages to allow for rapid</div><div style="left: 510px; top: 299.043px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.00105);" data-canvas-width="135.7591666666667">testing and deployment.</div><div style="left: 510px; top: 548.508px; font-size: 12.5px; font-family: serif; transform: scaleX(1.01356);" data-canvas-width="395.52625">Randomly generated scale-free transportation network with a power-law</div><div style="left: 510px; top: 561.841px; font-size: 12.5px; font-family: serif; transform: scaleX(0.990683);" data-canvas-width="97.9125">degree exponent of</div><div style="left: 607.914px; top: 564.215px; font-size: 12.5px; font-family: sans-serif;">λ</div><div style="left: 614.775px; top: 561.841px; font-size: 12.5px; font-family: serif; transform: scaleX(0.991915);" data-canvas-width="285.2249999999999">=1.8. City and link sizes are scaled to reflect their weight.</div><div style="left: 510px; top: 776.588px; font-size: 12.5px; font-family: serif; transform: scaleX(1.00074);" data-canvas-width="393.14">An example of observed daily outbreak-related clinic visits across a randomly</div><div style="left: 510px; top: 789.921px; font-size: 12.5px; font-family: serif; transform: scaleX(1.02957);" data-canvas-width="393.87875">generated network of 20 cities. Each city is colored by the number of flights</div><div style="left: 510px; top: 803.254px; font-size: 12.5px; font-family: serif; transform: scaleX(1.02443);" data-canvas-width="393.8925">required to reach the city from the initial infection location. These generated</div><div style="left: 510px; top: 816.588px; font-size: 12.5px; font-family: serif; transform: scaleX(0.962698);" data-canvas-width="394.70624999999995">counts are then added onto baseline data to create a synthetic data set for</div><div style="left: 510px; top: 829.921px; font-size: 12.5px; font-family: serif; transform: scaleX(1.00153);" data-canvas-width="84.3625">experimentation.</div><div style="left: 510px; top: 866.818px; font-size: 14.1667px; font-family: sans-serif; transform: scaleX(1.08992);" data-canvas-width="67.70249999999999">Keywords</div><div style="left: 510px; top: 881.769px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.00085);" data-canvas-width="261.3041666666667">Simulation; Network; Spatial; Synthetic; Data</div>}, number={1}, journal={Online Journal of Public Health Informatics}, author={Levin, Drew and Finley, Patrick}, year={2017}, month={May} }