@article{Chen_Dubrawski_2017, title={Identification of Sufferers of Rare Diseases Using Medical Claims Data}, volume={9}, url={https://ojphi.org/ojs/index.php/ojphi/article/view/7607}, DOI={10.5210/ojphi.v9i1.7607}, abstractNote={<div style="left: 195px; top: 1244.78px; font-size: 10.8333px; font-family: sans-serif; transform: scaleX(0.977422);" data-canvas-width="660.1508333333342">ISDS Annual Conference Proceedings 2017. This is an Open Access article distributed under the terms of the Creative Commons Attribution-</div><div style="left: 195px; top: 1258.12px; font-size: 10.8333px; font-family: sans-serif; transform: scaleX(1.02712);" data-canvas-width="663.8644999999991">Noncommercial 3.0 Unported License (http://creativecommons.org/licenses/by-nc/3.0/), permitting all non-commercial use, distribution,</div><div style="left: 195px; top: 1271.45px; font-size: 10.8333px; font-family: sans-serif; transform: scaleX(1.00115);" data-canvas-width="366.0691666666667">and reproduction in any medium, provided the original work is properly cited.</div><div style="left: 879.608px; top: 1244.63px; font-size: 18.3333px; font-family: sans-serif; transform: scaleX(1.00098);" data-canvas-width="20.386666666666667">38</div><div style="left: 754.085px; top: 1280.23px; font-size: 8.33333px; font-family: sans-serif; transform: scaleX(1.00091);" data-canvas-width="145.90000000000003">(page number not for citation purposes)</div><div style="left: 114.937px; top: 96.3796px; font-size: 20px; font-family: sans-serif; transform: scaleX(1.05327);" data-canvas-width="310.1">ISDS 2016 Conference Abstracts</div><div style="left: 90px; top: 149.466px; font-size: 26.6667px; font-family: sans-serif; transform: scaleX(1.07684);" data-canvas-width="634.2400000000002">Identification of Sufferers of Rare Diseases Using</div><div style="left: 90px; top: 179.466px; font-size: 26.6667px; font-family: sans-serif; transform: scaleX(1.05532);" data-canvas-width="257.8666666666666">Medical Claims Data</div><div style="left: 90px; top: 227.166px; font-size: 16.6667px; font-family: sans-serif; transform: scaleX(1.07836);" data-canvas-width="268.58333333333337">Jieshi Chen* and Artur Dubrawski</div><div style="left: 90px; top: 257.683px; font-size: 11.6667px; font-family: sans-serif; transform: scaleX(1.00688);" data-canvas-width="310.59000000000003">Auton Lab, Carnegie Mellon University, Pittsburgh, PA, USA</div><div style="left: 90px; top: 320.758px; font-size: 14.1667px; font-family: sans-serif; transform: scaleX(1.08129);" data-canvas-width="63.778333333333336">Objective</div><div style="left: 105px; top: 335.709px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.948123);" data-canvas-width="379.9287499999998">To identify sufferers of a rare and hard to diagnose diseases by</div><div style="left: 90px; top: 352.376px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.00077);" data-canvas-width="324.61499999999984">detecting sequential patterns in historical medical claims.</div><div style="left: 90px; top: 384.091px; font-size: 14.1667px; font-family: sans-serif; transform: scaleX(1.11768);" data-canvas-width="82.63416666666666">Introduction</div><div style="left: 105px; top: 399.043px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.00613);" data-canvas-width="378.7245833333334">Patients who suffer from rare diseases can be hard to diagnose for</div><div style="left: 90px; top: 415.709px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.04466);" data-canvas-width="394.77541666666656">prolonged periods of time. In the process, they are often subjected</div><div style="left: 90px; top: 432.376px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.987797);" data-canvas-width="395.95833333333326">to tentative treatments for ailments they do not have, risking an</div><div style="left: 90px; top: 449.043px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.95895);" data-canvas-width="394.9595833333333">escalation of their actual condition and side effects from therapies</div><div style="left: 90px; top: 465.709px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.970052);" data-canvas-width="395.6197500000003">they do not need. An early and accurate detection of these cases</div><div style="left: 90px; top: 482.376px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.02403);" data-canvas-width="394.52324999999996">would enable follow-ups for precise diagnoses, mitigating the costs</div><div style="left: 90px; top: 499.043px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.00067);" data-canvas-width="311.99249999999995">of unnecessary care and improving patients’ outcomes.</div><div style="left: 90px; top: 530.758px; font-size: 14.1667px; font-family: sans-serif; transform: scaleX(1.07287);" data-canvas-width="58.23916666666666">Methods</div><div style="left: 105px; top: 545.709px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.969835);" data-canvas-width="199.05866666666674">A sequential rule learning algorithm</div><div style="left: 304.204px; top: 546.084px; font-size: 8.5px; font-family: serif;">1</div><div style="left: 308.398px; top: 545.709px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.957116);" data-canvas-width="174.24291666666667">was applied to a medical claim</div><div style="left: 90px; top: 562.376px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.00007);" data-canvas-width="393.5103333333333">dataset of about 1,700 patients, who are pre-selected to have medical</div><div style="left: 90px; top: 579.043px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.954523);" data-canvas-width="394.94966666666693">histories indicative of Gaucher Disease (GD) but only 25 of these</div><div style="left: 90px; top: 595.709px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.98172);" data-canvas-width="395.32225">patients were confirmed positives. About 168,000 medical claims</div><div style="left: 90px; top: 612.376px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.0328);" data-canvas-width="394.2710833333333">and 142,000 pharmaceutical claims were featurized into sequences</div><div style="left: 90px; top: 629.043px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.02851);" data-canvas-width="394.60683333333316">of asynchronous events and regularly sampled time series as inputs</div><div style="left: 90px; top: 645.709px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.012);" data-canvas-width="393.8885833333333">for the model, such that an occurrence of a certain diagnosis code in</div><div style="left: 90px; top: 662.376px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.03101);" data-canvas-width="394.42833333333306">a medical claim was counted as one event along the timeline of the</div><div style="left: 90px; top: 679.043px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.04197);" data-canvas-width="394.3858333333333">patient’s medical history. Similar method was applied to other key</div><div style="left: 90px; top: 695.709px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.03618);" data-canvas-width="394.33624999999995">attributes of claims data including procedure codes, National Drug</div><div style="left: 90px; top: 712.376px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.978581);" data-canvas-width="393.0796666666665">Codes, Diagnosis Related Groupers, etc. These types of events as well</div><div style="left: 90px; top: 729.043px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.03646);" data-canvas-width="394.37450000000007">as their temporal statistics, e.g. moving frequencies, peaks, change</div><div style="left: 90px; top: 745.709px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.03139);" data-canvas-width="394.1988333333333">points, etc., formed the input feature space for the algorithm which</div><div style="left: 90px; top: 762.376px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.00889);" data-canvas-width="393.8545833333332">was trained to adjudicate each test case and estimate their likelihood</div><div style="left: 90px; top: 779.043px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.970558);" data-canvas-width="393.05983333333324">of having GD. A random forest algorithm was also applied to the same</div><div style="left: 90px; top: 795.709px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.00788);" data-canvas-width="393.8446666666666">feature set to comparatively evaluate the utility of sequential aspects</div><div style="left: 90px; top: 812.376px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.00089);" data-canvas-width="372.97999999999985">of data. The models were evaluated with 10-fold cross-validation.</div><div style="left: 90px; top: 844.091px; font-size: 14.1667px; font-family: sans-serif; transform: scaleX(1.08488);" data-canvas-width="51.17">Results</div><div style="left: 105px; top: 859.043px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.00971);" data-canvas-width="381.1484999999999">Figure 1 shows the Receiver Operating Characteristic (ROC)</div><div style="left: 90px; top: 875.709px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.02119);" data-canvas-width="394.2129999999999">curves of the temporal rule model with Area Under the Curve score</div><div style="left: 90px; top: 892.376px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.975174);" data-canvas-width="394.978">exceeding 81% and significantly outperforming the random forest</div><div style="left: 90px; top: 909.043px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.0217);" data-canvas-width="396.5377500000001">and default models. Considering the practical costs to perform</div><div style="left: 90px; top: 925.709px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.9547);" data-canvas-width="394.62524999999977">follow-up genetic tests, we prefer a model achieving high positive</div><div style="left: 90px; top: 942.376px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.04835);" data-canvas-width="394.8774166666667">recall at low risk of false detection. Our model correctly identifies</div><div style="left: 90px; top: 959.043px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.992125);" data-canvas-width="393.29499999999973">more than 25% of known positive cases well within 0.1% of the false</div><div style="left: 90px; top: 975.709px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.960702);" data-canvas-width="394.86466666666684">positive rate, while the performance of a more popular alternative</div><div style="left: 90px; top: 992.376px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.971006);" data-canvas-width="395.26416666666677">is indistinguishable from random. This demonstrates the utility of</div><div style="left: 90px; top: 1009.04px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.973074);" data-canvas-width="394.9383333333332">sequential structure of medical claims in identifying patients who</div><div style="left: 90px; top: 1025.71px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.00233);" data-canvas-width="143.1825">suffer from rare diseases.</div><div style="left: 105px; top: 1042.38px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.952053);" data-canvas-width="379.71058333333315">Our algorithm infers from data highly interpretable rules it uses</div><div style="left: 90px; top: 1059.04px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.00332);" data-canvas-width="396.2104999999999">in case adjudication. Figure 2 illustrates one of them. The root</div><div style="left: 90px; top: 1075.71px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.958814);" data-canvas-width="394.82358333333286">node of the case adjudication tree (Event.7969) reflects the ICD-9</div><div style="left: 90px; top: 1092.38px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.98005);" data-canvas-width="395.00916666666666">diagnosis code of “Other nonspecific abnormal findings”. Among</div><div style="left: 90px; top: 1109.04px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.03377);" data-canvas-width="394.31500000000017">the 14 patients that have this particular ICD-9 code present in their</div><div style="left: 90px; top: 1125.71px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.00373);" data-canvas-width="393.4805833333333">claim history, 36% are confirmed GD sufferers. Compared to default</div><div style="left: 90px; top: 1142.38px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.944645);" data-canvas-width="394.6252499999998">prevalence in our pre-selected data set of 1.47%, this rule lifts the</div><div style="left: 90px; top: 1159.04px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.999447);" data-canvas-width="395.8974166666667">estimated likelihood of GD 25 times. The rule further develops</div><div style="left: 90px; top: 1175.71px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.948288);" data-canvas-width="395.0091666666666">into two children nodes. The left child node adds the condition of</div><div style="left: 90px; top: 1192.38px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.992834);" data-canvas-width="395.9257499999999">having any outpatient claim observed within 43 claims recorded</div><div style="left: 90px; top: 1209.04px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.0494);" data-canvas-width="394.87033333333306">nearby the occurrence of the root node event. It isolates 5 patients</div><div style="left: 510px; top: 319.043px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.03987);" data-canvas-width="394.52749999999946">all of whom are GD-positive. The right child shows that 3 patients</div><div style="left: 510px; top: 335.709px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.02619);" data-canvas-width="396.7927499999999">without Event.7969 in their claim history but prescribed NDC</div><div style="left: 510px; top: 352.376px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.01272);" data-canvas-width="394.23566666666676">62756-0137-02 (Gabapentin by Sun Pharmaceutical Industries Ltd.)</div><div style="left: 510px; top: 369.043px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.05029);" data-canvas-width="394.7683333333333">are all GD-positive. This is just one example of a simple and easy</div><div style="left: 510px; top: 385.709px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.02592);" data-canvas-width="394.5841666666669">to implement business rule that is capable of identifying previously</div><div style="left: 510px; top: 402.376px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.00183);" data-canvas-width="219.11583333333337">undiagnosed sufferers of rare diseases.</div><div style="left: 510px; top: 434.091px; font-size: 14.1667px; font-family: sans-serif; transform: scaleX(1.10336);" data-canvas-width="85.01416666666667">Conclusions</div><div style="left: 525px; top: 449.043px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.986604);" data-canvas-width="380.1879999999999">Our model successfully utilizes sequential relationships among</div><div style="left: 510px; top: 465.709px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.995259);" data-canvas-width="396.0135833333331">events recorded in medical claims data and reveals interpretable</div><div style="left: 510px; top: 482.376px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.0443);" data-canvas-width="397.30558333333283">patterns that can identify sufferers of rare diseases with high</div><div style="left: 510px; top: 499.043px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.04025);" data-canvas-width="394.4113333333335">confidence. The algorithm scales well to large volumes of medical</div><div style="left: 510px; top: 515.709px; font-size: 14.1667px; font-family: serif; transform: scaleX(0.966529);" data-canvas-width="392.5880833333334">claims data and it remains sensitive in despite of a very low prevalence</div><div style="left: 510px; top: 532.376px; font-size: 14.1667px; font-family: serif; transform: scaleX(1.00263);" data-canvas-width="126.28166666666667">of target cases in data.</div><div style="left: 510px; top: 937.261px; font-size: 12.5px; font-family: serif; transform: scaleX(1.00905);" data-canvas-width="393.39625000000007">ROC diagrams of models trained to identify GD patients shown with decimal</div><div style="left: 510px; top: 950.595px; font-size: 12.5px; font-family: serif; transform: scaleX(1.00152);" data-canvas-width="235.37500000000003">logarithmic scale of the false positive rate axis.</div>}, number={1}, journal={Online Journal of Public Health Informatics}, author={Chen, Jieshi and Dubrawski, Artur}, year={2017}, month={May} }