Toner T, Pancholi R, Miller P, Forster T, Coleman H, Overton I (2023). “Strategies and techniques for quality control and semantic enrichment with multimodal data: a case study in colorectal cancer with eHDPrep.” GigaScience, 12. ISSN 2047-217X, doi:10.1093/gigascience/giad030, giad030, https://academic.oup.com/gigascience/article-pdf/doi/10.1093/gigascience/giad030/50383140/giad030.pdf, https://doi.org/10.1093/gigascience/giad030.

Corresponding BibTeX entry:

  @Article{,
    title = {Strategies and techniques for quality control and semantic
      enrichment with multimodal data: a case study in colorectal
      cancer with eHDPrep},
    author = {Tom M. Toner and Rashi Pancholi and Paul Miller and
      Thorsten Forster and Helen G. Coleman and Ian M. Overton},
    journal = {GigaScience},
    volume = {12},
    year = {2023},
    month = {05},
    abstract = {Integration of data from multiple domains can greatly
      enhance the quality and applicability of knowledge generated in
      analysis workflows. However, working with health data is
      challenging, requiring careful preparation in order to support
      meaningful interpretation and robust results. Ontologies
      encapsulate relationships between variables that can enrich the
      semantic content of health datasets to enhance interpretability
      and inform downstream analyses. We developed an R package for
      electronic health data preparation, 'eHDPrep', demonstrated upon
      a multimodal colorectal cancer dataset (661 patients, 155
      variables; Colo-661); a further demonstrator is taken from The
      Cancer Genome Atlas (459 patients, 94 variables; TCGA-COAD).
      eHDPrep offers user-friendly methods for quality control,
      including internal consistency checking and redundancy removal
      with information-theoretic variable merging. Semantic enrichment
      functionality is provided, enabling generation of new informative
      “meta-variables” according to ontological common ancestry between
      variables, demonstrated with SNOMED CT and the Gene Ontology in
      the current study. eHDPrep also facilitates numerical encoding,
      variable extraction from free text, completeness analysis, and
      user review of modifications to the dataset. eHDPrep provides
      effective tools to assess and enhance data quality, laying the
      foundation for robust performance and interpretability in
      downstream analyses. Application to multimodal colorectal cancer
      datasets resulted in improved data quality, structuring, and
      robust encoding, as well as enhanced semantic information. We
      make eHDPrep available as an R package from CRAN
      (https://cran.r-project.org/package=eHDPrep) and GitHub
      (https://github.com/overton-group/eHDPrep).},
    issn = {2047-217X},
    doi = {10.1093/gigascience/giad030},
    url = {https://doi.org/10.1093/gigascience/giad030},
    note = {giad030},
    eprint =
      {https://academic.oup.com/gigascience/article-pdf/doi/10.1093/gigascience/giad030/50383140/giad030.pdf},
  }