Source code for mednet.data.classify.indian

# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""Indian database for TB detection (a.k.a. Dataset A/Dataset B).

The Indian collection database has been established to foster research in
computer-aided diagnosis of pulmonary diseases with a special focus on
pulmonary tuberculosis (TB).  This database is also known as the "Database
A/Database B" database.

* Database reference: :cite:p:`noauthor_tbxpredict_2014`
* Split references: :cite:p:`noauthor_tbxpredict_2014` with 20% of train set for the validation
  set

.. important:: **Raw data organization**

    The Indian_ base datadir, which you should configure following the
    :ref:`mednet.setup` instructions, must contain at least these two
    subdirectories:

    - ``DatasetA/`` (directory containing the dataset A images in JPG format)
    - ``DatasetB/`` (directory containing the dataset B images in DICOM format)

Data specifications:

* Raw data input (on disk):

  * JPG RGB 8-bit depth images with "inverted" grayscale scale, with varying
    resolution of at least 1024 x 1024 pixels per sample
  * Samples: 156 images and associated labels

* Output image:  Use the same transforms and specifications as for
  :py:mod:`.classify.shenzhen`

This module contains the base declaration of common data modules and raw-data
loaders for this database. All configured splits inherit from this definition.
"""

import importlib.resources.abc
import pathlib

from ..datamodule import CachingDataModule
from ..split import JSONDatabaseSplit
from .shenzhen import RawDataLoader

DATABASE_SLUG = __name__.rsplit(".", 1)[-1]
"""Pythonic name of this database."""

CONFIGURATION_KEY_DATADIR = "datadir." + DATABASE_SLUG
"""Key to search for in the configuration file for the root directory of this
database."""



[docs]
class DataModule(CachingDataModule):
    """Indian database for TB detection (a.k.a. Dataset A/Dataset B).
        Names of the JSON files containing the splits to load for montgomery
        and shenzhen databases (in this order).

    Parameters
    ----------
    split_path
        Path or traversable (resource) with the JSON split description to load.
    """

    def __init__(self, split_path: pathlib.Path | importlib.resources.abc.Traversable):
        super().__init__(
            database_split=JSONDatabaseSplit(split_path),
            raw_data_loader=RawDataLoader(config_variable=CONFIGURATION_KEY_DATADIR),
            database_name=DATABASE_SLUG,
            split_name=split_path.name.rsplit(".", 2)[0],
            task="classification",
            num_classes=1,
        )