Skip to content

Image parser

C pynasonde.digisonde.parsers.image_parserIonogramImageExtractor

Parses Ionogram (.png) format files, extract data from left table.

pynasonde.digisonde.parsers.image_parser

Utilities to extract tabular metadata from ionogram image files.

This module contains:class:IonogramImageExtractor which uses OpenCV and Tesseract OCR to extract textual parameter tables and header metadata from ionogram images. The functionality is intentionally focused and small so it can be used in documentation examples.

IonogramImageExtractor

Bases: object

Extractor for ionogram images using OpenCV + Tesseract OCR.

The extractor offers helpers to read date/time information from the filename and to OCR specific regions of the image to parse parameter tables and header fields into pandas DataFrames.

Source code in pynasonde/digisonde/parsers/image_parser.py
class IonogramImageExtractor(object):
    """Extractor for ionogram images using OpenCV + Tesseract OCR.

    The extractor offers helpers to read date/time information from the
    filename and to OCR specific regions of the image to parse
    parameter tables and header fields into pandas DataFrames.
    """

    def __init__(
        self,
        filepath: str,
        extract_time_from_name: bool = True,
        date: dt.datetime = None,
        filestr_date_format: str = "ion%y%m%d_%H%M%S.png",
    ):
        """Create an IonogramImageExtractor.

        Parameters:
            filepath: str
                Path to the ionogram image file.
            extract_time_from_name: bool, optional
                If True (default), attempt to parse the timestamp from the
                filename using ``filestr_date_format``.
            date: datetime.datetime, optional
                Manually provided date; if None and
                ``extract_time_from_name`` is True the date will be parsed
                from the filename.
            filestr_date_format: str, optional
                Format string used when parsing the filename timestamp.
        """
        self.filepath = filepath
        self.extract_time_from_name = extract_time_from_name
        self.date = date
        self.file_ext = filepath.split(".")[-1]
        self.filestr_date_format = filestr_date_format
        if extract_time_from_name:
            self.date = dt.datetime.strptime(
                filepath.split("/")[-1], self.filestr_date_format
            )
            logger.info(f"Parsed date from file name: {self.date}")
        return

    def extract_text(
        self,
        crop_axis: np.array = np.array([[50, 650], [0, 220]]),
        cv_props: dict = dict(
            thresh=180, maxval=255, type=cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
        ),
        OCR_custom_config: str = r"--oem 3 --psm 6",
    ) -> str:
        """Extract text from a cropped region of the image using Tesseract OCR.

        Parameters:
            crop_axis: numpy.ndarray, optional
                2x2 array specifying the crop region as [[y1, y2], [x1, x2]].
            cv_props: dict, optional
                OpenCV thresholding parameters (keys: 'thresh', 'maxval', 'type').
            OCR_custom_config: str, optional
                Tesseract configuration string (e.g. '--oem 3 --psm 6').

        Returns:
            Text extracted from the cropped region.
        """
        # Load the image
        img = cv2.imread(self.filepath)
        # Crop the left table region (adjust these values as needed)
        # These coordinates are (y1:y2, x1:x2)
        cropped = img[
            crop_axis[0, 0] : crop_axis[0, 1], crop_axis[1, 0] : crop_axis[1, 1]
        ]
        # Optional: Convert to grayscale and apply threshold for better OCR
        gray = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY)
        _, thresh = cv2.threshold(
            gray, cv_props["thresh"], cv_props["maxval"], cv_props["type"]
        )

        # OCR extraction
        text = pytesseract.image_to_string(thresh, config=OCR_custom_config)
        return text

    def parse_artist_params_table(
        self,
        crop_axis: np.array = np.array([[50, 650], [0, 220]]),
        cv_props: dict = dict(
            thresh=180, maxval=255, type=cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
        ),
        OCR_custom_config: str = r"--oem 3 --psm 6",
        lines_to_extracted: int = -8,
        word_filtes_for_table_values: dict = {"N/A": "nan", ":": "."},
    ) -> pd.DataFrame:
        """Parse a left-side artist-parameters table from the ionogram image.

        The function OCRs a cropped region, extracts line-wise
        key/value pairs, applies simple text replacements and converts
        values to float where possible. The result is returned as a
        single-row:class:`pandas.DataFrame`.

        Parameters:
            crop_axis: numpy.ndarray, optional
                Crop region as [[y1, y2], [x1, x2]].
            cv_props: dict, optional
                OpenCV thresholding parameters.
            OCR_custom_config: str, optional
                Tesseract config string.
            lines_to_extracted: int, optional
                Number of lines to keep from OCR output (negative values
                trim from the end).
            word_filtes_for_table_values: dict, optional
                Mapping of substrings to replace in extracted values before
                conversion.

        Returns:
            Single-row DataFrame with parsed parameter names and values.
        """
        text = self.extract_text(crop_axis, cv_props, OCR_custom_config)
        # Extract all individual parameters
        record = dict()
        if len(text) > 0:
            lines = text.split("\n")
            # Filter all lines based on empty lines
            lines = [l for l in lines if len(l) > 0][:lines_to_extracted]
            for line in lines:
                words = list(filter(None, line.split(" ")))
                if len(words) >= 2:
                    for fw in word_filtes_for_table_values.keys():
                        words[1] = words[1].replace(
                            fw, word_filtes_for_table_values[fw]
                        )
                    record[words[0]] = float(words[1])
        record = pd.DataFrame.from_dict([record])
        logger.info(f"Parsed records: \n {record}")
        return record

    def extract_header(
        self,
        crop_axis: np.array = np.array([[0, 50], [100, 800]]),
        cv_props: dict = dict(
            thresh=180, maxval=255, type=cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
        ),
        OCR_custom_config: str = r"--oem 3 --psm 6",
        word_filtes_for_table_values: dict = {",": "", ":": "."},
    ) -> pd.DataFrame:
        """Extract and parse header fields from an ionogram image.

        Parameters:
            crop_axis: numpy.ndarray, optional
                Crop region for the header area.
            cv_props: dict, optional
                OpenCV thresholding parameters.
            OCR_custom_config: str, optional
                Tesseract config string.
            word_filtes_for_table_values: dict, optional
                Mapping of substrings to replace in both header keys and
                values.

        Returns:
            Single-row DataFrame with header keys and values when found.
        """
        text = self.extract_text(crop_axis, cv_props, OCR_custom_config)

        # Extract all individual parameters
        record = dict()
        if len(text) > 0:
            logger.debug(f"Extracted text: \n {text}")
            lines = text.split("\n")
            # Filter all lines based on empty lines
            lines = [l for l in lines if len(l) > 0]
            if len(lines) >= 2:
                header_columns = list(filter(None, lines[0].split(" ")))
                header_values = list(filter(None, lines[1].split(" ")))

                for fw in word_filtes_for_table_values.keys():
                    header_columns, header_values = (
                        [
                            w.replace(fw, word_filtes_for_table_values[fw])
                            for w in header_columns
                        ],
                        [
                            w.replace(fw, word_filtes_for_table_values[fw])
                            for w in header_values
                        ],
                    )
                record = dict(zip(header_columns, header_values))

        record = pd.DataFrame.from_dict([record])
        logger.info(f"Parsed records: \n {record}")
        return record

__init__(filepath, extract_time_from_name=True, date=None, filestr_date_format='ion%y%m%d_%H%M%S.png')

Create an IonogramImageExtractor.

Parameters:

Name Type Description Default
filepath str

str Path to the ionogram image file.

required
extract_time_from_name bool

bool, optional If True (default), attempt to parse the timestamp from the filename using filestr_date_format.

True
date dt.datetime

datetime.datetime, optional Manually provided date; if None and extract_time_from_name is True the date will be parsed from the filename.

None
filestr_date_format str

str, optional Format string used when parsing the filename timestamp.

'ion%y%m%d_%H%M%S.png'
Source code in pynasonde/digisonde/parsers/image_parser.py
def __init__(
    self,
    filepath: str,
    extract_time_from_name: bool = True,
    date: dt.datetime = None,
    filestr_date_format: str = "ion%y%m%d_%H%M%S.png",
):
    """Create an IonogramImageExtractor.

    Parameters:
        filepath: str
            Path to the ionogram image file.
        extract_time_from_name: bool, optional
            If True (default), attempt to parse the timestamp from the
            filename using ``filestr_date_format``.
        date: datetime.datetime, optional
            Manually provided date; if None and
            ``extract_time_from_name`` is True the date will be parsed
            from the filename.
        filestr_date_format: str, optional
            Format string used when parsing the filename timestamp.
    """
    self.filepath = filepath
    self.extract_time_from_name = extract_time_from_name
    self.date = date
    self.file_ext = filepath.split(".")[-1]
    self.filestr_date_format = filestr_date_format
    if extract_time_from_name:
        self.date = dt.datetime.strptime(
            filepath.split("/")[-1], self.filestr_date_format
        )
        logger.info(f"Parsed date from file name: {self.date}")
    return

extract_text(crop_axis=np.array([[50, 650], [0, 220]]), cv_props=dict(thresh=180, maxval=255, type=cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU), OCR_custom_config='--oem 3 --psm 6')

Extract text from a cropped region of the image using Tesseract OCR.

Parameters:

Name Type Description Default
crop_axis np.array

numpy.ndarray, optional 2x2 array specifying the crop region as [[y1, y2], [x1, x2]].

np.array([[50, 650], [0, 220]])
cv_props dict

dict, optional OpenCV thresholding parameters (keys: 'thresh', 'maxval', 'type').

dict(thresh=180, maxval=255, type=cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
OCR_custom_config str

str, optional Tesseract configuration string (e.g. '--oem 3 --psm 6').

'--oem 3 --psm 6'

Returns:

Type Description
str

Text extracted from the cropped region.

Source code in pynasonde/digisonde/parsers/image_parser.py
def extract_text(
    self,
    crop_axis: np.array = np.array([[50, 650], [0, 220]]),
    cv_props: dict = dict(
        thresh=180, maxval=255, type=cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
    ),
    OCR_custom_config: str = r"--oem 3 --psm 6",
) -> str:
    """Extract text from a cropped region of the image using Tesseract OCR.

    Parameters:
        crop_axis: numpy.ndarray, optional
            2x2 array specifying the crop region as [[y1, y2], [x1, x2]].
        cv_props: dict, optional
            OpenCV thresholding parameters (keys: 'thresh', 'maxval', 'type').
        OCR_custom_config: str, optional
            Tesseract configuration string (e.g. '--oem 3 --psm 6').

    Returns:
        Text extracted from the cropped region.
    """
    # Load the image
    img = cv2.imread(self.filepath)
    # Crop the left table region (adjust these values as needed)
    # These coordinates are (y1:y2, x1:x2)
    cropped = img[
        crop_axis[0, 0] : crop_axis[0, 1], crop_axis[1, 0] : crop_axis[1, 1]
    ]
    # Optional: Convert to grayscale and apply threshold for better OCR
    gray = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(
        gray, cv_props["thresh"], cv_props["maxval"], cv_props["type"]
    )

    # OCR extraction
    text = pytesseract.image_to_string(thresh, config=OCR_custom_config)
    return text

parse_artist_params_table(crop_axis=np.array([[50, 650], [0, 220]]), cv_props=dict(thresh=180, maxval=255, type=cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU), OCR_custom_config='--oem 3 --psm 6', lines_to_extracted=-8, word_filtes_for_table_values={'N/A': 'nan', ':': '.'})

Parse a left-side artist-parameters table from the ionogram image.

The function OCRs a cropped region, extracts line-wise key/value pairs, applies simple text replacements and converts values to float where possible. The result is returned as a single-row:class:pandas.DataFrame.

Parameters:

Name Type Description Default
crop_axis np.array

numpy.ndarray, optional Crop region as [[y1, y2], [x1, x2]].

np.array([[50, 650], [0, 220]])
cv_props dict

dict, optional OpenCV thresholding parameters.

dict(thresh=180, maxval=255, type=cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
OCR_custom_config str

str, optional Tesseract config string.

'--oem 3 --psm 6'
lines_to_extracted int

int, optional Number of lines to keep from OCR output (negative values trim from the end).

-8
word_filtes_for_table_values dict

dict, optional Mapping of substrings to replace in extracted values before conversion.

{'N/A': 'nan', ':': '.'}

Returns:

Type Description
pd.DataFrame

Single-row DataFrame with parsed parameter names and values.

Source code in pynasonde/digisonde/parsers/image_parser.py
def parse_artist_params_table(
    self,
    crop_axis: np.array = np.array([[50, 650], [0, 220]]),
    cv_props: dict = dict(
        thresh=180, maxval=255, type=cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
    ),
    OCR_custom_config: str = r"--oem 3 --psm 6",
    lines_to_extracted: int = -8,
    word_filtes_for_table_values: dict = {"N/A": "nan", ":": "."},
) -> pd.DataFrame:
    """Parse a left-side artist-parameters table from the ionogram image.

    The function OCRs a cropped region, extracts line-wise
    key/value pairs, applies simple text replacements and converts
    values to float where possible. The result is returned as a
    single-row:class:`pandas.DataFrame`.

    Parameters:
        crop_axis: numpy.ndarray, optional
            Crop region as [[y1, y2], [x1, x2]].
        cv_props: dict, optional
            OpenCV thresholding parameters.
        OCR_custom_config: str, optional
            Tesseract config string.
        lines_to_extracted: int, optional
            Number of lines to keep from OCR output (negative values
            trim from the end).
        word_filtes_for_table_values: dict, optional
            Mapping of substrings to replace in extracted values before
            conversion.

    Returns:
        Single-row DataFrame with parsed parameter names and values.
    """
    text = self.extract_text(crop_axis, cv_props, OCR_custom_config)
    # Extract all individual parameters
    record = dict()
    if len(text) > 0:
        lines = text.split("\n")
        # Filter all lines based on empty lines
        lines = [l for l in lines if len(l) > 0][:lines_to_extracted]
        for line in lines:
            words = list(filter(None, line.split(" ")))
            if len(words) >= 2:
                for fw in word_filtes_for_table_values.keys():
                    words[1] = words[1].replace(
                        fw, word_filtes_for_table_values[fw]
                    )
                record[words[0]] = float(words[1])
    record = pd.DataFrame.from_dict([record])
    logger.info(f"Parsed records: \n {record}")
    return record

extract_header(crop_axis=np.array([[0, 50], [100, 800]]), cv_props=dict(thresh=180, maxval=255, type=cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU), OCR_custom_config='--oem 3 --psm 6', word_filtes_for_table_values={',': '', ':': '.'})

Extract and parse header fields from an ionogram image.

Parameters:

Name Type Description Default
crop_axis np.array

numpy.ndarray, optional Crop region for the header area.

np.array([[0, 50], [100, 800]])
cv_props dict

dict, optional OpenCV thresholding parameters.

dict(thresh=180, maxval=255, type=cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
OCR_custom_config str

str, optional Tesseract config string.

'--oem 3 --psm 6'
word_filtes_for_table_values dict

dict, optional Mapping of substrings to replace in both header keys and values.

{',': '', ':': '.'}

Returns:

Type Description
pd.DataFrame

Single-row DataFrame with header keys and values when found.

Source code in pynasonde/digisonde/parsers/image_parser.py
def extract_header(
    self,
    crop_axis: np.array = np.array([[0, 50], [100, 800]]),
    cv_props: dict = dict(
        thresh=180, maxval=255, type=cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
    ),
    OCR_custom_config: str = r"--oem 3 --psm 6",
    word_filtes_for_table_values: dict = {",": "", ":": "."},
) -> pd.DataFrame:
    """Extract and parse header fields from an ionogram image.

    Parameters:
        crop_axis: numpy.ndarray, optional
            Crop region for the header area.
        cv_props: dict, optional
            OpenCV thresholding parameters.
        OCR_custom_config: str, optional
            Tesseract config string.
        word_filtes_for_table_values: dict, optional
            Mapping of substrings to replace in both header keys and
            values.

    Returns:
        Single-row DataFrame with header keys and values when found.
    """
    text = self.extract_text(crop_axis, cv_props, OCR_custom_config)

    # Extract all individual parameters
    record = dict()
    if len(text) > 0:
        logger.debug(f"Extracted text: \n {text}")
        lines = text.split("\n")
        # Filter all lines based on empty lines
        lines = [l for l in lines if len(l) > 0]
        if len(lines) >= 2:
            header_columns = list(filter(None, lines[0].split(" ")))
            header_values = list(filter(None, lines[1].split(" ")))

            for fw in word_filtes_for_table_values.keys():
                header_columns, header_values = (
                    [
                        w.replace(fw, word_filtes_for_table_values[fw])
                        for w in header_columns
                    ],
                    [
                        w.replace(fw, word_filtes_for_table_values[fw])
                        for w in header_values
                    ],
                )
            record = dict(zip(header_columns, header_values))

    record = pd.DataFrame.from_dict([record])
    logger.info(f"Parsed records: \n {record}")
    return record