Source code for iqdma.utilities_dvha_stats

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# utilities.py
"""Common functions for DVHA-Stats. Copied to limit required libraries."""
#
# Copyright (c) 2020 Dan Cutright
# This file is part of DVHA-Stats, released under a MIT license.
#    See the file LICENSE included with this distribution, also
#    available at https://github.com/cutright/DVHA-Stats

import numpy as np
from os.path import isfile, splitext
import csv


[docs]def apply_dtype(value, dtype):
    """Convert value with the provided data type

    Parameters
    ----------
    value : any
        Value to be converted
    dtype : function, None
        python reserved types, e.g., int, float, str, etc. However, dtype
        could be any callable that raises a ValueError on failure.

    Returns
    ----------
    any
        The return of dtype(value) or numpy.nan on ValueError
    """
    if dtype is None:
        return value
    try:
        value = dtype(value)
    except ValueError:
        value = np.nan
    return value


[docs]def csv_to_dict(csv_file_path, delimiter=",", dtype=None, header_row=True):
    """Read in a csv file, return data as a dictionary

    Parameters
    ----------
    csv_file_path : str
        File path to the CSV file to be processed.
    delimiter : str
        Specify the delimiter used in the csv file (default = ',')
    dtype : callable, type, optional
        Optionally force values to a type (e.g., float, int, str, etc.).
    header_row : bool, optional
        If True, the first row is interpreted as column keys, otherwise row
        indices will be used

    Returns
    -------
    dict
        CSV data as a dict, using the first row values as keys
    """

    with open(csv_file_path, "r") as fp:
        reader = csv.reader(fp, delimiter=delimiter)
        if header_row:
            first_row = next(reader)
            keys = [key.strip() for key in first_row]
            data = list(reader)
        else:
            data = list(reader)
            keys = list(range(len(data[0])))

    data_dict = {key: [] for key in keys}
    for row in data:
        for c, value in enumerate(row):
            data_dict[keys[c]].append(apply_dtype(value, dtype))

    return data_dict


[docs]def dict_to_array(data, key_order=None):
    """Convert a dict of data to a numpy array

    Parameters
    ----------
    data : dict
        Dictionary of data to be converted to np.array.
    key_order : None, list of str
        Optionally the order of columns

    Returns
    -------
    dict
        A dictionary with keys of 'data' and 'columns', pointing to a
        numpy array and list of str, respectively
    """
    var_names = key_order if key_order is not None else list(data.keys())
    arr_data = [data[key] for key in var_names]
    return {"data": np.asarray(arr_data).T, "var_names": var_names}


[docs]def import_data(data, var_names=None):
    """Generalized data importer for np.ndarray, dict, and csv file

    Parameters
    ----------
    data : numpy.array, dict, str
        Input data (2-D) with N rows of observations and
        p columns of variables.  The CSV file must have a header row
        for column names.
    var_names : list of str, optional
        If data is a numpy array, optionally provide the column names.

    Returns
    ----------
    np.ndarray, list
        A tuple: data as an array and variable names as a list
    """
    if isinstance(data, np.ndarray):
        var_names = (
            var_names if var_names is not None else list(range(data.shape[1]))
        )
        return data, var_names
    if isinstance(data, dict):
        data = dict_to_array(data)
        return data["data"], data["var_names"]
    if isinstance(data, str) and isfile(data):
        if splitext(data)[1] == ".csv":
            data = dict_to_array(csv_to_dict(data, dtype=float))
            return data["data"], data["var_names"]

    msg = "Invalid data provided - must be a numpy array, dict, or .csv file"
    raise NotImplementedError(msg)


[docs]def get_sorted_indices(list_data):
    """Get original indices of a list after sorting

    Parameters
    ----------
    list_data : list
        Any python sortable list

    Returns
    ----------
    list
        list_data indices of sorted(list_data)
    """
    return [i[0] for i in sorted(enumerate(list_data), key=lambda x: x[1])]


[docs]def sort_2d_array(arr, index, mode="col"):
    """Sort a 2-D numpy array

    Parameters
    ----------
    arr : np.ndarray
        Input 2-D array to be sorted
    index : int, list
        Index of column or row to sort arr.  If list, will sort by each index
        in the order provided.
    mode : str
        Either 'col' or 'row'
    """
    if not isinstance(index, list):
        index = [index]

    if mode not in {"col", "row"}:
        msg = (
            "Unsupported sort_2d_array mode, "
            "must be either 'col' or 'row' - got %s" % mode
        )
        raise NotImplementedError(msg)

    sort_by = arr[:, index[-1]] if mode == "col" else arr[index[-1], :]
    arr = arr[sort_by.argsort()]
    for i in index[0:-1][::-1]:
        sort_by = arr[:, i] if mode == "col" else arr[i, :]
        arr = arr[sort_by.argsort(kind="mergesort")]
    return arr


[docs]def is_numeric(val):
    """Check if value is numeric (float or int)

    Parameters
    ----------
    val : any
        Any value

    Returns
    -------
    bool
        Returns true if float(val) doesn't raise a ValueError
    """
    try:
        float(val)
        return True
    except ValueError:
        return False