Source code for iqdma.utilities_dvha_stats

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# utilities.py
"""Common functions for DVHA-Stats. Copied to limit required libraries."""
#
# Copyright (c) 2020 Dan Cutright
# This file is part of DVHA-Stats, released under a MIT license.
#    See the file LICENSE included with this distribution, also
#    available at https://github.com/cutright/DVHA-Stats

import numpy as np
from os.path import isfile, splitext
import csv


[docs]def apply_dtype(value, dtype): """Convert value with the provided data type Parameters ---------- value : any Value to be converted dtype : function, None python reserved types, e.g., int, float, str, etc. However, dtype could be any callable that raises a ValueError on failure. Returns ---------- any The return of dtype(value) or numpy.nan on ValueError """ if dtype is None: return value try: value = dtype(value) except ValueError: value = np.nan return value
[docs]def csv_to_dict(csv_file_path, delimiter=",", dtype=None, header_row=True): """Read in a csv file, return data as a dictionary Parameters ---------- csv_file_path : str File path to the CSV file to be processed. delimiter : str Specify the delimiter used in the csv file (default = ',') dtype : callable, type, optional Optionally force values to a type (e.g., float, int, str, etc.). header_row : bool, optional If True, the first row is interpreted as column keys, otherwise row indices will be used Returns ------- dict CSV data as a dict, using the first row values as keys """ with open(csv_file_path, "r") as fp: reader = csv.reader(fp, delimiter=delimiter) if header_row: first_row = next(reader) keys = [key.strip() for key in first_row] data = list(reader) else: data = list(reader) keys = list(range(len(data[0]))) data_dict = {key: [] for key in keys} for row in data: for c, value in enumerate(row): data_dict[keys[c]].append(apply_dtype(value, dtype)) return data_dict
[docs]def dict_to_array(data, key_order=None): """Convert a dict of data to a numpy array Parameters ---------- data : dict Dictionary of data to be converted to np.array. key_order : None, list of str Optionally the order of columns Returns ------- dict A dictionary with keys of 'data' and 'columns', pointing to a numpy array and list of str, respectively """ var_names = key_order if key_order is not None else list(data.keys()) arr_data = [data[key] for key in var_names] return {"data": np.asarray(arr_data).T, "var_names": var_names}
[docs]def import_data(data, var_names=None): """Generalized data importer for np.ndarray, dict, and csv file Parameters ---------- data : numpy.array, dict, str Input data (2-D) with N rows of observations and p columns of variables. The CSV file must have a header row for column names. var_names : list of str, optional If data is a numpy array, optionally provide the column names. Returns ---------- np.ndarray, list A tuple: data as an array and variable names as a list """ if isinstance(data, np.ndarray): var_names = ( var_names if var_names is not None else list(range(data.shape[1])) ) return data, var_names if isinstance(data, dict): data = dict_to_array(data) return data["data"], data["var_names"] if isinstance(data, str) and isfile(data): if splitext(data)[1] == ".csv": data = dict_to_array(csv_to_dict(data, dtype=float)) return data["data"], data["var_names"] msg = "Invalid data provided - must be a numpy array, dict, or .csv file" raise NotImplementedError(msg)
[docs]def get_sorted_indices(list_data): """Get original indices of a list after sorting Parameters ---------- list_data : list Any python sortable list Returns ---------- list list_data indices of sorted(list_data) """ return [i[0] for i in sorted(enumerate(list_data), key=lambda x: x[1])]
[docs]def sort_2d_array(arr, index, mode="col"): """Sort a 2-D numpy array Parameters ---------- arr : np.ndarray Input 2-D array to be sorted index : int, list Index of column or row to sort arr. If list, will sort by each index in the order provided. mode : str Either 'col' or 'row' """ if not isinstance(index, list): index = [index] if mode not in {"col", "row"}: msg = ( "Unsupported sort_2d_array mode, " "must be either 'col' or 'row' - got %s" % mode ) raise NotImplementedError(msg) sort_by = arr[:, index[-1]] if mode == "col" else arr[index[-1], :] arr = arr[sort_by.argsort()] for i in index[0:-1][::-1]: sort_by = arr[:, i] if mode == "col" else arr[i, :] arr = arr[sort_by.argsort(kind="mergesort")] return arr
[docs]def is_numeric(val): """Check if value is numeric (float or int) Parameters ---------- val : any Any value Returns ------- bool Returns true if float(val) doesn't raise a ValueError """ try: float(val) return True except ValueError: return False