Source code for logomaker.src.validate

from __future__ import division
import numpy as np
import pandas as pd
from logomaker.src.error_handling import check, handle_errors



[docs]
@handle_errors
def validate_matrix(df, matrix_type=None, allow_nan=False):
    """
    Checks to make sure that the input dataframe, df, represents a valid
    matrix, i.e., an object that can be displayed as a logo.

    parameters
    ----------

    df: (dataframe)
        A pandas dataframe where each row represents an (integer) position
        and each column represents to a (single) character.

    matrix_type: (None or str)
        If 'probability', validates df as a probability matrix, i.e., all
        elements are in [0,1] and rows are normalized). If 'information',
        validates df as an information matrix, i.e., all elements >= 0.

    allow_nan: (bool)
        Whether to allow NaN entries in the matrix.

    returns
    -------
    out_df: (dataframe)
        A cleaned-up version of df (if possible).
    """

    # check that df is a dataframe
    check(isinstance(df, pd.DataFrame),
          'out_df needs to be a valid pandas out_df, ' 
          'out_df entered: %s' % type(df))

    # create copy of df so we don't overwrite the user's data
    out_df = df.copy()

    # check that type is valid
    check(matrix_type in {None, 'probability', 'information'},
          'matrix_type = %s; must be None, "probability", or "information"' %
          matrix_type)

    # check that allow_nan is boolean
    check(isinstance(allow_nan, bool),
          'allow_nan must be of type bool; is type %s.' % type(allow_nan))

    if not allow_nan:
        # make sure all entries are finite numbers
        check(np.isfinite(out_df.values).all(),
              'some matrix elements are not finite. ' 
              'Set allow_nan=True to allow this.')

    # make sure the matrix has a finite number of rows and columns
    check(out_df.shape[0] >= 1, 'df has zero rows. Needs multiple rows.')
    check(out_df.shape[1] >= 1, 'df has zero columns. Needs multiple columns.')

    # check that all column names are strings and have length 1
    for i, col in enumerate(out_df.columns):

        # convert from unicode to string for python 2
        col = str(col)
        check(isinstance(col, str),
              'column number %d is of type %s; must be a str' % (i, col))
        check(len(col) == 1,
              'column %d is %s and has length %d; ' % (i, repr(col), len(col))
              + 'must have length 1.')

    # 2025.01.19 Fix for Issue #36 - Columns should *not* be sorted alphabetically
    # as this nullifies the 'fixed_order' option in Logo()
    # # sort columns alphabetically
    # char_cols = list(out_df.columns)
    # char_cols.sort()
    # out_df = out_df[char_cols]

    # name out_df.index as 'pos'
    out_df.index.name = 'pos'

    # try casting df.index as type int
    try:
        int_index = out_df.index.astype(int)
    except TypeError:
        check(False,
              'could not convert df.index to type int. Check that '
              'all positions have integer numerical values.')

    # make sure that df.index values have not changed
    check(all(int_index == out_df.index),
          'could not convert df.index values to int without changing'
          'some values. Make sure that df.index values are integers.')

    # check that all index values are unique
    check(len(set(out_df.index)) == len(out_df.index),
          'not all values of df.index are unique. Make sure all are unique.')

    # if type is 'information', make sure elements are nonnegative
    if matrix_type == 'information':

        # make sure all elements are nonnegative
        check(all(df.values.ravel() >= 0), 'not all values in df are >=0.')

    # if type is 'probability', make sure elements are valid probabilities
    elif matrix_type == 'probability':

        # make sure all values are non-negative
        check(all(df.values.ravel() >= 0),
              'not all values in df are >=0.')

        # check to see if values sum to one
        sums = df.sum(axis=1).values

        # if any sums are close to zero, abort
        check(not any(np.isclose(sums, 0.0)),
              'some columns in df sum to nearly zero.')

        # if any sums are not close to one, renormalize all sums
        if not all(np.isclose(sums, 1.0)):
            print('in validate_matrix(): Row sums in df are not close to 1. '
                  'Reormalizing rows...')
            df.loc[:, :] = df.values / sums[:, np.newaxis]
            out_df = df.copy()

    # nothing more to check if type is None
    elif matrix_type is None:
        pass

    # return cleaned-up out_df
    return out_df



@handle_errors
def validate_probability_mat(df):
    """
    Verifies that the input dataframe df indeed represents a
    probability matrix. Renormalizes df with a text warning if it is not
    already normalized. Throws an error if df cannot be reliably normalized.

    parameters
    ----------

    df: (dataframe)
        A pandas dataframe where each row represents an (integer) position
        and each column represents to a (single) character.

    returns
    -------
    prob_df: (dataframe)
        A cleaned-up and normalized version of df (if possible).
    """

    # Validate as a matrix. Make sure this contains no NaN values
    prob_df = validate_matrix(df, allow_nan=False)

    # Make sure all values are non-negative
    check(all(prob_df.values.ravel() >= 0),
          'not all values in df are >=0.')

    # Check to see if values sum to one
    sums = prob_df.sum(axis=1).values

    # If any sums are close to zero, abort
    check(not any(np.isclose(sums, 0.0)),
          'some columns in prob_df sum to nearly zero.')

    # If any sums are not close to one, renormalize all sums
    if not all(np.isclose(sums, 1.0)):
        print('in validate_probability_mat(): '
              'Row sums in df are not close to 1. '
              'Reormalizing rows...')
        prob_df.loc[:, :] = prob_df.values / sums[:, np.newaxis]

    # Return validated probability matrix to user
    return prob_df


@handle_errors
def validate_numeric(value, name, min_val=None, max_val=None, allow_none=False,
                     min_inclusive=True, max_inclusive=True):
    """Validate numeric parameters with optional range checking
    
    Parameters
    ----------
    value : object
        Value to validate
    name : str
        Parameter name for error messages
    min_val : float, optional
        Minimum allowed value
    max_val : float, optional 
        Maximum allowed value
    allow_none : bool, optional
        Whether to allow None as a valid value
    min_inclusive : bool, optional
        Whether minimum bound is inclusive (default True)
    max_inclusive : bool, optional
        Whether maximum bound is inclusive (default True)
        
    Returns
    -------
    float
        Validated numeric value
    """
    if allow_none and value is None:
        return None
        
    check(isinstance(value, (int, float, np.number)),
          f'type({name}) = {type(value)}; must be numeric')
    
    value_float = float(value)
    
    if min_val is not None:
        if min_inclusive:
            check(value_float >= min_val,
                  f'{name} = {value} must be >= {min_val}')
        else:
            check(value_float > min_val,
                  f'{name} = {value} must be > {min_val}')
              
    if max_val is not None:
        if max_inclusive:
            check(value_float <= max_val,
                  f'{name} = {value} must be <= {max_val}')
        else:
            check(value_float < max_val,
                  f'{name} = {value} must be < {max_val}')
              
    return value_float