from __future__ import division
import numpy as np
import pandas as pd
from logomaker.src.error_handling import check, handle_errors
[docs]
@handle_errors
def validate_matrix(df, matrix_type=None, allow_nan=False):
"""
Checks to make sure that the input dataframe, df, represents a valid
matrix, i.e., an object that can be displayed as a logo.
parameters
----------
df: (dataframe)
A pandas dataframe where each row represents an (integer) position
and each column represents to a (single) character.
matrix_type: (None or str)
If 'probability', validates df as a probability matrix, i.e., all
elements are in [0,1] and rows are normalized). If 'information',
validates df as an information matrix, i.e., all elements >= 0.
allow_nan: (bool)
Whether to allow NaN entries in the matrix.
returns
-------
out_df: (dataframe)
A cleaned-up version of df (if possible).
"""
# check that df is a dataframe
check(isinstance(df, pd.DataFrame),
'out_df needs to be a valid pandas out_df, '
'out_df entered: %s' % type(df))
# create copy of df so we don't overwrite the user's data
out_df = df.copy()
# check that type is valid
check(matrix_type in {None, 'probability', 'information'},
'matrix_type = %s; must be None, "probability", or "information"' %
matrix_type)
# check that allow_nan is boolean
check(isinstance(allow_nan, bool),
'allow_nan must be of type bool; is type %s.' % type(allow_nan))
if not allow_nan:
# make sure all entries are finite numbers
check(np.isfinite(out_df.values).all(),
'some matrix elements are not finite. '
'Set allow_nan=True to allow this.')
# make sure the matrix has a finite number of rows and columns
check(out_df.shape[0] >= 1, 'df has zero rows. Needs multiple rows.')
check(out_df.shape[1] >= 1, 'df has zero columns. Needs multiple columns.')
# check that all column names are strings and have length 1
for i, col in enumerate(out_df.columns):
# convert from unicode to string for python 2
col = str(col)
check(isinstance(col, str),
'column number %d is of type %s; must be a str' % (i, col))
check(len(col) == 1,
'column %d is %s and has length %d; ' % (i, repr(col), len(col))
+ 'must have length 1.')
# 2025.01.19 Fix for Issue #36 - Columns should *not* be sorted alphabetically
# as this nullifies the 'fixed_order' option in Logo()
# # sort columns alphabetically
# char_cols = list(out_df.columns)
# char_cols.sort()
# out_df = out_df[char_cols]
# name out_df.index as 'pos'
out_df.index.name = 'pos'
# try casting df.index as type int
try:
int_index = out_df.index.astype(int)
except TypeError:
check(False,
'could not convert df.index to type int. Check that '
'all positions have integer numerical values.')
# make sure that df.index values have not changed
check(all(int_index == out_df.index),
'could not convert df.index values to int without changing'
'some values. Make sure that df.index values are integers.')
# check that all index values are unique
check(len(set(out_df.index)) == len(out_df.index),
'not all values of df.index are unique. Make sure all are unique.')
# if type is 'information', make sure elements are nonnegative
if matrix_type == 'information':
# make sure all elements are nonnegative
check(all(df.values.ravel() >= 0), 'not all values in df are >=0.')
# if type is 'probability', make sure elements are valid probabilities
elif matrix_type == 'probability':
# make sure all values are non-negative
check(all(df.values.ravel() >= 0),
'not all values in df are >=0.')
# check to see if values sum to one
sums = df.sum(axis=1).values
# if any sums are close to zero, abort
check(not any(np.isclose(sums, 0.0)),
'some columns in df sum to nearly zero.')
# if any sums are not close to one, renormalize all sums
if not all(np.isclose(sums, 1.0)):
print('in validate_matrix(): Row sums in df are not close to 1. '
'Reormalizing rows...')
df.loc[:, :] = df.values / sums[:, np.newaxis]
out_df = df.copy()
# nothing more to check if type is None
elif matrix_type is None:
pass
# return cleaned-up out_df
return out_df
@handle_errors
def validate_probability_mat(df):
"""
Verifies that the input dataframe df indeed represents a
probability matrix. Renormalizes df with a text warning if it is not
already normalized. Throws an error if df cannot be reliably normalized.
parameters
----------
df: (dataframe)
A pandas dataframe where each row represents an (integer) position
and each column represents to a (single) character.
returns
-------
prob_df: (dataframe)
A cleaned-up and normalized version of df (if possible).
"""
# Validate as a matrix. Make sure this contains no NaN values
prob_df = validate_matrix(df, allow_nan=False)
# Make sure all values are non-negative
check(all(prob_df.values.ravel() >= 0),
'not all values in df are >=0.')
# Check to see if values sum to one
sums = prob_df.sum(axis=1).values
# If any sums are close to zero, abort
check(not any(np.isclose(sums, 0.0)),
'some columns in prob_df sum to nearly zero.')
# If any sums are not close to one, renormalize all sums
if not all(np.isclose(sums, 1.0)):
print('in validate_probability_mat(): '
'Row sums in df are not close to 1. '
'Reormalizing rows...')
prob_df.loc[:, :] = prob_df.values / sums[:, np.newaxis]
# Return validated probability matrix to user
return prob_df
@handle_errors
def validate_numeric(value, name, min_val=None, max_val=None, allow_none=False,
min_inclusive=True, max_inclusive=True):
"""Validate numeric parameters with optional range checking
Parameters
----------
value : object
Value to validate
name : str
Parameter name for error messages
min_val : float, optional
Minimum allowed value
max_val : float, optional
Maximum allowed value
allow_none : bool, optional
Whether to allow None as a valid value
min_inclusive : bool, optional
Whether minimum bound is inclusive (default True)
max_inclusive : bool, optional
Whether maximum bound is inclusive (default True)
Returns
-------
float
Validated numeric value
"""
if allow_none and value is None:
return None
check(isinstance(value, (int, float, np.number)),
f'type({name}) = {type(value)}; must be numeric')
value_float = float(value)
if min_val is not None:
if min_inclusive:
check(value_float >= min_val,
f'{name} = {value} must be >= {min_val}')
else:
check(value_float > min_val,
f'{name} = {value} must be > {min_val}')
if max_val is not None:
if max_inclusive:
check(value_float <= max_val,
f'{name} = {value} must be <= {max_val}')
else:
check(value_float < max_val,
f'{name} = {value} must be < {max_val}')
return value_float