Source code for logomaker.src.matrix

from __future__ import division
import numpy as np
import pandas as pd

# Logomaker imports
from logomaker.src.error_handling import check, handle_errors
from logomaker.src.validate import validate_matrix

# Specifies built-in character alphabets
ALPHABET_DICT = {
    'dna': 'ACGT',
    'rna': 'ACGU',
    'protein': 'ACDEFGHIKLMNPQRSTVWY'
}

# Specifies IUPAC string transformations
IUPAC_DICT = {
    'A': 'A',
    'C': 'C',
    'G': 'G',
    'T': 'T',
    'R': 'AG',
    'Y': 'CT',
    'S': 'GC',
    'W': 'AT',
    'K': 'GT',
    'M': 'AC',
    'B': 'CGT',
    'D': 'AGT',
    'H': 'ACT',
    'V': 'ACG',
    'N': 'ACGT'
}

# Set constants
SMALL = np.finfo(float).tiny
MATRIX_TYPES = {'counts', 'probability', 'weight', 'information'}


[docs] @handle_errors def transform_matrix(df, center_values=False, normalize_values=False, from_type=None, to_type=None, background=None, pseudocount=1): """ Performs transformations on a matrix. There are three types of transformations that can be performed: 1. Center values: Subtracts the mean from each row in df. This is common for weight matrices or energy matrices. To do this, set center_values=True. 2. Normalize values: Divides each row by the sum of the row. This is needed for probability matrices. To do this, set normalize_values=True. 3. From/To transformations: Transforms from one type of matrix (e.g. 'counts') to another type of matrix (e.g. 'information'). To do this, set from_type and to_type arguments. Here are the mathematical formulas invoked by From/To transformations: from_type='counts' -> to_type='probability': P_ic = (N_ic + l)/(N_i + C*l), N_i = sum_c(N_ic) from_type='probability' -> to_type='weight': W_ic = log_2(P_ic / Q_ic) from_type='weight' -> to_type='probability': P_ic = Q_ic * 2^(W_ic) from_type='probability' -> to_type='information': I_ic = P_ic * sum_d(P_id * log2(P_id / W_id)) from_type='information' -> to_type='probability': P_ic = I_ic / sum_d(I_id) notation: i = position c, d = character l = pseudocount C = number of characters N_ic = counts matrix element P_ic = probability matrix element Q_ic = background probability matrix element W_ic = weight matrix element I_ic = information matrix element Using these five 1-step transformations, 2-step transformations are also enabled, e.g., from_type='counts' -> to_type='information'. parameters ---------- df: (dataframe) The matrix to be transformed. center_values: (bool) Whether to center matrix values, i.e., subtract the mean from each row. normalize_values: (bool) Whether to normalize each row, i.e., divide each row by the sum of that row. from_type: (str) Type of input matrix. Must be one of 'counts', 'probability', 'weight', or 'information'. to_type: (str) Type of output matrix. Must be one of 'probability', 'weight', or 'information'. Can be 'counts' ONLY if from_type is 'counts' too. background: (array, or df) Specification of background probabilities. If array, should be the same length as df.columns and correspond to the probability of each column's character. If df, should be a probability matrix the same shape as df. pseudocount: (number >= 0) Pseudocount to use when transforming from a counts matrix to a probability matrix. returns ------- out_df: (dataframe) Transformed matrix """ # validate matrix dataframe df = validate_matrix(df) # validate center_values check(isinstance(center_values, bool), 'type(center_values) = %s must be of type bool' % type(center_values)) # validate normalize_values check(isinstance(normalize_values, bool), 'type(normalize_values) = %s must be of type bool' % type(normalize_values)) # validate from_type check((from_type in MATRIX_TYPES) or (from_type is None), 'from_type = %s must be None or in %s' % (from_type, MATRIX_TYPES)) # validate to_type check((to_type in MATRIX_TYPES) or (to_type is None), 'to_type = %s must be None or in %s' % (to_type, MATRIX_TYPES)) # validate background check(isinstance(background, (type([]), np.ndarray, pd.DataFrame)) or (background is None), 'type(background) = %s must be None or array-like or a dataframe.' % type(background)) # validate pseudocount check(isinstance(pseudocount, (int, float)), 'type(pseudocount) = %s must be a number' % type(pseudocount)) check(pseudocount >= 0, 'pseudocount=%s must be >= 0' % pseudocount) # If centering values, do that if center_values is True: check((from_type is None) and (to_type is None), "If center_values is True, both from_type and to_type" "must be None. Here, from_type=%s, to_type=%s" % (from_type, to_type)) # Do centering out_df = _center_matrix(df) # Otherwise, if normalizing values, do that elif normalize_values is True: check((from_type is None) and (to_type is None), "If normalize_values is True, both from_type and to_type" "must be None. Here, from_type=%s, to_type=%s" % (from_type, to_type)) # Do centering out_df = _normalize_matrix(df) # otherwise, if to_type == from_type, just return matrix # Note, this is the only way that to_type='counts' is valid elif from_type == to_type: out_df = df.copy() # Otherwise, we're converting from one type of matrix to another. Do this. else: # Check that from_type and to_type are not None check((from_type is not None) and (to_type is not None), 'Unless center_values is True or normalize_values is True,' 'Neither from_type (=%s) nor to_type (=%s) can be None.' % (from_type, to_type)) # Check that to_type != 'counts' check(to_type != 'counts', "Can only have to_type='counts' if " "from_type='counts'. Here, however, " "from_type='%s'" % from_type) # If converting from a probability matrix if from_type == 'probability': # ... to a weight matrix if to_type == 'weight': out_df = _probability_mat_to_weight_mat(df, background) # ... to an information matrix elif to_type == 'information': out_df = _probability_mat_to_information_mat(df, background) # This should never execute else: assert False, 'THIS SHOULD NEVER EXECUTE' # Otherwise, convert to probability matrix, then call function again else: # If converting from a counts matrix, # convert to probability matrix first if from_type == 'counts': prob_df = _counts_mat_to_probability_mat(df, pseudocount) # If converting from a weight matrix, # convert to probability matrix first elif from_type == 'weight': prob_df = _weight_mat_to_probability_mat(df, background) # If converting from an information matrix, # convert to probability matrix first elif from_type == 'information': prob_df = _information_mat_to_probability_mat(df, background) # This should never execute else: assert False, 'THIS SHOULD NEVER EXECUTE' # Now that we have the probability matrix, # onvert to user-specified to_type out_df = transform_matrix(prob_df, from_type='probability', to_type=to_type, background=background) # Validate and return out_df = validate_matrix(out_df) return out_df
def _counts_mat_to_probability_mat(counts_df, pseudocount=1.0): """ Converts a counts matrix to a probability matrix """ # Validate matrix before use counts_df = validate_matrix(counts_df) # Check pseudocount value check(pseudocount >= 0, "pseudocount must be >= 0.") # Compute prob_df prob_df = counts_df.copy().astype(float) # 25.01.19 Issue #41 - Need to specify float to avoid FutureWarning. vals = counts_df.values + pseudocount prob_df.loc[:, :] = vals / vals.sum(axis=1)[:, np.newaxis] prob_df = _normalize_matrix(prob_df) # Validate and return prob_df = validate_matrix(prob_df, matrix_type='probability') return prob_df def _probability_mat_to_weight_mat(prob_df, background=None): """ Converts a probability matrix to a weight matrix """ # Validate matrix before use prob_df = validate_matrix(prob_df, matrix_type='probability') # Get background matrix bg_df = _get_background_mat(prob_df, background) # Compute weight_df; SMALL is a regularization factor to make sure # np.log2 doesn't throw an error. weight_df = prob_df.copy() weight_df.loc[:, :] = np.log2(prob_df + SMALL) - np.log2(bg_df + SMALL) # Validate and return weight_df = validate_matrix(weight_df) return weight_df def _weight_mat_to_probability_mat(weight_df, background=None): """ Converts a weight matrix to a probability matrix """ # Validate matrix before use weight_df = validate_matrix(weight_df) # Get background matrix bg_df = _get_background_mat(weight_df, background) # Compute prob_df prob_df = weight_df.copy() prob_df.loc[:, :] = bg_df.values * np.power(2, weight_df.values) # Normalize matrix. Needed if matrix is centered. prob_df = _normalize_matrix(prob_df) # Validate and return prob_df = validate_matrix(prob_df, matrix_type='probability') return prob_df def _probability_mat_to_information_mat(prob_df, background=None): """ Converts a probability matrix to an information matrix """ # Validate matrix before use prob_df = validate_matrix(prob_df, matrix_type='probability') # Get background matrix bg_df = _get_background_mat(prob_df, background) # Compute info_df info_df = prob_df.copy() fg_vals = prob_df.values bg_vals = bg_df.values tmp_vals = fg_vals * (np.log2(fg_vals + SMALL) - np.log2(bg_vals + SMALL)) info_vec = tmp_vals.sum(axis=1) info_df.loc[:, :] = fg_vals * info_vec[:, np.newaxis] # Validate and return info_df = validate_matrix(info_df, matrix_type='information') return info_df def _information_mat_to_probability_mat(info_df, background=None): """ Converts an information matrix to an probability matrix """ # Validate matrix before use info_df = validate_matrix(info_df, matrix_type='information') # Get background matrix bg_df = _get_background_mat(info_df, background) # This is a little subtle. If any rows of info_df are zero, # _normalize_matrix() cannot be relied on. But in this case, # we know that the corresponding row of prob_df should just # reflect background. So we replace rows that are zero with the # corresponding bg_df row. zero_indices = np.isclose(info_df.sum(axis=1), 0.0) info_df.loc[zero_indices, :] = bg_df.loc[zero_indices, :] # Just need to normalize matrix prob_df = _normalize_matrix(info_df) # Validate and return prob_df = validate_matrix(prob_df, matrix_type='probability') return prob_df def _normalize_matrix(df): """ Normalizes a matrix df to a probability matrix prob_df """ # Validate matrix df = validate_matrix(df) # Make sure all df values are greater than or equal to zero check(all(df.values.ravel() >= 0), 'Some data frame entries are negative.') # Check to see if values sum to one sums = df.sum(axis=1).values # If any sums are close to zero, abort check(not any(np.isclose(sums, 0.0)), 'Some columns in df sum to nearly zero.') # Create normalized version of input matrix prob_df = df.copy() prob_df.loc[:, :] = df.values / sums[:, np.newaxis] # Validate and return probability matrix prob_df = validate_matrix(prob_df, matrix_type='probability') return prob_df def _center_matrix(df): """ Centers each row of a matrix about zero by subtracting out the mean. """ # Validate matrix df = validate_matrix(df) # Compute mean value of each row means = df.mean(axis=1).values # Subtract out means out_df = df.copy() out_df.loc[:, :] = df.values - means[:, np.newaxis] # Validate and return out_df = validate_matrix(out_df) return out_df def _get_background_mat(df, background): """ Creates a background matrix given a background specification. There are three possiblities: 1. background is None => out_df represents a uniform background 2. background is a vector => this vector is normalized then used as the entries of the rows of out_df. Vector must be the same length as the number of columns in df 3. background is a dataframe => it is then normalized and use as out_df. In this case, background must have the same rows and cols as df """ # Get dimensions of df num_pos, num_cols = df.shape # Create background using df as template bg_df = df.copy() # If background is not specified, use uniform background if background is None: bg_df.loc[:, :] = 1 / num_cols # If background is array-like elif isinstance(background, (np.ndarray, list, tuple)): background = np.array(background) check(len(background) == num_cols, 'df and background have mismatched dimensions.') bg_df.loc[:, :] = background[np.newaxis, :] bg_df = _normalize_matrix(bg_df) # If background is a dataframe elif isinstance(background, pd.core.frame.DataFrame): bg_df = validate_matrix(background) check(all(df.index == bg_df.index), 'Error: df and bg_mat have different indexes.') check(all(df.columns == bg_df.columns), 'Error: df and bg_mat have different columns.') bg_df = _normalize_matrix(bg_df) # validate as probability matrix bg_df = validate_matrix(bg_df, matrix_type='probability') return bg_df
[docs] @handle_errors def alignment_to_matrix(sequences, counts=None, to_type='counts', background=None, characters_to_ignore='.-', center_weights=False, pseudocount=1.0): """ Generates matrix from a sequence alignment parameters ---------- sequences: (list of strings) A list of sequences, all of which must be the same length counts: (None or list of numbers) If not None, must be a list of numbers the same length os sequences, containing the (nonnegative) number of times that each sequence was observed. If None, defaults to 1. to_type: (str) The type of matrix to output. Must be 'counts', 'probability', 'weight', or 'information' background: (array, or df) Specification of background probabilities. If array, should be the same length as df.columns and correspond to the probability of each column's character. If df, should be a probability matrix the same shape as df. characters_to_ignore: (str) Characters to ignore within sequences. This is often needed when creating matrices from gapped alignments. center_weights: (bool) Whether to subtract the mean of each row, but only if to_type=='weight'. pseudocount: (number >= 0.0) Pseudocount to use when converting from counts to probabilities. returns ------- out_df: (dataframe) A matrix of the requested type. """ # validate inputs # Make sure sequences is list-like check(isinstance(sequences, (list, tuple, np.ndarray, pd.Series)), 'sequences must be a list, tuple, np.ndarray, or pd.Series.') sequences = list(sequences) # Make sure sequences has at least 1 element check(len(sequences) > 0, 'sequences must have length > 0.') # Make sure all elements are sequences check(all(isinstance(seq, str) for seq in sequences), 'sequences must all be of type string') # validate characters_to_ignore check(isinstance(characters_to_ignore, str), 'type(seq) = %s must be of type str' % type(characters_to_ignore)) # validate center_weights check(isinstance(center_weights, bool), 'type(center_weights) = %s; must be bool.' % type(center_weights)) # Get sequence length L = len(sequences[0]) # Make sure all sequences are the same length check(all([len(s) == L for s in sequences]), 'all elements of sequences must have the same length.') # validate counts as list-like check(isinstance(counts, (list, tuple, np.ndarray, pd.Series)) or (counts is None), 'counts must be None or a list, tuple, np.ndarray, or pd.Series.') # make sure counts has the same length as sequences if counts is None: counts = np.ones(len(sequences)) else: check(len(counts) == len(sequences), 'counts must be the same length as sequences;' 'len(counts) = %d; len(sequences) = %d' % (len(counts), len(sequences))) # 2025.01.19 Fix for Issue #25 - Cast counts to numpy array counts = np.array(counts, dtype=int) # validate background check(isinstance(background, (type([]), np.ndarray, pd.DataFrame)) or (background is None), 'type(background) = %s must be None or array-like or a dataframe.' % type(background)) # Define valid types valid_types = MATRIX_TYPES.copy() # Check that to_type is valid check(to_type in valid_types, 'to_type=%s; must be in %s' % (to_type, valid_types)) # Create a 2D array of characters char_array = np.array([np.array(list(seq)) for seq in sequences]) # Get list of unique characters unique_characters = np.unique(char_array.ravel()) unique_characters.sort() # Remove characters to ignore columns = [c for c in unique_characters if not c in characters_to_ignore] index = list(range(L)) counts_df = pd.DataFrame(data=0, columns=columns, index=index) # Sum of the number of occurrences of each character at each position for c in columns: tmp_mat = (char_array == c).astype(float) * counts[:, np.newaxis] counts_df.loc[:, c] = tmp_mat.sum(axis=0).T # Convert counts matrix to matrix of requested type out_df = transform_matrix(counts_df, from_type='counts', to_type=to_type, pseudocount=pseudocount, background=background) # Center values only if center_weights is True and to_type is 'weight' if center_weights and to_type == 'weight': out_df = transform_matrix(out_df, center_values=True) return out_df
[docs] @handle_errors def sequence_to_matrix(seq, cols=None, alphabet=None, is_iupac=False, to_type='probability', center_weights=False): """ Generates a matrix from a sequence. With default keyword arguments, this is a one-hot-encoded version of the sequence provided. Alternatively, is_iupac=True allows users to get matrix models based in IUPAC motifs. parameters ---------- seq: (str) Sequence from which to construct matrix. cols: (str or array-like or None) The characters to use for the matrix columns. If None, cols is constructed from the unqiue characters in seq. Overriden by alphabet and is_iupac. alphabet: (str or None) The alphabet used to determine the columns of the matrix. Options are: 'dna', 'rna', 'protein'. Ignored if None. Overrides cols. is_iupac: (bool) If True, it is assumed that the sequence represents an IUPAC DNA string. In this case, cols is overridden, and alphabet must be None. to_type: (str) The type of matrix to output. Must be 'probability', 'weight', or 'information' center_weights: (bool) Whether to subtract the mean of each row, but only if to_type='weight'. returns ------- seq_df: (dataframe) the matrix returned to the user. """ # Define valid types valid_types = MATRIX_TYPES.copy() valid_types.remove('counts') # validate seq check(isinstance(seq, str), 'type(seq) = %s must be of type str' % type(seq)) # validate center_weights check(isinstance(center_weights, bool), 'type(center_weights) = %s; must be bool.' % type(center_weights)) # If cols is None, set to list of unique characters in sequence if cols is None: cols = list(set(seq)) cols.sort() # Otherwise, validate cols else: cols_types = (str, list, set, np.ndarray) check(isinstance(cols, cols_types), 'cols = %s must be None or a string, set, list, or np.ndarray') # If alphabet is specified, override cols if alphabet is not None: # Validate alphabet valid_alphabets = list(ALPHABET_DICT.keys()) check(alphabet in valid_alphabets, 'alphabet = %s; must be in %s.' % (alphabet, valid_alphabets)) # Set cols cols = list(ALPHABET_DICT[alphabet]) # validate to_type check(to_type in valid_types, 'invalid to_type=%s; to_type must be in %s' % (to_type, valid_types)) # validate is_iupac check(isinstance(is_iupac, bool), 'type(is_iupac) = %s; must be bool.' % type(is_iupac)) # If is_iupac, override alphabet and cols if is_iupac: # Check that alphabet has not been specified check(alphabet is None, 'must have alphabet=None if is_iupac=True') cols = list(ALPHABET_DICT['dna']) # Initialize counts dataframe L = len(seq) index = list(range(L)) counts_df = pd.DataFrame(data=0.0, columns=cols, index=index) # If is_iupac, fill counts_df: if is_iupac: # Get list of valid IUPAC characters iupac_characters = list(IUPAC_DICT.keys()) # Iterate over sequence positions for i, c in enumerate(seq): # Check that c is in the set of valid IUPAC characters check(c in iupac_characters, 'character %s at position %d is not a valid IUPAC character;' 'must be one of %s' % (c, i, iupac_characters)) # Fill in a count for each possible base bs = IUPAC_DICT[c] for b in bs: counts_df.loc[i, b] = 1.0 # Otherwise, fill counts the normal way else: # Iterate over sequence positions for i, c in enumerate(seq): # Check that c is in columns check(c in cols, 'character %s at position %d is not in cols=%s' % (c, i, cols)) # Increment counts_df counts_df.loc[i, c] = 1.0 # Convert to requested type out_df = transform_matrix(counts_df, pseudocount=0, from_type='counts', to_type=to_type) # Center values only if center_weights is True and to_type is 'weight' if center_weights and to_type == 'weight': out_df = transform_matrix(out_df, center_values=True) return out_df
[docs] @handle_errors def saliency_to_matrix(seq, values, cols=None, alphabet=None): """ Takes a sequence string and an array of values values and outputs a values dataframe. The returned dataframe is a L by C matrix where C is the number ofcharacters and L is sequence length. If matrix is denoted as S, i indexes positions and c indexes characters, then S_ic will be non-zero (equal to the value in the values array at position p) only if character c occurs at position p in sequence. All other elements of S are zero. example usage: saliency_mat = logomaker.saliency_to_matrix(sequence,values) logomaker.Logo(saliency_mat) parameters ---------- seq: (str or array-like list of single characters) sequence for which values matrix is constructed values: (array-like list of numbers) array of values values for each character in sequence cols: (str or array-like or None) The characters to use for the matrix columns. If None, cols is constructed from the unqiue characters in seq. Overridden by alphabet and is_iupac. alphabet: (str or None) The alphabet used to determine the columns of the matrix. Options are: 'dna', 'rna', 'protein'. Ignored if None. Overrides cols. returns ------- saliency_df: (dataframe) values matrix in the form of a dataframe """ # try to convert seq to str; throw exception if fail if isinstance(seq, (list, np.ndarray, pd.Series)): try: seq = ''.join([str(x) for x in seq]) except: check(False, 'could not convert %s to type str' % repr(str)) else: try: seq = str(seq) except: check(False, 'could not convert %s to type str' % repr(str)) # validate seq check(isinstance(seq, str), 'type(seq) = %s must be of type str' % type(seq)) # validate values: check that it is a list or array check(isinstance(values, (type([]), np.ndarray, pd.Series)), 'type(values) = %s must be of type list' % type(values)) # cast values as a list just to be sure what we're working with values = list(values) # check length of seq and values are equal check(len(seq) == len(values), 'length of seq and values list must be equal.') # If cols is None, set to list of unique characters in sequence if cols is None: cols = list(set(seq)) cols.sort() # Otherwise, validate cols else: cols_types = (str, list, set, np.ndarray) check(isinstance(cols, cols_types), 'cols = %s must be None or a string, set, list, or np.ndarray') # perform additional checks to validate cols check(len(set(cols))==len(set(seq)), 'length of set of unique characters must be equal for "cols " and "seq"') check(set(cols) == set(seq), 'unique characters for "cols" and "seq" must be equal.') # If alphabet is specified, override cols if alphabet is not None: # Validate alphabet valid_alphabets = list(ALPHABET_DICT.keys()) check(alphabet in valid_alphabets, 'alphabet = %s; must be in %s.' % (alphabet, valid_alphabets)) # Set cols cols = list(ALPHABET_DICT[alphabet]) # turn seq into binary one-hot encoded matrix. ohe_sequence = sequence_to_matrix(seq, cols=cols) # multiply values list with one-hot encoded seq to get # values matrix or dataframe saliency_df = ohe_sequence.copy() saliency_df.loc[:, :] = ohe_sequence.values * \ np.array(values)[:, np.newaxis] return saliency_df