from __future__ import division
import numpy as np
import pandas as pd
# Logomaker imports
from logomaker.src.error_handling import check, handle_errors
from logomaker.src.validate import validate_matrix
# Specifies built-in character alphabets
ALPHABET_DICT = {
'dna': 'ACGT',
'rna': 'ACGU',
'protein': 'ACDEFGHIKLMNPQRSTVWY'
}
# Specifies IUPAC string transformations
IUPAC_DICT = {
'A': 'A',
'C': 'C',
'G': 'G',
'T': 'T',
'R': 'AG',
'Y': 'CT',
'S': 'GC',
'W': 'AT',
'K': 'GT',
'M': 'AC',
'B': 'CGT',
'D': 'AGT',
'H': 'ACT',
'V': 'ACG',
'N': 'ACGT'
}
# Set constants
SMALL = np.finfo(float).tiny
MATRIX_TYPES = {'counts', 'probability', 'weight', 'information'}
def _counts_mat_to_probability_mat(counts_df, pseudocount=1.0):
"""
Converts a counts matrix to a probability matrix
"""
# Validate matrix before use
counts_df = validate_matrix(counts_df)
# Check pseudocount value
check(pseudocount >= 0, "pseudocount must be >= 0.")
# Compute prob_df
prob_df = counts_df.copy().astype(float) # 25.01.19 Issue #41 - Need to specify float to avoid FutureWarning.
vals = counts_df.values + pseudocount
prob_df.loc[:, :] = vals / vals.sum(axis=1)[:, np.newaxis]
prob_df = _normalize_matrix(prob_df)
# Validate and return
prob_df = validate_matrix(prob_df, matrix_type='probability')
return prob_df
def _probability_mat_to_weight_mat(prob_df, background=None):
"""
Converts a probability matrix to a weight matrix
"""
# Validate matrix before use
prob_df = validate_matrix(prob_df, matrix_type='probability')
# Get background matrix
bg_df = _get_background_mat(prob_df, background)
# Compute weight_df; SMALL is a regularization factor to make sure
# np.log2 doesn't throw an error.
weight_df = prob_df.copy()
weight_df.loc[:, :] = np.log2(prob_df + SMALL) - np.log2(bg_df + SMALL)
# Validate and return
weight_df = validate_matrix(weight_df)
return weight_df
def _weight_mat_to_probability_mat(weight_df, background=None):
"""
Converts a weight matrix to a probability matrix
"""
# Validate matrix before use
weight_df = validate_matrix(weight_df)
# Get background matrix
bg_df = _get_background_mat(weight_df, background)
# Compute prob_df
prob_df = weight_df.copy()
prob_df.loc[:, :] = bg_df.values * np.power(2, weight_df.values)
# Normalize matrix. Needed if matrix is centered.
prob_df = _normalize_matrix(prob_df)
# Validate and return
prob_df = validate_matrix(prob_df, matrix_type='probability')
return prob_df
def _probability_mat_to_information_mat(prob_df, background=None):
"""
Converts a probability matrix to an information matrix
"""
# Validate matrix before use
prob_df = validate_matrix(prob_df, matrix_type='probability')
# Get background matrix
bg_df = _get_background_mat(prob_df, background)
# Compute info_df
info_df = prob_df.copy()
fg_vals = prob_df.values
bg_vals = bg_df.values
tmp_vals = fg_vals * (np.log2(fg_vals + SMALL) - np.log2(bg_vals + SMALL))
info_vec = tmp_vals.sum(axis=1)
info_df.loc[:, :] = fg_vals * info_vec[:, np.newaxis]
# Validate and return
info_df = validate_matrix(info_df, matrix_type='information')
return info_df
def _information_mat_to_probability_mat(info_df, background=None):
"""
Converts an information matrix to an probability matrix
"""
# Validate matrix before use
info_df = validate_matrix(info_df, matrix_type='information')
# Get background matrix
bg_df = _get_background_mat(info_df, background)
# This is a little subtle. If any rows of info_df are zero,
# _normalize_matrix() cannot be relied on. But in this case,
# we know that the corresponding row of prob_df should just
# reflect background. So we replace rows that are zero with the
# corresponding bg_df row.
zero_indices = np.isclose(info_df.sum(axis=1), 0.0)
info_df.loc[zero_indices, :] = bg_df.loc[zero_indices, :]
# Just need to normalize matrix
prob_df = _normalize_matrix(info_df)
# Validate and return
prob_df = validate_matrix(prob_df, matrix_type='probability')
return prob_df
def _normalize_matrix(df):
"""
Normalizes a matrix df to a probability matrix prob_df
"""
# Validate matrix
df = validate_matrix(df)
# Make sure all df values are greater than or equal to zero
check(all(df.values.ravel() >= 0),
'Some data frame entries are negative.')
# Check to see if values sum to one
sums = df.sum(axis=1).values
# If any sums are close to zero, abort
check(not any(np.isclose(sums, 0.0)),
'Some columns in df sum to nearly zero.')
# Create normalized version of input matrix
prob_df = df.copy()
prob_df.loc[:, :] = df.values / sums[:, np.newaxis]
# Validate and return probability matrix
prob_df = validate_matrix(prob_df, matrix_type='probability')
return prob_df
def _center_matrix(df):
"""
Centers each row of a matrix about zero by subtracting out the mean.
"""
# Validate matrix
df = validate_matrix(df)
# Compute mean value of each row
means = df.mean(axis=1).values
# Subtract out means
out_df = df.copy()
out_df.loc[:, :] = df.values - means[:, np.newaxis]
# Validate and return
out_df = validate_matrix(out_df)
return out_df
def _get_background_mat(df, background):
"""
Creates a background matrix given a background specification. There
are three possiblities:
1. background is None => out_df represents a uniform background
2. background is a vector => this vector is normalized then used as
the entries of the rows of out_df. Vector must be the same length
as the number of columns in df
3. background is a dataframe => it is then normalized and use as out_df.
In this case, background must have the same rows and cols as df
"""
# Get dimensions of df
num_pos, num_cols = df.shape
# Create background using df as template
bg_df = df.copy()
# If background is not specified, use uniform background
if background is None:
bg_df.loc[:, :] = 1 / num_cols
# If background is array-like
elif isinstance(background, (np.ndarray, list, tuple)):
background = np.array(background)
check(len(background) == num_cols,
'df and background have mismatched dimensions.')
bg_df.loc[:, :] = background[np.newaxis, :]
bg_df = _normalize_matrix(bg_df)
# If background is a dataframe
elif isinstance(background, pd.core.frame.DataFrame):
bg_df = validate_matrix(background)
check(all(df.index == bg_df.index),
'Error: df and bg_mat have different indexes.')
check(all(df.columns == bg_df.columns),
'Error: df and bg_mat have different columns.')
bg_df = _normalize_matrix(bg_df)
# validate as probability matrix
bg_df = validate_matrix(bg_df, matrix_type='probability')
return bg_df
[docs]
@handle_errors
def alignment_to_matrix(sequences,
counts=None,
to_type='counts',
background=None,
characters_to_ignore='.-',
center_weights=False,
pseudocount=1.0):
"""
Generates matrix from a sequence alignment
parameters
----------
sequences: (list of strings)
A list of sequences, all of which must be the same length
counts: (None or list of numbers)
If not None, must be a list of numbers the same length os sequences,
containing the (nonnegative) number of times that each sequence was
observed. If None, defaults to 1.
to_type: (str)
The type of matrix to output. Must be 'counts', 'probability',
'weight', or 'information'
background: (array, or df)
Specification of background probabilities. If array, should be the
same length as df.columns and correspond to the probability of each
column's character. If df, should be a probability matrix the same
shape as df.
characters_to_ignore: (str)
Characters to ignore within sequences. This is often needed when
creating matrices from gapped alignments.
center_weights: (bool)
Whether to subtract the mean of each row, but only if to_type=='weight'.
pseudocount: (number >= 0.0)
Pseudocount to use when converting from counts to probabilities.
returns
-------
out_df: (dataframe)
A matrix of the requested type.
"""
# validate inputs
# Make sure sequences is list-like
check(isinstance(sequences, (list, tuple, np.ndarray, pd.Series)),
'sequences must be a list, tuple, np.ndarray, or pd.Series.')
sequences = list(sequences)
# Make sure sequences has at least 1 element
check(len(sequences) > 0, 'sequences must have length > 0.')
# Make sure all elements are sequences
check(all(isinstance(seq, str) for seq in sequences),
'sequences must all be of type string')
# validate characters_to_ignore
check(isinstance(characters_to_ignore, str),
'type(seq) = %s must be of type str' % type(characters_to_ignore))
# validate center_weights
check(isinstance(center_weights, bool),
'type(center_weights) = %s; must be bool.' % type(center_weights))
# Get sequence length
L = len(sequences[0])
# Make sure all sequences are the same length
check(all([len(s) == L for s in sequences]),
'all elements of sequences must have the same length.')
# validate counts as list-like
check(isinstance(counts, (list, tuple, np.ndarray, pd.Series)) or
(counts is None),
'counts must be None or a list, tuple, np.ndarray, or pd.Series.')
# make sure counts has the same length as sequences
if counts is None:
counts = np.ones(len(sequences))
else:
check(len(counts) == len(sequences),
'counts must be the same length as sequences;'
'len(counts) = %d; len(sequences) = %d' %
(len(counts), len(sequences)))
# 2025.01.19 Fix for Issue #25 - Cast counts to numpy array
counts = np.array(counts, dtype=int)
# validate background
check(isinstance(background, (type([]), np.ndarray, pd.DataFrame)) or
(background is None),
'type(background) = %s must be None or array-like or a dataframe.' %
type(background))
# Define valid types
valid_types = MATRIX_TYPES.copy()
# Check that to_type is valid
check(to_type in valid_types,
'to_type=%s; must be in %s' % (to_type, valid_types))
# Create a 2D array of characters
char_array = np.array([np.array(list(seq)) for seq in sequences])
# Get list of unique characters
unique_characters = np.unique(char_array.ravel())
unique_characters.sort()
# Remove characters to ignore
columns = [c for c in unique_characters if not c in characters_to_ignore]
index = list(range(L))
counts_df = pd.DataFrame(data=0, columns=columns, index=index)
# Sum of the number of occurrences of each character at each position
for c in columns:
tmp_mat = (char_array == c).astype(float) * counts[:, np.newaxis]
counts_df.loc[:, c] = tmp_mat.sum(axis=0).T
# Convert counts matrix to matrix of requested type
out_df = transform_matrix(counts_df,
from_type='counts',
to_type=to_type,
pseudocount=pseudocount,
background=background)
# Center values only if center_weights is True and to_type is 'weight'
if center_weights and to_type == 'weight':
out_df = transform_matrix(out_df, center_values=True)
return out_df
[docs]
@handle_errors
def sequence_to_matrix(seq,
cols=None,
alphabet=None,
is_iupac=False,
to_type='probability',
center_weights=False):
"""
Generates a matrix from a sequence. With default keyword arguments,
this is a one-hot-encoded version of the sequence provided. Alternatively,
is_iupac=True allows users to get matrix models based in IUPAC motifs.
parameters
----------
seq: (str)
Sequence from which to construct matrix.
cols: (str or array-like or None)
The characters to use for the matrix columns. If None, cols is
constructed from the unqiue characters in seq. Overriden by alphabet
and is_iupac.
alphabet: (str or None)
The alphabet used to determine the columns of the matrix.
Options are: 'dna', 'rna', 'protein'. Ignored if None. Overrides cols.
is_iupac: (bool)
If True, it is assumed that the sequence represents an IUPAC DNA
string. In this case, cols is overridden, and alphabet must be None.
to_type: (str)
The type of matrix to output. Must be 'probability', 'weight',
or 'information'
center_weights: (bool)
Whether to subtract the mean of each row, but only if to_type='weight'.
returns
-------
seq_df: (dataframe)
the matrix returned to the user.
"""
# Define valid types
valid_types = MATRIX_TYPES.copy()
valid_types.remove('counts')
# validate seq
check(isinstance(seq, str),
'type(seq) = %s must be of type str' % type(seq))
# validate center_weights
check(isinstance(center_weights, bool),
'type(center_weights) = %s; must be bool.' % type(center_weights))
# If cols is None, set to list of unique characters in sequence
if cols is None:
cols = list(set(seq))
cols.sort()
# Otherwise, validate cols
else:
cols_types = (str, list, set, np.ndarray)
check(isinstance(cols, cols_types),
'cols = %s must be None or a string, set, list, or np.ndarray')
# If alphabet is specified, override cols
if alphabet is not None:
# Validate alphabet
valid_alphabets = list(ALPHABET_DICT.keys())
check(alphabet in valid_alphabets,
'alphabet = %s; must be in %s.' % (alphabet, valid_alphabets))
# Set cols
cols = list(ALPHABET_DICT[alphabet])
# validate to_type
check(to_type in valid_types,
'invalid to_type=%s; to_type must be in %s' % (to_type, valid_types))
# validate is_iupac
check(isinstance(is_iupac, bool),
'type(is_iupac) = %s; must be bool.' % type(is_iupac))
# If is_iupac, override alphabet and cols
if is_iupac:
# Check that alphabet has not been specified
check(alphabet is None, 'must have alphabet=None if is_iupac=True')
cols = list(ALPHABET_DICT['dna'])
# Initialize counts dataframe
L = len(seq)
index = list(range(L))
counts_df = pd.DataFrame(data=0.0, columns=cols, index=index)
# If is_iupac, fill counts_df:
if is_iupac:
# Get list of valid IUPAC characters
iupac_characters = list(IUPAC_DICT.keys())
# Iterate over sequence positions
for i, c in enumerate(seq):
# Check that c is in the set of valid IUPAC characters
check(c in iupac_characters,
'character %s at position %d is not a valid IUPAC character;'
'must be one of %s' %
(c, i, iupac_characters))
# Fill in a count for each possible base
bs = IUPAC_DICT[c]
for b in bs:
counts_df.loc[i, b] = 1.0
# Otherwise, fill counts the normal way
else:
# Iterate over sequence positions
for i, c in enumerate(seq):
# Check that c is in columns
check(c in cols,
'character %s at position %d is not in cols=%s' %
(c, i, cols))
# Increment counts_df
counts_df.loc[i, c] = 1.0
# Convert to requested type
out_df = transform_matrix(counts_df,
pseudocount=0,
from_type='counts',
to_type=to_type)
# Center values only if center_weights is True and to_type is 'weight'
if center_weights and to_type == 'weight':
out_df = transform_matrix(out_df, center_values=True)
return out_df
[docs]
@handle_errors
def saliency_to_matrix(seq, values, cols=None, alphabet=None):
"""
Takes a sequence string and an array of values values and outputs a
values dataframe. The returned dataframe is a L by C matrix where C is
the number ofcharacters and L is sequence length. If matrix is denoted as
S, i indexes positions and c indexes characters, then S_ic will be non-zero
(equal to the value in the values array at position p) only if character c
occurs at position p in sequence. All other elements of S are zero.
example usage:
saliency_mat = logomaker.saliency_to_matrix(sequence,values)
logomaker.Logo(saliency_mat)
parameters
----------
seq: (str or array-like list of single characters)
sequence for which values matrix is constructed
values: (array-like list of numbers)
array of values values for each character in sequence
cols: (str or array-like or None)
The characters to use for the matrix columns. If None, cols is
constructed from the unqiue characters in seq. Overridden by alphabet
and is_iupac.
alphabet: (str or None)
The alphabet used to determine the columns of the matrix.
Options are: 'dna', 'rna', 'protein'. Ignored if None. Overrides cols.
returns
-------
saliency_df: (dataframe)
values matrix in the form of a dataframe
"""
# try to convert seq to str; throw exception if fail
if isinstance(seq, (list, np.ndarray, pd.Series)):
try:
seq = ''.join([str(x) for x in seq])
except:
check(False, 'could not convert %s to type str' % repr(str))
else:
try:
seq = str(seq)
except:
check(False, 'could not convert %s to type str' % repr(str))
# validate seq
check(isinstance(seq, str),
'type(seq) = %s must be of type str' % type(seq))
# validate values: check that it is a list or array
check(isinstance(values, (type([]), np.ndarray, pd.Series)),
'type(values) = %s must be of type list' % type(values))
# cast values as a list just to be sure what we're working with
values = list(values)
# check length of seq and values are equal
check(len(seq) == len(values),
'length of seq and values list must be equal.')
# If cols is None, set to list of unique characters in sequence
if cols is None:
cols = list(set(seq))
cols.sort()
# Otherwise, validate cols
else:
cols_types = (str, list, set, np.ndarray)
check(isinstance(cols, cols_types),
'cols = %s must be None or a string, set, list, or np.ndarray')
# perform additional checks to validate cols
check(len(set(cols))==len(set(seq)),
'length of set of unique characters must be equal for "cols " and "seq"')
check(set(cols) == set(seq),
'unique characters for "cols" and "seq" must be equal.')
# If alphabet is specified, override cols
if alphabet is not None:
# Validate alphabet
valid_alphabets = list(ALPHABET_DICT.keys())
check(alphabet in valid_alphabets,
'alphabet = %s; must be in %s.' % (alphabet, valid_alphabets))
# Set cols
cols = list(ALPHABET_DICT[alphabet])
# turn seq into binary one-hot encoded matrix.
ohe_sequence = sequence_to_matrix(seq, cols=cols)
# multiply values list with one-hot encoded seq to get
# values matrix or dataframe
saliency_df = ohe_sequence.copy()
saliency_df.loc[:, :] = ohe_sequence.values * \
np.array(values)[:, np.newaxis]
return saliency_df