Module guidemaker.doench_featurization
doench_featurization.py. Simplified feature extraction to run the model 'V3_model_nopos' from Doench et al. 2016 for on-target scoring.
For use in Guidemaker https://guidemaker.org. Adam Rivers, Unites States Department of Agriculture, Agricultural Research Service
Core code https://github.com/MicrosoftResearch/Azimuth is in Python2 and does not run well given changes to packages. Miles Smith worked on porting to Python3 in this repo: https://github.com/milescsmith/Azimuth. including a new branch that used Poetry to build. The work is not complete.
This work is derivitive of that BSD 3-clause licensed work. The key changes are: 1. uch of the code needed for tasks other thant prediction of the V3_model_nopos was removed. 2. The Calculation of NGGX features was re-written. A bug that prevented scaling to thousands guides efficiently. 3. the Pickle model and scikit-learn were replaced with an Onnx model and onnxruntime for better persistance, security, and performance.
Reference:
John G. Doench, Nicolo Fusi, Meagan Sullender, Mudra Hegde, Emma W. Vaimberg, Katherine F. Donovan, Ian Smith, Zuzana Tothova, Craig Wilen , Robert Orchard , Herbert W. Virgin, Jennifer Listgarten, David E. Root. Optimized sgRNA design to maximize activity and minimize off-target effects for genetic screens with CRISPR-Cas9. Nature Biotechnology Jan 2016, doi:10.1038/nbt.3437.
Expand source code
"""doench_featurization.py. Simplified feature extraction to run the model 'V3_model_nopos' from Doench et al. 2016 for
on-target scoring.
For use in Guidemaker https://guidemaker.org.
Adam Rivers, Unites States Department of Agriculture, Agricultural Research Service
Core code https://github.com/MicrosoftResearch/Azimuth is in Python2 and does not run well given changes to packages.
Miles Smith worked on porting to Python3 in this repo: https://github.com/milescsmith/Azimuth. including a new branch
that used Poetry to build. The work is not complete.
This work is derivitive of that BSD 3-clause licensed work. The key changes are:
1. uch of the code needed for tasks other thant prediction of the V3_model_nopos was removed.
2. The Calculation of NGGX features was re-written. A bug that prevented scaling to thousands guides efficiently.
3. the Pickle model and scikit-learn were replaced with an Onnx model and onnxruntime for better persistance,
security, and performance.
Reference:
John G. Doench*, Nicolo Fusi*, Meagan Sullender*, Mudra Hegde*, Emma W. Vaimberg*, Katherine F. Donovan, Ian Smith,
Zuzana Tothova, Craig Wilen , Robert Orchard , Herbert W. Virgin, Jennifer Listgarten*, David E. Root.
Optimized sgRNA design to maximize activity and minimize off-target effects for genetic screens with CRISPR-Cas9.
Nature Biotechnology Jan 2016, doi:10.1038/nbt.3437.
"""
from itertools import product, islice
from time import time
from typing import List
import logging
from functools import partial
from multiprocessing import Pool
from collections import deque
import numpy as np
import pandas as pd
from Bio.SeqUtils import MeltingTemp as Tm
logger = logging.getLogger(__name__)
def featurize_data(data, learn_options, pam_audit=True, length_audit=True) -> dict:
"""Creates a dictionary of feature data
Args:
data pd.DataFrame: of 30-mer sequences in column 1 and strand in column 2
learn_options (dict): dict of model training parameters
pam_audit (bool): should a check of GG at position 25:27 be performed?
length_audit (bool): should sequence length be checked?
Returns:
(dict): Returns a dict containing pandas dataframs of features
"""
logger.info("Creating features for Doench et al. 2016 score prediction")
if np.any(data["30mer"].str.len() != 30):
raise AssertionError(f"Sequences should be 30 nt long")
feature_sets = {}
if learn_options["nuc_features"]:
# spectrum kernels (position-independent) and weighted degree kernels (position-dependent)
logger.info("Creating nucleotide features")
feature_sets["_nuc_pd_Order1"], feature_sets["_nuc_pi_Order1"], feature_sets["_nuc_pd_Order2"], feature_sets["_nuc_pi_Order2"] = get_nuc_features(data)
logger.info("Verifing nucleotide features")
check_feature_set(feature_sets)
if learn_options["gc_features"]:
logger.info("Creating GC features")
gc_above_10, gc_below_10, gc_count = gc_features(data, length_audit)
feature_sets["gc_above_10"] = pd.DataFrame(gc_above_10)
feature_sets["gc_below_10"] = pd.DataFrame(gc_below_10)
feature_sets["gc_count"] = pd.DataFrame(gc_count)
logger.info("gc features complete")
if learn_options["include_NGGX_interaction"]:
logger.info("Creating ggx features")
feature_sets["NGGX"] = nggx_interaction_feature(data, pam_audit)
if learn_options["include_Tm"]:
logger.info("Creattng Tm features")
feature_sets["Tm"] = Tm_feature(data, pam_audit, learn_options=None)
check_feature_set(feature_sets)
logger.info("final feature check complte")
return feature_sets
def parallel_featurize_data(data, learn_options, pam_audit=True, length_audit=True, num_threads=1) -> dict:
""" Use multprocessing to divide up the creation of ML features for Doench scoring
Creates a dictionary of feature data
Args:
data pd.DataFrame: of 30-mer sequences in column 1 and strand in column 2
learn_options (dict): dict of model training parameters
pam_audit (bool): should a check of GG at position 25:27 be performed?
length_audit (bool): should sequence length be checked?
Returns:
(dict): Returns a dict containing pandas dataframs of features
"""
if num_threads > 1:
dflist = np.array_split(data, num_threads)
partial_fd = partial(featurize_data,learn_options=learn_options, pam_audit=pam_audit, length_audit=length_audit )
with Pool(processes=num_threads) as pool:
result = pool.map(partial_fd, dflist)
featdict = dict.fromkeys(result[0].keys())
for featkey in featdict.keys():
tempdflist = []
for d1 in result:
tempdflist.append(d1[featkey])
featdict[featkey] = pd.concat(tempdflist)
return featdict
else:
return featurize_data(data=data, learn_options=learn_options, pam_audit=pam_audit, length_audit=length_audit)
def get_nuc_features(data):
""" Create first and second order nucleotide features
Args:
data pd.DataFrame: of 30-mer sequences in column 1 and strand in column 2
Returns:
(tuple): Returns a tuple pwith 4 Pandas dataframes
"""
seqlen = 30
# create first header
nuc_pi_Order1_header = [x[0] for x in product('ATCG', repeat=1)]
# create second header
nuc_pd_Order1_header = []
for i in range(seqlen):
nuc_pd_Order1_header.extend([x + "_" + str(i) for x in nuc_pi_Order1_header ])
# create third header
nuc_pi_Order2_header = [ "".join(x) for x in product('ATCG', repeat=2)]
# create forth header
nuc_pd_Order2_header = []
for i in range(seqlen - 1):
nuc_pd_Order2_header.extend([x + "_" + str(i) for x in nuc_pi_Order2_header ])
# Create lists for holding features
nuc_pd_Order1_list = []
nuc_pi_Order1_list = []
nuc_pd_Order2_list = []
nuc_pi_Order2_list = []
# Create identity matricies and lookups for one hot encoding
o1_id = np.eye(4)
o2_id = np.eye(16)
o1_lookup = {x : i for i, x in enumerate(nuc_pi_Order1_header)}
o2_lookup = {x : i for i, x in enumerate(nuc_pi_Order2_header)}
# not feasable ot use pandas.get_dummies for this
def one_hot(seq, idmat, lookup):
"""One hot endode an iterable matching the items in a lookup
Args:
seq (str): a 30mer sequence string
idmat (np.array): a Numpy identity matrix
lookup (dict): a dictionary matchin the substring to the row in idmat
Returns:
pd.Series: one hot encoding
"""
featurevect = []
for let in seq:
pos = lookup[let]
featurevect.extend(list(idmat[pos,:]))
return pd.Series(featurevect)
def sliding_window(iterable, n):
"""Create a generator of substrings
Args:
iterable (iterable): an itterable object like a string or list
n (int): the size of the window or kmer
Returns:
a genrator of substrings
"""
# sliding_window('ABCDEFG', 4) -> ABCD BCDE CDEF DEFG
it = iter(iterable)
window = deque(islice(it, n), maxlen=n)
if len(window) == n:
yield tuple(window)
for x in it:
window.append(x)
yield tuple(window)
for seq in data["30mer"]:
# add order 1 frequency features
pi1dict = dict.fromkeys(nuc_pi_Order1_header, 0)
for let in seq:
pi1dict[let] += 1
nuc_pi_Order1_list.append(pi1dict)
# add order 1 positon features
nuc_pd_Order1_list.append(one_hot(seq, o1_id, o1_lookup))
# create list of 2mers
seq_2mers = [ "".join(x) for x in list(sliding_window(seq, 2))]
# add order two frequency features
pi2dict = dict.fromkeys(nuc_pi_Order2_header, 0)
for let in seq_2mers:
pi2dict[let] += 1
nuc_pi_Order2_list.append(pi2dict)
# add order 2 positon features
nuc_pd_Order2_list.append(one_hot(seq_2mers, o2_id, o2_lookup))
# Create DataFrames
nuc_pd_Order1 = pd.DataFrame(data=nuc_pd_Order1_list)
nuc_pd_Order1.columns = nuc_pd_Order1_header
nuc_pi_Order1 = pd.DataFrame(data=nuc_pi_Order1_list, columns=nuc_pi_Order1_header)
nuc_pd_Order2 = pd.DataFrame(data=nuc_pd_Order2_list)
nuc_pd_Order2.columns = nuc_pd_Order2_header
nuc_pi_Order2 = pd.DataFrame(data=nuc_pi_Order2_list, columns=nuc_pi_Order2_header)
return nuc_pd_Order1, nuc_pi_Order1, nuc_pd_Order2, nuc_pi_Order2
def check_feature_set(feature_sets):
"""Ensure the number of features is the same in each feature set
Args:
feature_sets (dict): the feature set dictionary
Returns:
None
"""
if feature_sets == {}:
raise AssertionError("no feature sets present")
num = None
for ft in feature_sets:
num2 = feature_sets[ft].shape[0]
if num is None:
num = num2
else:
if num < 1:
raise AssertionError("should be at least one individual")
if num != num2:
raise AssertionError(
"Number of individuals do not match up across feature sets"
)
for item in feature_sets:
if np.any(np.isnan(feature_sets[item])):
raise Exception(f"found Nan in set {item}")
def nggx_interaction_feature(data, pam_audit=True):
""" One hot encode the sequence of NX aroung pam site NGGX
Args:
data (pandas.DataFrame): A dataframe of 30-mer and strand (filled with NA)
pam_audit (bool): should check of GG at position 25:27 be performed?
Returns:
(pandas.DataFrame): A dataframe with 16 columns containing one hoe encoding of NX data
"""
# function completly replaced from Doench et al. (2016). The old function had time complexity of O^2
sequence = data["30mer"].values
nxcombos = []
for i in product('ACGT', repeat=2):
nxcombos.append("".join(list(i)))
nxlist = []
# check that GG is where we think
for seq in sequence:
if pam_audit and seq[25:27] != "GG":
raise Exception(f"expected GG but found {seq[25 :27]}")
nxlist.append(seq[24] + seq[27])
nxs = pd.Series(nxlist)
feat_nx = pd.get_dummies(nxs, columns=nxcombos)
# Add any missing columns
for i, header in enumerate(nxcombos):
if header not in feat_nx.columns:
feat_nx.insert(i, header, 0)
feat_nx.columns = ["NGGX" + i + "_0" for i in nxcombos]
feat_nx = feat_nx.astype('float64')
return feat_nx
def countGC(s, length_audit=True):
"""
GC content for only the 20mer, as per the Doench paper/code
"""
if length_audit:
if len(s) != 30:
raise AssertionError("seems to assume 30mer")
return len(s[4:24].replace("A", "").replace("T", ""))
def organism_feature(data):
"""
Human vs. mouse
"""
organism = np.array(data["Organism"].values)
feat = pd.DataFrame(pd.DataFrame(organism))
import pdb
pdb.set_trace()
return feat
def gc_cont(seq):
return (seq.count("G") + seq.count("C")) / float(len(seq))
def Tm_feature(data, pam_audit=True, learn_options=None):
"""
assuming '30-mer'is a key
get melting temperature features from:
0-the 30-mer ("global Tm")
1-the Tm (melting temperature) of the DNA:RNA hybrid from positions 16 - 20 of the sgRNA,
i.e. the 5nts immediately proximal of the NGG PAM
2-the Tm of the DNA:RNA hybrid from position 8 - 15 (i.e. 8 nt)
3-the Tm of the DNA:RNA hybrid from position 3 - 7 (i.e. 5 nt)
"""
if learn_options is None or "Tm segments" not in learn_options:
segments = [(19, 24), (11, 19), (6, 11)]
else:
segments = learn_options["Tm segments"]
sequence = data["30mer"].values
featarray = np.ones((sequence.shape[0], 4))
for i, seq in enumerate(sequence):
if pam_audit and seq[25:27] != "GG":
raise Exception(f"expected GG but found {seq[25:27]}")
rna = False
featarray[i, 0] = Tm.Tm_NN(seq, nn_table=Tm.RNA_NN2) # 30mer Tm
featarray[i, 1] = Tm.Tm_NN(
seq[segments[0][0]: segments[0][1]], nn_table=Tm.RNA_NN2
) # 5nts immediately proximal of the NGG PAM
featarray[i, 2] = Tm.Tm_NN(
seq[segments[1][0]: segments[1][1]], nn_table=Tm.RNA_NN2
) # 8-mer
featarray[i, 3] = Tm.Tm_NN(
seq[segments[2][0]: segments[2][1]], nn_table=Tm.RNA_NN2
) # 5-mer
feat = pd.DataFrame(
featarray,
index=data.index,
columns=[
f"Tm global_{rna}",
f"5mer_end_{rna}",
f"8mer_middle_{rna}",
f"5mer_start_{rna}",
],
)
return feat
def gc_features(data, audit=True):
gc_count = data["30mer"].apply(lambda seq: countGC(seq, audit))
gc_count.name = "GC count"
gc_above_10 = (gc_count > 10) * 1
gc_above_10.name = "GC > 10"
gc_below_10 = (gc_count < 10) * 1
gc_below_10.name = "GC < 10"
return gc_above_10, gc_below_10, gc_count
def normalize_features(data, axis):
"""
input: pd.DataFrame of dtype=np.float64 array, of dimensions
mean-center, and unit variance each feature
"""
data -= data.mean(axis)
data /= data.std(axis)
# remove rows with NaNs
data = data.dropna(1)
if np.any(np.isnan(data.values)):
raise Exception("found NaN in normalized features")
return data
Functions
def Tm_feature(data, pam_audit=True, learn_options=None)
-
assuming '30-mer'is a key get melting temperature features from: 0-the 30-mer ("global Tm") 1-the Tm (melting temperature) of the DNA:RNA hybrid from positions 16 - 20 of the sgRNA, i.e. the 5nts immediately proximal of the NGG PAM 2-the Tm of the DNA:RNA hybrid from position 8 - 15 (i.e. 8 nt) 3-the Tm of the DNA:RNA hybrid from position 3 - 7 (i.e. 5 nt)
Expand source code
def Tm_feature(data, pam_audit=True, learn_options=None): """ assuming '30-mer'is a key get melting temperature features from: 0-the 30-mer ("global Tm") 1-the Tm (melting temperature) of the DNA:RNA hybrid from positions 16 - 20 of the sgRNA, i.e. the 5nts immediately proximal of the NGG PAM 2-the Tm of the DNA:RNA hybrid from position 8 - 15 (i.e. 8 nt) 3-the Tm of the DNA:RNA hybrid from position 3 - 7 (i.e. 5 nt) """ if learn_options is None or "Tm segments" not in learn_options: segments = [(19, 24), (11, 19), (6, 11)] else: segments = learn_options["Tm segments"] sequence = data["30mer"].values featarray = np.ones((sequence.shape[0], 4)) for i, seq in enumerate(sequence): if pam_audit and seq[25:27] != "GG": raise Exception(f"expected GG but found {seq[25:27]}") rna = False featarray[i, 0] = Tm.Tm_NN(seq, nn_table=Tm.RNA_NN2) # 30mer Tm featarray[i, 1] = Tm.Tm_NN( seq[segments[0][0]: segments[0][1]], nn_table=Tm.RNA_NN2 ) # 5nts immediately proximal of the NGG PAM featarray[i, 2] = Tm.Tm_NN( seq[segments[1][0]: segments[1][1]], nn_table=Tm.RNA_NN2 ) # 8-mer featarray[i, 3] = Tm.Tm_NN( seq[segments[2][0]: segments[2][1]], nn_table=Tm.RNA_NN2 ) # 5-mer feat = pd.DataFrame( featarray, index=data.index, columns=[ f"Tm global_{rna}", f"5mer_end_{rna}", f"8mer_middle_{rna}", f"5mer_start_{rna}", ], ) return feat
def check_feature_set(feature_sets)
-
Ensure the number of features is the same in each feature set
Args
feature_sets
:dict
- the feature set dictionary
Returns
None
Expand source code
def check_feature_set(feature_sets): """Ensure the number of features is the same in each feature set Args: feature_sets (dict): the feature set dictionary Returns: None """ if feature_sets == {}: raise AssertionError("no feature sets present") num = None for ft in feature_sets: num2 = feature_sets[ft].shape[0] if num is None: num = num2 else: if num < 1: raise AssertionError("should be at least one individual") if num != num2: raise AssertionError( "Number of individuals do not match up across feature sets" ) for item in feature_sets: if np.any(np.isnan(feature_sets[item])): raise Exception(f"found Nan in set {item}")
def countGC(s, length_audit=True)
-
GC content for only the 20mer, as per the Doench paper/code
Expand source code
def countGC(s, length_audit=True): """ GC content for only the 20mer, as per the Doench paper/code """ if length_audit: if len(s) != 30: raise AssertionError("seems to assume 30mer") return len(s[4:24].replace("A", "").replace("T", ""))
def featurize_data(data, learn_options, pam_audit=True, length_audit=True) ‑> dict
-
Creates a dictionary of feature data
Args
- data pd.DataFrame: of 30-mer sequences in column 1 and strand in column 2
learn_options
:dict
- dict of model training parameters
pam_audit
:bool
- should a check of GG at position 25:27 be performed?
length_audit
:bool
- should sequence length be checked?
Returns
(dict): Returns a dict containing pandas dataframs of features
Expand source code
def featurize_data(data, learn_options, pam_audit=True, length_audit=True) -> dict: """Creates a dictionary of feature data Args: data pd.DataFrame: of 30-mer sequences in column 1 and strand in column 2 learn_options (dict): dict of model training parameters pam_audit (bool): should a check of GG at position 25:27 be performed? length_audit (bool): should sequence length be checked? Returns: (dict): Returns a dict containing pandas dataframs of features """ logger.info("Creating features for Doench et al. 2016 score prediction") if np.any(data["30mer"].str.len() != 30): raise AssertionError(f"Sequences should be 30 nt long") feature_sets = {} if learn_options["nuc_features"]: # spectrum kernels (position-independent) and weighted degree kernels (position-dependent) logger.info("Creating nucleotide features") feature_sets["_nuc_pd_Order1"], feature_sets["_nuc_pi_Order1"], feature_sets["_nuc_pd_Order2"], feature_sets["_nuc_pi_Order2"] = get_nuc_features(data) logger.info("Verifing nucleotide features") check_feature_set(feature_sets) if learn_options["gc_features"]: logger.info("Creating GC features") gc_above_10, gc_below_10, gc_count = gc_features(data, length_audit) feature_sets["gc_above_10"] = pd.DataFrame(gc_above_10) feature_sets["gc_below_10"] = pd.DataFrame(gc_below_10) feature_sets["gc_count"] = pd.DataFrame(gc_count) logger.info("gc features complete") if learn_options["include_NGGX_interaction"]: logger.info("Creating ggx features") feature_sets["NGGX"] = nggx_interaction_feature(data, pam_audit) if learn_options["include_Tm"]: logger.info("Creattng Tm features") feature_sets["Tm"] = Tm_feature(data, pam_audit, learn_options=None) check_feature_set(feature_sets) logger.info("final feature check complte") return feature_sets
def gc_cont(seq)
-
Expand source code
def gc_cont(seq): return (seq.count("G") + seq.count("C")) / float(len(seq))
def gc_features(data, audit=True)
-
Expand source code
def gc_features(data, audit=True): gc_count = data["30mer"].apply(lambda seq: countGC(seq, audit)) gc_count.name = "GC count" gc_above_10 = (gc_count > 10) * 1 gc_above_10.name = "GC > 10" gc_below_10 = (gc_count < 10) * 1 gc_below_10.name = "GC < 10" return gc_above_10, gc_below_10, gc_count
def get_nuc_features(data)
-
Create first and second order nucleotide features
Args
data pd.DataFrame: of 30-mer sequences in column 1 and strand in column 2
Returns
(tuple): Returns a tuple pwith 4 Pandas dataframes
Expand source code
def get_nuc_features(data): """ Create first and second order nucleotide features Args: data pd.DataFrame: of 30-mer sequences in column 1 and strand in column 2 Returns: (tuple): Returns a tuple pwith 4 Pandas dataframes """ seqlen = 30 # create first header nuc_pi_Order1_header = [x[0] for x in product('ATCG', repeat=1)] # create second header nuc_pd_Order1_header = [] for i in range(seqlen): nuc_pd_Order1_header.extend([x + "_" + str(i) for x in nuc_pi_Order1_header ]) # create third header nuc_pi_Order2_header = [ "".join(x) for x in product('ATCG', repeat=2)] # create forth header nuc_pd_Order2_header = [] for i in range(seqlen - 1): nuc_pd_Order2_header.extend([x + "_" + str(i) for x in nuc_pi_Order2_header ]) # Create lists for holding features nuc_pd_Order1_list = [] nuc_pi_Order1_list = [] nuc_pd_Order2_list = [] nuc_pi_Order2_list = [] # Create identity matricies and lookups for one hot encoding o1_id = np.eye(4) o2_id = np.eye(16) o1_lookup = {x : i for i, x in enumerate(nuc_pi_Order1_header)} o2_lookup = {x : i for i, x in enumerate(nuc_pi_Order2_header)} # not feasable ot use pandas.get_dummies for this def one_hot(seq, idmat, lookup): """One hot endode an iterable matching the items in a lookup Args: seq (str): a 30mer sequence string idmat (np.array): a Numpy identity matrix lookup (dict): a dictionary matchin the substring to the row in idmat Returns: pd.Series: one hot encoding """ featurevect = [] for let in seq: pos = lookup[let] featurevect.extend(list(idmat[pos,:])) return pd.Series(featurevect) def sliding_window(iterable, n): """Create a generator of substrings Args: iterable (iterable): an itterable object like a string or list n (int): the size of the window or kmer Returns: a genrator of substrings """ # sliding_window('ABCDEFG', 4) -> ABCD BCDE CDEF DEFG it = iter(iterable) window = deque(islice(it, n), maxlen=n) if len(window) == n: yield tuple(window) for x in it: window.append(x) yield tuple(window) for seq in data["30mer"]: # add order 1 frequency features pi1dict = dict.fromkeys(nuc_pi_Order1_header, 0) for let in seq: pi1dict[let] += 1 nuc_pi_Order1_list.append(pi1dict) # add order 1 positon features nuc_pd_Order1_list.append(one_hot(seq, o1_id, o1_lookup)) # create list of 2mers seq_2mers = [ "".join(x) for x in list(sliding_window(seq, 2))] # add order two frequency features pi2dict = dict.fromkeys(nuc_pi_Order2_header, 0) for let in seq_2mers: pi2dict[let] += 1 nuc_pi_Order2_list.append(pi2dict) # add order 2 positon features nuc_pd_Order2_list.append(one_hot(seq_2mers, o2_id, o2_lookup)) # Create DataFrames nuc_pd_Order1 = pd.DataFrame(data=nuc_pd_Order1_list) nuc_pd_Order1.columns = nuc_pd_Order1_header nuc_pi_Order1 = pd.DataFrame(data=nuc_pi_Order1_list, columns=nuc_pi_Order1_header) nuc_pd_Order2 = pd.DataFrame(data=nuc_pd_Order2_list) nuc_pd_Order2.columns = nuc_pd_Order2_header nuc_pi_Order2 = pd.DataFrame(data=nuc_pi_Order2_list, columns=nuc_pi_Order2_header) return nuc_pd_Order1, nuc_pi_Order1, nuc_pd_Order2, nuc_pi_Order2
def nggx_interaction_feature(data, pam_audit=True)
-
One hot encode the sequence of NX aroung pam site NGGX
Args
data
:pandas.DataFrame
- A dataframe of 30-mer and strand (filled with NA)
pam_audit
:bool
- should check of GG at position 25:27 be performed?
Returns
(pandas.DataFrame): A dataframe with 16 columns containing one hoe encoding of NX data
Expand source code
def nggx_interaction_feature(data, pam_audit=True): """ One hot encode the sequence of NX aroung pam site NGGX Args: data (pandas.DataFrame): A dataframe of 30-mer and strand (filled with NA) pam_audit (bool): should check of GG at position 25:27 be performed? Returns: (pandas.DataFrame): A dataframe with 16 columns containing one hoe encoding of NX data """ # function completly replaced from Doench et al. (2016). The old function had time complexity of O^2 sequence = data["30mer"].values nxcombos = [] for i in product('ACGT', repeat=2): nxcombos.append("".join(list(i))) nxlist = [] # check that GG is where we think for seq in sequence: if pam_audit and seq[25:27] != "GG": raise Exception(f"expected GG but found {seq[25 :27]}") nxlist.append(seq[24] + seq[27]) nxs = pd.Series(nxlist) feat_nx = pd.get_dummies(nxs, columns=nxcombos) # Add any missing columns for i, header in enumerate(nxcombos): if header not in feat_nx.columns: feat_nx.insert(i, header, 0) feat_nx.columns = ["NGGX" + i + "_0" for i in nxcombos] feat_nx = feat_nx.astype('float64') return feat_nx
def normalize_features(data, axis)
-
input: pd.DataFrame of dtype=np.float64 array, of dimensions mean-center, and unit variance each feature
Expand source code
def normalize_features(data, axis): """ input: pd.DataFrame of dtype=np.float64 array, of dimensions mean-center, and unit variance each feature """ data -= data.mean(axis) data /= data.std(axis) # remove rows with NaNs data = data.dropna(1) if np.any(np.isnan(data.values)): raise Exception("found NaN in normalized features") return data
def organism_feature(data)
-
Human vs. mouse
Expand source code
def organism_feature(data): """ Human vs. mouse """ organism = np.array(data["Organism"].values) feat = pd.DataFrame(pd.DataFrame(organism)) import pdb pdb.set_trace() return feat
def parallel_featurize_data(data, learn_options, pam_audit=True, length_audit=True, num_threads=1) ‑> dict
-
Use multprocessing to divide up the creation of ML features for Doench scoring Creates a dictionary of feature data
Args
- data pd.DataFrame: of 30-mer sequences in column 1 and strand in column 2
learn_options
:dict
- dict of model training parameters
pam_audit
:bool
-
should a check of GG at position 25:27 be performed?
length_audit
:bool
- should sequence length be checked?
Returns
(dict): Returns a dict containing pandas dataframs of features
Expand source code
def parallel_featurize_data(data, learn_options, pam_audit=True, length_audit=True, num_threads=1) -> dict: """ Use multprocessing to divide up the creation of ML features for Doench scoring Creates a dictionary of feature data Args: data pd.DataFrame: of 30-mer sequences in column 1 and strand in column 2 learn_options (dict): dict of model training parameters pam_audit (bool): should a check of GG at position 25:27 be performed? length_audit (bool): should sequence length be checked? Returns: (dict): Returns a dict containing pandas dataframs of features """ if num_threads > 1: dflist = np.array_split(data, num_threads) partial_fd = partial(featurize_data,learn_options=learn_options, pam_audit=pam_audit, length_audit=length_audit ) with Pool(processes=num_threads) as pool: result = pool.map(partial_fd, dflist) featdict = dict.fromkeys(result[0].keys()) for featkey in featdict.keys(): tempdflist = [] for d1 in result: tempdflist.append(d1[featkey]) featdict[featkey] = pd.concat(tempdflist) return featdict else: return featurize_data(data=data, learn_options=learn_options, pam_audit=pam_audit, length_audit=length_audit)