Module guidemaker.doench_predict
doench_predict.py. Simplified module to run the model 'V3_model_nopos' from Doench et al. 2016 for on-target scoring.
For use in Guidemaker https://guidemaker.org. Adam Rivers, Unites States Department of Agriculture, Agricultural Research Service
The core code, https://github.com/MicrosoftResearch/Azimuth, is in Python2 and does not run well given changes to packages. Miles Smith worked on porting to Python3 in this repo: https://github.com/milescsmith/Azimuth, including a new branch that used Poetry to build. The work is not complete.
This work is derivative of that BSD 3-clause, Modified licensed work. The key changes are: 1. Much of the code needed for tasks other thant prediction of the V3_model_nopos was removed. 2. The Calculation of NGGX features was re-written. A bug that prevented scaling to thousands guides efficiently. 3. the Pickle model and scikit-learn were replaced with an Onnx model ('V3_model_nopos.onnx"), and medadata file ("V3_model_nopos_options.json") and onnxruntime for better persistence, security, and performance.
Reference:
John G. Doench, Nicolo Fusi, Meagan Sullender, Mudra Hegde, Emma W. Vaimberg, Katherine F. Donovan, Ian Smith, Zuzana Tothova, Craig Wilen , Robert Orchard , Herbert W. Virgin, Jennifer Listgarten, David E. Root. Optimized sgRNA design to maximize activity and minimize off-target effects for genetic screens with CRISPR-Cas9. Nature Biotechnology Jan 2016, doi:10.1038/nbt.3437.
Expand source code
"""doench_predict.py. Simplified module to run the model 'V3_model_nopos' from Doench et al. 2016 for on-target
scoring.
For use in Guidemaker https://guidemaker.org.
Adam Rivers, Unites States Department of Agriculture, Agricultural Research Service
The core code, https://github.com/MicrosoftResearch/Azimuth, is in Python2 and does not run well given changes to packages.
Miles Smith worked on porting to Python3 in this repo: https://github.com/milescsmith/Azimuth, including a new branch
that used Poetry to build. The work is not complete.
This work is derivative of that BSD 3-clause, Modified licensed work. The key changes are:
1. Much of the code needed for tasks other thant prediction of the V3_model_nopos was removed.
2. The Calculation of NGGX features was re-written. A bug that prevented scaling to thousands guides efficiently.
3. the Pickle model and scikit-learn were replaced with an Onnx model ('V3_model_nopos.onnx"), and medadata file
("V3_model_nopos_options.json") and onnxruntime for better persistence, security, and performance.
Reference:
John G. Doench*, Nicolo Fusi*, Meagan Sullender*, Mudra Hegde*, Emma W. Vaimberg*, Katherine F. Donovan, Ian Smith,
Zuzana Tothova, Craig Wilen , Robert Orchard , Herbert W. Virgin, Jennifer Listgarten*, David E. Root.
Optimized sgRNA design to maximize activity and minimize off-target effects for genetic screens with CRISPR-Cas9.
Nature Biotechnology Jan 2016, doi:10.1038/nbt.3437.
"""
import pandas as pd
from guidemaker.doench_featurization import featurize_data, parallel_featurize_data
from typing import List, Optional
import numpy as np
from pathlib import Path
import onnxruntime as rt
import json
import logging
import os
logger = logging.getLogger(__name__)
DIR = os.path.dirname(os.path.abspath(__file__)) # This is your Project Root
#MODEL = "guidemaker/data/V3_model_nopos.onnx"
MODEL = os.path.join(DIR,"data/V3_model_nopos.onnx")
#MODEL_META = "guidemaker/data/V3_model_nopos_options.json"
MODEL_META = os.path.join(DIR,"data/V3_model_nopos_options.json")
def concatenate_feature_sets(feature_sets, keys: List[str] = None):
""" Combine features
Given a dictionary of sets of features, each in a pd.DataFrame,
concatenate them together to form one big np.array, and get the dimension
of each set
Args:
feature_sets (dict): a Dict of feature sets as pandas DataFrames
Returns:
(tuple: inputs(numpy.ndarray), dim (tuple)
"""
if feature_sets == {}:
raise AssertionError("no feature sets present")
if keys is None:
keys = list(feature_sets.keys())
feature_1 = feature_sets[keys[0]].shape[0]
for assemblage in feature_sets:
feature_2 = feature_sets[assemblage].shape[0]
if feature_1 != feature_2:
raise AssertionError(
f"not same # items for features {keys[0]} and {assemblage}"
)
num_sets = feature_sets[keys[0]].shape[0]
inputs = np.zeros((num_sets, 0))
feature_names = []
dim = {}
dimsum = 0
for assemblage in keys:
inputs_set = feature_sets[assemblage].values
dim[assemblage] = inputs_set.shape[1]
dimsum = dimsum + dim[assemblage]
inputs = np.hstack((inputs, inputs_set))
feature_names.extend(feature_sets[assemblage].columns.tolist())
return inputs, dim, dimsum, feature_names
def predict(
seq: np.ndarray,
model_file: Optional[Path] = MODEL,
model_metadata: Optional[Path] = MODEL_META,
pam_audit: bool = True,
length_audit: bool = False,
num_threads: int = 1
) -> np.array:
"""Pedicts regressions scored from sequences.
Args:
seq (numpy.ndarray) numpy array of 30 nt sequences with 25 nt of guide, NGG pam in 25:27 and the following 2 nts.
model_file (str): file path of the onnx Boosted Gradien Regressor model file without position data
model_metadata (str): file path of the json model parameters metadata file.
pam_audit (bool): check PAM of each sequence.
length_audit(bool) : check length of each sequence.
Returns:
(numpy.array): An array with regression values.
"""
if not isinstance(seq, np.ndarray):
raise AssertionError("Please ensure seq is a numpy array")
if len(seq[0]) <= 0:
raise AssertionError("Make sure that seq is not empty")
if not isinstance(seq[0], str):
raise AssertionError(
f"Please ensure input sequences are in string format, i.e. 'AGAG' "
f"rather than ['A' 'G' 'A' 'G'] or alternate representations"
)
# Open onnx runtime session
sess = rt.InferenceSession(model_file)
with open(model_metadata, "r") as f:
learn_options = json.load(f)
x_df = pd.DataFrame(
columns=["30mer", "Strand"],
data=list(zip(seq, ["NA" for x in range(len(seq))])),
)
feature_sets = parallel_featurize_data(
x_df,
learn_options,
pam_audit=pam_audit,
length_audit=length_audit,
num_threads=num_threads
)
inputs, *_ = concatenate_feature_sets(feature_sets)
preds = sess.run(None, {'float_input': inputs.astype(np.float32)})[0]
return preds
Functions
def concatenate_feature_sets(feature_sets, keys: List[str] = None)
-
Combine features
Given a dictionary of sets of features, each in a pd.DataFrame, concatenate them together to form one big np.array, and get the dimension of each set
Args
feature_sets
:dict
- a Dict of feature sets as pandas DataFrames
Returns
(tuple: inputs(numpy.ndarray), dim (tuple)
Expand source code
def concatenate_feature_sets(feature_sets, keys: List[str] = None): """ Combine features Given a dictionary of sets of features, each in a pd.DataFrame, concatenate them together to form one big np.array, and get the dimension of each set Args: feature_sets (dict): a Dict of feature sets as pandas DataFrames Returns: (tuple: inputs(numpy.ndarray), dim (tuple) """ if feature_sets == {}: raise AssertionError("no feature sets present") if keys is None: keys = list(feature_sets.keys()) feature_1 = feature_sets[keys[0]].shape[0] for assemblage in feature_sets: feature_2 = feature_sets[assemblage].shape[0] if feature_1 != feature_2: raise AssertionError( f"not same # items for features {keys[0]} and {assemblage}" ) num_sets = feature_sets[keys[0]].shape[0] inputs = np.zeros((num_sets, 0)) feature_names = [] dim = {} dimsum = 0 for assemblage in keys: inputs_set = feature_sets[assemblage].values dim[assemblage] = inputs_set.shape[1] dimsum = dimsum + dim[assemblage] inputs = np.hstack((inputs, inputs_set)) feature_names.extend(feature_sets[assemblage].columns.tolist()) return inputs, dim, dimsum, feature_names
def predict(seq: numpy.ndarray, model_file: Optional[pathlib.Path] = '/Users/rivers/Documents/guidemaker/guidemaker/data/V3_model_nopos.onnx', model_metadata: Optional[pathlib.Path] = '/Users/rivers/Documents/guidemaker/guidemaker/data/V3_model_nopos_options.json', pam_audit: bool = True, length_audit: bool = False, num_threads: int = 1) ‑>
-
Pedicts regressions scored from sequences.
Args
- seq (numpy.ndarray) numpy array of 30 nt sequences with 25 nt of guide, NGG pam in 25:27 and the following 2 nts.
model_file
:str
- file path of the onnx Boosted Gradien Regressor model file without position data
model_metadata
:str
- file path of the json model parameters metadata file.
pam_audit
:bool
- check PAM of each sequence.
length_audit(bool) : check length of each sequence.
Returns
(numpy.array): An array with regression values.
Expand source code
def predict( seq: np.ndarray, model_file: Optional[Path] = MODEL, model_metadata: Optional[Path] = MODEL_META, pam_audit: bool = True, length_audit: bool = False, num_threads: int = 1 ) -> np.array: """Pedicts regressions scored from sequences. Args: seq (numpy.ndarray) numpy array of 30 nt sequences with 25 nt of guide, NGG pam in 25:27 and the following 2 nts. model_file (str): file path of the onnx Boosted Gradien Regressor model file without position data model_metadata (str): file path of the json model parameters metadata file. pam_audit (bool): check PAM of each sequence. length_audit(bool) : check length of each sequence. Returns: (numpy.array): An array with regression values. """ if not isinstance(seq, np.ndarray): raise AssertionError("Please ensure seq is a numpy array") if len(seq[0]) <= 0: raise AssertionError("Make sure that seq is not empty") if not isinstance(seq[0], str): raise AssertionError( f"Please ensure input sequences are in string format, i.e. 'AGAG' " f"rather than ['A' 'G' 'A' 'G'] or alternate representations" ) # Open onnx runtime session sess = rt.InferenceSession(model_file) with open(model_metadata, "r") as f: learn_options = json.load(f) x_df = pd.DataFrame( columns=["30mer", "Strand"], data=list(zip(seq, ["NA" for x in range(len(seq))])), ) feature_sets = parallel_featurize_data( x_df, learn_options, pam_audit=pam_audit, length_audit=length_audit, num_threads=num_threads ) inputs, *_ = concatenate_feature_sets(feature_sets) preds = sess.run(None, {'float_input': inputs.astype(np.float32)})[0] return preds