Module guidemaker.cfd_score_calculator

cfd_score_calculator.py This is a modified version of the CDF score calculator in Doench et al. (2016) for use in Guidemaker (https://guidemaker.org) Adam Rivers, USDA Agricultural Research Service

We score only the CFD for off targets with a NGG site, we do not collect these non-matching PAM off targets Guidemaker. For this reason we omit the PAM scoring portion of CFD. For that reason we omit the pam scoring part of the doench et al. (2016) script. Results are identical for all off-targets that are scored.

Very few off trargets with non-pam matching sites would interact with targets in a small geneome (The highest scoring non-Pam,NGT, has a score of 0.3). Additionally we require all our guides have a distance of at least 2 by default so any off targets would have a score below the 0.2 threshold most people use.

We also modified the script to score pam sites longer than 20 by ignoring the 5' end past 20 and for shorter pam's by only scoring the sites present.

Expand source code
"""cfd_score_calculator.py This is a modified version of the CDF score calculator in Doench et al. (2016) for use in
Guidemaker (https://guidemaker.org)
Adam Rivers, USDA Agricultural Research Service

We score only the CFD for off targets with a NGG site, we do not collect these non-matching PAM off targets Guidemaker.
For this reason we omit the PAM scoring portion of CFD. For that reason we omit the pam scoring
part of the doench et al. (2016) script.  Results are identical for all off-targets that are scored.

Very few off trargets with non-pam matching sites would interact with targets in a small geneome (The highest scoring
non-Pam,NGT, has a score of 0.3). Additionally we require all our guides have a distance of at least 2 by default so
any off targets would have a score below the 0.2 threshold most people use.

We also modified the script to score pam sites longer than 20 by ignoring the 5' end past 20 and for shorter pam's by
only scoring the sites present.
"""
import json
from typing import Tuple, Dict
import logging
import os

logger = logging.getLogger(__name__)

DIR = os.path.dirname(os.path.abspath(__file__))  # This is your Project Root
MODEL_META = os.path.join(DIR,"data/cfd_data.json")

def get_mm_pam_scores() -> Tuple[Dict, Dict]:
    """load json file of mismatch scores and PAM scores

    Returns:
        (tuple):dict of mismatch scores, dict of pam scores

    """
    try:
        with open(MODEL_META) as dat:
            scores = json.load(dat)
        mm_s = scores['mm']
        pam_s = scores['pam']
        return mm_s, pam_s
    except (FileNotFoundError, IOError):
        raise Exception("Could not find file with reference mismatch scores and PAM scores")


def check_len(wt: str, off: str) -> int:
    """Verify the lengths of guide and off target match returning the length

    Args:
        wt: the guide type guide sequence
        off: the off target sequence

    Returns:
        (int): the length of the data

    """
    wtl = len(wt)
    offl = len(off)
    assert (wtl == offl), "The lengths wt and off differ: wt = {}, off = {}".format(str(wtl), str(offl))
    return wtl


def calc_cfd(wt: str, off: str, mm_scores=None) -> float:
    """Calculate the CFD score using precalculated weights

    Args:
        wt: wild-type gRNA sequence, excluding the PAM Cas9 site
        off: off target sequence, excluding the PAM Cas9 site

    Returns:
        (float): CDF score of the pair

    """
    guidelen = check_len(wt, off)
    if mm_scores is None:
        mm_scores, _ = get_mm_pam_scores()
    score = 1.
    off = off.upper().replace('T', 'U')
    wt = wt.upper().replace('T', 'U')
    s_list = list(off)
    wt_list = list(wt)
    basecomp = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'U': 'A'}
    for i, sl in enumerate(s_list):
        if (guidelen - 20 - i) <= 0:
            if wt_list[i] != sl:
                key = 'r' + wt_list[i] + ':d' + basecomp[sl] + ',' + str(20 + i + 1 - guidelen)
                score *= mm_scores[key]
    return score

Functions

def calc_cfd(wt: str, off: str, mm_scores=None) ‑> float

Calculate the CFD score using precalculated weights

Args

wt
wild-type gRNA sequence, excluding the PAM Cas9 site
off
off target sequence, excluding the PAM Cas9 site

Returns

(float): CDF score of the pair

Expand source code
def calc_cfd(wt: str, off: str, mm_scores=None) -> float:
    """Calculate the CFD score using precalculated weights

    Args:
        wt: wild-type gRNA sequence, excluding the PAM Cas9 site
        off: off target sequence, excluding the PAM Cas9 site

    Returns:
        (float): CDF score of the pair

    """
    guidelen = check_len(wt, off)
    if mm_scores is None:
        mm_scores, _ = get_mm_pam_scores()
    score = 1.
    off = off.upper().replace('T', 'U')
    wt = wt.upper().replace('T', 'U')
    s_list = list(off)
    wt_list = list(wt)
    basecomp = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'U': 'A'}
    for i, sl in enumerate(s_list):
        if (guidelen - 20 - i) <= 0:
            if wt_list[i] != sl:
                key = 'r' + wt_list[i] + ':d' + basecomp[sl] + ',' + str(20 + i + 1 - guidelen)
                score *= mm_scores[key]
    return score
def check_len(wt: str, off: str) ‑> int

Verify the lengths of guide and off target match returning the length

Args

wt
the guide type guide sequence
off
the off target sequence

Returns

(int): the length of the data

Expand source code
def check_len(wt: str, off: str) -> int:
    """Verify the lengths of guide and off target match returning the length

    Args:
        wt: the guide type guide sequence
        off: the off target sequence

    Returns:
        (int): the length of the data

    """
    wtl = len(wt)
    offl = len(off)
    assert (wtl == offl), "The lengths wt and off differ: wt = {}, off = {}".format(str(wtl), str(offl))
    return wtl
def get_mm_pam_scores() ‑> Tuple[Dict[~KT, ~VT], Dict[~KT, ~VT]]

load json file of mismatch scores and PAM scores

Returns

(tuple):dict of mismatch scores, dict of pam scores

Expand source code
def get_mm_pam_scores() -> Tuple[Dict, Dict]:
    """load json file of mismatch scores and PAM scores

    Returns:
        (tuple):dict of mismatch scores, dict of pam scores

    """
    try:
        with open(MODEL_META) as dat:
            scores = json.load(dat)
        mm_s = scores['mm']
        pam_s = scores['pam']
        return mm_s, pam_s
    except (FileNotFoundError, IOError):
        raise Exception("Could not find file with reference mismatch scores and PAM scores")