"""Small utility functions."""
import logging
import os
from typing import Dict, List, Tuple, Union
from numpy import isnan
__author__ = "Bernhard Reuter, Jules Kreuer"
__copyright__ = "Bernhard Reuter, Jules Kreuer"
__license__ = "LGPL-3.0-only"
_logger = logging.getLogger(__name__)
[docs]def check_fastq_filenames(fastq_dir: str, sample_id: str) -> List[str]:
"""
Checks if the fastq files in a folder are following the MTBSeq naming scheme.::
[SampleID]_[LibID]_[*]_[Direction].f(ast)q.gz
^- Optional values.
Direction must be one of R1, R2.
Parameters
----------
fastq_dir : str
Path to the directory containing the fastq files.
Returns
-------
None
Throws an error if the names do not follow the assumed scheme.
"""
_logger.info("Checking naming scheme of fastq files")
# Checks if at least one fastq file is present.
fastq_files = []
for file_name in os.listdir(fastq_dir):
if file_name.startswith(sample_id) and (
file_name.endswith(".fq.gz") or file_name.endswith(".fastq.gz")
):
_logger.debug(f"Checking: {file_name}")
segment_check = 2 <= file_name.count("_")
direction = file_name.split("_")[-1].split(".")[0]
direction_check = direction in ["R1", "R2"]
if segment_check and direction_check:
# All checks passed.
fastq_files.append(file_name)
continue
raise RuntimeError(
f"FASTQ file {file_name} does not follow naming scheme:\n"
"[SampleID]_[LibID]_[*]_[Direction].f(ast)q.gz"
)
# No fastq file in folder.
if len(fastq_files) == 0:
raise FileNotFoundError(f"No FASTQ file/s in folder {fastq_dir} starting with {sample_id}.")
_logger.debug("All fastq files passed the saming scheme test")
return fastq_files
[docs]def stripper(x) -> Union[str, float]:
"""Strips string / float input. Throws error if type does not match."""
if isinstance(x, str):
return x.strip().lstrip("'")
elif isinstance(x, float):
if isnan(x):
return x
else:
raise ValueError(f"{x} passed to stripper is float but not np.nan.")
else:
raise ValueError(f"{x} passed to stripper is neither str nor float.")
[docs]def get_drugs() -> List[str]:
"""Returns a list of two / three letter drug codes."""
return [
"AMK",
"CAP",
"DCS",
"EMB",
"ETH",
"FQ",
"INH",
"KAN",
"PAS",
"PZA",
"RIF",
"STR",
]
[docs]def get_aminos() -> str:
"""Returns regex of amino-acids."""
return "(Ala|Arg|Asn|Asp|Cys|Gln|Glu|Gly|His|Ile|Leu|Lys|Met|Phe|Pro|Ser|Thr|Trp|Tyr|Val|_)"
[docs]def get_amino_ann() -> str:
"""Returns regex of amino-acid annotations."""
return "(^Ala|^Arg|^Asn|^Asp|^Cys|^Gln|^Glu|^Gly|^His|^Ile|^Leu|^Lys|^Met|^Phe|^Pro|^Ser|^Thr|^Trp|^Tyr|^Val|^_)[0-9]+(Ala$|Arg$|Asn$|Asp$|Cys$|Gln$|Glu$|Gly$|His$|Ile$|Leu$|Lys$|Met$|Phe$|Pro$|Ser$|Thr$|Trp$|Tyr$|Val$|_$)" # noqa: E501
[docs]def get_key_genes() -> Dict[str, List[str]]:
"""Returns a dict of genes used to determine the genotype of an isolate."""
aminos = get_aminos()
return {
"BDQ": [
r"^Rv0678:[0-9]+_ins_[ACGT]+$",
r"^Rv0678:([0-9]+_){0,1}[0-9]+_del$",
r"^Rv0678:" + aminos + "[0-9]+_$",
],
"CAP": [
r"^Rv1694:[0-9]+_ins_[ACGT]+$",
r"^Rv1694:([0-9]+_){0,1}[0-9]+_del$",
r"^Rv1694:" + aminos + "[0-9]+_$",
],
"CFZ": [
r"^Rv0678:[0-9]+_ins_[ACGT]+$",
r"^Rv0678:([0-9]+_){0,1}[0-9]+_del$",
r"^Rv0678:" + aminos + "[0-9]+_$",
],
"DCS": [
r"^Rv2780:[0-9]+_ins_[ACGT]+$",
r"^Rv2780:([0-9]+_){0,1}[0-9]+_del$",
r"^Rv2780:" + aminos + "[0-9]+_$",
],
"DEL": [
r"^Rv3547:[0-9]+_ins_[ACGT]+$",
r"^Rv3547:([0-9]+_){0,1}[0-9]+_del$",
r"^Rv3547:" + aminos + "[0-9]+_$",
],
"ETH": [
r"^Rv3854c:[0-9]+_ins_[ACGT]+$",
r"^Rv3854c:([0-9]+_){0,1}[0-9]+_del$",
r"^Rv3854c:" + aminos + "[0-9]+_$",
],
"INH": [
r"^Rv1908c:[0-9]+_ins_[ACGT]+$",
r"^Rv1908c:([0-9]+_){0,1}[0-9]+_del$",
r"^Rv1908c:" + aminos + "[0-9]+_$",
],
"PZA": [
r"^Rv2043c:[0-9]+_ins_[ACGT]+$",
r"^Rv2043c:([0-9]+_){0,1}[0-9]+_del$",
r"^Rv2043c:" + aminos + "[0-9]+_$",
],
"RIF": [
r"^Rv0667:[0-9]+_ins_[ACGT]+$",
r"^Rv0667:([0-9]+_){0,1}[0-9]+_del$",
r"^Rv0667:" + aminos + "[0-9]+_$",
],
"STR": [
r"^Rv3919c:[0-9]+_ins_[ACGT]+$",
r"^Rv3919c:([0-9]+_){0,1}[0-9]+_del$",
r"^Rv3919c:" + aminos + "[0-9]+_$",
],
}
[docs]def get_lineages() -> List[str]:
"""Returns a list of lineages."""
return [
"Lineage 1",
"Lineage 1.1",
"Lineage 1.1.1",
"Lineage 1.1.1.1",
"Lineage 1.1.2",
"Lineage 1.1.3",
"Lineage 1.2.1",
"Lineage 1.2.2",
"Lineage 2.1",
"Lineage 2.2.1",
"Lineage 2.2.1.1",
"Lineage 2.2.1.2",
"Lineage 2.2.2",
"Lineage 3",
"Lineage 3.1.1",
"Lineage 3.1.2",
"Lineage 3.1.2.1",
"Lineage 3.1.2.2",
"Lineage 4",
"Lineage 4.1",
"Lineage 4.1.1",
"Lineage 4.1.1.1",
"Lineage 4.1.1.2",
"Lineage 4.1.1.3",
"Lineage 4.1.2",
"Lineage 4.1.2.1",
"Lineage 4.2",
"Lineage 4.2.1",
"Lineage 4.2.2",
"Lineage 4.2.2.1",
"Lineage 4.3",
"Lineage 4.3.1",
"Lineage 4.3.2",
"Lineage 4.3.2.1",
"Lineage 4.3.3",
"Lineage 4.3.4",
"Lineage 4.3.4.1",
"Lineage 4.3.4.2",
"Lineage 4.3.4.2.1",
"Lineage 4.4",
"Lineage 4.4.1",
"Lineage 4.4.1.1",
"Lineage 4.4.1.2",
"Lineage 4.4.2",
"Lineage 4.5",
"Lineage 4.6",
"Lineage 4.6.1",
"Lineage 4.6.1.1",
"Lineage 4.6.1.2",
"Lineage 4.6.2",
"Lineage 4.6.2.1",
"Lineage 4.6.2.2",
"Lineage 4.7",
"Lineage 4.8",
"Lineage 4.9",
"Lineage 5",
"Lineage 6",
"Lineage 7",
"Lineage BOV",
]
[docs]def get_rules(drug: str) -> Tuple[List[int], bool]:
"""
Returns the rules learned by the Rule-Based Classifier.
Parameters
----------
drug : str
Drug for which the rule, obtained from a Rule-Based Classifier, shall be returned.
Returns
-------
rule : list[int]
List of integer indices to index the features that are resistance-causing.
geno_only : bool
If True, the returned rule contains only the index of the FZB genotype feature.
"""
if drug == "AMK":
return [807], False
elif drug == "FQ":
return [1552], True
elif drug == "RIF":
return [
150,
1212,
1326,
1327,
1328,
1330,
1332,
1371,
1372,
1373,
1385,
1386,
1387,
1417,
1419,
1420,
1421,
1422,
1423,
1439,
1455,
1472,
1476,
1505,
1508,
1511,
1512,
1515,
1516,
1539,
1540,
1643,
1663,
1756,
2518,
2651,
3003,
3145,
3150,
3165,
3683,
3702,
3922,
4379,
5055,
5518,
5550,
5955,
6236,
], False
else:
raise ValueError(f"Illegal input: {drug} is not modeled by a Rule-Based Classifier.")
[docs]def check_output(
output_dir: str, ground_truth_dir: str, sample_id: str, only_preprocess: bool
) -> bool:
"""
Checks the resistance prediction, extracted features and feature importance evaluation
against the ground truth.
Throws an assertion error if the files do not match up.
Parameters
----------
output_dir : str
Output directory of prediction.
ground_truth_dir : str
Directory of ground truth files.
sample_id : str
ID of sample.
preprocess : bool
If True, check only the preprocess output.
Returns
-------
bool, default=True
Throws an exception if the files do not match.
"""
# Check the extracted features.
_logger.debug("Checking: Extracted features")
true_feat_path = os.path.join(ground_truth_dir, f"{sample_id}_ground_truth_features.tsv")
out_feat_path = os.path.join(output_dir, f"{sample_id}_extracted_features.tsv")
with open(true_feat_path) as true_file, open(out_feat_path) as out_file:
true_features = set(true_file.readlines())
out_features = set(out_file.readlines())
assert true_features == out_features
_logger.debug("Check passed: Extracted features")
if only_preprocess:
return True
# Check feature importance evaluation
_logger.debug("Checking: Feature evaluation")
true_eval_path = os.path.join(
ground_truth_dir, f"{sample_id}_feature_importance_evaluation.tsv"
)
out_eval_path = os.path.join(output_dir, f"{sample_id}_feature_importance_evaluation.tsv")
with open(true_eval_path) as true_file, open(out_eval_path) as out_file:
true_features = set(true_file.readlines())
out_features = set(out_file.readlines())
assert true_features == out_features
_logger.debug("Check passed: Feature evaluation")
# Check resistance report predictions drug
for drug in get_drugs():
_logger.debug(f"Checking: Resistance report {drug}")
true_report_path = os.path.join(ground_truth_dir, f"{drug}_resistance_report.txt")
out_report_path = os.path.join(output_dir, f"{drug}_resistance_report.txt")
with open(true_report_path) as true_file, open(out_report_path) as out_file:
true_report = true_file.readlines()
out_report = out_file.readlines()
assert true_report == out_report
_logger.debug(f"Check passed: {drug}")
return True
[docs]def get_static_dir() -> str:
"""Returns the absolute path of the static folder."""
return os.path.join(os.path.dirname(os.path.realpath(__file__)), "static")