You're Invited:Meet the Socket Team at RSAC and BSidesSF 2026, March 23–26.RSVP
Socket
Book a DemoSign in
Socket

gumpy

Package Overview
Dependencies
Maintainers
2
Versions
34
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

gumpy - pypi Package Compare versions

Comparing version
1.3.3
to
1.3.4
+10
-5
bin/to_piezo_catalogue.py

@@ -24,3 +24,6 @@ #!/usr/bin/python3

"""
header = "GENBANK_REFERENCE,CATALOGUE_NAME,CATALOGUE_VERSION,CATALOGUE_GRAMMAR,PREDICTION_VALUES,DRUG,MUTATION,PREDICTION,SOURCE,EVIDENCE,OTHER\n"
header = (
"GENBANK_REFERENCE,CATALOGUE_NAME,CATALOGUE_VERSION,CATALOGUE_GRAMMAR,"
"PREDICTION_VALUES,DRUG,MUTATION,PREDICTION,SOURCE,EVIDENCE,OTHER\n"
)
common_all = f"{ref_genome},{catalogue},1.0,GARC1,RUS,NAN,"

@@ -32,3 +35,4 @@

f.write(header)
# Piezo requires all prediction values are in the catalogue, and all valid `PREDICTION_VALUES` include `R`
# Piezo requires all prediction values are in the catalogue,
# and all valid `PREDICTION_VALUES` include `R`
# So add a dummy record with R as S and U are used elsewhere

@@ -47,5 +51,6 @@ f.write(common_all + "NOTAGENE@A1!,R,{},{},{}\n")

if __name__ == "__main__":
assert (
len(sys.argv) == 4
), "Incorrect usage. Try: python to_piezo_catalogue.py <genome_path> <vcf_path> <output_filename>"
assert len(sys.argv) == 4, (
"Incorrect usage. Try: python to_piezo_catalogue.py "
"<genome_path> <vcf_path> <output_filename>"
)
genome_path = sys.argv[1]

@@ -52,0 +57,0 @@ vcf_path = sys.argv[2]

Metadata-Version: 2.1
Name: gumpy
Version: 1.3.3
Version: 1.3.4
Summary: Genetics with Numpy

@@ -5,0 +5,0 @@ Home-page: https://github.com/oxfordmmm/gumpy

@@ -1002,30 +1002,27 @@ """

for idx, type_ in tqdm(vcf.calls.keys(), disable=(not self.show_progress_bar)):
genome.vcf_evidence[genome.nucleotide_index[idx - 1]] = vcf.calls[
(idx, type_)
]["original_vcf_row"]
for item in vcf.calls[(idx, type_)]:
genome.vcf_evidence[genome.nucleotide_index[idx - 1]] = item[
"original_vcf_row"
]
# deal with changes at a single nucleotide site
if type_ in ["snp", "null", "het"]:
# only set values if the idx is to a single nucleotide
genome.nucleotide_sequence[idx - 1] = vcf.calls[(idx, type_)]["call"]
# deal with changes at a single nucleotide site
if type_ in ["snp", "null", "het"]:
# only set values if the idx is to a single nucleotide
genome.nucleotide_sequence[idx - 1] = item["call"]
# deal with insertions and deletions
elif type_ == "indel":
genome.is_indel[idx - 1] = True
genome.indel_nucleotides[idx - 1] = vcf.calls[(idx, type_)]["call"][1]
# deal with insertions and deletions
elif type_ == "indel":
genome.is_indel[idx - 1] = True
genome.indel_nucleotides[idx - 1] = item["call"][1]
if vcf.calls[(idx, type_)]["call"][0] == "ins":
genome.indel_length[idx - 1] = len(
vcf.calls[(idx, type_)]["call"][1]
)
else:
genome.indel_length[idx - 1] = -1 * len(
vcf.calls[(idx, type_)]["call"][1]
)
if item["call"][0] == "ins":
genome.indel_length[idx - 1] = len(item["call"][1])
else:
genome.indel_length[idx - 1] = -1 * len(item["call"][1])
elif type_ == "ref":
# These only exist due to reference calls
# They only made it this far as they are required to pull out minors at
# these positions
pass
elif type_ == "ref":
# These only exist due to reference calls
# They only made it this far as they are required to pull
# out minors at these positions
pass

@@ -1032,0 +1029,0 @@ genome.minor_populations = []

@@ -405,107 +405,108 @@ """

for idx, type_ in self.calls.keys():
# Get the simple format of this call for comparison
if isinstance(self.calls[(idx, type_)]["call"], tuple):
# Indels
t = self.calls[(idx, type_)]["call"][0]
bases = self.calls[(idx, type_)]["call"][1]
pos = self.calls[(idx, type_)]["pos"]
else:
# Snps
if self.calls[(idx, type_)]["call"] == "x":
# Null calls shouldn't have minor populations
continue
t = "snp"
bases = (
self.calls[(idx, type_)]["ref"],
self.calls[(idx, type_)]["call"],
)
pos = self.calls[(idx, type_)]["pos"]
simple_calls.append((idx, pos, t, bases))
for item in self.calls[(idx, type_)]:
# Get the simple format of this call for comparison
if isinstance(item["call"], tuple):
# Indels
t = item["call"][0]
bases = item["call"][1]
pos = item["pos"]
else:
# Snps
if item["call"] == "x":
# Null calls shouldn't have minor populations
continue
t = "snp"
bases = (
item["ref"],
item["call"],
)
pos = item["pos"]
simple_calls.append((idx, pos, t, bases))
seen = set()
for idx, type_ in tqdm(self.calls.keys()):
# Check if we've delt with this vcf already
if str(self.calls[(idx, type_)]["original_vcf_row"]) in seen:
continue
else:
seen.add(str(self.calls[(idx, type_)]["original_vcf_row"]))
for item in self.calls[(idx, type_)]:
# Check if we've delt with this vcf already
if str(item["original_vcf_row"]) in seen:
continue
else:
seen.add(str(item["original_vcf_row"]))
# Pull out depth tag from the specific row's format fields
# as the file metadata isn't a guarantee of the actual fields of this row
allelic_depth_tag = (
"COV"
if self.calls[(idx, type_)]["original_vcf_row"].get("COV", None)
else "AD"
)
# Pull out depth tag from the specific row's format fields
# as the file metadata isn't a guarantee of the actual
# fields of this row
allelic_depth_tag = (
"COV" if item["original_vcf_row"].get("COV", None) else "AD"
)
# Checking for het calls
if self.calls[(idx, type_)]["call"] == "z":
if 0 not in self.calls[(idx, type_)]["original_vcf_row"]["GT"]:
# Het call without a wildtype call, so warn about weird behaviour
warnings.warn(
f"Minor population detected at position {idx}, which doesn't "
f"include a wildtype call. Call: "
f"{self.calls[(idx, type_)]['original_vcf_row']['GT']}. "
"Note that there may be multiple mutations given at this index",
UserWarning,
)
# Checking for het calls
if item["call"] == "z":
if 0 not in item["original_vcf_row"]["GT"]:
# Het call without a wildtype call, so warn about
# behaviour
warnings.warn(
f"Minor population detected at position {idx}, which "
f"doesn't include a wildtype call. Call: "
f"{item['original_vcf_row']['GT']}. Note that there "
"may be multiple mutations given at this index",
UserWarning,
)
# Reference base(s)
ref = self.calls[(idx, type_)]["original_vcf_row"]["REF"]
# Reference base(s)
ref = item["original_vcf_row"]["REF"]
if self.calls[(idx, type_)]["original_vcf_row"]["ALTS"] is None:
# Case arrises from gvcf ref calls not giving any alts
calls = [self.calls[(idx, type_)]["original_vcf_row"]["REF"]]
else:
# Get all of the calls
calls = [self.calls[(idx, type_)]["original_vcf_row"]["REF"]] + list(
self.calls[(idx, type_)]["original_vcf_row"]["ALTS"]
)
if item["original_vcf_row"]["ALTS"] is None:
# Case arrises from gvcf ref calls not giving any alts
calls = [item["original_vcf_row"]["REF"]]
else:
# Get all of the calls
calls = [item["original_vcf_row"]["REF"]] + list(
item["original_vcf_row"]["ALTS"]
)
# Break down the calls as appropriate
simple = [self._simplify_call(ref, alt) for alt in calls]
# Break down the calls as appropriate
simple = [self._simplify_call(ref, alt) for alt in calls]
# Map each call to the corresponding read depth
dps = list(self.calls[(idx, type_)]["original_vcf_row"][allelic_depth_tag])
# Map each call to the corresponding read depth
dps = list(item["original_vcf_row"][allelic_depth_tag])
# GVCF null calls get None for depth, so catch (and skip) this
if dps == [None]:
continue
else:
total_depth = sum(dps)
# GVCF null calls get None for depth, so catch (and skip) this
if dps == [None]:
continue
else:
total_depth = sum(dps)
# idx here refers to the position of this call, NOT this vcf row, so adjust
# to avoid shifting when building minor calls
original_idx = idx
idx = idx - self.calls[(idx, type_)]["pos"]
for calls, depth in zip(simple, dps):
# As we can have >1 call per simple, iter
for call in calls:
# Check that this call isn't one of the actual calls
if (idx + int(call[0]), *call) in simple_calls:
# Is an actual call so we skip
continue
# Check if there are >=2 reads to support this call
if depth >= 2:
# These are minor calls!!
pos = idx + int(call[0])
if pos not in self.minor_population_indices:
# We don't actually care though
# This has to be done here as simplifying calls can move
# the position
# idx here refers to the position of this call, NOT this vcf row,
# so adjust to avoid shifting when building minor calls
idx = idx - item["pos"]
for calls, depth in zip(simple, dps):
# As we can have >1 call per simple, iter
for call in calls:
# Check that this call isn't one of the actual calls
if (idx + int(call[0]), *call) in simple_calls:
# Is an actual call so we skip
continue
if call[1] == "snp" and call[2][0] == call[2][1]:
# Ref calls aren't interesting
continue
# Only tracking absolute number of reads
self.minor_populations.append(
(
pos,
call[1],
call[2],
int(depth),
round(depth / total_depth, 3),
self.calls[(original_idx, type_)]["original_vcf_row"],
# Check if there are >=2 reads to support this call
if depth >= 2:
# These are minor calls!!
pos = idx + int(call[0])
if pos not in self.minor_population_indices:
# We don't actually care though
# This has to be done here as simplifying calls can move
# the position
continue
if call[1] == "snp" and call[2][0] == call[2][1]:
# Ref calls aren't interesting
continue
# Only tracking absolute number of reads
self.minor_populations.append(
(
pos,
call[1],
call[2],
int(depth),
round(depth / total_depth, 3),
item["original_vcf_row"],
)
)
)

@@ -519,3 +520,3 @@ def __find_calls(self):

self.calls = {}
self.calls = defaultdict(list)

@@ -595,6 +596,9 @@ for record in self.records:

if self.calls.get((index + counter, variant_type)) is not None:
raise ValueError(
"Multiple calls at position " + str(index) + " in VCF file"
warnings.warn(
UserWarning(
f"Multiple calls at position {index}"
+ f" with type {variant_type} in VCF file"
)
)
self.calls[(index + counter, variant_type)] = metadata
self.calls[(index + counter, variant_type)].append(metadata)

@@ -619,8 +623,9 @@ # otherwise the REF, ALT pair are different lengths

if self.calls.get((index + p, "indel")) is not None:
raise ValueError(
"Multiple calls at position "
+ str(index)
+ " in VCF file"
warnings.warn(
UserWarning(
f"Multiple calls at position {index}"
+ " with type indel in VCF file"
)
)
self.calls[(index + p, "indel")] = metadata
self.calls[(index + p, "indel")].append(metadata)

@@ -638,8 +643,9 @@ else:

if self.calls.get((index + p, variant_type)) is not None:
raise ValueError(
"Multiple calls at position "
+ str(index)
+ " in VCF file"
warnings.warn(
UserWarning(
f"Multiple calls at position {index}"
+ f" with type {variant_type} in VCF file"
)
)
self.calls[(index + p, variant_type)] = metadata
self.calls[(index + p, variant_type)].append(metadata)

@@ -816,66 +822,67 @@ def _simplify_call(self, ref: str, alt: str) -> List[Tuple[int, str, str]]:

for index, type_ in sorted(list(self.calls.keys())):
call = self.calls[(index, type_)]["call"]
alt = call
ref = self.calls[(index, type_)]["ref"]
if ref == alt and self.bypass_reference_calls:
# If we have a call at a position which the alt is a reference call
# we should only care if we haven't tried to bypass ref calls
# Have to check here too i.e CCC->ATC will have a ref call at pos 3
to_drop.append((index, type_))
continue
indices.append(index)
refs.append(ref)
pos = self.calls[(index, type_)]["pos"]
positions.append(pos)
# Update the masks with the appropriate types
if type_ == "indel":
# Convert to ins_x or del_x rather than tuple
variant = str(index) + "_" + call[0] + "_" + str(call[1])
alt = call[1]
is_indel.append(True)
if call[0] == "ins":
indel_length.append(len(call[1]))
else:
indel_length.append(-1 * len(call[1]))
is_snp.append(False)
is_het.append(False)
is_null.append(False)
elif type_ == "snp":
variant = str(index) + ref + ">" + call
is_indel.append(False)
indel_length.append(0)
is_snp.append(True)
is_het.append(False)
is_null.append(False)
elif type_ == "het":
variant = str(index) + ref + ">" + call
is_indel.append(False)
indel_length.append(0)
is_snp.append(False)
is_het.append(True)
is_null.append(False)
elif type_ == "null":
variant = str(index) + ref + ">" + call
is_indel.append(False)
indel_length.append(0)
is_snp.append(False)
is_het.append(False)
is_null.append(True)
elif type_ == "ref":
variant = str(index) + ref + ">" + ref
is_indel.append(False)
indel_length.append(0)
is_snp.append(False)
is_het.append(False)
is_null.append(False)
alts.append(alt)
variants.append(variant)
for key in self.calls[(index, type_)]["original_vcf_row"]:
metadata[key].append(
self.calls[(index, type_)]["original_vcf_row"][key]
)
for idx, item in enumerate(self.calls[(index, type_)]):
call = item["call"]
alt = call
ref = item["ref"]
if ref == alt and self.bypass_reference_calls:
# If we have a call at a position which the alt is a reference call
# we should only care if we haven't tried to bypass ref calls
# Have to check here too i.e CCC->ATC will have a ref call at pos 3
to_drop.append((idx, (index, type_)))
continue
indices.append(index)
refs.append(ref)
pos = item["pos"]
positions.append(pos)
# Update the masks with the appropriate types
if type_ == "indel":
# Convert to ins_x or del_x rather than tuple
variant = str(index) + "_" + call[0] + "_" + str(call[1])
alt = call[1]
is_indel.append(True)
if call[0] == "ins":
indel_length.append(len(call[1]))
else:
indel_length.append(-1 * len(call[1]))
is_snp.append(False)
is_het.append(False)
is_null.append(False)
elif type_ == "snp":
variant = str(index) + ref + ">" + call
is_indel.append(False)
indel_length.append(0)
is_snp.append(True)
is_het.append(False)
is_null.append(False)
elif type_ == "het":
variant = str(index) + ref + ">" + call
is_indel.append(False)
indel_length.append(0)
is_snp.append(False)
is_het.append(True)
is_null.append(False)
elif type_ == "null":
variant = str(index) + ref + ">" + call
is_indel.append(False)
indel_length.append(0)
is_snp.append(False)
is_het.append(False)
is_null.append(True)
elif type_ == "ref":
variant = str(index) + ref + ">" + ref
is_indel.append(False)
indel_length.append(0)
is_snp.append(False)
is_het.append(False)
is_null.append(False)
alts.append(alt)
variants.append(variant)
for key in item["original_vcf_row"]:
metadata[key].append(item["original_vcf_row"][key])
# Remove ref calls as required
for key in to_drop:
del self.calls[key]
for idx, key in to_drop:
del self.calls[key][idx]
if len(self.calls[key]) == 0:
del self.calls[key]

@@ -882,0 +889,0 @@ # Convert to numpy arrays for neat indexing

Metadata-Version: 2.1
Name: gumpy
Version: 1.3.3
Version: 1.3.4
Summary: Genetics with Numpy

@@ -5,0 +5,0 @@ Home-page: https://github.com/oxfordmmm/gumpy

@@ -1,2 +0,2 @@

1.3.3
1.3.4