Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
6b36ba1
Initial revision of ReferenceLengthExpression in VCF annotator
theferrit32 Oct 15, 2025
76cd565
No longer need FieldName.default_value since we can rely on pysam
theferrit32 Oct 15, 2025
4f22083
Fix INFO field setting to allow 0 for Integers.
theferrit32 Oct 16, 2025
94cc277
Reworking the RLE flow
theferrit32 Oct 16, 2025
d048501
Move around code some more
theferrit32 Oct 16, 2025
6051b11
Adding some tests
theferrit32 Oct 22, 2025
c627c50
Add more RLE test cases
theferrit32 Nov 11, 2025
499002e
Fix test_allele_translator for RLE NC_000013.11:g.32936732=
theferrit32 Nov 11, 2025
4db6c3c
remove id/digest testing from allele test
theferrit32 Nov 12, 2025
3e121be
Fix test_annotate_vcf_grch38_noattrs with ref alleles
theferrit32 Nov 12, 2025
eac0dc5
Update incorrect ref allele test case. Refresh expected VCFs
theferrit32 Nov 12, 2025
491de1b
Refresh tests/cassettes/test_normalize_allele.yaml
theferrit32 Nov 12, 2025
b1a7b5f
Update comment
theferrit32 Nov 12, 2025
bce957a
Change test VCFs to uncompressed text so diffs can be seen going forw…
theferrit32 Nov 12, 2025
c0042c3
Include sequence value in output vcf if the length of that sequence i…
theferrit32 Nov 12, 2025
6df2ba2
Remove commented blocks
theferrit32 Nov 12, 2025
460d6bb
Update test name
theferrit32 Nov 12, 2025
b7fa124
Add comments to cases in test_vrs_normalize.py
theferrit32 Nov 12, 2025
576e4db
Add make target clean-cassettes
theferrit32 Nov 12, 2025
70bd6e2
Minor tweak to all_played checks in vcf annotator tests to make re-re…
theferrit32 Nov 12, 2025
adf5f73
Ignore VRS and Python version header values in VCF tests
theferrit32 Nov 12, 2025
2c5c1dd
Add missing cassette
theferrit32 Nov 13, 2025
9a65b82
Don't fail test when vcr is being rewritten. Refresh cassettes
theferrit32 Nov 13, 2025
94ad735
add decode_compressed_response=True to vcr config
theferrit32 Nov 13, 2025
7723d5c
Add vcr_config to conftest.py. Delete vcr_support.py. Refresh cassettes
theferrit32 Nov 13, 2025
6255779
Merge remote-tracking branch 'origin/main' into issue-577-VCF-RLE
theferrit32 Nov 17, 2025
26eeba3
Reset casettes for RLE tests after merge from main
theferrit32 Nov 18, 2025
c8282c2
Adding microsatellite RLE cases
theferrit32 Nov 20, 2025
97a583c
Make (incorrect) changes to normalize same-as-ref RLE variants
theferrit32 Nov 20, 2025
6a442c9
Do not normalize same-as-ref. Further refactoring and documentation i…
theferrit32 Nov 26, 2025
6982654
Update _normalize_allele to accept zero bp ref alleles. Expand test c…
theferrit32 Dec 2, 2025
c1bd885
Add several more RLE cases testing partial insertion/deletion in midd…
theferrit32 Dec 2, 2025
bce6830
Reset cassettes
theferrit32 Dec 2, 2025
5b7d1f7
Update VRS_RepeatSubunitLengths docstring
theferrit32 Dec 2, 2025
aed34f1
Fix VCF annotations to match source changes
theferrit32 Dec 3, 2025
44af087
Clean up error in cassette
theferrit32 Dec 3, 2025
77cb9c7
Remove unused vrs_data_key
theferrit32 Dec 12, 2025
f3ce393
Update vcf.py to replace MAX_LITERAL_STATE_LENGTH with a RLE_SEQ_LIMI…
theferrit32 Dec 12, 2025
83f9296
Apply suggestions from code review
theferrit32 Dec 12, 2025
a52ec24
Use enumerated sequence types in vcf.py
theferrit32 Dec 12, 2025
938e3e0
Update src/ga4gh/vrs/normalize.py
theferrit32 Dec 12, 2025
2234438
Fix formatting
theferrit32 Dec 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,11 @@ cleaner: clean
cleanest: cleaner
rm -fr .eggs venv

#=> clean-cassettes: delete YAML VCR cassettes under tests/**/casette/
.PHONY: clean-cassettes
clean-cassettes:
find ./tests -type f -path '*/cassettes/*.yaml' -print0 | ${XRM}


## <LICENSE>
## Copyright 2016 Source Code Committers
Expand Down
97 changes: 76 additions & 21 deletions src/ga4gh/vrs/extras/annotator/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
VrsObjectIdentifierIs,
use_ga4gh_compute_identifier_when,
)
from ga4gh.vrs import VRS_VERSION, __version__
from ga4gh.vrs import VRS_VERSION, VrsType, __version__
from ga4gh.vrs.dataproxy import _DataProxy
from ga4gh.vrs.extras.translator import AlleleTranslator
from ga4gh.vrs.models import Allele
from ga4gh.vrs.models import Allele, Range

_logger = logging.getLogger(__name__)

Expand All @@ -36,17 +36,10 @@ class FieldName(str, Enum):
STARTS_FIELD = "VRS_Starts"
ENDS_FIELD = "VRS_Ends"
STATES_FIELD = "VRS_States"
LENGTHS_FIELD = "VRS_Lengths"
REPEAT_SUBUNIT_LENGTHS_FIELD = "VRS_RepeatSubunitLengths"
ERROR_FIELD = "VRS_Error"

def default_value(self) -> Literal[".", -1]:
"""Provide value to use for default/null case in VCF INFO field

:return: either ``"."`` or ``-1``
"""
if self in (FieldName.IDS_FIELD, FieldName.STATES_FIELD, FieldName.ERROR_FIELD):
return "."
return -1


# VCF character escape map
VCF_ESCAPE_MAP = str.maketrans(
Expand All @@ -59,6 +52,11 @@ def default_value(self) -> Literal[".", -1]:
}
)

# ReferenceLengthExpression .sequence values will be included in output VCF if
# length <= this value. This field is optional for RLE since it can be derived
# from the reference sequence. Set to None to always include the sequence.
RLE_SEQ_LIMIT = 50


def dump_alleles_to_pkl(alleles: list[Allele], output_pkl_path: Path) -> None:
"""Create pkl file of dictionary mapping VRS IDs to ingested alleles.
Expand Down Expand Up @@ -187,6 +185,24 @@ def _update_vcf_header(
f"corresponding to the GT indexes of the {info_field_desc} alleles"
),
)
vcf.header.info.add(
FieldName.LENGTHS_FIELD.value,
info_field_num,
"Integer",
(
"The length values from ReferenceLengthExpression states for the GA4GH VRS "
f"Alleles corresponding to the GT indexes of the {info_field_desc} alleles"
),
)
vcf.header.info.add(
FieldName.REPEAT_SUBUNIT_LENGTHS_FIELD.value,
info_field_num,
"Integer",
(
"The repeatSubunitLength values from ReferenceLengthExpression states for the GA4GH VRS "
f"Alleles corresponding to the GT indexes of the {info_field_desc} alleles"
),
)

@use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.MISSING)
def annotate(
Expand Down Expand Up @@ -244,6 +260,8 @@ def annotate(
FieldName.STARTS_FIELD,
FieldName.ENDS_FIELD,
FieldName.STATES_FIELD,
FieldName.LENGTHS_FIELD,
FieldName.REPEAT_SUBUNIT_LENGTHS_FIELD,
]
else:
# no INFO field names need to be designated if not producing an annotated VCF
Expand Down Expand Up @@ -275,8 +293,10 @@ def annotate(

if output_vcf_path and vcf_out:
for k in additional_info_fields:
# Convert "" and None values (but not 0) to None.
# Pysam outputs "." for missing values.
record.info[k.value] = [
value or k.default_value() for value in vrs_field_data[k.value]
None if v in ("", None) else v for v in vrs_field_data[k.value]
]
vcf_out.write(record)

Expand Down Expand Up @@ -369,20 +389,57 @@ def _get_vrs_object(
vrs_field_data[FieldName.IDS_FIELD].append(allele_id)

if vrs_attributes:
# Initialize fields with None for missing values
# pysam will convert None to "." in VCF output
start = end = None
alt = None
length = repeat_subunit_length = None

if vrs_obj:
# Common fields for all state types
start = vrs_obj.location.start
end = vrs_obj.location.end
alt = (
str(vrs_obj.state.sequence.root)
if vrs_obj.state.sequence
else ""
)
else:
start = end = alt = ""
state = vrs_obj.state
state_type = state.type

if state_type == VrsType.LIT_SEQ_EXPR:
# Sequence is required
alt = state.sequence.root
elif state_type == VrsType.REF_LEN_EXPR:
# Length is required, sequence is optional
length = state.length
if length is None:
err_msg = f"{state_type} requires a non-empty length: {vcf_coords}"
raise VcfAnnotatorError(err_msg)
if isinstance(length, Range):
err_msg = f"{state_type} with Range length not supported for VCF annotation: {vcf_coords}"
raise VcfAnnotatorError(err_msg)
repeat_subunit_length = state.repeatSubunitLength
# Only include sequence if within rle_seq_limit
if state.sequence is not None and (
RLE_SEQ_LIMIT is None or length <= RLE_SEQ_LIMIT
):
alt = state.sequence.root
elif state_type == VrsType.LEN_EXPR:
# Length is required, no sequence field
length = state.length
if length is None:
err_msg = f"{state_type} requires a non-empty length: {vcf_coords}"
raise VcfAnnotatorError(err_msg)
if isinstance(length, Range):
err_msg = f"{state_type} with Range length not supported for VCF annotation: {vcf_coords}"
raise VcfAnnotatorError(err_msg)
else:
err_msg = f"Unsupported state type '{state_type}' for VCF annotation: {vcf_coords}"
raise VcfAnnotatorError(err_msg)

vrs_field_data[FieldName.STARTS_FIELD].append(start)
vrs_field_data[FieldName.ENDS_FIELD].append(end)
vrs_field_data[FieldName.STATES_FIELD].append(alt)
vrs_field_data[FieldName.LENGTHS_FIELD].append(length)
vrs_field_data[FieldName.REPEAT_SUBUNIT_LENGTHS_FIELD].append(
repeat_subunit_length
)

def _get_vrs_data(
self,
Expand Down Expand Up @@ -432,7 +489,6 @@ def _get_vrs_data(
# Get VRS data for alts
alts = record.alts or []
alleles = [f"{gnomad_loc}-{record.ref}-{a}" for a in [*alts]]
data = f"{record.chrom}\t{record.pos}\t{record.ref}\t{record.alts}"
for allele in alleles:
if "*" in allele:
_logger.debug("Star allele found: %s", allele)
Expand All @@ -444,7 +500,6 @@ def _get_vrs_data(
allele_collection,
vrs_field_data,
assembly,
vrs_data_key=data,
vrs_attributes=vrs_attributes,
require_validation=require_validation,
)
Expand Down
Loading
Loading