diff --git a/Makefile b/Makefile index 58acefd2..fcf3502b 100644 --- a/Makefile +++ b/Makefile @@ -123,6 +123,11 @@ cleaner: clean cleanest: cleaner rm -fr .eggs venv +#=> clean-cassettes: delete YAML VCR cassettes under tests/**/casette/ +.PHONY: clean-cassettes +clean-cassettes: + find ./tests -type f -path '*/cassettes/*.yaml' -print0 | ${XRM} + ## ## Copyright 2016 Source Code Committers diff --git a/src/ga4gh/vrs/extras/annotator/vcf.py b/src/ga4gh/vrs/extras/annotator/vcf.py index a0e93cb1..37f0d475 100644 --- a/src/ga4gh/vrs/extras/annotator/vcf.py +++ b/src/ga4gh/vrs/extras/annotator/vcf.py @@ -13,10 +13,10 @@ VrsObjectIdentifierIs, use_ga4gh_compute_identifier_when, ) -from ga4gh.vrs import VRS_VERSION, __version__ +from ga4gh.vrs import VRS_VERSION, VrsType, __version__ from ga4gh.vrs.dataproxy import _DataProxy from ga4gh.vrs.extras.translator import AlleleTranslator -from ga4gh.vrs.models import Allele +from ga4gh.vrs.models import Allele, Range _logger = logging.getLogger(__name__) @@ -36,17 +36,10 @@ class FieldName(str, Enum): STARTS_FIELD = "VRS_Starts" ENDS_FIELD = "VRS_Ends" STATES_FIELD = "VRS_States" + LENGTHS_FIELD = "VRS_Lengths" + REPEAT_SUBUNIT_LENGTHS_FIELD = "VRS_RepeatSubunitLengths" ERROR_FIELD = "VRS_Error" - def default_value(self) -> Literal[".", -1]: - """Provide value to use for default/null case in VCF INFO field - - :return: either ``"."`` or ``-1`` - """ - if self in (FieldName.IDS_FIELD, FieldName.STATES_FIELD, FieldName.ERROR_FIELD): - return "." - return -1 - # VCF character escape map VCF_ESCAPE_MAP = str.maketrans( @@ -59,6 +52,11 @@ def default_value(self) -> Literal[".", -1]: } ) +# ReferenceLengthExpression .sequence values will be included in output VCF if +# length <= this value. This field is optional for RLE since it can be derived +# from the reference sequence. Set to None to always include the sequence. +RLE_SEQ_LIMIT = 50 + def dump_alleles_to_pkl(alleles: list[Allele], output_pkl_path: Path) -> None: """Create pkl file of dictionary mapping VRS IDs to ingested alleles. @@ -187,6 +185,24 @@ def _update_vcf_header( f"corresponding to the GT indexes of the {info_field_desc} alleles" ), ) + vcf.header.info.add( + FieldName.LENGTHS_FIELD.value, + info_field_num, + "Integer", + ( + "The length values from ReferenceLengthExpression states for the GA4GH VRS " + f"Alleles corresponding to the GT indexes of the {info_field_desc} alleles" + ), + ) + vcf.header.info.add( + FieldName.REPEAT_SUBUNIT_LENGTHS_FIELD.value, + info_field_num, + "Integer", + ( + "The repeatSubunitLength values from ReferenceLengthExpression states for the GA4GH VRS " + f"Alleles corresponding to the GT indexes of the {info_field_desc} alleles" + ), + ) @use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.MISSING) def annotate( @@ -244,6 +260,8 @@ def annotate( FieldName.STARTS_FIELD, FieldName.ENDS_FIELD, FieldName.STATES_FIELD, + FieldName.LENGTHS_FIELD, + FieldName.REPEAT_SUBUNIT_LENGTHS_FIELD, ] else: # no INFO field names need to be designated if not producing an annotated VCF @@ -275,8 +293,10 @@ def annotate( if output_vcf_path and vcf_out: for k in additional_info_fields: + # Convert "" and None values (but not 0) to None. + # Pysam outputs "." for missing values. record.info[k.value] = [ - value or k.default_value() for value in vrs_field_data[k.value] + None if v in ("", None) else v for v in vrs_field_data[k.value] ] vcf_out.write(record) @@ -369,20 +389,57 @@ def _get_vrs_object( vrs_field_data[FieldName.IDS_FIELD].append(allele_id) if vrs_attributes: + # Initialize fields with None for missing values + # pysam will convert None to "." in VCF output + start = end = None + alt = None + length = repeat_subunit_length = None + if vrs_obj: + # Common fields for all state types start = vrs_obj.location.start end = vrs_obj.location.end - alt = ( - str(vrs_obj.state.sequence.root) - if vrs_obj.state.sequence - else "" - ) - else: - start = end = alt = "" + state = vrs_obj.state + state_type = state.type + + if state_type == VrsType.LIT_SEQ_EXPR: + # Sequence is required + alt = state.sequence.root + elif state_type == VrsType.REF_LEN_EXPR: + # Length is required, sequence is optional + length = state.length + if length is None: + err_msg = f"{state_type} requires a non-empty length: {vcf_coords}" + raise VcfAnnotatorError(err_msg) + if isinstance(length, Range): + err_msg = f"{state_type} with Range length not supported for VCF annotation: {vcf_coords}" + raise VcfAnnotatorError(err_msg) + repeat_subunit_length = state.repeatSubunitLength + # Only include sequence if within rle_seq_limit + if state.sequence is not None and ( + RLE_SEQ_LIMIT is None or length <= RLE_SEQ_LIMIT + ): + alt = state.sequence.root + elif state_type == VrsType.LEN_EXPR: + # Length is required, no sequence field + length = state.length + if length is None: + err_msg = f"{state_type} requires a non-empty length: {vcf_coords}" + raise VcfAnnotatorError(err_msg) + if isinstance(length, Range): + err_msg = f"{state_type} with Range length not supported for VCF annotation: {vcf_coords}" + raise VcfAnnotatorError(err_msg) + else: + err_msg = f"Unsupported state type '{state_type}' for VCF annotation: {vcf_coords}" + raise VcfAnnotatorError(err_msg) vrs_field_data[FieldName.STARTS_FIELD].append(start) vrs_field_data[FieldName.ENDS_FIELD].append(end) vrs_field_data[FieldName.STATES_FIELD].append(alt) + vrs_field_data[FieldName.LENGTHS_FIELD].append(length) + vrs_field_data[FieldName.REPEAT_SUBUNIT_LENGTHS_FIELD].append( + repeat_subunit_length + ) def _get_vrs_data( self, @@ -432,7 +489,6 @@ def _get_vrs_data( # Get VRS data for alts alts = record.alts or [] alleles = [f"{gnomad_loc}-{record.ref}-{a}" for a in [*alts]] - data = f"{record.chrom}\t{record.pos}\t{record.ref}\t{record.alts}" for allele in alleles: if "*" in allele: _logger.debug("Star allele found: %s", allele) @@ -444,7 +500,6 @@ def _get_vrs_data( allele_collection, vrs_field_data, assembly, - vrs_data_key=data, vrs_attributes=vrs_attributes, require_validation=require_validation, ) diff --git a/src/ga4gh/vrs/normalize.py b/src/ga4gh/vrs/normalize.py index 17b40c52..f899faf2 100644 --- a/src/ga4gh/vrs/normalize.py +++ b/src/ga4gh/vrs/normalize.py @@ -83,7 +83,7 @@ def _get_new_allele_location_pos( return val -def _normalize_allele(input_allele, data_proxy, rle_seq_limit=50): +def _normalize_allele(input_allele: models.Allele, data_proxy, rle_seq_limit=50): """Normalize Allele using "fully-justified" normalization adapted from NCBI's VOCA. Fully-justified normalization expands such ambiguous representation over the entire region of ambiguity, resulting in an unambiguous representation that may be @@ -93,16 +93,29 @@ def _normalize_allele(input_allele, data_proxy, rle_seq_limit=50): the `allele.location.sequenceReference`. If a `SequenceReference` is not provided, the allele will be returned as is with no normalization. + Does not attempt to normalize Alleles with definite ranges or non-LiteralSequenceExpression + states and will instead return the `input_allele`. + + If attempting to re-normalize a ReferenceLengthExpression allele, + use denormalize_reference_length_expression to construct a LiteralSequenceExpression + allele and then call normalize() on that LiteralSequenceExpression allele. + + See: https://vrs.ga4gh.org/en/2.0/conventions/normalization.html#literalsequenceexpression-alleles + :param input_allele: Input VRS Allele object :param data_proxy: SeqRepo dataproxy :param rle_seq_limit: If RLE is set as the new state, set the limit for the length of the `sequence`. To exclude `sequence` from the response, set to 0. For no limit, set to `None`. - - Does not attempt to normalize Alleles with definite ranges and will instead return the - `input_allele` """ + # Algorithm applies to LiteralSequenceExpression alleles only; other states are returned unchanged + if not isinstance(input_allele.state, models.LiteralSequenceExpression): + _logger.warning( + "`input_allele.state` was not a LiteralSequenceExpression, returning `input_allele` with no normalization." + ) + return input_allele + if isinstance(input_allele.location.sequenceReference, models.SequenceReference): alias = f"ga4gh:{input_allele.location.sequenceReference.refgetAccession}" else: @@ -112,7 +125,7 @@ def _normalize_allele(input_allele, data_proxy, rle_seq_limit=50): ) return input_allele - # Get reference sequence and interval + # 0: Get reference sequence and interval ref_seq = SequenceProxy(data_proxy, alias) start = _get_allele_location_pos(input_allele, use_start=True) if start is None: @@ -123,90 +136,81 @@ def _normalize_allele(input_allele, data_proxy, rle_seq_limit=50): return input_allele ival = (start.value, end.value) - alleles = ( - (None, input_allele.state.sequence.root) - if input_allele.state.sequence - else (None, "") + start_pos_type = start.pos_type + end_pos_type = end.pos_type + alt_seq = input_allele.state.sequence.root or "" + alleles = (None, alt_seq) + + # 1: trim shared flanking sequence + trim_ival, trim_alleles = _trim_for_normalization( + ref_seq, ival, alleles, start, end ) - # Trim common flanking sequence from Allele sequences. - try: - trim_ival, trim_alleles = _normalize( - ref_seq, ival, alleles, mode=None, trim=True - ) - except ValueError: - # Occurs for ref agree Alleles (when alt = ref) - len_trimmed_ref = len_trimmed_alt = 0 - # TODO: Return RLE for ref agree Alleles - else: - trim_ref_seq = ref_seq[trim_ival[0] : trim_ival[1]] - trim_alt_seq = trim_alleles[1] - len_trimmed_ref = len(trim_ref_seq) - len_trimmed_alt = len(trim_alt_seq) + trim_ref_seq = ref_seq[trim_ival[0] : trim_ival[1]] + trim_alt_seq = trim_alleles[1] + len_trimmed_ref = len(trim_ref_seq) + len_trimmed_alt = len(trim_alt_seq) + seed_length = len_trimmed_ref or len_trimmed_alt + identity_case = trim_ref_seq == trim_alt_seq - # Compare the two allele sequences - if not len_trimmed_ref and not len_trimmed_alt: - return input_allele + new_allele: models.Allele = pydantic_copy(input_allele) - new_allele = pydantic_copy(input_allele) + # 2.a: Reference allele (ref==alt after trim): use original span and return RLE + # length = repeatSubunitLength = seed_length (the input sequence length) + if identity_case: + _set_location_from_interval(new_allele, ival, start_pos_type, end_pos_type) + return _define_rle_allele( + new_allele, seed_length, seed_length, rle_seq_limit, alt_seq + ) + # 2.b: Substitution: both sides non-empty and different after trim. if len_trimmed_ref and len_trimmed_alt: - new_allele.location.start = _get_new_allele_location_pos( - trim_ival[0], start.pos_type - ) - new_allele.location.end = _get_new_allele_location_pos( - trim_ival[1], end.pos_type - ) - new_allele.state.sequence = models.sequenceString(trim_alleles[1]) + _set_location_from_interval(new_allele, trim_ival, start_pos_type, end_pos_type) + new_allele.state.sequence = models.sequenceString(trim_alt_seq) return new_allele - seed_length = len_trimmed_ref if len_trimmed_ref else len_trimmed_alt - # Determine bounds of ambiguity + # 3: Expand ambiguity by rolling left + right new_ival, new_alleles = _normalize( - ref_seq, trim_ival, (None, trim_alleles[1]), mode=NormalizationMode.EXPAND - ) - - new_allele.location.start = _get_new_allele_location_pos( - new_ival[0], start.pos_type + ref_seq, + trim_ival, + (None, trim_alt_seq), + mode=NormalizationMode.EXPAND, + trim=not identity_case, # bioutils will not trim identical alleles ) - new_allele.location.end = _get_new_allele_location_pos(new_ival[1], end.pos_type) + # 4: Get the extended sequences extended_ref_seq = ref_seq[new_ival[0] : new_ival[1]] extended_alt_seq = new_alleles[1] + len_extended_alt = len(extended_alt_seq) + len_extended_ref = len(extended_ref_seq) + # 5.a if not extended_ref_seq: - # If the reference sequence is empty this is an unambiguous insertion. - # Return a new Allele with the trimmed alternate sequence as a Literal - # Sequence Expression + _set_location_from_interval(new_allele, new_ival, start_pos_type, end_pos_type) new_allele.state = models.LiteralSequenceExpression( sequence=models.sequenceString(extended_alt_seq) ) return new_allele - # Otherwise, determine if this is reference-derived (an RLE allele). - len_extended_alt = len(extended_alt_seq) - len_extended_ref = len(extended_ref_seq) - + # 5.b if len_extended_alt < len_extended_ref: - # If this is a deletion, it is reference-derived + _set_location_from_interval(new_allele, new_ival, start_pos_type, end_pos_type) return _define_rle_allele( new_allele, len_extended_alt, seed_length, rle_seq_limit, extended_alt_seq ) + # 5.c if len_extended_alt > len_extended_ref: - # If this is an insertion, it may or may not be reference-derived. - # - # Determine the greatest factor `d` of the `seed length` such that `d` - # is less than or equal to the length of the modified `reference sequence`, - # and there exists a subsequence of length `d` derived from the modified - # `reference sequence` that can be circularly expanded to recreate - # the modified `alternate sequence`. factors = _factor_gen(seed_length) for cycle_length in factors: if cycle_length > len_extended_ref: continue cycle_start = len_extended_ref - cycle_length if _is_valid_cycle(cycle_start, extended_ref_seq, extended_alt_seq): + _set_location_from_interval( + new_allele, new_ival, start_pos_type, end_pos_type + ) + # 5.c.2 / 5.d: reference-derived ambiguous insertion return _define_rle_allele( new_allele, len_extended_alt, @@ -214,13 +218,66 @@ def _normalize_allele(input_allele, data_proxy, rle_seq_limit=50): rle_seq_limit, extended_alt_seq, ) + # 5.c.3 + _set_location_from_interval(new_allele, new_ival, start_pos_type, end_pos_type) + new_allele.state = models.LiteralSequenceExpression( + sequence=models.sequenceString(extended_alt_seq) + ) + return new_allele + # 5.e: Otherwise return literal Allele using expanded interval/state (spec step 5 final bullet) + _set_location_from_interval(new_allele, new_ival, start_pos_type, end_pos_type) new_allele.state = models.LiteralSequenceExpression( sequence=models.sequenceString(extended_alt_seq) ) return new_allele +def _trim_for_normalization( + ref_seq: SequenceProxy, + ival: tuple[int, int], + alleles: tuple[None, str], + start: LocationPos, + end: LocationPos, +): + """Trim common prefix and suffix from the intervals. + + Return the trimmed interval and trimmed alleles: + ((trim_start, trim_end), (trim_ref, trim_alt)) + + If the alleles are the same, return the original interval and alleles. + The first allele (ref) will be populated by bioutils. + """ + try: + trim_ival, trim_alleles = _normalize( + ref_seq, ival, alleles, mode=None, trim=True + ) + except ValueError as e: + ref_at_location = ref_seq[start.value : end.value] + alt_seq = alleles[1] + if ref_at_location == alt_seq: + # return (ival, (None, ref_at_location)) + return ival, alleles + msg = ( + "Unexpected bioutils trim error for non reference allele: " + f"ref='{ref_at_location}', alt='{alt_seq}'" + ) + raise ValueError(msg) from e + + return trim_ival, trim_alleles + + +def _set_location_from_interval( + allele: models.Allele, + ival: tuple[int, int], + start_pos_type: PosType, + end_pos_type: PosType, +) -> None: + """Update ``allele`` start and end location""" + allele.location.start = _get_new_allele_location_pos(ival[0], start_pos_type) + allele.location.end = _get_new_allele_location_pos(ival[1], end_pos_type) + + def denormalize_reference_length_expression( ref_seq: str, repeat_subunit_length: int, @@ -307,7 +364,8 @@ def normalize(vo, data_proxy: _DataProxy | None = None, **kwargs): :param data_proxy: GA4GH sequence dataproxy instance, if needed :keyword rle_seq_limit: If RLE is set as the new state, set the limit for the length of the `sequence`. To exclude `state.sequence`, set to 0. - :return: normalized object + :return: normalized object, or unmodified input object if the normalization algorithm + does not provide normalization steps for the given type. :raise TypeError: if given object isn't a pydantic.BaseModel """ if not is_pydantic_instance(vo): diff --git a/tests/cassettes/test_normalize_allele.yaml b/tests/cassettes/test_normalize_allele.yaml index a27c8c08..a953adce 100644 --- a/tests/cassettes/test_normalize_allele.yaml +++ b/tests/cassettes/test_normalize_allele.yaml @@ -273,4 +273,64 @@ interactions: status: code: 200 message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=100210777&end=100210779 + response: + body: + string: AA + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=939145&end=939147 + response: + body: + string: GA + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=236900413&end=236900413 + response: + body: + string: '' + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=236900412&end=236900413 + response: + body: + string: A + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=236900413&end=236900414 + response: + body: + string: A + headers: {} + status: + code: 200 + message: OK version: 1 diff --git a/tests/cassettes/test_normalize_clinvar_rle.yaml b/tests/cassettes/test_normalize_clinvar_rle.yaml new file mode 100644 index 00000000..f17cb16f --- /dev/null +++ b/tests/cassettes/test_normalize_clinvar_rle.yaml @@ -0,0 +1,674 @@ +interactions: +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=66926&end=66927 + response: + body: + string: G + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=66925&end=66926 + response: + body: + string: A + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=66927&end=66928 + response: + body: + string: A + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=66926&end=66926 + response: + body: + string: '' + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=66927&end=66927 + response: + body: + string: '' + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=766400&end=766404 + response: + body: + string: ATAA + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=766399&end=766400 + response: + body: + string: A + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=766398&end=766399 + response: + body: + string: G + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=766404&end=766405 + response: + body: + string: A + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=766405&end=766406 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=766406&end=766407 + response: + body: + string: A + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=766407&end=766408 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=766404&end=766407 + response: + body: + string: ATA + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=766399&end=766407 + response: + body: + string: AATAAATA + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930137&end=930140 + response: + body: + string: TCC + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930136&end=930137 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930135&end=930136 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930140&end=930141 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930141&end=930142 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930142&end=930143 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930143&end=930144 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930144&end=930145 + response: + body: + string: G + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930140&end=930144 + response: + body: + string: TCCT + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930136&end=930144 + response: + body: + string: CTCCTCCT + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752908&end=1752908 + response: + body: + string: '' + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752907&end=1752908 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752908&end=1752909 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752909&end=1752910 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752910&end=1752911 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752911&end=1752912 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752912&end=1752913 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752913&end=1752914 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752914&end=1752915 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752915&end=1752916 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752916&end=1752917 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752917&end=1752918 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752918&end=1752919 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752919&end=1752920 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752920&end=1752921 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752921&end=1752922 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752922&end=1752923 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752923&end=1752924 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752924&end=1752925 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752925&end=1752926 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752926&end=1752927 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752927&end=1752928 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752928&end=1752929 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752929&end=1752930 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752930&end=1752931 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752931&end=1752932 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752932&end=1752933 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752933&end=1752934 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752934&end=1752935 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752935&end=1752936 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752936&end=1752937 + response: + body: + string: G + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752908&end=1752936 + response: + body: + string: CCTCCTCCTCCTCCTCCTCCTCCTCCTC + headers: {} + status: + code: 200 + message: OK +version: 1 diff --git a/tests/cassettes/test_normalize_partial_rle_del_ins.yaml b/tests/cassettes/test_normalize_partial_rle_del_ins.yaml new file mode 100644 index 00000000..47da614c --- /dev/null +++ b/tests/cassettes/test_normalize_partial_rle_del_ins.yaml @@ -0,0 +1,170 @@ +interactions: +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752915&end=1752915 + response: + body: + string: '' + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752915&end=1752918 + response: + body: + string: CTC + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752911&end=1752915 + response: + body: + string: CCTC + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752911&end=1752916 + response: + body: + string: CCTCC + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752934&end=1752934 + response: + body: + string: '' + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752912&end=1752916 + response: + body: + string: CTCC + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752916&end=1752916 + response: + body: + string: '' + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752913&end=1752915 + response: + body: + string: TC + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752912&end=1752915 + response: + body: + string: CTC + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752934&end=1752936 + response: + body: + string: TC + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752936&end=1752936 + response: + body: + string: '' + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752933&end=1752936 + response: + body: + string: CTC + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752932&end=1752936 + response: + body: + string: CCTC + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=1752932&end=1752932 + response: + body: + string: '' + headers: {} + status: + code: 200 + message: OK +version: 1 diff --git a/tests/extras/cassettes/test_annotate_vcf_rle.yaml b/tests/extras/cassettes/test_annotate_vcf_rle.yaml new file mode 100644 index 00000000..2ba6fd04 --- /dev/null +++ b/tests/extras/cassettes/test_annotate_vcf_rle.yaml @@ -0,0 +1,240 @@ +interactions: +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/metadata/GRCh38:1 + response: + body: + string: "{\n \"added\": \"2016-08-27T21:17:00Z\",\n \"aliases\": [\n \"GRCh38:1\",\n + \ \"GRCh38:chr1\",\n \"GRCh38.p1:1\",\n \"GRCh38.p1:chr1\",\n \"GRCh38.p10:1\",\n + \ \"GRCh38.p10:chr1\",\n \"GRCh38.p11:1\",\n \"GRCh38.p11:chr1\",\n + \ \"GRCh38.p12:1\",\n \"GRCh38.p12:chr1\",\n \"GRCh38.p2:1\",\n \"GRCh38.p2:chr1\",\n + \ \"GRCh38.p3:1\",\n \"GRCh38.p3:chr1\",\n \"GRCh38.p4:1\",\n \"GRCh38.p4:chr1\",\n + \ \"GRCh38.p5:1\",\n \"GRCh38.p5:chr1\",\n \"GRCh38.p6:1\",\n \"GRCh38.p6:chr1\",\n + \ \"GRCh38.p7:1\",\n \"GRCh38.p7:chr1\",\n \"GRCh38.p8:1\",\n \"GRCh38.p8:chr1\",\n + \ \"GRCh38.p9:1\",\n \"GRCh38.p9:chr1\",\n \"MD5:6aef897c3d6ff0c78aff06ac189178dd\",\n + \ \"NCBI:NC_000001.11\",\n \"refseq:NC_000001.11\",\n \"SEGUID:FCUd6VJ6uikS/VWLbhGdVmj2rOA\",\n + \ \"SHA1:14251de9527aba2912fd558b6e119d5668f6ace0\",\n \"VMC:GS_Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO\",\n + \ \"sha512t24u:Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO\",\n \"ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO\"\n + \ ],\n \"alphabet\": \"ACGMNRT\",\n \"length\": 248956422\n}\n" + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/GRCh38:1?start=100210777&end=100210779 + response: + body: + string: AA + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/metadata/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO + response: + body: + string: "{\n \"added\": \"2016-08-27T21:17:00Z\",\n \"aliases\": [\n \"GRCh38:1\",\n + \ \"GRCh38:chr1\",\n \"GRCh38.p1:1\",\n \"GRCh38.p1:chr1\",\n \"GRCh38.p10:1\",\n + \ \"GRCh38.p10:chr1\",\n \"GRCh38.p11:1\",\n \"GRCh38.p11:chr1\",\n + \ \"GRCh38.p12:1\",\n \"GRCh38.p12:chr1\",\n \"GRCh38.p2:1\",\n \"GRCh38.p2:chr1\",\n + \ \"GRCh38.p3:1\",\n \"GRCh38.p3:chr1\",\n \"GRCh38.p4:1\",\n \"GRCh38.p4:chr1\",\n + \ \"GRCh38.p5:1\",\n \"GRCh38.p5:chr1\",\n \"GRCh38.p6:1\",\n \"GRCh38.p6:chr1\",\n + \ \"GRCh38.p7:1\",\n \"GRCh38.p7:chr1\",\n \"GRCh38.p8:1\",\n \"GRCh38.p8:chr1\",\n + \ \"GRCh38.p9:1\",\n \"GRCh38.p9:chr1\",\n \"MD5:6aef897c3d6ff0c78aff06ac189178dd\",\n + \ \"NCBI:NC_000001.11\",\n \"refseq:NC_000001.11\",\n \"SEGUID:FCUd6VJ6uikS/VWLbhGdVmj2rOA\",\n + \ \"SHA1:14251de9527aba2912fd558b6e119d5668f6ace0\",\n \"VMC:GS_Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO\",\n + \ \"sha512t24u:Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO\",\n \"ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO\"\n + \ ],\n \"alphabet\": \"ACGMNRT\",\n \"length\": 248956422\n}\n" + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=100210777&end=100210779 + response: + body: + string: AA + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=100210778&end=100210779 + response: + body: + string: A + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=100210777&end=100210778 + response: + body: + string: A + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=100210776&end=100210777 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=100210779&end=100210780 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=100210779&end=100210779 + response: + body: + string: '' + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/GRCh38:1?start=102995988&end=102995992 + response: + body: + string: CTTT + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=102995988&end=102995992 + response: + body: + string: CTTT + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=102995992&end=102995992 + response: + body: + string: '' + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=102995991&end=102995992 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=102995990&end=102995991 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=102995989&end=102995990 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=102995988&end=102995989 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=102995987&end=102995988 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=102995992&end=102995993 + response: + body: + string: A + headers: {} + status: + code: 200 + message: OK +version: 1 diff --git a/tests/extras/cassettes/test_normalize_microsatellite_counts[deletion].yaml b/tests/extras/cassettes/test_normalize_microsatellite_counts[deletion].yaml new file mode 100644 index 00000000..29ac449f --- /dev/null +++ b/tests/extras/cassettes/test_normalize_microsatellite_counts[deletion].yaml @@ -0,0 +1,674 @@ +interactions: +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930131&end=930152 + response: + body: + string: TTCCTCTCCTCCTGCCCCACC + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930130&end=930131 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930129&end=930130 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930128&end=930129 + response: + body: + string: A + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930127&end=930128 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930126&end=930127 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930125&end=930126 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930124&end=930125 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930123&end=930124 + response: + body: + string: G + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930122&end=930123 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930121&end=930122 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930120&end=930121 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930119&end=930120 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930118&end=930119 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930117&end=930118 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930116&end=930117 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930115&end=930116 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930114&end=930115 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930113&end=930114 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930112&end=930113 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930111&end=930112 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930110&end=930111 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930109&end=930110 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930108&end=930109 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930107&end=930108 + response: + body: + string: A + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930106&end=930107 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930105&end=930106 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930104&end=930105 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930103&end=930104 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930102&end=930103 + response: + body: + string: G + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930101&end=930102 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930100&end=930101 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930099&end=930100 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930098&end=930099 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930097&end=930098 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930096&end=930097 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930095&end=930096 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930094&end=930095 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930093&end=930094 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930092&end=930093 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930091&end=930092 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930090&end=930091 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930089&end=930090 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930088&end=930089 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930087&end=930088 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930086&end=930087 + response: + body: + string: A + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930085&end=930086 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930084&end=930085 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930083&end=930084 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930082&end=930083 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930081&end=930082 + response: + body: + string: G + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930080&end=930081 + response: + body: + string: A + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930152&end=930153 + response: + body: + string: A + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930081&end=930131 + response: + body: + string: GCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACC + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930152&end=930152 + response: + body: + string: '' + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930081&end=930152 + response: + body: + string: GCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACC + headers: {} + status: + code: 200 + message: OK +version: 1 diff --git a/tests/extras/cassettes/test_normalize_microsatellite_counts[identity].yaml b/tests/extras/cassettes/test_normalize_microsatellite_counts[identity].yaml new file mode 100644 index 00000000..664527a6 --- /dev/null +++ b/tests/extras/cassettes/test_normalize_microsatellite_counts[identity].yaml @@ -0,0 +1,26 @@ +interactions: +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/NC_000001.11?start=930089&end=930152 + response: + body: + string: TTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACC + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930089&end=930152 + response: + body: + string: TTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACC + headers: {} + status: + code: 200 + message: OK +version: 1 diff --git a/tests/extras/cassettes/test_normalize_microsatellite_counts[insertion].yaml b/tests/extras/cassettes/test_normalize_microsatellite_counts[insertion].yaml new file mode 100644 index 00000000..4fa2be2f --- /dev/null +++ b/tests/extras/cassettes/test_normalize_microsatellite_counts[insertion].yaml @@ -0,0 +1,254 @@ +interactions: +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930151&end=930152 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930150&end=930151 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930149&end=930150 + response: + body: + string: A + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930148&end=930149 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930147&end=930148 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930146&end=930147 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930145&end=930146 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930144&end=930145 + response: + body: + string: G + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930143&end=930144 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930142&end=930143 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930141&end=930142 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930140&end=930141 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930139&end=930140 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930138&end=930139 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930137&end=930138 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930136&end=930137 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930135&end=930136 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930134&end=930135 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930133&end=930134 + response: + body: + string: C + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930132&end=930133 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=930131&end=930132 + response: + body: + string: T + headers: {} + status: + code: 200 + message: OK +version: 1 diff --git a/tests/extras/cassettes/test_reference_allele_rle.yaml b/tests/extras/cassettes/test_reference_allele_rle.yaml new file mode 100644 index 00000000..fa7d4e56 --- /dev/null +++ b/tests/extras/cassettes/test_reference_allele_rle.yaml @@ -0,0 +1,38 @@ +interactions: +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/GRCh38:1?start=100210777&end=100210779 + response: + body: + string: AA + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO?start=100210777&end=100210779 + response: + body: + string: AA + headers: {} + status: + code: 200 + message: OK +- request: + body: null + headers: {} + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/NC_000001.11?start=100210777&end=100210779 + response: + body: + string: AA + headers: {} + status: + code: 200 + message: OK +version: 1 diff --git a/tests/extras/data/test_rle.vcf b/tests/extras/data/test_rle.vcf new file mode 100644 index 00000000..b14eb45b --- /dev/null +++ b/tests/extras/data/test_rle.vcf @@ -0,0 +1,9 @@ +##fileformat=VCFv4.2 +##reference=GRCh38 +##contig= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 100210778 . AA A . . VARID=1663061;VARNAME=NM_001918.5(DBT):c.940-7del;VARTYPE=Deletion +1 102995989 . CTTT CTTTCTTT . . VARID=3727769;VARNAME=NM_001854.4(COL11A1):c.2292_2295dup;VARTYPE=Duplication diff --git a/tests/extras/data/test_vcf_expected_altsonly_output.vcf b/tests/extras/data/test_vcf_expected_altsonly_output.vcf new file mode 100644 index 00000000..8d74d7ca --- /dev/null +++ b/tests/extras/data/test_vcf_expected_altsonly_output.vcf @@ -0,0 +1,246 @@ +##fileformat=VCFv4.2 +##FILTER= +##fileDate=20160824 +##CL=vcffilter -i - -o - --javascript "function record() {HG001.PS=\".\";}" +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER=0.8"> +##FILTER= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG001 +chr19 82664 . C T 50 PASS platforms=2;platformnames=10X,PacBio;datasets=2;datasetnames=10XChromiumLR,CCS15kb_20kb;callsets=2;callsetnames=10XLRGATK,CCS15kb_20kbDV;datasetsmissingcall=HiSeqPE300x,CCS15kb_20kb,CGnormal,IonExome,SolidSE75bp;callable=CS_CCS15kb_20kbDV_callable,CS_10XLRGATK_callable,CS_CCS15kb_20kbGATK4_callable;filt=CS_CCS15kb_20kbDV_filt;arbitrated=TRUE;difficultregion=hg38.segdups_sorted_merged,lowmappabilityall;VRS_Allele_IDs=ga4gh:VA.b9AIUegs1SOYO7nKY8l01yllkyUns-u4;VRS_Starts=82663;VRS_Ends=82664;VRS_States=T;VRS_Lengths=.;VRS_RepeatSubunitLengths=. GT:PS:DP:ADALL:AD:GQ 0/1:.:154:0,0:0,0:120 +chr19 284350 . CA C 50 PASS platforms=4;platformnames=Illumina,10X,PacBio,CG;datasets=4;datasetnames=HiSeqPE300x,10XChromiumLR,CCS15kb_20kb,CGnormal;callsets=5;callsetnames=HiSeqPE300xGATK,10XLRGATK,CCS15kb_20kbGATK4,CGnormal,HiSeqPE300xfreebayes;datasetsmissingcall=CCS15kb_20kb,IonExome,SolidSE75bp;callable=CS_HiSeqPE300xGATK_callable;filt=CS_CCS15kb_20kbDV_filt,CS_CCS15kb_20kbGATK4_filt;difficultregion=GRCh38_AllHomopolymers_gt6bp_imperfectgt10bp_slop5,GRCh38_SimpleRepeat_imperfecthomopolgt10_slop5;VRS_Allele_IDs=ga4gh:VA.a04jFsNg0bS0RMIWjKWSbwJS4_vp7S6x;VRS_Starts=284350;VRS_Ends=284366;VRS_States=AAAAAAAAAAAAAAA;VRS_Lengths=15;VRS_RepeatSubunitLengths=1 GT:PS:DP:ADALL:AD:GQ 0/1:.:422:117,101:81,75:356 +chr19 289464 . T TCACGCCTGTAATCC 50 PASS platforms=4;platformnames=Illumina,PacBio,CG,10X;datasets=4;datasetnames=HiSeqPE300x,CCS15kb_20kb,CGnormal,10XChromiumLR;callsets=6;callsetnames=HiSeqPE300xGATK,CCS15kb_20kbGATK4,CGnormal,HiSeqPE300xfreebayes,CCS15kb_20kbDV,10XLRGATK;datasetsmissingcall=IonExome,SolidSE75bp;callable=CS_HiSeqPE300xGATK_callable,CS_10XLRGATK_callable,CS_CCS15kb_20kbGATK4_callable,CS_CGnormal_callable,CS_HiSeqPE300xfreebayes_callable;filt=CS_CCS15kb_20kbDV_filt,CS_CCS15kb_20kbGATK4_filt;VRS_Allele_IDs=ga4gh:VA.ySvDptXfHB_9WEfu78v32DzBXJfwGgO7;VRS_Starts=289464;VRS_Ends=289466;VRS_States=CACGCCTGTAATCCCA;VRS_Lengths=.;VRS_RepeatSubunitLengths=. GT:PS:DP:ADALL:AD:GQ 0/1:.:518:94,98:116,137:785 +chr19 28946400 . T C 50 PASS platforms=5;platformnames=Illumina,PacBio,CG,10X,Solid;datasets=5;datasetnames=HiSeqPE300x,CCS15kb_20kb,CGnormal,10XChromiumLR,SolidSE75bp;callsets=7;callsetnames=HiSeqPE300xGATK,CCS15kb_20kbDV,CCS15kb_20kbGATK4,CGnormal,HiSeqPE300xfreebayes,10XLRGATK,SolidSE75GATKHC;datasetsmissingcall=IonExome;callable=CS_HiSeqPE300xGATK_callable,CS_CCS15kb_20kbDV_callable,CS_10XLRGATK_callable,CS_CCS15kb_20kbGATK4_callable,CS_CGnormal_callable,CS_HiSeqPE300xfreebayes_callable;filt=CS_CCS15kb_20kbDV_filt,CS_CCS15kb_20kbGATK4_filt;VRS_Allele_IDs=ga4gh:VA.uV5O4M9zpiwk6sftOd-EDvtw_pkSAvdf;VRS_Starts=28946399;VRS_Ends=28946400;VRS_States=C;VRS_Lengths=.;VRS_RepeatSubunitLengths=. GT:PS:DP:ADALL:AD:GQ 1/1:.:874:0,275:115,378:502 +chr19 490414 . ACT A 50 PASS platforms=5;platformnames=Illumina,PacBio,CG,10X,Solid;datasets=5;datasetnames=HiSeqPE300x,CCS15kb_20kb,CGnormal,10XChromiumLR,SolidSE75bp;callsets=7;callsetnames=HiSeqPE300xGATK,CCS15kb_20kbDV,CCS15kb_20kbGATK4,CGnormal,HiSeqPE300xfreebayes,10XLRGATK,SolidSE75GATKHC;datasetsmissingcall=IonExome;callable=CS_HiSeqPE300xGATK_callable,CS_CCS15kb_20kbDV_callable,CS_CCS15kb_20kbGATK4_callable,CS_CGnormal_callable,CS_HiSeqPE300xfreebayes_callable;filt=CS_10XLRGATK_filt;VRS_Allele_IDs=ga4gh:VA.lok7a3lot_cvUyw626otpJi4yxk0X07v;VRS_Starts=490414;VRS_Ends=490416;VRS_States;VRS_Lengths=0;VRS_RepeatSubunitLengths=2 GT:PS:DP:ADALL:AD:GQ 0/1:.:821:163,158:239,220:1004 +chr19 54220024 . G *,A 50 PASS platforms=1;platformnames=PacBio;datasets=1;datasetnames=CCS15kb_20kb;callsets=1;callsetnames=CCS15kb_20kbGATK4;datasetsmissingcall=HiSeqPE300x,CCS15kb_20kb,10XChromiumLR,CGnormal,IonExome,SolidSE75bp;callable=CS_CCS15kb_20kbGATK4_callable;filt=CS_CCS15kb_20kbDV_filt,CS_10XLRGATK_filt,CS_HiSeqPE300xfreebayes_filt;difficultregion=HG001.hg38.300x.bam.bilkentuniv.010920.dups,hg38.segdups_sorted_merged;VRS_Allele_IDs=,ga4gh:VA.I7J3i1B36BACEUINcTwEh7uMv3I-PXT1;VRS_Starts=.,54220023;VRS_Ends=.,54220024;VRS_States=,A;VRS_Lengths=.,.;VRS_RepeatSubunitLengths=.,. GT:PS:DP:ADALL:AD:GQ 1/2:.:45:0,20,25:0,20,25:99 +chr19 54220999 . A T 50 PASS platforms=1;platformnames=PacBio;datasets=1;datasetnames=CCS15kb_20kb;callsets=1;callsetnames=CCS15kb_20kbGATK4;datasetsmissingcall=HiSeqPE300x,CCS15kb_20kb,10XChromiumLR,CGnormal,IonExome,SolidSE75bp;callable=CS_CCS15kb_20kbGATK4_callable;filt=CS_CCS15kb_20kbDV_filt,CS_10XLRGATK_filt,CS_HiSeqPE300xfreebayes_filt;difficultregion=HG001.hg38.300x.bam.bilkentuniv.010920.dups,hg38.segdups_sorted_merged;VRS_Error=Reference mismatch at GRCh38:chr19 position 54220998-54220999 (input gave 'A' but correct ref is 'T') GT:PS:DP:ADALL:AD:GQ 0/1:.:45:0,20,25:0,20,25:99 +chr19 54221654 . T A,P 50 PASS platforms=1;platformnames=PacBio;datasets=1;datasetnames=CCS15kb_20kb;callsets=1;callsetnames=CCS15kb_20kbGATK4;datasetsmissingcall=HiSeqPE300x,CCS15kb_20kb,10XChromiumLR,CGnormal,IonExome,SolidSE75bp;callable=CS_CCS15kb_20kbGATK4_callable;filt=CS_CCS15kb_20kbDV_filt,CS_10XLRGATK_filt,CS_HiSeqPE300xfreebayes_filt;difficultregion=HG001.hg38.300x.bam.bilkentuniv.010920.dups,hg38.segdups_sorted_merged;VRS_Allele_IDs=ga4gh:VA.Zzlc24htmBV1HZZzWYgPD2_GfMInkrZu,;VRS_Starts=54221653,.;VRS_Ends=54221654,.;VRS_States=A,;VRS_Lengths=.,.;VRS_RepeatSubunitLengths=.,. GT:PS:DP:ADALL:AD:GQ 0/1:.:45:0,20,25:0,20,25:99 diff --git a/tests/extras/data/test_vcf_expected_altsonly_output.vcf.gz b/tests/extras/data/test_vcf_expected_altsonly_output.vcf.gz deleted file mode 100644 index 173adfa1..00000000 Binary files a/tests/extras/data/test_vcf_expected_altsonly_output.vcf.gz and /dev/null differ diff --git a/tests/extras/data/test_vcf_expected_output.vcf b/tests/extras/data/test_vcf_expected_output.vcf new file mode 100644 index 00000000..fd781c5a --- /dev/null +++ b/tests/extras/data/test_vcf_expected_output.vcf @@ -0,0 +1,246 @@ +##fileformat=VCFv4.2 +##FILTER= +##fileDate=20160824 +##CL=vcffilter -i - -o - --javascript "function record() {HG001.PS=\".\";}" +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER=0.8"> +##FILTER= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG001 +chr19 82664 . C T 50 PASS platforms=2;platformnames=10X,PacBio;datasets=2;datasetnames=10XChromiumLR,CCS15kb_20kb;callsets=2;callsetnames=10XLRGATK,CCS15kb_20kbDV;datasetsmissingcall=HiSeqPE300x,CCS15kb_20kb,CGnormal,IonExome,SolidSE75bp;callable=CS_CCS15kb_20kbDV_callable,CS_10XLRGATK_callable,CS_CCS15kb_20kbGATK4_callable;filt=CS_CCS15kb_20kbDV_filt;arbitrated=TRUE;difficultregion=hg38.segdups_sorted_merged,lowmappabilityall;VRS_Allele_IDs=ga4gh:VA.esmv8nARRRdSysDFuIErJxRAdUSVsWNE,ga4gh:VA.b9AIUegs1SOYO7nKY8l01yllkyUns-u4;VRS_Starts=82663,82663;VRS_Ends=82664,82664;VRS_States=C,T;VRS_Lengths=1,.;VRS_RepeatSubunitLengths=1,. GT:PS:DP:ADALL:AD:GQ 0/1:.:154:0,0:0,0:120 +chr19 284350 . CA C 50 PASS platforms=4;platformnames=Illumina,10X,PacBio,CG;datasets=4;datasetnames=HiSeqPE300x,10XChromiumLR,CCS15kb_20kb,CGnormal;callsets=5;callsetnames=HiSeqPE300xGATK,10XLRGATK,CCS15kb_20kbGATK4,CGnormal,HiSeqPE300xfreebayes;datasetsmissingcall=CCS15kb_20kb,IonExome,SolidSE75bp;callable=CS_HiSeqPE300xGATK_callable;filt=CS_CCS15kb_20kbDV_filt,CS_CCS15kb_20kbGATK4_filt;difficultregion=GRCh38_AllHomopolymers_gt6bp_imperfectgt10bp_slop5,GRCh38_SimpleRepeat_imperfecthomopolgt10_slop5;VRS_Allele_IDs=ga4gh:VA.xgtXGA3ZkV1WgMc6eD9l64fX27S_TScW,ga4gh:VA.a04jFsNg0bS0RMIWjKWSbwJS4_vp7S6x;VRS_Starts=284349,284350;VRS_Ends=284351,284366;VRS_States=CA,AAAAAAAAAAAAAAA;VRS_Lengths=2,15;VRS_RepeatSubunitLengths=2,1 GT:PS:DP:ADALL:AD:GQ 0/1:.:422:117,101:81,75:356 +chr19 289464 . T TCACGCCTGTAATCC 50 PASS platforms=4;platformnames=Illumina,PacBio,CG,10X;datasets=4;datasetnames=HiSeqPE300x,CCS15kb_20kb,CGnormal,10XChromiumLR;callsets=6;callsetnames=HiSeqPE300xGATK,CCS15kb_20kbGATK4,CGnormal,HiSeqPE300xfreebayes,CCS15kb_20kbDV,10XLRGATK;datasetsmissingcall=IonExome,SolidSE75bp;callable=CS_HiSeqPE300xGATK_callable,CS_10XLRGATK_callable,CS_CCS15kb_20kbGATK4_callable,CS_CGnormal_callable,CS_HiSeqPE300xfreebayes_callable;filt=CS_CCS15kb_20kbDV_filt,CS_CCS15kb_20kbGATK4_filt;VRS_Allele_IDs=ga4gh:VA.nqqTUy-a2gssemOmJb4CJv-HNuFAmGrO,ga4gh:VA.ySvDptXfHB_9WEfu78v32DzBXJfwGgO7;VRS_Starts=289463,289464;VRS_Ends=289464,289466;VRS_States=T,CACGCCTGTAATCCCA;VRS_Lengths=1,.;VRS_RepeatSubunitLengths=1,. GT:PS:DP:ADALL:AD:GQ 0/1:.:518:94,98:116,137:785 +chr19 28946400 . T C 50 PASS platforms=5;platformnames=Illumina,PacBio,CG,10X,Solid;datasets=5;datasetnames=HiSeqPE300x,CCS15kb_20kb,CGnormal,10XChromiumLR,SolidSE75bp;callsets=7;callsetnames=HiSeqPE300xGATK,CCS15kb_20kbDV,CCS15kb_20kbGATK4,CGnormal,HiSeqPE300xfreebayes,10XLRGATK,SolidSE75GATKHC;datasetsmissingcall=IonExome;callable=CS_HiSeqPE300xGATK_callable,CS_CCS15kb_20kbDV_callable,CS_10XLRGATK_callable,CS_CCS15kb_20kbGATK4_callable,CS_CGnormal_callable,CS_HiSeqPE300xfreebayes_callable;filt=CS_CCS15kb_20kbDV_filt,CS_CCS15kb_20kbGATK4_filt;VRS_Allele_IDs=ga4gh:VA.yPr2pVvJeWHDHarhzAvOCb5Cn9UMF6a5,ga4gh:VA.uV5O4M9zpiwk6sftOd-EDvtw_pkSAvdf;VRS_Starts=28946399,28946399;VRS_Ends=28946400,28946400;VRS_States=T,C;VRS_Lengths=1,.;VRS_RepeatSubunitLengths=1,. GT:PS:DP:ADALL:AD:GQ 1/1:.:874:0,275:115,378:502 +chr19 490414 . ACT A 50 PASS platforms=5;platformnames=Illumina,PacBio,CG,10X,Solid;datasets=5;datasetnames=HiSeqPE300x,CCS15kb_20kb,CGnormal,10XChromiumLR,SolidSE75bp;callsets=7;callsetnames=HiSeqPE300xGATK,CCS15kb_20kbDV,CCS15kb_20kbGATK4,CGnormal,HiSeqPE300xfreebayes,10XLRGATK,SolidSE75GATKHC;datasetsmissingcall=IonExome;callable=CS_HiSeqPE300xGATK_callable,CS_CCS15kb_20kbDV_callable,CS_CCS15kb_20kbGATK4_callable,CS_CGnormal_callable,CS_HiSeqPE300xfreebayes_callable;filt=CS_10XLRGATK_filt;VRS_Allele_IDs=ga4gh:VA.aje4-hx7eihWndAwfhzNq_7CZV3bRMXf,ga4gh:VA.lok7a3lot_cvUyw626otpJi4yxk0X07v;VRS_Starts=490413,490414;VRS_Ends=490416,490416;VRS_States=ACT,;VRS_Lengths=3,0;VRS_RepeatSubunitLengths=3,2 GT:PS:DP:ADALL:AD:GQ 0/1:.:821:163,158:239,220:1004 +chr19 54220024 . G *,A 50 PASS platforms=1;platformnames=PacBio;datasets=1;datasetnames=CCS15kb_20kb;callsets=1;callsetnames=CCS15kb_20kbGATK4;datasetsmissingcall=HiSeqPE300x,CCS15kb_20kb,10XChromiumLR,CGnormal,IonExome,SolidSE75bp;callable=CS_CCS15kb_20kbGATK4_callable;filt=CS_CCS15kb_20kbDV_filt,CS_10XLRGATK_filt,CS_HiSeqPE300xfreebayes_filt;difficultregion=HG001.hg38.300x.bam.bilkentuniv.010920.dups,hg38.segdups_sorted_merged;VRS_Allele_IDs=ga4gh:VA.LlmfhAC3gQlVQUwXWYiYjrn5V_K8vBz1,,ga4gh:VA.I7J3i1B36BACEUINcTwEh7uMv3I-PXT1;VRS_Starts=54220023,.,54220023;VRS_Ends=54220024,.,54220024;VRS_States=G,,A;VRS_Lengths=1,.,.;VRS_RepeatSubunitLengths=1,.,. GT:PS:DP:ADALL:AD:GQ 1/2:.:45:0,20,25:0,20,25:99 +chr19 54220999 . A T 50 PASS platforms=1;platformnames=PacBio;datasets=1;datasetnames=CCS15kb_20kb;callsets=1;callsetnames=CCS15kb_20kbGATK4;datasetsmissingcall=HiSeqPE300x,CCS15kb_20kb,10XChromiumLR,CGnormal,IonExome,SolidSE75bp;callable=CS_CCS15kb_20kbGATK4_callable;filt=CS_CCS15kb_20kbDV_filt,CS_10XLRGATK_filt,CS_HiSeqPE300xfreebayes_filt;difficultregion=HG001.hg38.300x.bam.bilkentuniv.010920.dups,hg38.segdups_sorted_merged;VRS_Error=Reference mismatch at GRCh38:chr19 position 54220998-54220999 (input gave 'A' but correct ref is 'T') GT:PS:DP:ADALL:AD:GQ 0/1:.:45:0,20,25:0,20,25:99 +chr19 54221654 . T A,P 50 PASS platforms=1;platformnames=PacBio;datasets=1;datasetnames=CCS15kb_20kb;callsets=1;callsetnames=CCS15kb_20kbGATK4;datasetsmissingcall=HiSeqPE300x,CCS15kb_20kb,10XChromiumLR,CGnormal,IonExome,SolidSE75bp;callable=CS_CCS15kb_20kbGATK4_callable;filt=CS_CCS15kb_20kbDV_filt,CS_10XLRGATK_filt,CS_HiSeqPE300xfreebayes_filt;difficultregion=HG001.hg38.300x.bam.bilkentuniv.010920.dups,hg38.segdups_sorted_merged;VRS_Allele_IDs=ga4gh:VA.kea5G-J1teg0iHMbgUELy-4L9lbJkgoj,ga4gh:VA.Zzlc24htmBV1HZZzWYgPD2_GfMInkrZu,;VRS_Starts=54221653,54221653,.;VRS_Ends=54221654,54221654,.;VRS_States=T,A,;VRS_Lengths=1,.,.;VRS_RepeatSubunitLengths=1,.,. GT:PS:DP:ADALL:AD:GQ 0/1:.:45:0,20,25:0,20,25:99 diff --git a/tests/extras/data/test_vcf_expected_output.vcf.gz b/tests/extras/data/test_vcf_expected_output.vcf.gz deleted file mode 100644 index 728fc91c..00000000 Binary files a/tests/extras/data/test_vcf_expected_output.vcf.gz and /dev/null differ diff --git a/tests/extras/data/test_vcf_expected_output_no_vrs_attrs.vcf b/tests/extras/data/test_vcf_expected_output_no_vrs_attrs.vcf new file mode 100644 index 00000000..c81bb519 --- /dev/null +++ b/tests/extras/data/test_vcf_expected_output_no_vrs_attrs.vcf @@ -0,0 +1,241 @@ +##fileformat=VCFv4.2 +##FILTER= +##fileDate=20160824 +##CL=vcffilter -i - -o - --javascript "function record() {HG001.PS=\".\";}" +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER=0.8"> +##FILTER= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG001 +chr19 82664 . C T 50 PASS platforms=2;platformnames=10X,PacBio;datasets=2;datasetnames=10XChromiumLR,CCS15kb_20kb;callsets=2;callsetnames=10XLRGATK,CCS15kb_20kbDV;datasetsmissingcall=HiSeqPE300x,CCS15kb_20kb,CGnormal,IonExome,SolidSE75bp;callable=CS_CCS15kb_20kbDV_callable,CS_10XLRGATK_callable,CS_CCS15kb_20kbGATK4_callable;filt=CS_CCS15kb_20kbDV_filt;arbitrated=TRUE;difficultregion=hg38.segdups_sorted_merged,lowmappabilityall;VRS_Allele_IDs=ga4gh:VA.esmv8nARRRdSysDFuIErJxRAdUSVsWNE,ga4gh:VA.b9AIUegs1SOYO7nKY8l01yllkyUns-u4 GT:PS:DP:ADALL:AD:GQ 0/1:.:154:0,0:0,0:120 +chr19 284350 . CA C 50 PASS platforms=4;platformnames=Illumina,10X,PacBio,CG;datasets=4;datasetnames=HiSeqPE300x,10XChromiumLR,CCS15kb_20kb,CGnormal;callsets=5;callsetnames=HiSeqPE300xGATK,10XLRGATK,CCS15kb_20kbGATK4,CGnormal,HiSeqPE300xfreebayes;datasetsmissingcall=CCS15kb_20kb,IonExome,SolidSE75bp;callable=CS_HiSeqPE300xGATK_callable;filt=CS_CCS15kb_20kbDV_filt,CS_CCS15kb_20kbGATK4_filt;difficultregion=GRCh38_AllHomopolymers_gt6bp_imperfectgt10bp_slop5,GRCh38_SimpleRepeat_imperfecthomopolgt10_slop5;VRS_Allele_IDs=ga4gh:VA.xgtXGA3ZkV1WgMc6eD9l64fX27S_TScW,ga4gh:VA.a04jFsNg0bS0RMIWjKWSbwJS4_vp7S6x GT:PS:DP:ADALL:AD:GQ 0/1:.:422:117,101:81,75:356 +chr19 289464 . T TCACGCCTGTAATCC 50 PASS platforms=4;platformnames=Illumina,PacBio,CG,10X;datasets=4;datasetnames=HiSeqPE300x,CCS15kb_20kb,CGnormal,10XChromiumLR;callsets=6;callsetnames=HiSeqPE300xGATK,CCS15kb_20kbGATK4,CGnormal,HiSeqPE300xfreebayes,CCS15kb_20kbDV,10XLRGATK;datasetsmissingcall=IonExome,SolidSE75bp;callable=CS_HiSeqPE300xGATK_callable,CS_10XLRGATK_callable,CS_CCS15kb_20kbGATK4_callable,CS_CGnormal_callable,CS_HiSeqPE300xfreebayes_callable;filt=CS_CCS15kb_20kbDV_filt,CS_CCS15kb_20kbGATK4_filt;VRS_Allele_IDs=ga4gh:VA.nqqTUy-a2gssemOmJb4CJv-HNuFAmGrO,ga4gh:VA.ySvDptXfHB_9WEfu78v32DzBXJfwGgO7 GT:PS:DP:ADALL:AD:GQ 0/1:.:518:94,98:116,137:785 +chr19 28946400 . T C 50 PASS platforms=5;platformnames=Illumina,PacBio,CG,10X,Solid;datasets=5;datasetnames=HiSeqPE300x,CCS15kb_20kb,CGnormal,10XChromiumLR,SolidSE75bp;callsets=7;callsetnames=HiSeqPE300xGATK,CCS15kb_20kbDV,CCS15kb_20kbGATK4,CGnormal,HiSeqPE300xfreebayes,10XLRGATK,SolidSE75GATKHC;datasetsmissingcall=IonExome;callable=CS_HiSeqPE300xGATK_callable,CS_CCS15kb_20kbDV_callable,CS_10XLRGATK_callable,CS_CCS15kb_20kbGATK4_callable,CS_CGnormal_callable,CS_HiSeqPE300xfreebayes_callable;filt=CS_CCS15kb_20kbDV_filt,CS_CCS15kb_20kbGATK4_filt;VRS_Allele_IDs=ga4gh:VA.yPr2pVvJeWHDHarhzAvOCb5Cn9UMF6a5,ga4gh:VA.uV5O4M9zpiwk6sftOd-EDvtw_pkSAvdf GT:PS:DP:ADALL:AD:GQ 1/1:.:874:0,275:115,378:502 +chr19 490414 . ACT A 50 PASS platforms=5;platformnames=Illumina,PacBio,CG,10X,Solid;datasets=5;datasetnames=HiSeqPE300x,CCS15kb_20kb,CGnormal,10XChromiumLR,SolidSE75bp;callsets=7;callsetnames=HiSeqPE300xGATK,CCS15kb_20kbDV,CCS15kb_20kbGATK4,CGnormal,HiSeqPE300xfreebayes,10XLRGATK,SolidSE75GATKHC;datasetsmissingcall=IonExome;callable=CS_HiSeqPE300xGATK_callable,CS_CCS15kb_20kbDV_callable,CS_CCS15kb_20kbGATK4_callable,CS_CGnormal_callable,CS_HiSeqPE300xfreebayes_callable;filt=CS_10XLRGATK_filt;VRS_Allele_IDs=ga4gh:VA.aje4-hx7eihWndAwfhzNq_7CZV3bRMXf,ga4gh:VA.lok7a3lot_cvUyw626otpJi4yxk0X07v GT:PS:DP:ADALL:AD:GQ 0/1:.:821:163,158:239,220:1004 +chr19 54220024 . G *,A 50 PASS platforms=1;platformnames=PacBio;datasets=1;datasetnames=CCS15kb_20kb;callsets=1;callsetnames=CCS15kb_20kbGATK4;datasetsmissingcall=HiSeqPE300x,CCS15kb_20kb,10XChromiumLR,CGnormal,IonExome,SolidSE75bp;callable=CS_CCS15kb_20kbGATK4_callable;filt=CS_CCS15kb_20kbDV_filt,CS_10XLRGATK_filt,CS_HiSeqPE300xfreebayes_filt;difficultregion=HG001.hg38.300x.bam.bilkentuniv.010920.dups,hg38.segdups_sorted_merged;VRS_Allele_IDs=ga4gh:VA.LlmfhAC3gQlVQUwXWYiYjrn5V_K8vBz1,,ga4gh:VA.I7J3i1B36BACEUINcTwEh7uMv3I-PXT1 GT:PS:DP:ADALL:AD:GQ 1/2:.:45:0,20,25:0,20,25:99 +chr19 54220999 . A T 50 PASS platforms=1;platformnames=PacBio;datasets=1;datasetnames=CCS15kb_20kb;callsets=1;callsetnames=CCS15kb_20kbGATK4;datasetsmissingcall=HiSeqPE300x,CCS15kb_20kb,10XChromiumLR,CGnormal,IonExome,SolidSE75bp;callable=CS_CCS15kb_20kbGATK4_callable;filt=CS_CCS15kb_20kbDV_filt,CS_10XLRGATK_filt,CS_HiSeqPE300xfreebayes_filt;difficultregion=HG001.hg38.300x.bam.bilkentuniv.010920.dups,hg38.segdups_sorted_merged;VRS_Error=Reference mismatch at GRCh38:chr19 position 54220998-54220999 (input gave 'A' but correct ref is 'T') GT:PS:DP:ADALL:AD:GQ 0/1:.:45:0,20,25:0,20,25:99 +chr19 54221654 . T A,P 50 PASS platforms=1;platformnames=PacBio;datasets=1;datasetnames=CCS15kb_20kb;callsets=1;callsetnames=CCS15kb_20kbGATK4;datasetsmissingcall=HiSeqPE300x,CCS15kb_20kb,10XChromiumLR,CGnormal,IonExome,SolidSE75bp;callable=CS_CCS15kb_20kbGATK4_callable;filt=CS_CCS15kb_20kbDV_filt,CS_10XLRGATK_filt,CS_HiSeqPE300xfreebayes_filt;difficultregion=HG001.hg38.300x.bam.bilkentuniv.010920.dups,hg38.segdups_sorted_merged;VRS_Allele_IDs=ga4gh:VA.kea5G-J1teg0iHMbgUELy-4L9lbJkgoj,ga4gh:VA.Zzlc24htmBV1HZZzWYgPD2_GfMInkrZu, GT:PS:DP:ADALL:AD:GQ 0/1:.:45:0,20,25:0,20,25:99 diff --git a/tests/extras/data/test_vcf_expected_output_no_vrs_attrs.vcf.gz b/tests/extras/data/test_vcf_expected_output_no_vrs_attrs.vcf.gz deleted file mode 100644 index 6eae43ca..00000000 Binary files a/tests/extras/data/test_vcf_expected_output_no_vrs_attrs.vcf.gz and /dev/null differ diff --git a/tests/extras/test_allele_translator.py b/tests/extras/test_allele_translator.py index 6742c78b..24c461e8 100644 --- a/tests/extras/test_allele_translator.py +++ b/tests/extras/test_allele_translator.py @@ -538,12 +538,8 @@ def test_to_spdi_with_ref(tlr): ( "NC_000013.11:g.32936732=", { - "digest": "GJ2JySBMXePcV2yItyvCfbGBUoawOBON", - "id": "ga4gh:VA.GJ2JySBMXePcV2yItyvCfbGBUoawOBON", "location": { - "digest": "28YsnRvD40gKu1x3nev0gRzRz-5OTlpS", "end": 32936732, - "id": "ga4gh:SL.28YsnRvD40gKu1x3nev0gRzRz-5OTlpS", "sequenceReference": { "refgetAccession": "SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT", "type": "SequenceReference", @@ -551,19 +547,20 @@ def test_to_spdi_with_ref(tlr): "start": 32936731, "type": "SequenceLocation", }, - "state": {"sequence": "C", "type": "LiteralSequenceExpression"}, + "state": { + "length": 1, + "repeatSubunitLength": 1, + "sequence": "C", + "type": "ReferenceLengthExpression", + }, "type": "Allele", }, ), ( "NC_000007.14:g.55181320A>T", { - "digest": "Hy2XU_-rp4IMh6I_1NXNecBo8Qx8n0oE", - "id": "ga4gh:VA.Hy2XU_-rp4IMh6I_1NXNecBo8Qx8n0oE", "location": { - "digest": "_G2K0qSioM74l_u3OaKR0mgLYdeTL7Xd", "end": 55181320, - "id": "ga4gh:SL._G2K0qSioM74l_u3OaKR0mgLYdeTL7Xd", "sequenceReference": { "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", "type": "SequenceReference", @@ -578,12 +575,8 @@ def test_to_spdi_with_ref(tlr): ( "NC_000007.14:g.55181220del", { - "digest": "klRMVChjvV73ZxS9Ajq1Rb8WU-p_HbLu", - "id": "ga4gh:VA.klRMVChjvV73ZxS9Ajq1Rb8WU-p_HbLu", "location": { - "digest": "ljan7F0ePe9uiD6f2u80ZG5gDtx9Mr0V", "end": 55181220, - "id": "ga4gh:SL.ljan7F0ePe9uiD6f2u80ZG5gDtx9Mr0V", "sequenceReference": { "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", "type": "SequenceReference", @@ -603,12 +596,8 @@ def test_to_spdi_with_ref(tlr): ( "NC_000007.14:g.55181230_55181231insGGCT", { - "digest": "CLOvnFRJXGNRB9aTuNbvsLqc7syRYb55", - "id": "ga4gh:VA.CLOvnFRJXGNRB9aTuNbvsLqc7syRYb55", "location": { - "digest": "lh4dRt_xWPi3wrubcfomi5DkD7fu6wd2", "end": 55181230, - "id": "ga4gh:SL.lh4dRt_xWPi3wrubcfomi5DkD7fu6wd2", "sequenceReference": { "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", "type": "SequenceReference", @@ -623,12 +612,8 @@ def test_to_spdi_with_ref(tlr): ( "NC_000013.11:g.32331093_32331094dup", { - "digest": "swY2caCgv1kP6YqKyPlcEzJqTvou15vC", - "id": "ga4gh:VA.swY2caCgv1kP6YqKyPlcEzJqTvou15vC", "location": { - "digest": "ikECYncPpE1xh6f_LiComrFGevocjDHQ", "end": 32331094, - "id": "ga4gh:SL.ikECYncPpE1xh6f_LiComrFGevocjDHQ", "sequenceReference": { "refgetAccession": "SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT", "type": "SequenceReference", @@ -648,12 +633,8 @@ def test_to_spdi_with_ref(tlr): ( "NC_000013.11:g.32316467dup", { - "digest": "96ak7XdY3DNbp71aHEXw-NHSfeHGW-KT", - "id": "ga4gh:VA.96ak7XdY3DNbp71aHEXw-NHSfeHGW-KT", "location": { - "digest": "fwfHu8VaD2-6Qvay9MJSINXPS767RYSw", "end": 32316467, - "id": "ga4gh:SL.fwfHu8VaD2-6Qvay9MJSINXPS767RYSw", "sequenceReference": { "refgetAccession": "SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT", "type": "SequenceReference", @@ -673,12 +654,8 @@ def test_to_spdi_with_ref(tlr): ( "NM_001331029.1:c.722A>G", { - "digest": "DPe4AO-S0Yu4wzSCmys7eGn4p4sO0zaC", - "id": "ga4gh:VA.DPe4AO-S0Yu4wzSCmys7eGn4p4sO0zaC", "location": { - "digest": "7hcVmPnIspQNDfZKBzRJFc8K9GaJuAlY", "end": 872, - "id": "ga4gh:SL.7hcVmPnIspQNDfZKBzRJFc8K9GaJuAlY", "sequenceReference": { "refgetAccession": "SQ.MBIgVnoHFw34aFqNUVGM0zgjC3d-v8dK", "type": "SequenceReference", @@ -693,12 +670,8 @@ def test_to_spdi_with_ref(tlr): ( "NM_181798.1:c.1007G>T", { - "digest": "vSL4aV7mPQKQLX7Jk-PmXN0APs0cBIr9", - "id": "ga4gh:VA.vSL4aV7mPQKQLX7Jk-PmXN0APs0cBIr9", "location": { - "digest": "EtvHvoj1Lsq-RruzIzWbKOIAW-bt193w", "end": 1263, - "id": "ga4gh:SL.EtvHvoj1Lsq-RruzIzWbKOIAW-bt193w", "sequenceReference": { "refgetAccession": "SQ.KN07u-RFqd1dTyOWOG98HnOq87Nq-ZIg", "type": "SequenceReference", @@ -713,13 +686,9 @@ def test_to_spdi_with_ref(tlr): ( "NC_000019.10:g.289464_289465insCACA", { - "digest": "YFUR4oR_84b-rRFf0UzOjfI4eE5FTKAP", - "id": "ga4gh:VA.YFUR4oR_84b-rRFf0UzOjfI4eE5FTKAP", "type": "Allele", "location": { - "digest": "L145KFLJeJ334YnOVm59pPlbdqfHhgXZ", "end": 289466, - "id": "ga4gh:SL.L145KFLJeJ334YnOVm59pPlbdqfHhgXZ", "sequenceReference": { "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", "type": "SequenceReference", @@ -738,13 +707,9 @@ def test_to_spdi_with_ref(tlr): ( "NC_000019.10:g.289485_289500del", { - "digest": "Djc_SwVDFunsArqwUM00PciVaF70VTcU", - "id": "ga4gh:VA.Djc_SwVDFunsArqwUM00PciVaF70VTcU", "type": "Allele", "location": { - "digest": "WTE7jyihK4qvRRzEqM7u5nSD4iS2k3xp", "end": 289501, - "id": "ga4gh:SL.WTE7jyihK4qvRRzEqM7u5nSD4iS2k3xp", "sequenceReference": { "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", "type": "SequenceReference", @@ -772,7 +737,6 @@ def test_to_spdi_with_ref(tlr): @pytest.mark.vcr def test_hgvs(tlr, hgvsexpr, expected): # do_normalize defaults to true - tlr.identify = True allele = tlr.translate_from(hgvsexpr, "hgvs") assert allele.model_dump(exclude_none=True) == expected @@ -789,15 +753,9 @@ def test_rle_seq_limit(tlr): rle_seq_limit is set to None. """ # do_normalize defaults to true - tlr.identify = True - a_dict = { - "digest": "j7qUzb1uvmdxLAbtdCPiay4kIRQmyZNv", - "id": "ga4gh:VA.j7qUzb1uvmdxLAbtdCPiay4kIRQmyZNv", "location": { - "digest": "88oOqkUgALP7fnN8P8lbvCosFhG8YpY0", "end": 32331094, - "id": "ga4gh:SL.88oOqkUgALP7fnN8P8lbvCosFhG8YpY0", "sequenceReference": { "refgetAccession": "SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT", "type": "SequenceReference", @@ -859,6 +817,162 @@ def test_to_hgvs_iri_ref_keyerror(tlr): assert str(e.value) == "'ga4gh:seqrefs.jsonc#/NM_181798.1'" +@pytest.mark.vcr +def test_reference_allele_rle(tlr): + """Test that reference alleles (REF==ALT) are normalized to ReferenceLengthExpression. + + Added to address https://github.com/ga4gh/vrs-python/issues/587 + """ + # Test with gnomad format + gnomad_ref_allele = "1-100210778-AA-AA" + allele = tlr._from_gnomad(gnomad_ref_allele) + + expected = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 100210777, + "end": 100210779, + }, + "state": { + "type": "ReferenceLengthExpression", + "length": 2, + "repeatSubunitLength": 2, + "sequence": "AA", + }, + } + + assert allele.model_dump(exclude_none=True) == expected + + # Test with SPDI format (REF==ALT) + spdi_ref_allele = "NC_000001.11:100210777:AA:AA" + allele_spdi = tlr._from_spdi(spdi_ref_allele) + + assert allele_spdi.model_dump(exclude_none=True) == expected + + # Test round-trip to SPDI + to_spdi = tlr.translate_to(allele_spdi, "spdi", ref_seq_limit=None) + assert len(to_spdi) == 1 + assert to_spdi[0] == spdi_ref_allele + + +# Microsatellite test cases for 21bp repeat unit +# https://github.com/ga4gh/vrs-python/discussions/592 +# Tests deletion, insertion, and identity (no-change) variations +# Deletion/insertion: VOCA normalize to 930081-930152 with repeatSubunitLength=21 +# Identity: Keep input coordinates 930089-930152 with repeatSubunitLength=63 (no VOCA normalization) +microsatellite_21bp_cases = [ + { + "id": "deletion", + "description": "Delete 1 copy from 3 copies (3->2 copies)", + "hgvs": "NC_000001.11:g.930132_930152del", + "spdi": "NC_000001.11:930081:GCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACC:GCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACC", + "expected": { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 930081, + "end": 930152, + }, + "state": { + "type": "ReferenceLengthExpression", + "length": 50, + "sequence": "GCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACC", + "repeatSubunitLength": 21, + }, + }, + }, + { + "id": "insertion", + "description": "Insert 1 copy to 3 copies (3->4 copies)", + "hgvs": "NC_000001.11:g.930152_930153insTTCCTCTCCTCCTGCCCCACC", + "spdi": "NC_000001.11:930081:GCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACC:GCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACC", + "expected": { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 930081, + "end": 930152, + }, + "state": { + "type": "ReferenceLengthExpression", + "length": 92, + "sequence": "GCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACC", + "repeatSubunitLength": 21, + }, + }, + }, + { + "id": "identity", + "description": "No change, 3 copies (same-as-ref does NOT do VOCA normalization)", + "hgvs": "NC_000001.11:g.930090_930152=", + "spdi": "NC_000001.11:930089:TTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACC:TTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACC", + "expected": { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 930089, + "end": 930152, + }, + "state": { + "type": "ReferenceLengthExpression", + "length": 63, + "sequence": "TTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACCTTCCTCTCCTCCTGCCCCACC", + "repeatSubunitLength": 63, + }, + }, + }, +] + + +@pytest.mark.parametrize("case", microsatellite_21bp_cases, ids=lambda c: c["id"]) +@pytest.mark.vcr +def test_normalize_microsatellite_counts(tlr, case): + """Test microsatellite deletion, insertion, and identity normalization behavior + + Tests three variations of a 21bp microsatellite: + - Deletion and insertion: Apply VOCA normalization (roll left, find repeat unit) + - Identity (same-as-ref): Do NOT apply VOCA normalization, use input coordinates + + https://github.com/ga4gh/vrs-python/discussions/592 + + The microsatellite has a 21bp repeat unit when fully normalized. + For deletion/insertion: normalize to 930081-930152 with repeatSubunitLength=21 + For identity: keep input coordinates (930089-930152) with repeatSubunitLength=63 + """ + # Test HGVS format + allele_hgvs = tlr.translate_from( + case["hgvs"], "hgvs", normalize=True, rle_seq_limit=100 + ) + assert allele_hgvs.model_dump(exclude_none=True) == case["expected"], ( + f"HGVS failed: {case['description']}" + ) + + # Test SPDI format + allele_spdi = tlr.translate_from( + case["spdi"], "spdi", normalize=True, rle_seq_limit=100 + ) + assert allele_spdi.model_dump(exclude_none=True) == case["expected"], ( + f"SPDI failed: {case['description']}" + ) + + # TODO: Readd these tests # @pytest.mark.vcr # def test_errors(tlr): diff --git a/tests/extras/test_annotate_vcf.py b/tests/extras/test_annotate_vcf.py index af5b21a1..7e085f2d 100644 --- a/tests/extras/test_annotate_vcf.py +++ b/tests/extras/test_annotate_vcf.py @@ -6,9 +6,9 @@ import re from pathlib import Path +import pysam import pytest -from ga4gh.vrs import VRS_VERSION, __version__ from ga4gh.vrs.dataproxy import DataProxyValidationError, SeqRepoRESTDataProxy from ga4gh.vrs.extras.annotator.vcf import VcfAnnotator, VcfAnnotatorError @@ -35,20 +35,39 @@ def input_vcf(): def compare_vcfs(actual_vcf_path: Path, expected_vcf_path: Path): - """VRS-Python version annotation would be annoying to manually update. This helper - method replaces a placeholder string with the real version, and otherwise performs - a pairwise check for all lines in each VCF. + """Normalize version fields that change per build and compare the remaining content + of two VCF files line-by-line. """ - with gzip.open(actual_vcf_path, "rt") as out_vcf: + version_section_pattern = re.compile( + r"\[VRS version=[^;\]]+;VRS-Python version=[^\]]+\]" + ) + + def _mask_version_fields(line: str) -> str: + """Replace the bracketed version section with a stable placeholder.""" + if not line.startswith("##INFO=;VRS-Python version=]", line + ) + + # Handle both gzipped and uncompressed VCF files + def _open(path: Path): + if path.suffix == ".gz": + return gzip.open(path, "rt") + return path.open("r") + + with _open(actual_vcf_path) as out_vcf, _open(expected_vcf_path) as expected_output: out_vcf_lines = out_vcf.readlines() - with gzip.open(expected_vcf_path, "rt") as expected_output: expected_output_lines = expected_output.readlines() + for actual_line, expected_line in zip( out_vcf_lines, expected_output_lines, strict=False ): - if actual_line.startswith("##INFO=A): Expected RLE with length=1, repeatSubunitLength=1 + 2. Duplication (1:102995989 CTTT>CTTTCTTT): Expected RLE with length=8, repeatSubunitLength=4 + """ + vcr_cassette.allow_playback_repeats = False + input_vcf = TEST_DATA_DIR / "test_rle.vcf" + output_vcf = TEST_DATA_DIR / "test_rle_output.vcf" + output_vrs_pkl = TEST_DATA_DIR / "test_rle_output.pkl" + + # Annotate the VCF with VRS attributes enabled + vcf_annotator.annotate( + input_vcf, output_vcf, vrs_attributes=True, output_pkl_path=output_vrs_pkl + ) + + # Read the output VCF and verify RLE fields are present + with pysam.VariantFile(str(output_vcf)) as vcf: + # Verify the RLE-specific header fields were added + assert "VRS_Lengths" in vcf.header.info + assert "VRS_RepeatSubunitLengths" in vcf.header.info + + variants = list(vcf) + assert len(variants) == 2 + + # Test variant 1: Deletion (AA>A) + # Expected: length=1, repeatSubunitLength=1 + deletion_variant = variants[0] + assert deletion_variant.chrom == "1" + assert deletion_variant.pos == 100210778 + assert deletion_variant.ref == "AA" + assert deletion_variant.alts == ("A",) + + # Check VRS attributes for deletion + assert "VRS_Allele_IDs" in deletion_variant.info + assert "VRS_Starts" in deletion_variant.info + assert "VRS_Ends" in deletion_variant.info + assert "VRS_States" in deletion_variant.info + assert "VRS_Lengths" in deletion_variant.info + assert "VRS_RepeatSubunitLengths" in deletion_variant.info + + # Expected values for deletion RLE + # REF: AA uses RLE with length=2, repeatSubunitLength=2 + # ALT: A uses RLE with length=1, repeatSubunitLength=1 + vrs_lengths = deletion_variant.info["VRS_Lengths"] + vrs_repeat_lengths = deletion_variant.info["VRS_RepeatSubunitLengths"] + assert len(vrs_lengths) == 2 # REF and ALT + assert len(vrs_repeat_lengths) == 2 + assert vrs_lengths == (2, 1) # REF: AA (length 2), ALT: A (length 1) + assert vrs_repeat_lengths == ( + 2, + 1, + ) # REF: AA as repeat unit of length 2, ALT: A as repeat unit of length 1 + + # Test variant 2: Duplication (CTTT>CTTTCTTT) + # Expected: length=8, repeatSubunitLength=4 + duplication_variant = variants[1] + assert duplication_variant.chrom == "1" + assert duplication_variant.pos == 102995989 + assert duplication_variant.ref == "CTTT" + assert duplication_variant.alts == ("CTTTCTTT",) + + # Check VRS attributes for duplication + assert "VRS_Allele_IDs" in duplication_variant.info + assert "VRS_Lengths" in duplication_variant.info + assert "VRS_RepeatSubunitLengths" in duplication_variant.info + + # Expected values for duplication RLE + # REF: CTTT uses RLE with length=4, repeatSubunitLength=4 + # ALT: CTTTCTTT uses RLE with length=8, repeatSubunitLength=4 + vrs_lengths = duplication_variant.info["VRS_Lengths"] + vrs_repeat_lengths = duplication_variant.info["VRS_RepeatSubunitLengths"] + assert len(vrs_lengths) == 2 # REF and ALT + assert len(vrs_repeat_lengths) == 2 + assert vrs_lengths == (4, 8) # REF: CTTT (length 4), ALT: CTTTCTTT (length 8) + assert vrs_repeat_lengths == (4, 4) # Both are 4-base repeats + + assert output_vrs_pkl.exists() + if vcr_cassette.write_protected: + assert vcr_cassette.all_played diff --git a/tests/test_vrs_normalize.py b/tests/test_vrs_normalize.py index 9bf2006f..83c27cdd 100644 --- a/tests/test_vrs_normalize.py +++ b/tests/test_vrs_normalize.py @@ -2,13 +2,8 @@ from ga4gh.vrs import models, normalize -# >>> dp.get_sequence("refseq:NC_000019.10", 44908820, 44908830) -# |820 |825 | 830 -# ' G C G C C T G G C A ' -# |A| a1 -# - -allele_dict = { +# Single nucleotide same-as-reference allele. +allele_dict1 = { "location": { "end": 26090951, "start": 26090950, @@ -22,7 +17,26 @@ "type": "Allele", } +allele_dict1_normalized = { + "location": { + "end": 26090951, + "start": 26090950, + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.0iKlIQk2oZLoeOG9P1riRU6hvL5Ux8TV", + }, + "type": "SequenceLocation", + }, + "state": { + "type": "ReferenceLengthExpression", + "length": 1, + "sequence": "C", + "repeatSubunitLength": 1, + }, + "type": "Allele", +} +# Ambiguous indefinite-outer 2 bp deletion. Should become RLE. allele_dict2 = { "type": "Allele", "location": { @@ -37,7 +51,6 @@ "state": {"sequence": "", "type": "LiteralSequenceExpression"}, } - allele_dict2_normalized = { "type": "Allele", "location": { @@ -56,7 +69,7 @@ }, } - +# Ambiguous definite 2-4bp deletion. Cannot be converted to RLE. (as opposed to allele_dict2 which can) allele_dict3 = { "type": "Allele", "location": { @@ -71,7 +84,7 @@ "state": {"sequence": "", "type": "LiteralSequenceExpression"}, } - +# Tandem duplication of GT, normalizes to RLE. allele_dict4 = { "type": "Allele", "location": { @@ -105,6 +118,7 @@ }, } +# Insertion of multiple repeat subunits ("CAG") into an existing repeating region. allele_dict5 = { "location": { "end": 289464, @@ -138,19 +152,133 @@ }, } +# Another simple same-as-reference allele +allele_dict6 = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 100210777, + "end": 100210779, + }, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "AA", + }, +} + +allele_dict6_normalized = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 100210777, + "end": 100210779, + }, + "state": { + "type": "ReferenceLengthExpression", + "length": 2, + "repeatSubunitLength": 2, + "sequence": "AA", + }, +} + +# Multi-base substitution (step 2.b). ClinVar 1530016 +# HGVS: NC_000001.11:g.939146_939147delinsTT +# VCF: 1-939146-GA-TT +# Substitutions remain as LiteralSequenceExpression (both ref and alt non-empty after trim) +clinvar_substitution_2bp = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 939145, + "end": 939147, + }, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "TT", + }, +} + +clinvar_substitution_normalized_2bp = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 939145, + "end": 939147, + }, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "TT", + }, +} + +# Unambiguous insertion in a repeat region (step 5.a). +# Insert "CGT" into a poly-A run at chr1:236900409-236900417. +# Terminal bases (C, T) don't match surrounding A's, so no rolling occurs. +# Should remain as LiteralSequenceExpression at the original position. +unambiguous_insertion_in_repeat = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 236900413, + "end": 236900413, + }, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "CGT", + }, +} + +unambiguous_insertion_in_repeat_normalized = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 236900413, + "end": 236900413, + }, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "CGT", + }, +} + @pytest.mark.vcr def test_normalize_allele(rest_dataproxy): - allele1 = models.Allele(**allele_dict) + # Test that a same-as-reference LSE is normalized to an RLE + allele1 = models.Allele(**allele_dict1) allele2 = normalize(allele1, rest_dataproxy) - assert allele1 == allele2 + assert allele2 == models.Allele(**allele_dict1_normalized) allele1 = models.Allele(**allele_dict2) allele2 = normalize(allele1, rest_dataproxy, rle_seq_limit=0) assert allele1 != allele2 assert allele2 == models.Allele(**allele_dict2_normalized) - # Definite ranges are not normalized + # Definite ambiguous ranges are not normalized allele3 = models.Allele(**allele_dict3) allele3_after_norm = normalize(allele3, rest_dataproxy) assert allele3_after_norm == allele3 @@ -164,3 +292,640 @@ def test_normalize_allele(rest_dataproxy): allele5 = models.Allele(**allele_dict5) allele5_after_norm = normalize(allele5, rest_dataproxy) assert allele5_after_norm == models.Allele(**allele_dict5_normalized) + + # Same-as-reference allele (REF==ALT) + # Added to address https://github.com/ga4gh/vrs-python/issues/587 + allele6 = models.Allele(**allele_dict6) + allele6_after_norm = normalize(allele6, rest_dataproxy) + assert allele6_after_norm == models.Allele(**allele_dict6_normalized) + + # Multi-base substitution (step 2.b): both ref and alt non-empty after trim + # Should remain as LiteralSequenceExpression, not converted to RLE + substitution = models.Allele(**clinvar_substitution_2bp) + substitution_norm = normalize(substitution, rest_dataproxy) + assert substitution_norm == models.Allele(**clinvar_substitution_normalized_2bp) + + # Unambiguous insertion in a repeat region (step 5.a) + # Terminal bases don't match context, so no rolling - stays at original position + unambig_ins = models.Allele(**unambiguous_insertion_in_repeat) + unambig_ins_norm = normalize(unambig_ins, rest_dataproxy) + assert unambig_ins_norm == models.Allele( + **unambiguous_insertion_in_repeat_normalized + ) + + +# Simple deletion from non-repeating region (no trim/rolling involved). ClinVar 3385321 +# SPDI: NC_000001.11:66926:G: +clinvar_deletion = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 66926, + "end": 66927, + }, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "", + }, +} + +clinvar_deletion_normalized = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 66926, + "end": 66927, + }, + "state": { + "type": "ReferenceLengthExpression", + "length": 0, + "repeatSubunitLength": 1, + }, +} + + +# Microsatellite deletion: ClinVar 4286633 +# SPDI: NC_000001.11:766399:AATAAATA:AATA +clinvar_microsatellite = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 766400, + "end": 766404, + }, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "", + }, +} + +clinvar_microsatellite_normalized = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 766399, + "end": 766407, + }, + "state": { + "type": "ReferenceLengthExpression", + "length": 4, + "repeatSubunitLength": 4, + }, +} + +# Tandem repeat deletion. ClinVar 1658573 +# SPDI: NC_000001.11:930136:CTCCTCCT:CTCCT +clinvar_tandem_repeat = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 930137, + "end": 930140, + }, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "", + }, +} + +clinvar_tandem_repeat_normalized = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 930136, + "end": 930144, + }, + "state": { + "type": "ReferenceLengthExpression", + "length": 5, + "repeatSubunitLength": 3, + }, +} + +# Repeat subunit insertion in microsatellite region. ClinVar 2672290 +# SPDI: NC_000001.11:1752908:CCTCCTCCTCCTCCTCCTCCTCCTCCTC:CCTCCTCCTCCTCCTCCTCCTCCTCCTCCTC +# This tests the circular expansion logic for insertions in repeat regions +clinvar_microsatellite_insertion = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 1752908, + "end": 1752908, + }, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "CCT", + }, +} + +clinvar_microsatellite_insertion_normalized = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 1752908, + "end": 1752936, + }, + "state": { + "type": "ReferenceLengthExpression", + "length": 31, + "repeatSubunitLength": 3, + }, +} + + +@pytest.mark.vcr +def test_normalize_clinvar_rle(rest_dataproxy): + """Test normalization of ClinVar variants that should produce RLE. + + These test cases are pulled from ClinVar GRCh38 VCF. + """ + # Simple deletion: AG>A (deletes G) + deletion = models.Allele(**clinvar_deletion) + deletion_norm = normalize(deletion, rest_dataproxy, rle_seq_limit=0) + assert deletion_norm == models.Allele(**clinvar_deletion_normalized) + + # Microsatellite: GAATA>G (deletes AATA repeat) + microsatellite = models.Allele(**clinvar_microsatellite) + microsatellite_norm = normalize(microsatellite, rest_dataproxy, rle_seq_limit=0) + assert microsatellite_norm == models.Allele(**clinvar_microsatellite_normalized) + + # Tandem repeat: TCTC>T (deletes CTC repeat) + tandem_repeat = models.Allele(**clinvar_tandem_repeat) + tandem_repeat_norm = normalize(tandem_repeat, rest_dataproxy, rle_seq_limit=0) + assert tandem_repeat_norm == models.Allele(**clinvar_tandem_repeat_normalized) + + # Repeat subunit insertion in microsatellite region: C>CCCT (inserts CCT in CTC[10] region) + microsatellite_insertion = models.Allele(**clinvar_microsatellite_insertion) + microsatellite_insertion_norm = normalize( + microsatellite_insertion, rest_dataproxy, rle_seq_limit=0 + ) + assert microsatellite_insertion_norm == models.Allele( + **clinvar_microsatellite_insertion_normalized + ) + + +############################################################################### +# Partial repeat insertion/deletion edge cases in CCT repeat region +# Region: chr1:1752908-1752936 = CCTCCTCCTCCTCCTCCTCCTCCTCCTC (9 full CCT units + trailing C) +# These tests verify behavior when insertions/deletions don't align with repeat boundaries +############################################################################### +#### MIDDLE INSERTIONS (around position 1752915, in unit 3) #### + +# Insert "CT" (2 bases, < repeat unit) in middle of CCT region +# SPDI: NC_000001.11:1752915::CT +# Reference: CCT CCT C CT CCT ... (positions 1752908-...) +# ^-- insert "CT" at interbase position 1752915 +# Variant: CCT CCT C[CT]CT CCT ... +# The ref "CCT" (3bp) becomes "CCTCT" (5bp), which cycles with period 2 (CT CT C) +partial_repeat_insertion = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 1752915, + "end": 1752915, + }, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "CT", + }, +} + +partial_repeat_insertion_normalized = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + # Expands to 3 ref bases (1752915-1752918: "CCT"), not the full CCT repeat region, + # because the inserted "CT" creates a 2-base cycle pattern locally + "start": 1752915, + "end": 1752918, + }, + "state": { + "type": "ReferenceLengthExpression", + # ref "CCT" (3 bases) + ins "CT" (2 bases) = "CCTCT" (5 bases), cycles with period 2 + "length": 5, + "repeatSubunitLength": 2, + }, +} + + +# Insert "CCTC" (4 bases, > repeat unit) in middle of CCT region +# SPDI: NC_000001.11:1752915::CCTC +# Reference: CCT CCT C CT CCT CCT ... (positions 1752908-...) +# ^-- insert "CCTC" at interbase position 1752915 (between C at char 1752914 and C at 1752915) +# Variant: CCT CCT C[CCTC]CT CCT ... +# In the variant sequence, the subunit now identified is now CCTC (CCTC CCTC C) +# And the ref "CCTCC" (5bp) becomes "CCTCCCTCC" (9bp, period 4) +middle_ins_4bp = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 1752915, + "end": 1752915, + }, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "CCTC", + }, +} + +middle_ins_4bp_normalized = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + # Rolls left to 1752911, expands to 5 ref bases (CCTCC) + "start": 1752911, + "end": 1752916, + }, + "state": { + "type": "ReferenceLengthExpression", + # ref "CCTCC" (5 bases) + ins "CCTC" (4 bases) = 9 bases, cycles with period 4 + "length": 9, + "repeatSubunitLength": 4, + }, +} + +#### TAIL INSERTIONS (at position 1752934, end of unit 9) #### + +# Insert "CT" (2 bases, < repeat unit) at tail of CCT region +# SPDI: NC_000001.11:1752934::CT +# Reference: ...CCT CCT CCT C G A (positions 1752926-1752938) +# ^-- insert "CT" at interbase position 1752934 (after T, before trailing C) +# Variant: ...CCT CCT CCT[CT]C G A +# Result: Stays as LSE - at boundary where next base (C) doesn't continue the CT pattern +# No expansion occurs; output is the unchanged insertion +tail_ins_2bp = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 1752934, + "end": 1752934, + }, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "CT", + }, +} + +tail_ins_2bp_normalized = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + # No expansion - stays at original position + "start": 1752934, + "end": 1752934, + }, + "state": { + # Stays as LSE because insertion at boundary doesn't form repeating pattern + "type": "LiteralSequenceExpression", + "sequence": "CT", + }, +} + +# Insert "CCTC" (4 bases, > repeat unit) at tail of CCT region +# SPDI: NC_000001.11:1752934::CCTC +# Reference: ...CCT CCT CCT C G A (positions 1752926-1752938) +# ^-- insert "CCTC" at interbase position 1752934 (after T, before trailing C) +# Variant: ...CCT CCT CCT[CCTC]C G A → ...CCT CCT CCTCCCTC G A +# Left-aligns: ref "T" (1752933-1752934) + ins "CCTC" = "CCCTC" after normalization +# Result: Stays as LSE with sequence "CCCTC" - doesn't form repeating pattern at boundary +tail_ins_4bp = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 1752934, + "end": 1752934, + }, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "CCTC", + }, +} + +tail_ins_4bp_normalized = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + # Rolls left by 1 to include the T at 1752933 + "start": 1752933, + "end": 1752934, + }, + "state": { + # Stays as LSE - ref "T" + ins "CCTC" = "CCCTC" after left-alignment + "type": "LiteralSequenceExpression", + "sequence": "CCCTC", + }, +} + +#### MIDDLE DELETIONS (around positions 1752912-1752916) #### + +# Delete "CTCC" (4 bases, > repeat unit) in middle of CCT region +# SPDI: NC_000001.11:1752912:CTCC: +# Reference: CCT CCT CCT CCT CCT ... (positions 1752908-...) +# C[CTCC]T CCT ... <-- delete positions 1752912-1752916 +# Variant: CCT C T CCT ... → CCTCTCCT... +# Left-aligns to 1752911: ref span becomes "CCTCC" (5 bases) +# Ref "CCTCC" - del "CTCC" = 1 base remaining; CTCC has period 4, so repeatSubunitLength=4 +deletion_spanning_boundary = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 1752912, + "end": 1752916, + }, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "", + }, +} + +deletion_spanning_boundary_normalized = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + # Rolls left by 1 to 1752911; ref span becomes "CCTCC" (5 bases) + # Doesn't expand to full CCT repeat region because deleted "CTCC" has period 4, not 3 + "start": 1752911, + "end": 1752916, + }, + "state": { + "type": "ReferenceLengthExpression", + # ref "CCTCC" (5 bases) - del "CTCC" (4 bases) = 1 base remaining + # repeatSubunitLength=4 reflects the deletion size, not the original CCT repeat + "length": 1, + "repeatSubunitLength": 4, + }, +} + +# Delete "TC" (2 bases, < repeat unit) in middle of CCT region +# SPDI: NC_000001.11:1752913:TC: +# Reference: CCT CCT CCT CCT CCT ... (positions 1752908-...) +# T[TC]CT CCT ... <-- delete positions 1752913-1752915 +# Variant: CCT T CT CCT ... → CCTTCTCCT... +# Left-aligns to 1752912: ref span becomes "CTC" (3 bases) +# Ref "CTC" - del "TC" = 1 base remaining; TC has period 2, so repeatSubunitLength=2 +middle_del_2bp = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 1752913, + "end": 1752915, + }, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "", + }, +} + +middle_del_2bp_normalized = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + # Rolls left by 1 to 1752912; ref span becomes "CTC" (3 bases) + "start": 1752912, + "end": 1752915, + }, + "state": { + "type": "ReferenceLengthExpression", + # ref "CTC" (3 bases) - del "TC" (2 bases) = 1 base remaining + "length": 1, + "repeatSubunitLength": 2, + }, +} + +#### TAIL DELETIONS (at positions 1752932-1752936) #### + +# Delete "TC" (2 bases, < repeat unit) at tail of CCT region +# SPDI: NC_000001.11:1752934:TC: +# Reference: ...CCT CCT CCT C G A (positions 1752926-1752938) +# T[TC]G A <-- delete positions 1752934-1752936 (T from CCT + trailing C) +# Variant: ...CCT CCT CC G A → ...CCTCCTCCGA +# Left-aligns to 1752933: ref span becomes "TTC" (3 bases, positions 1752933-1752936) +# Ref "TTC" - del "TC" = 1 base remaining; TC has period 2, so repeatSubunitLength=2 +tail_del_2bp = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 1752934, + "end": 1752936, + }, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "", + }, +} + +tail_del_2bp_normalized = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + # Rolls left by 1 to 1752933; ref span becomes "TTC" (3 bases) + "start": 1752933, + "end": 1752936, + }, + "state": { + "type": "ReferenceLengthExpression", + # ref "TTC" (3 bases) - del "TC" (2 bases) = 1 base remaining + "length": 1, + "repeatSubunitLength": 2, + }, +} + +# Delete "CCTC" (4 bases, > repeat unit) at tail of CCT region +# SPDI: NC_000001.11:1752932:CCTC: +# Reference: ...CCT CCT [CCTC] G A (positions 1752926-1752938) +# ^-- delete positions 1752932-1752936 (CCTC = unit 9 CCT + trailing C) +# Variant: ...CCT CCT G A → ...CCTCCTGA +# Left-aligns: ref span stays at 1752932-1752936 = "CCTC" (4 bases) +# Ref "CCTC" - del "CCTC" = 0 bases remaining; CCTC has period 4, so repeatSubunitLength=4 +tail_del_4bp = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 1752932, + "end": 1752936, + }, + "state": { + "type": "LiteralSequenceExpression", + "sequence": "", + }, +} + +tail_del_4bp_normalized = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + # No rolling - stays at original position + "start": 1752932, + "end": 1752936, + }, + "state": { + "type": "ReferenceLengthExpression", + # ref "CCTC" (4 bases) - del "CCTC" (4 bases) = 0 bases remaining + "length": 0, + "repeatSubunitLength": 4, + }, +} + + +@pytest.mark.vcr +def test_normalize_partial_rle_del_ins(rest_dataproxy): + """Test normalization of partial repeat insertions/deletions in CCT repeat region. + + Tests insertions and deletions that don't align with the 3-base CCT repeat boundary. + Region: chr1:1752908-1752936 = CCTCCTCCTCCTCCTCCTCCTCCTCCTC + + Cases tested: + - Middle insertions: 2bp and 4bp insertions within the repeat region + - Tail insertions: 2bp and 4bp insertions at the end of the repeat region + - Middle deletions: 2bp and 4bp deletions within the repeat region + - Tail deletions: 2bp and 4bp deletions at the end of the repeat region + """ + # === MIDDLE INSERTIONS === + + # Middle ins 2bp: Insert "CT" at 1752915 (already tested in partial_repeat_insertion) + partial_ins = models.Allele(**partial_repeat_insertion) + partial_ins_norm = normalize(partial_ins, rest_dataproxy, rle_seq_limit=0) + assert partial_ins_norm == models.Allele(**partial_repeat_insertion_normalized) + + # Middle ins 4bp: Insert "CCTC" at 1752915 + # 4-base insertion into 3-base repeat creates period-4 RLE + mid_ins_4 = models.Allele(**middle_ins_4bp) + mid_ins_4_norm = normalize(mid_ins_4, rest_dataproxy, rle_seq_limit=0) + assert mid_ins_4_norm == models.Allele(**middle_ins_4bp_normalized) + + # === TAIL INSERTIONS === + + # Tail ins 2bp: Insert "CT" at 1752934 + # At boundary, stays as LSE (doesn't form repeating pattern) + tail_ins_2 = models.Allele(**tail_ins_2bp) + tail_ins_2_norm = normalize(tail_ins_2, rest_dataproxy, rle_seq_limit=0) + assert tail_ins_2_norm == models.Allele(**tail_ins_2bp_normalized) + + # Tail ins 4bp: Insert "CCTC" at 1752934 + # At boundary, stays as LSE with left-aligned sequence + tail_ins_4 = models.Allele(**tail_ins_4bp) + tail_ins_4_norm = normalize(tail_ins_4, rest_dataproxy, rle_seq_limit=0) + assert tail_ins_4_norm == models.Allele(**tail_ins_4bp_normalized) + + # === MIDDLE DELETIONS === + + # Middle del 4bp: Delete "CTCC" at 1752912-1752916 (already tested in deletion_spanning_boundary) + span_del = models.Allele(**deletion_spanning_boundary) + span_del_norm = normalize(span_del, rest_dataproxy, rle_seq_limit=0) + assert span_del_norm == models.Allele(**deletion_spanning_boundary_normalized) + + # Middle del 2bp: Delete "TC" at 1752913-1752915 + # 2-base deletion creates period-2 RLE + mid_del_2 = models.Allele(**middle_del_2bp) + mid_del_2_norm = normalize(mid_del_2, rest_dataproxy, rle_seq_limit=0) + assert mid_del_2_norm == models.Allele(**middle_del_2bp_normalized) + + # === TAIL DELETIONS === + + # Tail del 2bp: Delete "TC" at 1752934-1752936 + # At boundary, still becomes RLE with period 2 + tail_del_2 = models.Allele(**tail_del_2bp) + tail_del_2_norm = normalize(tail_del_2, rest_dataproxy, rle_seq_limit=0) + assert tail_del_2_norm == models.Allele(**tail_del_2bp_normalized) + + # Tail del 4bp: Delete "CCTC" at 1752932-1752936 + # Complete 4-base deletion, RLE with length=0 + tail_del_4 = models.Allele(**tail_del_4bp) + tail_del_4_norm = normalize(tail_del_4, rest_dataproxy, rle_seq_limit=0) + assert tail_del_4_norm == models.Allele(**tail_del_4bp_normalized)