diff --git a/VERSIONS b/VERSIONS index 861af6fca..6167d92c2 100644 --- a/VERSIONS +++ b/VERSIONS @@ -183,6 +183,13 @@ multitrait branch: whenever a mutationEffect() callback is added, removed, rescheduled, or activated/deactivated in a way that affects the current tick (invalidating all affected trait values) whenever the tick counter advances such that a previously active mutationEffect() callback becomes inactive, or vice versa (invalidating all affected trait values) whenever the tick counter is changed in script to a new arbitrary value (invalidating all trait values in all individuals in all species) + tree-sequence recording work for multitrait: + individual metadata now has a variable-length per-trait array of phenotype and offset information + mutation metadata is now moved from the mutation table's metadata column to a table placed in top-level metadata with the json+struct codec + mutation metadata now has a variable-length per-trait array of effect size and dominance information + top-level JSON metadata now has a new `traits` key (required), which lists the traits used in the tree sequence; duplicated in provenance + the top-level JSON, mutation, and individual metadata schemas are updated accordingly + pushed the .trees file version from 0.9 to 1.0 version 5.1 (Eidos version 4.1): diff --git a/core/haplosome.cpp b/core/haplosome.cpp index b0030eb73..9787384b8 100644 --- a/core/haplosome.cpp +++ b/core/haplosome.cpp @@ -4320,6 +4320,10 @@ EidosValue_SP Haplosome_Class::ExecuteMethod_removeMutations(EidosGlobalStringID slim_position_t pos = mut->position_; species->RecordNewDerivedState(target_haplosome, pos, empty_mut_vector); + + // BCH 2/14/2026: All mutations removed need to be retained by the tree sequence until + // simplified away, so we can persist their metadata + species->NotifyMutationRemoved(mut); } } } @@ -4496,8 +4500,15 @@ EidosValue_SP Haplosome_Class::ExecuteMethod_removeMutations(EidosGlobalStringID // TREE SEQUENCE RECORDING // When doing tree recording, we additionally keep all fixed mutations (their ids) in a multimap indexed by their position // This allows us to find all the fixed mutations at a given position quickly and easily, for calculating derived states + // BCH 2/14/2026: We now also need to remove the original mutation from our retained mutations list if it is there if (species->RecordingTreeSequence()) + { pop.treeseq_substitutions_map_.emplace(mut->position_, sub); + + // We just clear the flag here, and it is as if the mutation was actually removed from muts_retained_by_treeseq_; + // it will be garbage-collected the next time simplification occurs, so we don't have to search for it here + mut->retained_by_treeseq_ = false; + } pop.substitutions_.emplace_back(sub); } diff --git a/core/individual.cpp b/core/individual.cpp index 9ce6d7e13..58002040f 100644 --- a/core/individual.cpp +++ b/core/individual.cpp @@ -6395,10 +6395,10 @@ EidosValue_SP Individual_Class::ExecuteMethod_zygosityOfMutations(EidosGlobalStr int64_t *integer_result_data = integer_result->data_mutable(); // tabulate the results for each individuals - const int8_t PRESENT_HETEROZYGOUS = 1; - const int8_t PRESENT_HOMOZYGOUS = 2; - const int8_t PRESENT_HEMIZYGOUS = 3; - const int8_t PRESENT_HAPLOID = 4; + const uint8_t PRESENT_HETEROZYGOUS = 1; + const uint8_t PRESENT_HOMOZYGOUS = 2; + const uint8_t PRESENT_HEMIZYGOUS = 3; + const uint8_t PRESENT_HAPLOID = 4; for (int target_index = 0; target_index < target_size; ++target_index) { diff --git a/core/mutation.cpp b/core/mutation.cpp index 67a2514bd..891b27f28 100644 --- a/core/mutation.cpp +++ b/core/mutation.cpp @@ -42,7 +42,7 @@ slim_mutationid_t gSLiM_next_mutation_id = 0; // This constructor is used when making a new mutation with effects and dominances provided by the caller; FIXME MULTITRAIT: needs to take a whole vector of each, per trait! Mutation::Mutation(MutationType *p_mutation_type_ptr, slim_chromosome_index_t p_chromosome_index, slim_position_t p_position, slim_effect_t p_selection_coeff, slim_effect_t p_dominance_coeff, slim_objectid_t p_subpop_index, slim_tick_t p_tick, int8_t p_nucleotide) : -mutation_type_ptr_(p_mutation_type_ptr), position_(p_position), subpop_index_(p_subpop_index), origin_tick_(p_tick), chromosome_index_(p_chromosome_index), state_(MutationState::kNewMutation), nucleotide_(p_nucleotide), mutation_id_(gSLiM_next_mutation_id++) +mutation_type_ptr_(p_mutation_type_ptr), position_(p_position), subpop_index_(p_subpop_index), origin_tick_(p_tick), chromosome_index_(p_chromosome_index), state_(MutationState::kNewMutation), retained_by_treeseq_(false), nucleotide_(p_nucleotide), mutation_id_(gSLiM_next_mutation_id++) { #ifdef DEBUG_LOCKS_ENABLED mutation_block_LOCK.start_critical(2); @@ -152,7 +152,7 @@ mutation_type_ptr_(p_mutation_type_ptr), position_(p_position), subpop_index_(p_ // This constructor is used when making a new mutation with effects drawn from each trait's DES, and dominance taken from each trait's default dominance coefficient, both from the given mutation type Mutation::Mutation(MutationType *p_mutation_type_ptr, slim_chromosome_index_t p_chromosome_index, slim_position_t p_position, slim_objectid_t p_subpop_index, slim_tick_t p_tick, int8_t p_nucleotide) : -mutation_type_ptr_(p_mutation_type_ptr), position_(p_position), subpop_index_(p_subpop_index), origin_tick_(p_tick), chromosome_index_(p_chromosome_index), state_(MutationState::kNewMutation), nucleotide_(p_nucleotide), mutation_id_(gSLiM_next_mutation_id++) +mutation_type_ptr_(p_mutation_type_ptr), position_(p_position), subpop_index_(p_subpop_index), origin_tick_(p_tick), chromosome_index_(p_chromosome_index), state_(MutationState::kNewMutation), retained_by_treeseq_(false), nucleotide_(p_nucleotide), mutation_id_(gSLiM_next_mutation_id++) { #ifdef DEBUG_LOCKS_ENABLED mutation_block_LOCK.start_critical(2); @@ -295,9 +295,119 @@ mutation_type_ptr_(p_mutation_type_ptr), position_(p_position), subpop_index_(p_ #endif } +// This constructor is used when making a new mutation with effects and dominances provided by the caller, *and* a mutation id provided by the caller +Mutation::Mutation(slim_mutationid_t p_mutation_id, MutationType *p_mutation_type_ptr, slim_chromosome_index_t p_chromosome_index, slim_position_t p_position, MutationTableMetadataRec *p_metadata_ptr) : +mutation_type_ptr_(p_mutation_type_ptr), position_(p_position), subpop_index_(p_metadata_ptr->subpop_index_), origin_tick_(p_metadata_ptr->origin_tick_), chromosome_index_(p_chromosome_index), state_(MutationState::kNewMutation), retained_by_treeseq_(false), nucleotide_(p_metadata_ptr->nucleotide_), mutation_id_(p_mutation_id) +{ + Species &species = mutation_type_ptr_->species_; + const std::vector &traits = species.Traits(); + MutationBlock *mutation_block = species.SpeciesMutationBlock(); + + // initialize the tag to the "unset" value + tag_value_ = SLIM_TAG_UNSET_VALUE; + + // zero out our refcount and per-trait information, which is now kept in a separate buffer + MutationIndex mutation_index = mutation_block->IndexInBlock(this); + mutation_block->refcount_buffer_[mutation_index] = 0; + + slim_trait_index_t trait_count = mutation_block->trait_count_; + MutationTraitInfo *mut_trait_info = mutation_block->TraitInfoForIndex(mutation_index); + + // Below basically does the work of calling SetEffectSize() and SetDominance(), more efficiently since + // this is critical path. See those methods for more comments on what is happening here. + + is_neutral_for_all_traits_ = true; // will be set to false below as needed + is_neutral_for_direct_fitness_traits_ = true; + + // a dominance coefficient of NAN indicates independent dominance + independent_dominance_for_all_traits_ = true; + independent_dominance_for_any_traits_ = false; + + for (slim_trait_index_t trait_index = 0; trait_index < trait_count; ++trait_index) + { + MutationTraitInfo *traitInfoRec = mut_trait_info + trait_index; + Trait *trait = traits[trait_index]; + TraitType traitType = trait->Type(); + + // FIXME MULTITRAIT: This constructor needs to change to have a whole vector of trait information passed in, for effect and dominance + // for now we use the values passed in for trait 0, and make other traits neutral + slim_effect_t effect_size = p_metadata_ptr->per_trait_[trait_index].effect_size_; + slim_effect_t dominance = p_metadata_ptr->per_trait_[trait_index].dominance_coeff_; // can be NAN + slim_effect_t hemizygous_dominance = p_metadata_ptr->per_trait_[trait_index].hemizygous_dominance_coeff_; + + traitInfoRec->effect_size_ = effect_size; + traitInfoRec->dominance_coeff_UNSAFE_ = dominance; // can be NAN + traitInfoRec->hemizygous_dominance_coeff_ = hemizygous_dominance; + + if (effect_size != (slim_effect_t)0.0) + { + is_neutral_for_all_traits_ = false; + + if (trait->HasDirectFitnessEffect()) + is_neutral_for_direct_fitness_traits_ = false; + + if (std::isnan(dominance)) + independent_dominance_for_any_traits_ = true; + else + independent_dominance_for_all_traits_ = false; + + // get the realized dominance to handle the possibility of independent dominance + slim_effect_t realized_dominance = RealizedDominanceForTrait(trait); + + if (traitType == TraitType::kMultiplicative) + { + traitInfoRec->homozygous_effect_ = std::max((slim_effect_t)0.0, (slim_effect_t)1.0 + effect_size); + traitInfoRec->heterozygous_effect_ = std::max((slim_effect_t)0.0, (slim_effect_t)1.0 + realized_dominance * effect_size); + traitInfoRec->hemizygous_effect_ = std::max((slim_effect_t)0.0, (slim_effect_t)1.0 + hemizygous_dominance * effect_size); + } + else // (traitType == TraitType::kAdditive) + { + traitInfoRec->homozygous_effect_ = ((slim_effect_t)2.0 * effect_size); + traitInfoRec->heterozygous_effect_ = ((slim_effect_t)2.0 * realized_dominance * effect_size); + traitInfoRec->hemizygous_effect_ = ((slim_effect_t)2.0 * hemizygous_dominance * effect_size); + } + } + else // (effect == 0.0) + { + if (traitType == TraitType::kMultiplicative) + { + traitInfoRec->homozygous_effect_ = (slim_effect_t)1.0; + traitInfoRec->heterozygous_effect_ = (slim_effect_t)1.0; + traitInfoRec->hemizygous_effect_ = (slim_effect_t)1.0; + } + else // (traitType == TraitType::kAdditive) + { + traitInfoRec->homozygous_effect_ = (slim_effect_t)0.0; + traitInfoRec->heterozygous_effect_ = (slim_effect_t)0.0; + traitInfoRec->hemizygous_effect_ = (slim_effect_t)0.0; + } + } + } + + // this mutation will be added to the simulation somewhere, so tell the species about it + // (OK, it might not get added due to stacking policy or mutation() callbacks, but we assume it will be) + species.NoteChangedMutation(this); + +#if DEBUG + SelfConsistencyCheck(" in Mutation::Mutation()"); +#endif + +#if DEBUG_MUTATIONS() + std::cout << "Mutation constructed: " << this << std::endl; +#endif + + // Since a mutation id was supplied by the caller, we need to ensure that subsequent mutation ids generated do not collide + // This constructor (unlike the other Mutation() constructor above) is presently never called multithreaded, + // so we just enforce that here. If that changes, it should start using the debug lock to detect races, as above. + THREAD_SAFETY_IN_ACTIVE_PARALLEL("Mutation::Mutation(): gSLiM_next_mutation_id change"); + + if (gSLiM_next_mutation_id <= mutation_id_) + gSLiM_next_mutation_id = mutation_id_ + 1; +} + // This constructor is used when making a new mutation with effects and dominances provided by the caller, *and* a mutation id provided by the caller Mutation::Mutation(slim_mutationid_t p_mutation_id, MutationType *p_mutation_type_ptr, slim_chromosome_index_t p_chromosome_index, slim_position_t p_position, slim_effect_t p_selection_coeff, slim_effect_t p_dominance_coeff, slim_objectid_t p_subpop_index, slim_tick_t p_tick, int8_t p_nucleotide) : -mutation_type_ptr_(p_mutation_type_ptr), position_(p_position), subpop_index_(p_subpop_index), origin_tick_(p_tick), chromosome_index_(p_chromosome_index), state_(MutationState::kNewMutation), nucleotide_(p_nucleotide), mutation_id_(p_mutation_id) +mutation_type_ptr_(p_mutation_type_ptr), position_(p_position), subpop_index_(p_subpop_index), origin_tick_(p_tick), chromosome_index_(p_chromosome_index), state_(MutationState::kNewMutation), retained_by_treeseq_(false), nucleotide_(p_nucleotide), mutation_id_(p_mutation_id) { Species &species = mutation_type_ptr_->species_; const std::vector &traits = species.Traits(); diff --git a/core/mutation.h b/core/mutation.h index c09165d89..800c50c9d 100644 --- a/core/mutation.h +++ b/core/mutation.h @@ -35,6 +35,7 @@ class MutationType; class Trait; +struct MutationTableMetadataRec; class Mutation_Class; @@ -116,6 +117,18 @@ class Mutation : public EidosDictionaryRetained const slim_tick_t origin_tick_; // tick in which the mutation arose slim_chromosome_index_t chromosome_index_; // the (uint8_t) index of this mutation's chromosome int state_ : 4; // see MutationState above; 4 bits so we can represent -1 + unsigned int scratch_ : 4; // temporary scratch space for use by algorithms; regard as volatile outside your own code block + + // This flag indicates that the mutation has been retained by the species because it belonged to a haplosome + // that was remembered by treeSeqRememberIndividuals(), or because it was removed in script or by stacking + // policy (and might therefore still be referenced by the tree sequence). This flag is used by Species for + // bookkeeping. It is used only when tree-sequence recording is enabled. + unsigned int retained_by_treeseq_ : 1; + + // NOTE: there are three bits free here + + + // OPTIMIZATION FLAGS // is_neutral_for_all_traits_ is true if all mutation effects are 0.0 (note a callback might override this). // The state of is_neutral_for_all_traits_ is updated to reflect the state of the mutation when it changes. @@ -147,8 +160,8 @@ class Mutation : public EidosDictionaryRetained // if all effects for a mutation are neutral, this flag will be false. unsigned int independent_dominance_for_any_traits_ : 1; + int8_t nucleotide_; // the nucleotide being kept: A=0, C=1, G=2, T=3. -1 is used to indicate non-nucleotide-based. - int8_t scratch_; // temporary scratch space for use by algorithms; regard as volatile outside your own code block const slim_mutationid_t mutation_id_; // a unique id for each mutation, used to track mutations slim_usertag_t tag_value_; // a user-defined tag value @@ -158,7 +171,7 @@ class Mutation : public EidosDictionaryRetained #endif #if DEBUG - mutable slim_refcount_t refcount_CHECK_; // scratch space for checking of parallel refcounting + mutable slim_refcount_t refcount_CHECK_; // scratch space for checking of parallel refcounting #endif Mutation(const Mutation&) = delete; // no copying @@ -172,6 +185,9 @@ class Mutation : public EidosDictionaryRetained // FIXME MULTITRAIT: needs to take a whole vector of each, per trait! Mutation(MutationType *p_mutation_type_ptr, slim_chromosome_index_t p_chromosome_index, slim_position_t p_position, slim_effect_t p_selection_coeff, slim_effect_t p_dominance_coeff, slim_objectid_t p_subpop_index, slim_tick_t p_tick, int8_t p_nucleotide); + // This constructor is used by tree-sequence reading in Species::__CreateMutationsFromTabulation() + Mutation(slim_mutationid_t p_mutation_id, MutationType *p_mutation_type_ptr, slim_chromosome_index_t p_chromosome_index, slim_position_t p_position, MutationTableMetadataRec *p_metadata_ptr); + // This constructor is used when making a new mutation with effects and dominances PROVIDED by the caller, AND a mutation id provided by the caller // FIXME MULTITRAIT: needs to take a whole vector of each, per trait! Mutation(slim_mutationid_t p_mutation_id, MutationType *p_mutation_type_ptr, slim_chromosome_index_t p_chromosome_index, slim_position_t p_position, slim_effect_t p_selection_coeff, slim_effect_t p_dominance_coeff, slim_objectid_t p_subpop_index, slim_tick_t p_tick, int8_t p_nucleotide); diff --git a/core/mutation_block.h b/core/mutation_block.h index eb1f0491c..2a469e5ae 100644 --- a/core/mutation_block.h +++ b/core/mutation_block.h @@ -35,6 +35,7 @@ #include "mutation.h" +class Species; class Mutation; class MutationRun; diff --git a/core/mutation_run.cpp b/core/mutation_run.cpp index 032a39a8f..95e3a12e0 100644 --- a/core/mutation_run.cpp +++ b/core/mutation_run.cpp @@ -440,6 +440,8 @@ bool MutationRun::_EnforceStackPolicyForAddition(Mutation *p_mut_block_ptr, slim if ((mut_position == p_position) && (mut->mutation_type_ptr_->stack_group_ == p_stack_group)) { // The current scan position is a mutation that needs to be removed, so scan forward to skip copying it backward + // BCH 2/14/2026: this mutation would still be present in the tree sequence, so it needs to be retained by the species + mut->mutation_type_ptr_->species_.NotifyMutationRemoved(mut); continue; } else diff --git a/core/population.cpp b/core/population.cpp index ca0cd29c1..3e8255c23 100644 --- a/core/population.cpp +++ b/core/population.cpp @@ -7677,6 +7677,7 @@ void Population::RemoveAllFixedMutations(void) { // When doing tree recording, we additionally keep all fixed mutations (their ids) in a multimap indexed by their position // This allows us to find all the fixed mutations at a given position quickly and easily, for calculating derived states + // BCH 2/14/2026: We now also need to remove the original mutation from our retained mutations list if it is there for (int i = 0; i < fixed_mutation_accumulator_size; i++) { Mutation *mut_to_remove = mut_block_ptr + fixed_mutation_accumulator[i]; @@ -7685,6 +7686,10 @@ void Population::RemoveAllFixedMutations(void) species_.DoBaselineAccumulationForSubstitution(sub); treeseq_substitutions_map_.emplace(mut_to_remove->position_, sub); substitutions_.emplace_back(sub); + + // We just clear the flag here, and it is as if the mutation was actually removed from muts_retained_by_treeseq_; + // it will be garbage-collected the next time simplification occurs, so we don't have to search for it here + mut_to_remove->retained_by_treeseq_ = false; } } else diff --git a/core/slim_functions.cpp b/core/slim_functions.cpp index 53b10eb57..a228152c1 100644 --- a/core/slim_functions.cpp +++ b/core/slim_functions.cpp @@ -2595,21 +2595,10 @@ EidosValue_SP SLiM_ExecuteFunction_treeSeqMetadata(const std::vector(0) - EidosDictionaryRetained *objectElement = new EidosDictionaryRetained(); - EidosValue_SP result_SP = EidosValue_SP(new (gEidosValuePool->AllocateChunk()) EidosValue_Object(objectElement, gEidosDictionaryRetained_Class)); - - objectElement->Release(); // retained by result_SP - return result_SP; + EIDOS_TERMINATION << "ERROR (SLiM_ExecuteFunction_treeSeqMetadata): no metadata schema present in file " << file_path << "; a `json+struct` schema is required." << EidosTerminate(); } + // BCH 2/14/2026: As of SLiM 5.2, we read top-level metadata using the `json+struct` codec std::string metadata_schema_string(temp_tables.metadata_schema, temp_tables.metadata_schema_length); nlohmann::json metadata_schema; @@ -2621,10 +2610,33 @@ EidosValue_SP SLiM_ExecuteFunction_treeSeqMetadata(const std::vector(0) + EidosDictionaryRetained *objectElement = new EidosDictionaryRetained(); + EidosValue_SP result_SP = EidosValue_SP(new (gEidosValuePool->AllocateChunk()) EidosValue_Object(objectElement, gEidosDictionaryRetained_Class)); + + objectElement->Release(); // retained by result_SP + return result_SP; + } - std::string metadata_string(temp_tables.metadata, temp_tables.metadata_length); + std::string metadata_string(top_level_json_buffer, top_level_json_length); nlohmann::json metadata; tsk_table_collection_free(&temp_tables); diff --git a/core/slim_globals.cpp b/core/slim_globals.cpp index 45c8cf48f..5f6b68b6c 100644 --- a/core/slim_globals.cpp +++ b/core/slim_globals.cpp @@ -123,12 +123,17 @@ void SLiM_WarmUp(void) //std::cout << "sizeof(size_t) == " << sizeof(size_t) << std::endl; // Test that our tskit metadata schemas are valid JSON, and print them out formatted for debugging purposes if desired - nlohmann::json top_level_schema, edge_schema, site_schema, mutation_schema, node_schema, individual_schema, population_schema; + nlohmann::json top_level_JSON_schema, top_level_binary_schema, edge_schema, site_schema, mutation_schema, node_schema, individual_schema, population_schema; try { - top_level_schema = nlohmann::json::parse(gSLiM_tsk_metadata_schema); + top_level_JSON_schema = nlohmann::json::parse(gSLiM_tsk_metadata_JSON_schema); } catch (...) { - EIDOS_TERMINATION << "ERROR (SLiM_WarmUp): (internal error) gSLiM_tsk_metadata_schema must be a JSON string." << EidosTerminate(); + EIDOS_TERMINATION << "ERROR (SLiM_WarmUp): (internal error) gSLiM_tsk_metadata_JSON_schema must be a JSON string." << EidosTerminate(); + } + try { + top_level_binary_schema = nlohmann::json::parse(gSLiM_tsk_metadata_binary_schema_FORMAT); + } catch (...) { + EIDOS_TERMINATION << "ERROR (SLiM_WarmUp): (internal error) gSLiM_tsk_metadata_binary_schema_FORMAT must be a JSON string." << EidosTerminate(); } try { if (gSLiM_tsk_edge_metadata_schema.length()) @@ -143,7 +148,8 @@ void SLiM_WarmUp(void) EIDOS_TERMINATION << "ERROR (SLiM_WarmUp): (internal error) gSLiM_tsk_site_metadata_schema must be a JSON string." << EidosTerminate(); } try { - mutation_schema = nlohmann::json::parse(gSLiM_tsk_mutation_metadata_schema); + if (gSLiM_tsk_mutation_metadata_schema.length()) + mutation_schema = nlohmann::json::parse(gSLiM_tsk_mutation_metadata_schema); } catch (...) { EIDOS_TERMINATION << "ERROR (SLiM_WarmUp): (internal error) gSLiM_tsk_mutation_metadata_schema must be a JSON string." << EidosTerminate(); } @@ -153,9 +159,9 @@ void SLiM_WarmUp(void) EIDOS_TERMINATION << "ERROR (SLiM_WarmUp): (internal error) gSLiM_tsk_node_metadata_schema_FORMAT must be a JSON string." << EidosTerminate(); } try { - individual_schema = nlohmann::json::parse(gSLiM_tsk_individual_metadata_schema); + individual_schema = nlohmann::json::parse(gSLiM_tsk_individual_metadata_schema_FORMAT); } catch (...) { - EIDOS_TERMINATION << "ERROR (SLiM_WarmUp): (internal error) gSLiM_tsk_individual_metadata_schema must be a JSON string." << EidosTerminate(); + EIDOS_TERMINATION << "ERROR (SLiM_WarmUp): (internal error) gSLiM_tsk_individual_metadata_schema_FORMAT must be a JSON string." << EidosTerminate(); } try { population_schema = nlohmann::json::parse(gSLiM_tsk_population_metadata_schema); @@ -165,12 +171,13 @@ void SLiM_WarmUp(void) #if 0 #warning printing of JSON schemas should be disabled in a production build - std::cout << "gSLiM_tsk_metadata_schema == " << std::endl << top_level_schema.dump(4) << std::endl << std::endl; + std::cout << "gSLiM_tsk_metadata_JSON_schema == " << std::endl << top_level_JSON_schema.dump(4) << std::endl << std::endl; + std::cout << "gSLiM_tsk_metadata_binary_schema_FORMAT == " << std::endl << top_level_binary_schema.dump(4) << std::endl << std::endl; std::cout << "gSLiM_tsk_edge_metadata_schema == " << std::endl << edge_schema.dump(4) << std::endl << std::endl; std::cout << "gSLiM_tsk_site_metadata_schema == " << std::endl << site_schema.dump(4) << std::endl << std::endl; std::cout << "gSLiM_tsk_mutation_metadata_schema == " << std::endl << mutation_schema.dump(4) << std::endl << std::endl; std::cout << "gSLiM_tsk_node_metadata_schema_FORMAT == " << std::endl << node_schema.dump(4) << std::endl << std::endl; - std::cout << "gSLiM_tsk_individual_metadata_schema == " << std::endl << individual_schema.dump(4) << std::endl << std::endl; + std::cout << "gSLiM_tsk_individual_metadata_schema_FORMAT == " << std::endl << individual_schema.dump(4) << std::endl << std::endl; std::cout << "gSLiM_tsk_population_metadata_schema == " << std::endl << population_schema.dump(4) << std::endl << std::endl; #endif } @@ -1775,59 +1782,67 @@ void SLiM_ConfigureContext(void) // See the pyslim code for readable versions of these. // For more info on schemas in tskit, see: https://tskit.dev/tskit/docs/stable/metadata.html#sec-metadata - -// BCH 11/7/2021: Since I have been needing to modify these here by hand, I have changed them into C++ raw string literals. -// I have also added some code in SLiM_WarmUp() that checks that the metadata schemas are all valid JSON, and optionally prints them. -// see https://stackoverflow.com/a/5460235/2752221 + +// Note that several schemas now require substitution of a string before they are used, because there are bits +// of the schemas that need to be customized depending on the context (number of chromosomes, number of traits). +// The spots where this substitutuon occurs are marked with things like "%d", "%d1", or "%d2", INCLUDING the +// quotes because the metadata strings given here need to be valid JSON to pass SLiM's internal checks. Those +// are then replaced, in some cases by a quoted string, in other cases by an integer. Schemas that contain +// substitution strings like this have a variable name here ending in "_FORMAT" to indicate that they cannot +// be used directly. + +// BCH 11/7/2021: Since I have been needing to modify these here by hand, I have changed them into C++ raw string +// literals. I have also added some code in SLiM_WarmUp() that checks that the metadata schemas are all valid +// JSON, and optionally prints them. See https://stackoverflow.com/a/5460235/2752221 + +// BCH 2/13/2026: Validation of schemas, beyond them simply being valid JSON, is also needed. In tskit this can +// only be done in python, which is inconvenient. The website https://jsonschema.dev validates schemas. Put +// in {"$schema":"http://json-schema.org/schema#"} in the left ("JSON Schema") panel, and copy/paste a schema +// from below into the right-hand panel to validate it. // This is a useful JSON editor: https://jsoneditoronline.org/ // BCH 12/9/2024: Added the new optional key `chromosomes`, providing an array of information about chromosomes (to support // multiple chromosomes), and the new required key `this_chromosome` specifying information about this file's chromosome. -const std::string gSLiM_tsk_metadata_schema = -R"V0G0N({"$schema":"http://json-schema.org/schema#","codec":"json","examples":[{"SLiM":{"file_version":"0.9","name":"fox","description":"foxes on Catalina island","cycle":123,"tick":123,"model_type":"WF","this_chromosome":{"id":1,"index":0,"symbol":"1","name":"autosome_1","type":"A"},"chromosomes":[{"id":1,"symbol":"1","name":"autosome_1","type":"A"},{"id":35,"symbol":"MT","name":"mtDNA","type":"HF"}],"nucleotide_based":false,"separate_sexes":true,"spatial_dimensionality":"xy","spatial_periodicity":"x"}}],"properties":{"SLiM":{"description":"Top-level metadata for a SLiM tree sequence, file format version 0.9","properties":{"file_version":{"description":"The SLiM 'file format version' of this tree sequence.","type":"string"},"name":{"description":"The SLiM species name represented by this tree sequence.","type":"string"},"description":{"description":"A user-configurable description of the species represented by this tree sequence.","type":"string"},"cycle":{"description":"The 'SLiM cycle' counter when this tree sequence was recorded.","type":"integer"},"tick":{"description":"The 'SLiM tick' counter when this tree sequence was recorded.","type":"integer"},"model_type":{"description":"The model type used for the last part of this simulation (WF or nonWF).","enum":["WF","nonWF"],"type":"string"},"this_chromosome":{"description":"The chromosome represented by the tree sequence in this file.","properties":{"id":{"description":"An integer identifier for the chromosome, unique within this set of tree sequences; often the chromosome number in the organism being represented, such as 1.","type":"integer"},"index":{"description":"The (zero-based) index of this chromosome in the chromosomes metadata array (if present), which should match the information given here.","type":"integer"},"symbol":{"description":"A short string symbol for the chromosome, unique within this set of tree sequences, such as \"1\" or \"MT\".","type":"string"},"name":{"description":"A user-specified name for the chromosome, such as an accession identifier.","type":"string"},"type":{"description":"The type of chromosome, as specified by SLiM.","type":"string"}},"required":["id","index","symbol","type"],"type":"object"},"chromosomes":{"description":"The chromosomes represented by the collection of tree sequences, of which this tree sequence is one member.","items":{"properties":{"id":{"description":"An integer identifier for the chromosome, unique within this set of tree sequences; often the chromosome number in the organism being represented, such as 1.","type":"integer"},"symbol":{"description":"A short string symbol for the chromosome, unique within this set of tree sequences, such as \"1\" or \"MT\".","type":"string"},"name":{"description":"A user-specified name for the chromosome, such as an accession identifier.","type":"string"},"type":{"description":"The type of chromosome, as specified by SLiM.","type":"string"}},"required":["id","symbol","type"],"type":"object"},"type":"array"},"nucleotide_based":{"description":"Whether the simulation was nucleotide-based.","type":"boolean"},"separate_sexes":{"description":"Whether the simulation had separate sexes.","type":"boolean"},"spatial_dimensionality":{"description":"The spatial dimensionality of the simulation.","enum":["","x","xy","xyz"],"type":"string"},"spatial_periodicity":{"description":"The spatial periodicity of the simulation.","enum":["","x","y","z","xy","xz","yz","xyz"],"type":"string"},"stage":{"description":"The stage of the SLiM life cycle when this tree sequence was recorded.","type":"string"}},"required":["model_type","tick","file_version","spatial_dimensionality","spatial_periodicity","this_chromosome","separate_sexes","nucleotide_based"],"type":"object"}},"required":["SLiM"],"type":"object"})V0G0N"; +const std::string gSLiM_tsk_metadata_JSON_schema = +R"V0G0N({"codec":"json","examples":[{"SLiM":{"chromosomes":[{"id":1,"name":"autosome_1","symbol":"1","type":"A"},{"id":35,"name":"mtDNA","symbol":"MT","type":"HF"}],"cycle":123,"description":"foxes on Catalina island","file_version":"1.0","model_type":"WF","name":"fox","nucleotide_based":false,"separate_sexes":true,"spatial_dimensionality":"xy","spatial_periodicity":"x","this_chromosome":{"id":1,"index":0,"name":"autosome_1","symbol":"1","type":"A"},"tick":123,"traits":[{"index":0,"name":"simT","type":"multiplicative"}]}}],"properties":{"SLiM":{"description":"Top-level metadata for a SLiM tree sequence, file format version 1.0","properties":{"chromosomes":{"description":"The chromosomes represented by the collection of tree sequences, of which this tree sequence is one member.","items":{"properties":{"id":{"description":"An integer identifier for the chromosome, unique within this set of tree sequences; often the chromosome number in the organism being represented, such as 1.","type":"integer"},"name":{"description":"A user-specified name for the chromosome, such as an accession identifier.","type":"string"},"symbol":{"description":"A short string symbol for the chromosome, unique within this set of tree sequences, such as \"1\" or \"MT\".","type":"string"},"type":{"description":"The type of chromosome, as specified by SLiM.","type":"string"}},"required":["id","symbol","type"],"type":"object"},"type":"array"},"cycle":{"description":"The 'SLiM cycle' counter when this tree sequence was recorded.","type":"integer"},"description":{"description":"A user-configurable description of the species represented by this tree sequence.","type":"string"},"file_version":{"description":"The SLiM 'file format version' of this tree sequence.","type":"string"},"model_type":{"description":"The model type used for the last part of this simulation (WF or nonWF).","enum":["WF","nonWF"],"type":"string"},"name":{"description":"The SLiM species name represented by this tree sequence.","type":"string"},"nucleotide_based":{"description":"Whether the simulation was nucleotide-based.","type":"boolean"},"separate_sexes":{"description":"Whether the simulation had separate sexes.","type":"boolean"},"spatial_dimensionality":{"description":"The spatial dimensionality of the simulation.","enum":["","x","xy","xyz"],"type":"string"},"spatial_periodicity":{"description":"The spatial periodicity of the simulation.","enum":["","x","y","z","xy","xz","yz","xyz"],"type":"string"},"stage":{"description":"The stage of the SLiM life cycle when this tree sequence was recorded.","type":"string"},"this_chromosome":{"description":"The chromosome represented by the tree sequence in this file.","properties":{"id":{"description":"An integer identifier for the chromosome, unique within this set of tree sequences; often the chromosome number in the organism being represented, such as 1.","type":"integer"},"index":{"description":"The (zero-based) index of this chromosome in the chromosomes metadata array (if present), which should match the information given here.","type":"integer"},"name":{"description":"A user-specified name for the chromosome, such as an accession identifier.","type":"string"},"symbol":{"description":"A short string symbol for the chromosome, unique within this set of tree sequences, such as \"1\" or \"MT\".","type":"string"},"type":{"description":"The type of chromosome, as specified by SLiM.","type":"string"}},"required":["id","index","symbol","type"],"type":"object"},"tick":{"description":"The 'SLiM tick' counter when this tree sequence was recorded.","type":"integer"},"traits":{"description":"The traits defined for this tree sequence; each mutation and individual will have per-trait metadata.","items":{"properties":{"baselineAccumulation":{"description":"Whether the baseline offset includes accumulated effects from fixed (substituted) mutations.","type":"boolean"},"baselineOffset":{"description":"The baseline offset of the trait.","type":"number"},"directFitnessEffect":{"description":"Whether the trait's effects are used directly as fitness effects.","type":"boolean"},"index":{"description":"The integer index for the trait; indices must be sequential starting from zero.","type":"integer"},"individualOffsetMean":{"description":"The mean of the trait's individual offset distribution (which might or might not be used).","type":"number"},"individualOffsetSD":{"description":"The standard deviation of the trait's individual offset distribution (which might or might not be used).","type":"number"},"name":{"description":"The string name for the trait.","type":"string"},"type":{"description":"The type of the trait; this must be 'additive', 'multiplicative', or 'logistic'.","enum":["additive","multiplicative","logistic"],"type":"string"}},"required":["index","name","type"],"type":"object"},"type":"array"}},"required":["model_type","tick","file_version","spatial_dimensionality","spatial_periodicity","this_chromosome","separate_sexes","nucleotide_based","traits"],"type":"object"}},"required":["SLiM"],"type":"object"})V0G0N"; + +// Note the substituted strings "%d1" (the number of padding bytes used to produce 8-byte alignment of this +// binary metadata in the json_binary codec) and "%d2" (the number of traits in the focal species). +const std::string gSLiM_tsk_metadata_binary_schema_FORMAT = +R"V0G0N({"codec":"struct","description":"SLiM schema for binary top-level metadata.","properties":{"aligner":{"binaryFormat":"%d1","index":1,"type":"null"},"mutation_list":{"index":2,"items":{"additionalProperties":false,"properties":{"mutation_id":{"binaryFormat":"q","description":"The SLiM mutation ID for this mutation.","index":1,"type":"integer"},"mutation_type":{"binaryFormat":"i","description":"The id of this mutation's mutationType.","index":2,"type":"integer"},"nucleotide":{"binaryFormat":"b","description":"The nucleotide for this mutation (0=A , 1=C , 2=G, 3=T, or -1 for none)","index":5,"type":"integer"},"padding":{"binaryFormat":"3x","description":"Padding bytes for alignment","index":6,"type":"null"},"per_trait":{"index":7,"items":{"additionalProperties":false,"properties":{"dominance":{"binaryFormat":"f","description":"The dominance coefficient for this trait.","index":2,"type":"number"},"effect_size":{"binaryFormat":"f","description":"The effect size for this trait.","index":1,"type":"number"},"hemizygous_dominance":{"binaryFormat":"f","description":"The hemizygous dominance coefficient for this trait.","index":3,"type":"number"}},"required":["dominance","effect_size","hemizygous_dominance"],"type":"object"},"length":"%d2","type":"array"},"slim_time":{"binaryFormat":"i","description":"The SLiM tick counter when this mutation occurred.","index":4,"type":"integer"},"subpopulation":{"binaryFormat":"i","description":"The ID of the subpopulation this mutation occurred in.","index":3,"type":"integer"}},"required":["mutation_id","mutation_type","slim_time","subpopulation","nucleotide","per_trait"],"type":"object"},"noLengthEncodingExhaustBuffer":true,"type":"array"}},"required":["aligner","mutation_list"],"type":"object"})V0G0N"; const std::string gSLiM_tsk_edge_metadata_schema = ""; const std::string gSLiM_tsk_site_metadata_schema = ""; - -const std::string gSLiM_tsk_mutation_metadata_schema = -R"V0G0N({"$schema":"http://json-schema.org/schema#","additionalProperties":false,"codec":"struct","description":"SLiM schema for mutation metadata.","examples":[{"mutation_list":[{"mutation_type":1,"nucleotide":3,"selection_coeff":-0.2,"slim_time":243,"subpopulation":0}]}],"properties":{"mutation_list":{"items":{"additionalProperties":false,"properties":{"mutation_type":{"binaryFormat":"i","description":"The index of this mutation's mutationType.","index":1,"type":"integer"},"nucleotide":{"binaryFormat":"b","description":"The nucleotide for this mutation (0=A , 1=C , 2=G, 3=T, or -1 for none)","index":5,"type":"integer"},"selection_coeff":{"binaryFormat":"f","description":"This mutation's selection coefficient.","index":2,"type":"number"},"slim_time":{"binaryFormat":"i","description":"The SLiM tick counter when this mutation occurred.","index":4,"type":"integer"},"subpopulation":{"binaryFormat":"i","description":"The ID of the subpopulation this mutation occurred in.","index":3,"type":"integer"}},"required":["mutation_type","selection_coeff","subpopulation","slim_time","nucleotide"],"type":"object"},"noLengthEncodingExhaustBuffer":true,"type":"array"}},"required":["mutation_list"],"type":"object"})V0G0N"; - -// BCH 12/10/2024: Removed the type field, and changed the treatment of is_vacant. We have a -// tricky problem here, which is that is_vacant is now variable-length and there is no count. -// The number of byte (uint8_t) entries in is_vacant depends on the number of chromosomes in -// the full set of tree sequences, because the node metadata has to contain flags (bits) for -// every chromosome, not just for the chromosome represented by this file. So we deduce the -// length of is_vacant from that, but it is variable-length and has no count associated with -// it in the metadata. I think this is actually not allowed in JSON Schema, understandably. -// To make this work, we have to write out a DIFFERENT VERSION OF THIS METADATA SCHEMA -// depending on the number of bytes used. In other words, if 7 bytes of is_vacant data are -// needed (for 49-56 chromosomes), we'd write out a version of the schema that specifies -// 7 bytes of is_vacant data using binaryFormat:7B. This effectively puts the count into the -// schema itself. The number of bytes present can thus be inferred from the schema present -// in the file, but also from the 'chromosomes' top-level metadata key; one bit is taken -// for each chromosome, in order, regardless of their type, providing flags for one node -// table entry for one haplosome of each chromosome. (Remember, there are two node table -// entries per individual; the first corresponds to haplosome 1, so its is_vacant data only -// records is_vacant flags for haplosome 1 of each chromosome, and similarly for the second -// node table entry corresponding to haplosome 2 of each chromosome.) The variable name here -// ends in "_FORMAT" because it is a format string containing `%d`, which must be replaced -// by the correct byte count when it is used for output. See SetCurrentNewIndividual() and -// RecordNewHaplosome() for how this dynamic metadata structure is used in practice, and -// WriteTreeSequenceMetadata() for where this schema format string is used. +const std::string gSLiM_tsk_mutation_metadata_schema = ""; // this is now managed by gSLiM_tsk_metadata_binary_schema in top-level metadata + +// BCH 12/10/2024: Removed the type field, and changed the treatment of is_vacant. We have a tricky problem +// here, which is that is_vacant is now variable-length and there is no count. The number of byte (uint8_t) +// entries in is_vacant depends on the number of chromosomes in the full set of tree sequences, because the node +// metadata has to contain flags (bits) for every chromosome, not just for the chromosome represented by this +// file. So we deduce the length of is_vacant from that, but it is variable-length and has no count associated +// with it in the metadata. This is not allowed in JSON Schema, understandably. To make this work, we have to +// write out a DIFFERENT VERSION OF THIS METADATA SCHEMA depending on the number of bytes used. In other words, +// if 7 bytes of is_vacant data are needed (for 49-56 chromosomes), we'd write out a version of the schema that\ +// specifies 7 bytes of is_vacant data using binaryFormat:7B. This effectively puts the count into the schema +// itself. The number of bytes present can thus be inferred from the schema present in the file, but also from +// the 'chromosomes' top-level metadata key; one bit is taken for each chromosome, in order, regardless of their +// type, providing flags for one node table entry for one haplosome of each chromosome. (Remember, there are two +// node table entries per individual; the first corresponds to haplosome 1, so its is_vacant data only records +// is_vacant flags for haplosome 1 of each chromosome, and similarly for the second node table entry corresponding +// to haplosome 2 of each chromosome.) See SetCurrentNewIndividual() and RecordNewHaplosome() for how this +// dynamic metadata structure is used in practice, and WriteTreeSequenceMetadata() for where this schema is used. // -// BCH 4/10/2025: Changing from binary format 's' to 'B', using the new support for fixed- -// length arrays in tskit: https://github.com/tskit-dev/tskit/issues/3088. This has been -// released in tskit version Python 0.6.1. The new syntax for declaring a fixed-length -// array is: {"type": "array", "length": 3, "items": {"type":"number", "binaryFormat":"B"}}. -// The length, 3 here, is encoded with "%d" and replaced at runtime with the correct count. -// (The string replaced is "%d" *including* the quotes, because the format string needs to -// itself be a legal JSON string in order to pass SLiM's own internal checks, so beware.) +// BCH 4/10/2025: Changing from binary format 's' to 'B', using the new support for fixed-length arrays in tskit: +// https://github.com/tskit-dev/tskit/issues/3088 (released in tskit version Python 0.6.1). The new syntax for +// a fixed-length array is: {"type": "array", "length": 3, "items": {"type":"number", "binaryFormat":"B"}}. The +// length, 3 here, is encoded with "%d" and replaced at runtime with the correct count. const std::string gSLiM_tsk_node_metadata_schema_FORMAT = -R"V0G0N({"$schema":"http://json-schema.org/schema#","additionalProperties":false,"codec":"struct","description":"SLiM schema for node metadata.","examples":[{"is_vacant":0,"slim_id":123}],"properties":{"is_vacant":{"description":"A vector of byte (uint8_t) values, with each bit representing whether the node represents a vacant position, either unused or a null haplosome (1), or a non-null haplosome (0), in the corresponding chromosome. This field encodes vacancy for all of the chromosomes in the model, not just the chromosome represented in this file (so that the node table is identical across all chromosomes for a multi-chromosome model). Each chromosome receives one bit here; there are two node table entries per individual, used for the two haplosomes of every chromosome, so only one bit is needed in each entry (making two bits total per chromosome, across the two node table entries). The least significant bit of the first byte is used first (for one haplosome of the first chromosome); the most significant bit of the last byte is used last. The number of bytes present in this field is indicated by this schema's 'binaryFormat' field, which is variable (!), and can also be deduced from the number of chromosomes in the model as given in the top-level 'chromosomes' metadata key, which should always be present if this metadata is present.","index":1,"items":{"binaryFormat":"B","type":"number"},"length":"%d","type":"array"},"slim_id":{"binaryFormat":"q","description":"The 'pedigree ID' of the haplosomes associated with this node in SLiM.","index":0,"type":"integer"}},"required":["slim_id","is_vacant"],"type":["object","null"]})V0G0N"; +R"V0G0N({"$schema":"http://json-schema.org/schema#","additionalProperties":false,"codec":"struct","description":"SLiM schema for node metadata.","examples":[{"is_vacant":0,"slim_id":123}],"properties":{"is_vacant":{"description":"A vector of byte (uint8_t) values, with each bit representing whether the node represents a vacant position, either unused or a null haplosome (1), or a non-null haplosome (0), in the corresponding chromosome. This field encodes vacancy for all of the chromosomes in the model, not just the chromosome represented in this file (so that the node table is identical across all chromosomes for a multi-chromosome model). Each chromosome receives one bit here; there are two node table entries per individual, used for the two haplosomes of every chromosome, so only one bit is needed in each entry (making two bits total per chromosome, across the two node table entries). The least significant bit of the first byte is used first (for one haplosome of the first chromosome); the most significant bit of the last byte is used last. The number of bytes present in this field is indicated by this schema's 'binaryFormat' field, which is variable (!), and can also be deduced from the number of chromosomes in the model as given in the top-level 'chromosomes' metadata key, which should always be present if this metadata is present.","index":2,"items":{"binaryFormat":"B","type":"number"},"length":"%d","type":"array"},"slim_id":{"binaryFormat":"q","description":"The 'pedigree ID' of the haplosomes associated with this node in SLiM.","index":1,"type":"integer"}},"required":["slim_id","is_vacant"],"type":["object","null"]})V0G0N"; -const std::string gSLiM_tsk_individual_metadata_schema = -R"V0G0N({"$schema":"http://json-schema.org/schema#","additionalProperties":false,"codec":"struct","description":"SLiM schema for individual metadata.","examples":[{"age":-1,"flags":0,"pedigree_id":123,"pedigree_p1":12,"pedigree_p2":23,"sex":0,"subpopulation":0}],"flags":{"SLIM_INDIVIDUAL_METADATA_MIGRATED":{"description":"Whether this individual was a migrant, either in the tick when the tree sequence was written out (if the individual was alive then), or in the tick of the last time they were Remembered (if not).","value":1}},"properties":{"age":{"binaryFormat":"i","description":"The age of this individual, either when the tree sequence was written out (if the individual was alive then), or the last time they were Remembered (if not).","index":4,"type":"integer"},"flags":{"binaryFormat":"I","description":"Other information about the individual: see 'flags'.","index":7,"type":"integer"},"pedigree_id":{"binaryFormat":"q","description":"The 'pedigree ID' of this individual in SLiM.","index":1,"type":"integer"},"pedigree_p1":{"binaryFormat":"q","description":"The 'pedigree ID' of this individual's first parent in SLiM.","index":2,"type":"integer"},"pedigree_p2":{"binaryFormat":"q","description":"The 'pedigree ID' of this individual's second parent in SLiM.","index":3,"type":"integer"},"sex":{"binaryFormat":"i","description":"The sex of the individual (0 for female, 1 for male, -1 for hermaphrodite).","index":6,"type":"integer"},"subpopulation":{"binaryFormat":"i","description":"The ID of the subpopulation the individual was part of, either when the tree sequence was written out (if the individual was alive then), or the last time they were Remembered (if not).","index":5,"type":"integer"}},"required":["pedigree_id","pedigree_p1","pedigree_p2","age","subpopulation","sex","flags"],"type":"object"})V0G0N"; +// Note the substitute string "%d" (the number of traits in the focal species). +const std::string gSLiM_tsk_individual_metadata_schema_FORMAT = +R"V0G0N({"$schema":"http://json-schema.org/schema#","additionalProperties":false,"codec":"struct","description":"SLiM schema for individual metadata.","examples":[{"age":-1,"flags":0,"pedigree_id":123,"pedigree_p1":12,"pedigree_p2":23,"sex":0,"subpopulation":0,"per_trait":[{"offset":1.0,"phenotype":1.1}]}],"flags":{"SLIM_INDIVIDUAL_METADATA_MIGRATED":{"description":"Whether this individual was a migrant, either in the tick when the tree sequence was written out (if the individual was alive then), or in the tick of the last time they were Remembered (if not).","value":1}},"properties":{"age":{"binaryFormat":"i","description":"The age of this individual, either when the tree sequence was written out (if the individual was alive then), or the last time they were Remembered (if not).","index":4,"type":"integer"},"flags":{"binaryFormat":"I","description":"Other information about the individual: see 'flags'.","index":7,"type":"integer"},"pedigree_id":{"binaryFormat":"q","description":"The 'pedigree ID' of this individual in SLiM.","index":1,"type":"integer"},"pedigree_p1":{"binaryFormat":"q","description":"The 'pedigree ID' of this individual's first parent in SLiM.","index":2,"type":"integer"},"pedigree_p2":{"binaryFormat":"q","description":"The 'pedigree ID' of this individual's second parent in SLiM.","index":3,"type":"integer"},"per_trait":{"index":8,"items":{"additionalProperties":false,"properties":{"offset":{"binaryFormat":"d","description":"The individual offset for this trait.","index":2,"type":"number"},"phenotype":{"binaryFormat":"d","description":"The phenotype for this trait.","index":1,"type":"number"}},"required":["offset","phenotype"],"type":"object"},"length":"%d","type":"array"},"sex":{"binaryFormat":"i","description":"The sex of the individual (0 for female, 1 for male, -1 for hermaphrodite).","index":6,"type":"integer"},"subpopulation":{"binaryFormat":"i","description":"The ID of the subpopulation the individual was part of, either when the tree sequence was written out (if the individual was alive then), or the last time they were Remembered (if not).","index":5,"type":"integer"}},"required":["pedigree_id","pedigree_p1","pedigree_p2","age","subpopulation","sex","flags","per_trait"],"type":"object"})V0G0N"; // This schema was obsoleted in SLiM 3.7; we now use a JSON schema for the population metadata (see below) const std::string gSLiM_tsk_population_metadata_schema_PREJSON = diff --git a/core/slim_globals.h b/core/slim_globals.h index ff875fc64..694cff71d 100644 --- a/core/slim_globals.h +++ b/core/slim_globals.h @@ -683,12 +683,14 @@ enum class IndDomCacheIndex : slim_trait_index_t {}; #define SLIM_TSK_INDIVIDUAL_REMEMBERED ((tsk_flags_t)(1 << 17)) #define SLIM_TSK_INDIVIDUAL_RETAINED ((tsk_flags_t)(1 << 18)) -extern const std::string gSLiM_tsk_metadata_schema; +extern const std::string gSLiM_tsk_metadata_JSON_schema; +extern const std::string gSLiM_tsk_metadata_binary_schema_FORMAT; + extern const std::string gSLiM_tsk_edge_metadata_schema; extern const std::string gSLiM_tsk_site_metadata_schema; extern const std::string gSLiM_tsk_mutation_metadata_schema; extern const std::string gSLiM_tsk_node_metadata_schema_FORMAT; -extern const std::string gSLiM_tsk_individual_metadata_schema; +extern const std::string gSLiM_tsk_individual_metadata_schema_FORMAT; extern const std::string gSLiM_tsk_population_metadata_schema_PREJSON; // before SLiM 3.7 extern const std::string gSLiM_tsk_population_metadata_schema; diff --git a/core/species.cpp b/core/species.cpp index 0598823e9..0fa83428c 100644 --- a/core/species.cpp +++ b/core/species.cpp @@ -50,6 +50,7 @@ #include #include #include +#include #include "eidos_globals.h" #if EIDOS_ROBIN_HOOD_HASHING() @@ -89,7 +90,8 @@ static const char *SLIM_TREES_FILE_VERSION_META = "0.5"; // SLiM 3.5.x onward, static const char *SLIM_TREES_FILE_VERSION_PREPARENT = "0.6"; // SLiM 3.6.x onward, with SLIM_TSK_INDIVIDUAL_RETAINED instead of SLIM_TSK_INDIVIDUAL_FIRST_GEN static const char *SLIM_TREES_FILE_VERSION_PRESPECIES = "0.7"; // SLiM 3.7.x onward, with parent pedigree IDs in the individuals table metadata static const char *SLIM_TREES_FILE_VERSION_SPECIES = "0.8"; // SLiM 4.0.x onward, with species `name`/`description`, and `tick` in addition to `cycle` -static const char *SLIM_TREES_FILE_VERSION = "0.9"; // SLiM 5.0 onward, for multichrom (haplosomes not genomes, and `chromosomes` key) +static const char *SLIM_TREES_FILE_VERSION_MULTICHROM = "0.9"; // SLiM 5.0 onward, for multichrom (haplosomes not genomes, and `chromosomes` key) +static const char *SLIM_TREES_FILE_VERSION = "1.0"; // SLiM 5.2 onward, for multitrait (per-trait metadata, `traits` key) #pragma mark - #pragma mark Species @@ -127,6 +129,17 @@ Species::~Species(void) DeleteAllMutationRuns(); + // release mutations retained by the tree sequence + if (muts_retained_by_treeseq_.size()) + { + Mutation *mut_block_ptr = mutation_block_->mutation_buffer_; + + for (MutationIndex mut_index : muts_retained_by_treeseq_) + (mut_block_ptr + mut_index)->Release(); + + muts_retained_by_treeseq_.clear(); + } + for (auto mutation_type : mutation_types_) delete mutation_type.second; for (auto &element : mutation_types_) @@ -6644,6 +6657,8 @@ void Species::AddParentsColumnForOutput(tsk_table_collection_t *p_tables, INDIVI for (tsk_size_t individual_index = 0; individual_index < num_rows; individual_index++) { tsk_id_t tsk_individual = (tsk_id_t)individual_index; + + // note that IndividualMetadataRec is variable-length now, but we only use the fixed-size header portion here IndividualMetadataRec *metadata_rec = (IndividualMetadataRec *)(p_tables->individuals.metadata + p_tables->individuals.metadata_offset[tsk_individual]); slim_pedigreeid_t pedigree_p1 = metadata_rec->pedigree_p1_; slim_pedigreeid_t pedigree_p2 = metadata_rec->pedigree_p2_; @@ -6700,6 +6715,7 @@ void Species::BuildTabledIndividualsHash(tsk_table_collection_t *p_tables, INDIV for (tsk_size_t individual_index = 0; individual_index < num_rows; individual_index++) { + // note that IndividualMetadataRec is variable-length now, but we only use the fixed-size header portion here IndividualMetadataRec *metadata_rec = (IndividualMetadataRec *)(metadata_base + metadata_offset[individual_index]); slim_pedigreeid_t pedigree_id = metadata_rec->pedigree_id_; tsk_id_t tsk_individual = (tsk_id_t)individual_index; @@ -7344,6 +7360,88 @@ void Species::SimplifyAllTreeSequences(void) // and reset our elapsed time since last simplification, for auto-simplification simplify_elapsed_ = 0; + // after simplification, we check for any retained mutations that are no longer refenced by the tree sequence + // and remove their retains; this can happen for mutations that were removed by script or stacking policy, if + // all branches they were on were simplified away, and can also happen for mutations that were in remembered + // individuals if they were remembered with permanent=F; this shouldn't take too long and will save memory, + // but there is no real harm to not doing it besides wasted memory/disk, so to save effort we skip it if + // there are fewer than 10,000 retained mutations -- about 360K of table space for a one-trait model. + if (any_muts_retained_impermanently_ && (muts_retained_by_treeseq_.size() > 10000)) + { + Mutation *mut_block_ptr = mutation_block_->mutation_buffer_; + std::unordered_map mutid_to_mutation; + + // first mark all retained mutations 0, and build a hash table to look them up by mutation ID + for (MutationIndex mut_index : muts_retained_by_treeseq_) + { + Mutation *mut = mut_block_ptr + mut_index; + + mut->scratch_ = 0; + + // if retained_by_treeseq_ is false, the mutation is not considered retained even though it + // is in the list; it has been marked for removal, probably because it was substituted + // in that case, we don't add it to the hash table, so it will be swept at the end + if (mut->retained_by_treeseq_) + mutid_to_mutation.insert(std::pair(mut->mutation_id_, mut)); + } + + // then scan through derived states, look up referenced mutations by mutation ID, and mark them 1 + for (Chromosome *chromosome : chromosomes_) + { + slim_chromosome_index_t chromosome_index = chromosome->Index(); + TreeSeqInfo &chromosome_tsinfo = treeseq_[chromosome_index]; + tsk_table_collection_t &chromosome_tables = chromosome_tsinfo.tables_; + slim_mutationid_t *derived_states = (slim_mutationid_t *)chromosome_tables.mutations.derived_state; + tsk_size_t derived_state_length = chromosome_tables.mutations.derived_state_length; + +#if DEBUG + if (derived_state_length % sizeof(slim_mutationid_t) != 0) + EIDOS_TERMINATION << "ERROR (Species::SimplifyAllTreeSequences): (internal error) derived state length is not a multiple of the mutation id size." << EidosTerminate(); +#endif + + size_t derived_state_count = derived_state_length / sizeof(slim_mutationid_t); + + for (size_t derived_state_index = 0; derived_state_index < derived_state_count; ++derived_state_index) + { + slim_mutationid_t mut_id = derived_states[derived_state_index]; + auto iter = mutid_to_mutation.find(mut_id); + + // the mutation referenced by the derived state is not retained; unsurprising + if (iter == mutid_to_mutation.end()) + continue; + + // mark the retained mutation 1, since it is referenced by a derived state + Mutation *retained_mutation = iter->second; + + retained_mutation->scratch_ = 1; + } + } + + // unmarked mutations can be released and compacted out of muts_retained_by_treeseq_ + size_t retained_count = muts_retained_by_treeseq_.size(); + size_t last_valid_index = retained_count - 1; + + for (size_t retained_index = 0; retained_index < retained_count; ) + { + MutationIndex mut_index = muts_retained_by_treeseq_[retained_index]; + Mutation *mut = mut_block_ptr + mut_index; + + if (mut->scratch_ == 0) + { + // this element is no longer needed, so we backfill from the end and repeat this index + muts_retained_by_treeseq_[retained_index] = muts_retained_by_treeseq_[last_valid_index]; + last_valid_index--; + retained_count--; + mut->retained_by_treeseq_ = false; + mut->Release(); + continue; + } + + // this element is needed, so we don't need to touch it, we just move on + retained_index++; + } + } + // as a side effect of simplification, update a "model has coalesced" flag that the user can consult, if requested // this could potentially be parallelized, but it's kind of a fringe feature, and not that slow... if (running_coalescence_checks_) @@ -7919,17 +8017,10 @@ void Species::RecordNewDerivedState(const Haplosome *p_haplosome, slim_position_ THREAD_SAFETY_IN_ACTIVE_PARALLEL("Species::RecordNewDerivedState(): usage of statics"); static std::vector derived_mutation_ids; - static std::vector mutation_metadata; - MutationMetadataRec metadata_rec; derived_mutation_ids.resize(0); - mutation_metadata.resize(0); for (Mutation *mutation : p_derived_mutations) - { derived_mutation_ids.emplace_back(mutation->mutation_id_); - MetadataForMutation(mutation, &metadata_rec); - mutation_metadata.emplace_back(metadata_rec); - } // find and incorporate any fixed mutations at this position, which exist in all new derived states but are not included by SLiM // BCH 5/14/2019: Note that this means that derived states will be recorded that look "stacked" even when those mutations would @@ -7945,8 +8036,6 @@ void Species::RecordNewDerivedState(const Haplosome *p_haplosome, slim_position_ Substitution *substitution = position_iter->second; derived_mutation_ids.emplace_back(substitution->mutation_id_); - MetadataForSubstitution(substitution, &metadata_rec); - mutation_metadata.emplace_back(metadata_rec); } // check for time consistency, using the shared node table in treeseq_[0]; this used to be a DEBUG check, but @@ -7962,13 +8051,11 @@ void Species::RecordNewDerivedState(const Haplosome *p_haplosome, slim_position_ // add the mutation table row with the final derived state and metadata char *derived_muts_bytes = (char *)(derived_mutation_ids.data()); size_t derived_state_length = derived_mutation_ids.size() * sizeof(slim_mutationid_t); - char *mutation_metadata_bytes = (char *)(mutation_metadata.data()); - size_t mutation_metadata_length = mutation_metadata.size() * sizeof(MutationMetadataRec); int ret = tsk_mutation_table_add_row(&tsinfo.tables_.mutations, site_id, haplosomeTSKID, TSK_NULL, time, derived_muts_bytes, (tsk_size_t)derived_state_length, - mutation_metadata_bytes, (tsk_size_t)mutation_metadata_length); + NULL, (tsk_size_t)0); if (ret < 0) handle_error("tsk_mutation_table_add_row", ret); } @@ -8204,8 +8291,13 @@ void Species::AddIndividualsToTable(Individual * const *p_individual, size_t p_n location.emplace_back(ind->spatial_y_); location.emplace_back(ind->spatial_z_); - IndividualMetadataRec metadata_rec; - MetadataForIndividual(ind, &metadata_rec); + // the IndividualMetadataRec struct is now variable-size; we want to make a struct of the appropriate + // size for the amount of per-trait metadata that will be present for individuals of this species + slim_trait_index_t trait_count = TraitCount(); + size_t total_metadata_size = sizeof(IndividualMetadataRec) + sizeof(_IndividualPerTraitMetadata) * (trait_count - 1); + uint8_t metadata_buffer[total_metadata_size]; // assumes compiler extension for variable-size stack allocation + IndividualMetadataRec &metadata_rec = *(IndividualMetadataRec *)metadata_buffer; + MetadataForIndividual(ind, &metadata_rec); // requires a metadata record of the appropriate size // do a fast lookup to see whether this individual is already in the individuals table auto ind_pos = p_individuals_hash->find(ped_id); @@ -8215,7 +8307,7 @@ void Species::AddIndividualsToTable(Individual * const *p_individual, size_t p_n tsk_id_t tsk_individual = tsk_individual_table_add_row(&p_tables->individuals, p_flags, location.data(), (uint32_t)location.size(), NULL, 0, // individual parents - (char *)&metadata_rec, (uint32_t)sizeof(IndividualMetadataRec)); + (char *)&metadata_rec, (uint32_t)total_metadata_size); if (tsk_individual < 0) handle_error("tsk_individual_table_add_row", tsk_individual); // Add the new individual to our hash table, for fast lookup as done above @@ -8242,7 +8334,7 @@ void Species::AddIndividualsToTable(Individual * const *p_individual, size_t p_n && (location.size() == (p_tables->individuals.location_offset[tsk_individual + 1] - p_tables->individuals.location_offset[tsk_individual])) - && (sizeof(IndividualMetadataRec) + && (total_metadata_size == (p_tables->individuals.metadata_offset[tsk_individual + 1] - p_tables->individuals.metadata_offset[tsk_individual]))); @@ -8262,7 +8354,7 @@ void Species::AddIndividualsToTable(Individual * const *p_individual, size_t p_n location.data(), location.size() * sizeof(double)); memcpy(p_tables->individuals.metadata + p_tables->individuals.metadata_offset[tsk_individual], - &metadata_rec, sizeof(IndividualMetadataRec)); + &metadata_rec, total_metadata_size); p_tables->individuals.flags[tsk_individual] |= p_flags; // Check node table @@ -8558,13 +8650,18 @@ void Species::WriteTreeSequenceMetadata(tsk_table_collection_t *p_tables, EidosD ////// // Top-level (tree sequence) metadata: - // In the future, we might need to *add* to the metadata *and also* the schema, - // leaving other keys that might already be there. - // But that's being a headache, so we're skipping it. + // + // SLiM now writes top-level metadata with the new "json+struct" codec. So we assemble a JSON string and + // a binary struct, and put them together with a magic byte string, a version number, a header, and schemas. + // See https://github.com/tskit-dev/tskit/pull/3306 for the PR for the json+struct codec and discussion. + // + // NOTE: In the future, we might need to *add* to the metadata *and also* the schema, leaving other keys + // that might already be there. But that's being a headache, so we're skipping it. + // First generate our JSON string, as we did in SLiM 5.1 and earlier: // BCH 3/9/2025: This is now wrapped in try/catch because the JSON library might raise, especially if it dislikes // the model string we try to put in metadata for p_include_model; see https://github.com/MesserLab/SLiM/issues/488 - std::string new_metadata_str; + std::string metadata_JSON_str; try { nlohmann::json metadata; @@ -8666,59 +8763,346 @@ void Species::WriteTreeSequenceMetadata(tsk_table_collection_t *p_tables, EidosD } } - new_metadata_str = metadata.dump(); + metadata["SLiM"]["traits"] = nlohmann::json::array(); + for (Trait *trait : traits_) + { + nlohmann::json trait_info; + + trait_info["index"] = trait->Index(); + trait_info["name"] = trait->Name(); + + if (trait->HasLogisticPostTransform()) + trait_info["type"] = "logistic"; + else if (trait->Type() == TraitType::kAdditive) + trait_info["type"] = "additive"; + else + trait_info["type"] = "multiplicative"; + + trait_info["baselineOffset"] = trait->BaselineOffset(); + trait_info["baselineAccumulation"] = trait->HasBaselineAccumulation(); + + trait_info["individualOffsetMean"] = trait->IndividualOffsetDistributionMean(); + trait_info["individualOffsetSD"] = trait->IndividualOffsetDistributionSD(); + + trait_info["directFitnessEffect"] = trait->HasDirectFitnessEffect(); + + metadata["SLiM"]["traits"].push_back(trait_info); + } + + metadata_JSON_str = metadata.dump(); } catch (const std::exception &e) { EIDOS_TERMINATION << "ERROR (Species::WriteTreeSequenceMetadata): a JSON string could not be generated for tree-sequence metadata due to an error: '" << e.what() << "'." << EidosTerminate(); } catch (...) { EIDOS_TERMINATION << "ERROR (Species::WriteTreeSequenceMetadata): a JSON string could not be generated for tree-sequence metadata due to an unknown error." << EidosTerminate(); } - ret = tsk_table_collection_set_metadata( - p_tables, new_metadata_str.c_str(), (tsk_size_t)new_metadata_str.length()); + // Then figure out how large our binary data will be (but don't generate the data yet; we will write it + // directly into the buffer that we allocate below, to avoid copying and increased memory overhead) + int registry_size; + const MutationIndex *register_iter = population_.MutationRegistry(®istry_size); + + const std::vector substitutions = population_.substitutions_; + size_t substitution_count = substitutions.size(); + + size_t retained_muts_count = muts_retained_by_treeseq_.size(); + + size_t estimated_row_count = registry_size + substitution_count + retained_muts_count; + slim_trait_index_t trait_count = TraitCount(); + size_t size_for_additional_trait_info = sizeof(_MutationPerTraitMetadata) * (trait_count - 1); + size_t size_for_one_muttable_row = sizeof(MutationTableMetadataRec) + size_for_additional_trait_info; + size_t estimated_mutation_table_size = estimated_row_count * size_for_one_muttable_row; + + // Then assemble those two components into a json+struct metadata chunk: + const uint8_t TSK_JSON_BINARY_MAGIC[4] = { 'J', 'B', 'L', 'B' }; + size_t header_length = 4 + 1 + 8 + 8; + size_t json_length = metadata_JSON_str.length(); + size_t estimated_binary_length = estimated_mutation_table_size; + + // Insert null bytes as needed to align the binary chunk to an 8-byte boundary + size_t aligner_length = (8 - ((header_length + json_length) & 0x07)) % 8; + + size_t estimated_total_length = header_length + json_length + aligner_length + estimated_binary_length; + uint8_t *metadata_buffer = (uint8_t *)malloc(estimated_total_length); + + if (!metadata_buffer) + EIDOS_TERMINATION << "ERROR (Species::WriteTreeSequenceMetadata): allocation failed; you may need to raise the memory limit for SLiM." << EidosTerminate(); + + metadata_buffer[0] = TSK_JSON_BINARY_MAGIC[0]; + metadata_buffer[1] = TSK_JSON_BINARY_MAGIC[1]; + metadata_buffer[2] = TSK_JSON_BINARY_MAGIC[2]; + metadata_buffer[3] = TSK_JSON_BINARY_MAGIC[3]; + + metadata_buffer[4] = 1; // version number for the json_blob codec + + memcpy(metadata_buffer + header_length, metadata_JSON_str.data(), json_length); + + for (size_t aligner_index = 0; aligner_index < aligner_length; ++aligner_index) + metadata_buffer[header_length + json_length + aligner_index] = 0; + + // Write mutation metadata into the binary section + uint8_t *base_row_pointer = metadata_buffer + header_length + json_length + aligner_length; + uint8_t *row_pointer = base_row_pointer; + + // First we write substitutions; these are guaranteed not to be in the muts_retained_by_treeseq_ vector + for (Substitution *sub : substitutions) + { + MutationTableMetadataRec *metadata_row = (MutationTableMetadataRec *)row_pointer; + + metadata_row->mutation_id_ = sub->mutation_id_; + metadata_row->mutation_type_id_ = sub->mutation_type_ptr_->mutation_type_id_; + metadata_row->subpop_index_ = sub->subpop_index_; + metadata_row->origin_tick_ = sub->origin_tick_; + metadata_row->nucleotide_ = sub->nucleotide_; + + metadata_row->unused_[0] = 0; + metadata_row->unused_[1] = 0; + metadata_row->unused_[2] = 0; + + SubstitutionTraitInfo *sub_trait_info_array = sub->trait_info_; + _MutationPerTraitMetadata *metadata_trait_info_array = metadata_row->per_trait_; + + for (slim_trait_index_t trait_index = 0; trait_index < trait_count; ++trait_index) + { + SubstitutionTraitInfo *sub_trait_info = sub_trait_info_array + trait_index; + _MutationPerTraitMetadata *metadata_trait_info = metadata_trait_info_array + trait_index; + + metadata_trait_info->effect_size_ = sub_trait_info->effect_size_; + metadata_trait_info->dominance_coeff_ = sub_trait_info->dominance_coeff_UNSAFE_; + metadata_trait_info->hemizygous_dominance_coeff_ = sub_trait_info->hemizygous_dominance_coeff_; + } + + row_pointer += size_for_one_muttable_row; + } + + // Then we write mutations that were retained for us, because they were associated with individuals that + // were remembered, or because they were removed in script or by stacking policy; these mutations might + // no longer be in SLiM's mutation registry, but they need to be in the mutation metadata table. + int retained_count = (int)muts_retained_by_treeseq_.size(); + + for (int retained_index = 0; retained_index < retained_count; ++retained_index) + { + MutationTableMetadataRec *metadata_row = (MutationTableMetadataRec *)row_pointer; + MutationIndex mut_index = muts_retained_by_treeseq_[retained_index]; + Mutation *mut = mutation_block_->MutationForIndex(mut_index); + + // Mutations can be in muts_retained_by_treeseq_ and yet no longer be retained; we skip them. + // This occurs if a mutation is turned into a substitution; a retain is then no longer needed, + // and here we treat the mutation as if it was not in muts_retained_by_treeseq_ to begin with. + if (!mut->retained_by_treeseq_) + continue; + + metadata_row->mutation_id_ = mut->mutation_id_; + metadata_row->mutation_type_id_ = mut->mutation_type_ptr_->mutation_type_id_; + metadata_row->subpop_index_ = mut->subpop_index_; + metadata_row->origin_tick_ = mut->origin_tick_; + metadata_row->nucleotide_ = mut->nucleotide_; + + metadata_row->unused_[0] = 0; + metadata_row->unused_[1] = 0; + metadata_row->unused_[2] = 0; + + MutationTraitInfo *mut_trait_info_array = mutation_block_->TraitInfoForIndex(mut_index); + _MutationPerTraitMetadata *metadata_trait_info_array = metadata_row->per_trait_; + + for (slim_trait_index_t trait_index = 0; trait_index < trait_count; ++trait_index) + { + MutationTraitInfo *mut_trait_info = mut_trait_info_array + trait_index; + _MutationPerTraitMetadata *metadata_trait_info = metadata_trait_info_array + trait_index; + + metadata_trait_info->effect_size_ = mut_trait_info->effect_size_; + metadata_trait_info->dominance_coeff_ = mut_trait_info->dominance_coeff_UNSAFE_; + metadata_trait_info->hemizygous_dominance_coeff_ = mut_trait_info->hemizygous_dominance_coeff_; + } + + row_pointer += size_for_one_muttable_row; + } + + // Finally we write mutations in the registry, as long as they were not already written + for (int registry_index = 0; registry_index < registry_size; ++registry_index) + { + MutationTableMetadataRec *metadata_row = (MutationTableMetadataRec *)row_pointer; + MutationIndex mut_index = register_iter[registry_index]; + Mutation *mut = mutation_block_->MutationForIndex(mut_index); + + // mutations in the retained list were already handled + if (mut->retained_by_treeseq_) + continue; + + // mutations that are not segregating do not get persisted + MutationState mut_state = (MutationState)mut->state_; + + if ((mut_state == MutationState::kLostAndRemoved) || (mut_state == MutationState::kFixedAndSubstituted) || (mut_state == MutationState::kRemovedWithSubstitution)) + continue; + + metadata_row->mutation_id_ = mut->mutation_id_; + metadata_row->mutation_type_id_ = mut->mutation_type_ptr_->mutation_type_id_; + metadata_row->subpop_index_ = mut->subpop_index_; + metadata_row->origin_tick_ = mut->origin_tick_; + metadata_row->nucleotide_ = mut->nucleotide_; + + metadata_row->unused_[0] = 0; + metadata_row->unused_[1] = 0; + metadata_row->unused_[2] = 0; + + MutationTraitInfo *mut_trait_info_array = mutation_block_->TraitInfoForIndex(mut_index); + _MutationPerTraitMetadata *metadata_trait_info_array = metadata_row->per_trait_; + + for (slim_trait_index_t trait_index = 0; trait_index < trait_count; ++trait_index) + { + MutationTraitInfo *mut_trait_info = mut_trait_info_array + trait_index; + _MutationPerTraitMetadata *metadata_trait_info = metadata_trait_info_array + trait_index; + + metadata_trait_info->effect_size_ = mut_trait_info->effect_size_; + metadata_trait_info->dominance_coeff_ = mut_trait_info->dominance_coeff_UNSAFE_; + metadata_trait_info->hemizygous_dominance_coeff_ = mut_trait_info->hemizygous_dominance_coeff_; + } + + row_pointer += size_for_one_muttable_row; + } + + // we set size information at the end because we might not use all of the mutation table space + // if that is the case, we realloc here to free up the unused space + if ((row_pointer - base_row_pointer) % size_for_one_muttable_row != 0) + EIDOS_TERMINATION << "ERROR (Species::WriteTreeSequenceMetadata): (internal error) count of rows filled is not an integer." << EidosTerminate(); + + size_t actual_row_count = (row_pointer - base_row_pointer) / size_for_one_muttable_row; + size_t actual_mutation_table_size = actual_row_count * size_for_one_muttable_row; + size_t actual_binary_length = actual_mutation_table_size; + size_t actual_total_length = header_length + json_length + aligner_length + actual_binary_length; + + if (actual_total_length != estimated_total_length) + { + metadata_buffer = (uint8_t *)realloc(metadata_buffer, actual_total_length); + if (!metadata_buffer) + EIDOS_TERMINATION << "ERROR (Species::WriteTreeSequenceMetadata): allocation failed; you may need to raise the memory limit for SLiM." << EidosTerminate(); + } + + Eidos_set_u64_le(metadata_buffer + 5, (uint64_t)json_length); + Eidos_set_u64_le(metadata_buffer + 13, (uint64_t)(actual_mutation_table_size + aligner_length)); + +#if DEBUG + //std::cout << "WriteTreeSequenceMetadata(): wrote binary mutation table with row size " << size_for_one_muttable_row << " and " << row_count << " rows." << std::endl; +#endif + + // Write out the metadata chunk to the table collection: + ret = tsk_table_collection_set_metadata(p_tables, (const char *)metadata_buffer, (tsk_size_t)actual_total_length); if (ret != 0) handle_error("tsk_table_collection_set_metadata", ret); - - // As above, we maybe ought to edit the metadata schema adding our keys, + + // And finally, generate the bipartite schema for the json+struct metadata chunk: + // + // NOTE: As above, we maybe ought to edit the metadata schema adding our keys, // but then comparing tables is a headache; see tskit#763 + std::string jps_metadata_schema; + + jps_metadata_schema = R"V0G0N({"$schema":"http://json-schema.org/schema#","codec":"json+struct","json":)V0G0N"; + jps_metadata_schema += gSLiM_tsk_metadata_JSON_schema; + jps_metadata_schema += R"V0G0N(,"struct":)V0G0N"; + jps_metadata_schema += gSLiM_tsk_metadata_binary_schema_FORMAT; + jps_metadata_schema += R"V0G0N(})V0G0N"; + + { + // fix "%d1" to have the count of padding bytes needed for 8-byte alignment + size_t d1_pos = jps_metadata_schema.find("\"%d1\""); + + if (d1_pos == std::string::npos) + EIDOS_TERMINATION << "ERROR (Species::WriteTreeSequenceMetadata): (internal error) substring \"%d1\" for replacement in schema not found." << EidosTerminate(); + else + jps_metadata_schema.replace(d1_pos, 5, "\"" + std::to_string(aligner_length) + "x\""); // e.g., "%d1" -> "5x" + } + { + // fix "%d2" to have the number of traits in this species + size_t d2_pos = jps_metadata_schema.find("\"%d2\""); + + if (d2_pos == std::string::npos) + EIDOS_TERMINATION << "ERROR (Species::WriteTreeSequenceMetadata): (internal error) substring \"%d2\" for replacement in schema not found." << EidosTerminate(); + else + jps_metadata_schema.replace(d2_pos, 5, std::to_string(trait_count)); // e.g., "%d2" -> 3 + } + +#if 0 + // DEBUG: dump the json+struct metadata schema to std::cout + std::cout << std::endl << "jps_metadata_schema == " << std::endl << jps_metadata_schema << std::endl << std::endl; + + nlohmann::json jps_metadata_schema_JSON; + + try { + jps_metadata_schema_JSON = nlohmann::json::parse(jps_metadata_schema); + } catch (...) { + EIDOS_TERMINATION << "ERROR (Species::WriteTreeSequenceMetadata): (internal error) jps_metadata_schema must be a JSON string." << EidosTerminate(); + } + + std::cout << "jps_metadata_schema == " << std::endl << jps_metadata_schema_JSON.dump(4) << std::endl << std::endl; +#endif + ret = tsk_table_collection_set_metadata_schema( - p_tables, gSLiM_tsk_metadata_schema.c_str(), (tsk_size_t)gSLiM_tsk_metadata_schema.length()); + p_tables, jps_metadata_schema.data(), (tsk_size_t)jps_metadata_schema.length()); if (ret != 0) handle_error("tsk_table_collection_set_metadata_schema", ret); - + //////////// // Set metadata schema on each table + ret = tsk_edge_table_set_metadata_schema(&p_tables->edges, gSLiM_tsk_edge_metadata_schema.c_str(), (tsk_size_t)gSLiM_tsk_edge_metadata_schema.length()); if (ret != 0) handle_error("tsk_edge_table_set_metadata_schema", ret); + ret = tsk_site_table_set_metadata_schema(&p_tables->sites, gSLiM_tsk_site_metadata_schema.c_str(), (tsk_size_t)gSLiM_tsk_site_metadata_schema.length()); if (ret != 0) handle_error("tsk_site_table_set_metadata_schema", ret); + ret = tsk_mutation_table_set_metadata_schema(&p_tables->mutations, gSLiM_tsk_mutation_metadata_schema.c_str(), (tsk_size_t)gSLiM_tsk_mutation_metadata_schema.length()); if (ret != 0) handle_error("tsk_mutation_table_set_metadata_schema", ret); - ret = tsk_individual_table_set_metadata_schema(&p_tables->individuals, - gSLiM_tsk_individual_metadata_schema.c_str(), - (tsk_size_t)gSLiM_tsk_individual_metadata_schema.length()); - if (ret != 0) - handle_error("tsk_individual_table_set_metadata_schema", ret); + + { + // fix "%d" in the individual metadata to have the number of traits in the species + std::string tsk_individual_metadata_schema = gSLiM_tsk_individual_metadata_schema_FORMAT; + size_t d_pos = tsk_individual_metadata_schema.find("\"%d\""); + + if (d_pos == std::string::npos) + EIDOS_TERMINATION << "ERROR (Species::WriteTreeSequenceMetadata): (internal error) \"%d\" substring missing from gSLiM_tsk_individual_metadata_schema_FORMAT." << EidosTerminate(); + else + tsk_individual_metadata_schema.replace(d_pos, 4, std::to_string(TraitCount())); // e.g., "%d" -> 4 + +#if 0 + // DEBUG: dump the metadata schema to std::cout + std::cout << std::endl << "tsk_individual_metadata_schema == " << std::endl << tsk_individual_metadata_schema << std::endl << std::endl; + + nlohmann::json tsk_individual_metadata_schema_JSON; + + try { + tsk_individual_metadata_schema_JSON = nlohmann::json::parse(tsk_individual_metadata_schema); + } catch (...) { + EIDOS_TERMINATION << "ERROR (Species::WriteTreeSequenceMetadata): (internal error) tsk_individual_metadata_schema must be a JSON string." << EidosTerminate(); + } + + std::cout << "tsk_individual_metadata_schema == " << std::endl << tsk_individual_metadata_schema_JSON.dump(4) << std::endl << std::endl; +#endif + + ret = tsk_individual_table_set_metadata_schema(&p_tables->individuals, + tsk_individual_metadata_schema.c_str(), + (tsk_size_t)tsk_individual_metadata_schema.length()); + if (ret != 0) + handle_error("tsk_individual_table_set_metadata_schema", ret); + } + ret = tsk_population_table_set_metadata_schema(&p_tables->populations, gSLiM_tsk_population_metadata_schema.c_str(), (tsk_size_t)gSLiM_tsk_population_metadata_schema.length()); if (ret != 0) handle_error("tsk_population_table_set_metadata_schema", ret); - // For the node table the schema we save out depends upon the number of - // bits needed to represent the null haplosome structure of the model. - // We allocate one bit per chromosome, in each node table entry (note - // there are two entries per individual, so it ends up being two bits - // of information per chromosome, across the two node table entries.) - // See the big comment on gSLiM_tsk_node_metadata_schema_FORMAT. + // For the node table the schema we save out depends upon the number of bits needed to represent the null + // haplosome structure of the model. We allocate one bit per chromosome, in each node table entry (note + // there are two entries per individual, so it ends up being two bits of information per chromosome, across + // the two node table entries). See the big comment on gSLiM_tsk_node_metadata_schema_FORMAT. std::string tsk_node_metadata_schema = gSLiM_tsk_node_metadata_schema_FORMAT; size_t pos = tsk_node_metadata_schema.find("\"%d\""); std::string count_string = std::to_string(haplosome_metadata_is_vacant_bytes_); @@ -8728,6 +9112,21 @@ void Species::WriteTreeSequenceMetadata(tsk_table_collection_t *p_tables, EidosD tsk_node_metadata_schema.replace(pos, 4, count_string); // replace %d in the format string with the byte count +#if 0 + // DEBUG: dump the metadata schema to std::cout + std::cout << std::endl << "tsk_node_metadata_schema == " << std::endl << tsk_node_metadata_schema << std::endl << std::endl; + + nlohmann::json tsk_node_metadata_schema_JSON; + + try { + tsk_node_metadata_schema_JSON = nlohmann::json::parse(tsk_node_metadata_schema); + } catch (...) { + EIDOS_TERMINATION << "ERROR (Species::WriteTreeSequenceMetadata): (internal error) tsk_node_metadata_schema must be a JSON string." << EidosTerminate(); + } + + std::cout << "tsk_node_metadata_schema == " << std::endl << tsk_node_metadata_schema_JSON.dump(4) << std::endl << std::endl; +#endif + ret = tsk_node_table_set_metadata_schema(&p_tables->nodes, tsk_node_metadata_schema.c_str(), (tsk_size_t)tsk_node_metadata_schema.length()); @@ -8873,6 +9272,32 @@ void Species::WriteProvenanceTable(tsk_table_collection_t *p_tables, bool p_use_ } } + j["parameters"]["traits"] = nlohmann::json::array(); + for (Trait *trait : traits_) + { + nlohmann::json trait_info; + + trait_info["index"] = trait->Index(); + trait_info["name"] = trait->Name(); + + if (trait->HasLogisticPostTransform()) + trait_info["type"] = "logistic"; + else if (trait->Type() == TraitType::kAdditive) + trait_info["type"] = "additive"; + else + trait_info["type"] = "multiplicative"; + + trait_info["baselineOffset"] = trait->BaselineOffset(); + trait_info["baselineAccumulation"] = trait->HasBaselineAccumulation(); + + trait_info["individualOffsetMean"] = trait->IndividualOffsetDistributionMean(); + trait_info["individualOffsetSD"] = trait->IndividualOffsetDistributionSD(); + + trait_info["directFitnessEffect"] = trait->HasDirectFitnessEffect(); + + j["parameters"]["traits"].push_back(trait_info); + } + if (p_include_model) j["parameters"]["model"] = scriptString; // made model optional in file_version 0.4 j["parameters"]["model_hash"] = scriptHashString; // added model_hash in file_version 0.4 @@ -8946,7 +9371,7 @@ void Species::_MungeIsNullNodeMetadataToIndex0(TreeSeqInfo &p_treeseq, int origi // check that the length is sufficient for the bits of original_index if (node_metadata_length < expected_min_metadata_length) - EIDOS_TERMINATION << "ERROR (Species::__TabulateSubpopulationsFromTreeSequence): unexpected node metadata length; this file cannot be read." << EidosTerminate(); + EIDOS_TERMINATION << "ERROR (Species::_MungeIsNullNodeMetadataToIndex0): unexpected node metadata length; this file cannot be read." << EidosTerminate(); HaplosomeMetadataRec *node_metadata = (HaplosomeMetadataRec *)(node_table.metadata + node_table.metadata_offset[row]); HaplosomeMetadataRec *new_metadata = new_metadata_buffer + row; @@ -8985,11 +9410,8 @@ void Species::_MungeIsNullNodeMetadataToIndex0(TreeSeqInfo &p_treeseq, int origi handle_error("tsk_node_table_set_metadata_schema", ret); } -void Species::ReadTreeSequenceMetadata(TreeSeqInfo &p_treeseq, slim_tick_t *p_tick, slim_tick_t *p_cycle, SLiMModelType *p_model_type, int *p_file_version) +void Species::ReadTreeSequenceMetadata(TreeSeqInfo &p_treeseq, slim_tick_t *p_tick, slim_tick_t *p_cycle, SLiMModelType *p_model_type, int *p_file_version, MutationMetadataTable &p_mut_metadata_table) { - // New provenance reading code, using the JSON for Modern C++ library (json.hpp); this - // applies to file versions > 0.1. The version 0.1 code was removed 24 Feb. 2025. - tsk_table_collection_t &p_tables = p_treeseq.tables_; std::string model_type_str; std::string cycle_stage_str; @@ -9000,147 +9422,268 @@ void Species::ReadTreeSequenceMetadata(TreeSeqInfo &p_treeseq, slim_tick_t *p_ti std::string this_chromosome_type; bool chomosomes_key_present = false; - try { - //////////// - // Format 0.5 and later: using top-level metadata - - // Note: we *could* parse the metadata schema, but instead we'll just try parsing the metadata. - // std::string metadata_schema_str(p_tables->metadata_schema, p_tables->metadata_schema_length); - // nlohmann::json metadata_schema = nlohmann::json::parse(metadata_schema_str); - - std::string metadata_str(p_tables.metadata, p_tables.metadata_length); - auto metadata = nlohmann::json::parse(metadata_str); + // BCH 2/13/2026: This code was inside a big try/catch block that didn't make sense to me, since it replaced + // the useful error messages here with an uninformative catch-all error message; so I removed it. + + //////////// + // Format 1.0 and later: reading top-level metadata using tskit's `json+struct` codec + const char *top_level_json_buffer; + tsk_size_t top_level_json_length; + uint8_t *top_level_binary_buffer; + tsk_size_t top_level_binary_length; + + int ret = tsk_json_struct_metadata_get_blob(p_tables.metadata, p_tables.metadata_length, &top_level_json_buffer, &top_level_json_length, &top_level_binary_buffer, &top_level_binary_length); + if ((ret == TSK_ERR_FILE_FORMAT) || (ret == TSK_ERR_FILE_VERSION_TOO_NEW)) + EIDOS_TERMINATION << "ERROR (Species::ReadTreeSequenceMetadata): the version of this file appears to be too old to be read, or the file is corrupted; you can try using pyslim to bring an old file version forward to the current version, or generate a new file with the current version of SLiM." << EidosTerminate(); + if (ret != 0) + handle_error("tsk_json_struct_metadata_get_blob", ret); + + // adjust the binary pointer to account for alignment + size_t aligner_length = (8 - (reinterpret_cast(top_level_binary_buffer) & 0x07)) % 8; + + std::string metadata_schema_str(p_tables.metadata_schema, p_tables.metadata_schema_length); + std::string schema_aligner_prefix = "aligner\":{\"binaryFormat\":\""; + size_t schema_aligner_position = metadata_schema_str.find(schema_aligner_prefix); + + if (schema_aligner_position == std::string::npos) + EIDOS_TERMINATION << "ERROR (Species::ReadTreeSequenceMetadata): the top-level metadata schema is non-compliant; this file cannot be read." << EidosTerminate(); + + char schema_aligner_char = metadata_schema_str.at(schema_aligner_position + schema_aligner_prefix.length()); + + if ((schema_aligner_char < '0') || (schema_aligner_char > '7')) + EIDOS_TERMINATION << "ERROR (Species::ReadTreeSequenceMetadata): the top-level metadata schema is non-compliant; this file cannot be read." << EidosTerminate(); + + size_t schema_aligner_length = (schema_aligner_char - '0'); + + if (aligner_length != schema_aligner_length) + { + std::cout << "#WARNING (Species::ReadTreeSequenceMetadata): the alignment byte count for the top-level binary metadata is unexpected (the schema says " << schema_aligner_length << " but the reality is " << aligner_length << ")." << std::endl; + aligner_length = schema_aligner_length; + } + + top_level_binary_buffer = top_level_binary_buffer + aligner_length; + top_level_binary_length -= aligner_length; + + size_t trait_count = Traits().size(); + size_t binary_row_length = sizeof(MutationTableMetadataRec) + sizeof(_MutationPerTraitMetadata) * (trait_count - 1); + + if (top_level_binary_length % binary_row_length != 0) + EIDOS_TERMINATION << "ERROR (Species::ReadTreeSequenceMetadata): the top-level binary metadata does not comprise an integral number of rows (binary_row_length == " << binary_row_length << ", top_level_binary_length == " << top_level_binary_length << "); this file cannot be read." << EidosTerminate(); + + // pass information on the binary mutation metadata table back to the caller + p_mut_metadata_table.table_buffer = top_level_binary_buffer; + p_mut_metadata_table.row_size = binary_row_length; + p_mut_metadata_table.row_count = top_level_binary_length / binary_row_length; + +#if DEBUG + //std::cout << "ReadTreeSequenceMetadata(): read binary mutation table with row size " << p_mut_metadata_table.row_size << " and " << p_mut_metadata_table.row_count << " rows." << std::endl; +#endif + + // Note: we *could* parse the metadata schema, but instead we'll just try parsing the metadata. + // std::string metadata_schema_str(p_tables->metadata_schema, p_tables->metadata_schema_length); + // nlohmann::json metadata_schema = nlohmann::json::parse(metadata_schema_str); + + std::string top_level_json_str(top_level_json_buffer, top_level_json_length); + auto top_level_json = nlohmann::json::parse(top_level_json_str); + + //std::cout << top_level_json.dump(4) << std::endl; + + if (!top_level_json["SLiM"].contains("file_version")) + EIDOS_TERMINATION << "ERROR (Species::ReadTreeSequenceMetadata): the required metadata key 'file_version' is missing; this file cannot be read." << EidosTerminate(); + + auto file_version = top_level_json["SLiM"]["file_version"]; + + if ((file_version == SLIM_TREES_FILE_VERSION_PRENUC) || + (file_version == SLIM_TREES_FILE_VERSION_POSTNUC) || + (file_version == SLIM_TREES_FILE_VERSION_HASH) || + (file_version == SLIM_TREES_FILE_VERSION_META) || + (file_version == SLIM_TREES_FILE_VERSION_PREPARENT) || + (file_version == SLIM_TREES_FILE_VERSION_PRESPECIES) || + (file_version == SLIM_TREES_FILE_VERSION_SPECIES) || + (file_version == SLIM_TREES_FILE_VERSION_MULTICHROM)) + { + // SLiM 5.2 breaks backward compatibility with earlier file versions + EIDOS_TERMINATION << "ERROR (Species::ReadTreeSequenceMetadata): the version of this file appears to be too old to be read, or the file is corrupted; you can try using pyslim to bring an old file version forward to the current version, or generate a new file with the current version of SLiM or pyslim." << EidosTerminate(); + } + else if (file_version == SLIM_TREES_FILE_VERSION) + { + *p_file_version = 10; + } + else + EIDOS_TERMINATION << "ERROR (Species::ReadTreeSequenceMetadata): this .trees file was generated by an unrecognized version of SLiM or pyslim (internal file version " << file_version << "); this file cannot be read." << EidosTerminate(); + + // We test for some keys if they are new or optional, but assume that others must be there, such as "model_type". + // If we fetch a key and it is missing, nhlohmann::json raises and we end up in the provenance fallback code below. + // That indicates that we're reading an old file version, which we no longer support in SLiM 5. + // BCH 2/24/2025: I'm shifting towards testing for every key before fetching, in order to give better error messages. + if (!top_level_json["SLiM"].contains("model_type")) + EIDOS_TERMINATION << "ERROR (Species::ReadTreeSequenceMetadata): the required metadata key 'model_type' is missing; this file cannot be read." << EidosTerminate(); + + model_type_str = top_level_json["SLiM"]["model_type"]; + + if (!top_level_json["SLiM"].contains("tick")) + EIDOS_TERMINATION << "ERROR (Species::ReadTreeSequenceMetadata): the required metadata key 'tick' is missing; this file cannot be read." << EidosTerminate(); + + tick_ll = top_level_json["SLiM"]["tick"]; + + // "cycle" is optional and now defaults to the tick (it used to fall back to the old "generation" key) + if (top_level_json["SLiM"].contains("cycle")) + gen_ll = top_level_json["SLiM"]["cycle"]; + else + gen_ll = tick_ll; + + // "stage" is optional, and is used below only for validation; it provides an extra layer of safety + if (top_level_json["SLiM"].contains("stage")) + cycle_stage_str = top_level_json["SLiM"]["stage"]; + + /*if (metadata["SLiM"].contains("name")) + { + // If a species name is present, it must match our own name; can't load data across species, as a safety measure + // If users find this annoying, it can be relaxed; nothing really depends on it + // BCH 5/12/2022: OK, it is already annoying; disabling this check for now + std::string metadata_name = metadata["SLiM"]["name"]; + + if (metadata_name != name_) + EIDOS_TERMINATION << "ERROR (Species::ReadTreeSequenceMetadata): this .trees file represents a species named " << metadata_name << ", which does not match the name of the target species, " << name_ << "; species names must match." << EidosTerminate(); + }*/ + + if (top_level_json["SLiM"].contains("description")) + { + // If a species description is present and non-empty, it replaces our own description + std::string metadata_description = top_level_json["SLiM"]["description"]; - //std::cout << metadata.dump(4) << std::endl; + if (metadata_description.length()) + description_ = metadata_description; + } + + // The "this_chromosome" key is required, as are the keys within it + auto &this_chromosome_metadata = top_level_json["SLiM"]["this_chromosome"]; + + if (!this_chromosome_metadata.is_object()) + SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the 'this_chromosome' metadata key must be a JSON object." << std::endl; + if (!this_chromosome_metadata.contains("id")) + SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the required metadata key 'id' is missing from the 'this_chromosome' metadata entry." << std::endl; + if (!this_chromosome_metadata.contains("index")) + SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the required metadata key 'index' is missing from the 'this_chromosome' metadata entry." << std::endl; + if (!this_chromosome_metadata.contains("symbol")) + SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the required metadata key 'symbol' is missing from the 'this_chromosome' metadata entry." << std::endl; + if (!this_chromosome_metadata.contains("type")) + SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the required metadata key 'type' is missing from the 'this_chromosome' metadata entry." << std::endl; + + this_chromosome_id = this_chromosome_metadata["id"]; + this_chromosome_index = this_chromosome_metadata["index"]; + this_chromosome_symbol = this_chromosome_metadata["symbol"]; + this_chromosome_type = this_chromosome_metadata["type"]; + + // The "chromosomes" key is optional, but if provided, it has to make sense + if (top_level_json["SLiM"].contains("chromosomes")) + { + chomosomes_key_present = true; - if (!metadata["SLiM"].contains("file_version")) - EIDOS_TERMINATION << "ERROR (Species::ReadTreeSequenceMetadata): the required metadata key 'file_version' is missing; this file cannot be read." << EidosTerminate(); + // We validate the whole "chromosomes" key against the whole model, to make sure everything is as expected + auto &chromosomes_metadata = top_level_json["SLiM"]["chromosomes"]; - auto file_version = metadata["SLiM"]["file_version"]; + if (!chromosomes_metadata.is_array()) + SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the 'chromosomes' metadata key must be an array." << std::endl; + if (chromosomes_metadata.size() != Chromosomes().size()) + SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the number of entries in the 'chromosomes' metadata key does not match the number of chromosomes in the model." << std::endl; - if ((file_version == SLIM_TREES_FILE_VERSION_PRENUC) || - (file_version == SLIM_TREES_FILE_VERSION_POSTNUC) || - (file_version == SLIM_TREES_FILE_VERSION_HASH) || - (file_version == SLIM_TREES_FILE_VERSION_META) || - (file_version == SLIM_TREES_FILE_VERSION_PREPARENT) || - (file_version == SLIM_TREES_FILE_VERSION_PRESPECIES) || - (file_version == SLIM_TREES_FILE_VERSION_SPECIES)) + for (std::size_t chromosomes_index = 0; chromosomes_index < Chromosomes().size(); ++chromosomes_index) { - // SLiM 5.0 breaks backward compatibility with earlier file versions - EIDOS_TERMINATION << "ERROR (Species::ReadTreeSequenceMetadata): the version of this file appears to be too old to be read, or the file is corrupted; you can try using pyslim to bring an old file version forward to the current version, or generate a new file with the current version of SLiM or pyslim." << EidosTerminate(); - } - else if (file_version == SLIM_TREES_FILE_VERSION) - { - *p_file_version = 9; + Chromosome *chromosome = Chromosomes()[chromosomes_index]; + auto &one_chromosome_metadata = chromosomes_metadata[chromosomes_index]; + + if (!one_chromosome_metadata.contains("id")) + SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the required metadata key 'id' is missing from a 'chromosomes' metadata entry; if 'chromosomes' is provided at all, it must be complete." << std::endl; + if (!one_chromosome_metadata.contains("symbol")) + SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the required metadata key 'symbol' is missing from a 'chromosomes' metadata entry; if 'chromosomes' is provided at all, it must be complete." << std::endl; + if (!one_chromosome_metadata.contains("type")) + SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the required metadata key 'type' is missing from a 'chromosomes' metadata entry; if 'chromosomes' is provided at all, it must be complete." << std::endl; + + int one_chromosome_id = one_chromosome_metadata["id"]; + std::string one_chromosome_symbol = one_chromosome_metadata["symbol"]; + std::string one_chromosome_type = one_chromosome_metadata["type"]; + + if (one_chromosome_id != chromosome->ID()) + SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the id for the entry at index " << chromosomes_index << " in the 'chromosomes' metadata key does not match the corresponding chromosome in the model." << std::endl; + if (one_chromosome_symbol != chromosome->Symbol()) + SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the symbol for the entry at index " << chromosomes_index << " in the 'chromosomes' metadata key does not match the corresponding chromosome in the model." << std::endl; + if (one_chromosome_type != chromosome->TypeString()) + SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the type for the entry at index " << chromosomes_index << " in the 'chromosomes' metadata key does not match the corresponding chromosome in the model." << std::endl; } - else - EIDOS_TERMINATION << "ERROR (Species::ReadTreeSequenceMetadata): this .trees file was generated by an unrecognized version of SLiM or pyslim (internal file version " << file_version << "); this file cannot be read." << EidosTerminate(); - - // We test for some keys if they are new or optional, but assume that others must be there, such as "model_type". - // If we fetch a key and it is missing, nhlohmann::json raises and we end up in the provenance fallback code below. - // That indicates that we're reading an old file version, which we no longer support in SLiM 5. - // BCH 2/24/2025: I'm shifting towards testing for every key before fetching, in order to give better error messages. - if (!metadata["SLiM"].contains("model_type")) - EIDOS_TERMINATION << "ERROR (Species::ReadTreeSequenceMetadata): the required metadata key 'model_type' is missing; this file cannot be read." << EidosTerminate(); - - model_type_str = metadata["SLiM"]["model_type"]; - - if (!metadata["SLiM"].contains("tick")) - EIDOS_TERMINATION << "ERROR (Species::ReadTreeSequenceMetadata): the required metadata key 'tick' is missing; this file cannot be read." << EidosTerminate(); + } + + // The new "traits" key is required, and we need to check its contents + auto &traits_metadata = top_level_json["SLiM"]["traits"]; + + if (!traits_metadata.is_array()) + SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the 'traits' metadata key must be an array." << std::endl; + if (traits_metadata.size() != Traits().size()) + SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the number of entries in the 'traits' metadata key does not match the number of traits in the model." << std::endl; + + for (size_t traits_index = 0; traits_index < Traits().size(); ++traits_index) + { + Trait *trait = Traits()[traits_index]; + auto &one_trait_metadata = traits_metadata[traits_index]; - tick_ll = metadata["SLiM"]["tick"]; + // check required keys; they must match the simulation + if (!one_trait_metadata.contains("index")) + SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the required metadata key 'index' is missing from a 'traits' metadata entry." << std::endl; + if (!one_trait_metadata.contains("name")) + SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the required metadata key 'name' is missing from a 'traits' metadata entry." << std::endl; + if (!one_trait_metadata.contains("type")) + SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the required metadata key 'type' is missing from a 'traits' metadata entry." << std::endl; - // "cycle" is optional and now defaults to the tick (it used to fall back to the old "generation" key) - if (metadata["SLiM"].contains("cycle")) - gen_ll = metadata["SLiM"]["cycle"]; - else - gen_ll = tick_ll; + int one_trait_index = one_trait_metadata["index"]; + std::string one_trait_name = one_trait_metadata["name"]; + std::string one_trait_type = one_trait_metadata["type"]; - // "stage" is optional, and is used below only for validation; it provides an extra layer of safety - if (metadata["SLiM"].contains("stage")) - cycle_stage_str = metadata["SLiM"]["stage"]; + if (one_trait_index != trait->Index()) + SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the index for the entry at index " << traits_index << " in the 'traits' metadata key does not match the corresponding trait in the model." << std::endl; + if (one_trait_name != trait->Name()) + SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the name for the entry at index " << traits_index << " in the 'traits' metadata key does not match the corresponding trait in the model." << std::endl; + if (one_trait_type != trait->UserVisibleType()) + SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the type for the entry at index " << traits_index << " in the 'traits' metadata key does not match the corresponding trait in the model." << std::endl; - /*if (metadata["SLiM"].contains("name")) + // check optional keys for read-write properties; if present, we will bounds-check them and adopt their values + if (one_trait_metadata.contains("baselineOffset")) { - // If a species name is present, it must match our own name; can't load data across species, as a safety measure - // If users find this annoying, it can be relaxed; nothing really depends on it - // BCH 5/12/2022: OK, it is already annoying; disabling this check for now - std::string metadata_name = metadata["SLiM"]["name"]; + slim_trait_offset_t new_baseline_offset = one_trait_metadata["baselineOffset"]; + + if (!std::isfinite(new_baseline_offset)) + EIDOS_TERMINATION << "ERROR (Species::ReadTreeSequenceMetadata): the trait baselineOffset provided in the 'traits' metadata key for trait index " << traits_index << " (trait name " << one_trait_name << ") must be finite (" << new_baseline_offset << " provided)." << EidosTerminate(); - if (metadata_name != name_) - EIDOS_TERMINATION << "ERROR (Species::ReadTreeSequenceMetadata): this .trees file represents a species named " << metadata_name << ", which does not match the name of the target species, " << name_ << "; species names must match." << EidosTerminate(); - }*/ + trait->SetBaselineOffset(new_baseline_offset); + } - if (metadata["SLiM"].contains("description")) + if (one_trait_metadata.contains("individualOffsetMean")) { - // If a species description is present and non-empty, it replaces our own description - std::string metadata_description = metadata["SLiM"]["description"]; + slim_trait_offset_t new_offset_mean = one_trait_metadata["individualOffsetMean"]; + + if (!std::isfinite(new_offset_mean)) + EIDOS_TERMINATION << "ERROR (Species::ReadTreeSequenceMetadata): the trait individualOffsetMean provided in the 'traits' metadata key for trait index " << traits_index << " (trait name " << one_trait_name << ") must be finite (" << new_offset_mean << " provided)." << EidosTerminate(); - if (metadata_description.length()) - description_ = metadata_description; + trait->SetIndividualOffsetDistributionMean(new_offset_mean); } - // The "this_chromosome" key is required, as are the keys within it - auto &this_chromosome_metadata = metadata["SLiM"]["this_chromosome"]; - - if (!this_chromosome_metadata.is_object()) - SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the 'this_chromosome' metadata key must be a JSON object." << std::endl; - if (!this_chromosome_metadata.contains("id")) - SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the required metadata key 'id' is missing from the 'this_chromosome' metadata entry." << std::endl; - if (!this_chromosome_metadata.contains("index")) - SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the required metadata key 'index' is missing from the 'this_chromosome' metadata entry." << std::endl; - if (!this_chromosome_metadata.contains("symbol")) - SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the required metadata key 'symbol' is missing from the 'this_chromosome' metadata entry." << std::endl; - if (!this_chromosome_metadata.contains("type")) - SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the required metadata key 'type' is missing from the 'this_chromosome' metadata entry." << std::endl; - - this_chromosome_id = this_chromosome_metadata["id"]; - this_chromosome_index = this_chromosome_metadata["index"]; - this_chromosome_symbol = this_chromosome_metadata["symbol"]; - this_chromosome_type = this_chromosome_metadata["type"]; - - // The "chromosomes" key is optional, but if provided, it has to make sense - if (metadata["SLiM"].contains("chromosomes")) + if (one_trait_metadata.contains("individualOffsetSD")) { - chomosomes_key_present = true; - - // We validate the whole "chromosomes" key against the whole model, to make sure everything is as expected - auto &chromosomes_metadata = metadata["SLiM"]["chromosomes"]; + slim_trait_offset_t new_offset_SD = one_trait_metadata["individualOffsetSD"]; - if (!chromosomes_metadata.is_array()) - SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the 'chromosomes' metadata key must be an array." << std::endl; - if (chromosomes_metadata.size() != Chromosomes().size()) - SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the number of entries in the 'chromosomes' metadata key does not match the number of chromosomes in the model." << std::endl; + if (!std::isfinite(new_offset_SD) || (new_offset_SD < 0.0)) + EIDOS_TERMINATION << "ERROR (Species::ReadTreeSequenceMetadata): the trait individualOffsetSD provided in the 'traits' metadata key for trait index " << traits_index << " (trait name " << one_trait_name << ") must be finite and nonnegative (" << new_offset_SD << " provided)." << EidosTerminate(); - for (std::size_t chromosomes_index = 0; chromosomes_index < Chromosomes().size(); ++chromosomes_index) - { - Chromosome *chromosome = Chromosomes()[chromosomes_index]; - auto &one_chromosome_metadata = chromosomes_metadata[chromosomes_index]; - - if (!one_chromosome_metadata.contains("id")) - SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the required metadata key 'id' is missing from a 'chromosomes' metadata entry; if 'chromosomes' is provided at all, it must be complete." << std::endl; - if (!one_chromosome_metadata.contains("symbol")) - SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the required metadata key 'symbol' is missing from a 'chromosomes' metadata entry; if 'chromosomes' is provided at all, it must be complete." << std::endl; - if (!one_chromosome_metadata.contains("type")) - SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the required metadata key 'type' is missing from a 'chromosomes' metadata entry; if 'chromosomes' is provided at all, it must be complete." << std::endl; - - int one_chromosome_id = one_chromosome_metadata["id"]; - std::string one_chromosome_symbol = one_chromosome_metadata["symbol"]; - std::string one_chromosome_type = one_chromosome_metadata["type"]; - - if (one_chromosome_id != chromosome->ID()) - SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the id for the entry at index " << chromosomes_index << " in the 'chromosomes' metadata key does not match the corresponding chromosome in the model." << std::endl; - if (one_chromosome_symbol != chromosome->Symbol()) - SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the symbol for the entry at index " << chromosomes_index << " in the 'chromosomes' metadata key does not match the corresponding chromosome in the model." << std::endl; - if (one_chromosome_type != chromosome->TypeString()) - SLIM_ERRSTREAM << "#WARNING (Species::ReadTreeSequenceMetadata): the type for the entry at index " << chromosomes_index << " in the 'chromosomes' metadata key does not match the corresponding chromosome in the model." << std::endl; - } + trait->SetIndividualOffsetDistributionSD(new_offset_SD); } - } catch (...) { - /////////////////////// - // Previous formats: everything is in provenance - EIDOS_TERMINATION << "ERROR (Species::ReadTreeSequenceMetadata): the version of this file appears to be too old to be read, or the file is corrupted; you can try using pyslim to bring an old file version forward to the current version, or generate a new file with the current version of SLiM." << EidosTerminate(); + // check optional keys for read-only propoerties; if present, they must match the simulation + if (one_trait_metadata.contains("baselineAccumulation")) + if (one_trait_metadata["baselineAccumulation"] != trait->HasBaselineAccumulation()) + EIDOS_TERMINATION << "ERROR (Species::ReadTreeSequenceMetadata): the trait baselineAccumulation provided in the 'traits' metadata key (" << one_trait_metadata["baselineAccumulation"] << ") for trait index " << traits_index << " (trait name " << one_trait_name << ") does not match the configuration (" << trait->HasBaselineAccumulation() << ") of the corresponding trait in the model." << EidosTerminate(); + + if (one_trait_metadata.contains("directFitnessEffect")) + if (one_trait_metadata["directFitnessEffect"] != trait->HasDirectFitnessEffect()) + EIDOS_TERMINATION << "ERROR (Species::ReadTreeSequenceMetadata): the trait directFitnessEffect provided in the 'traits' metadata key (" << one_trait_metadata["directFitnessEffect"] << ") for trait index " << traits_index << " (trait name " << one_trait_name << ") does not match the configuration (" << trait->HasDirectFitnessEffect() << ") of the corresponding trait in the model." << EidosTerminate(); } // check the model type; at the moment we do not require the model type to match what we are running, but we issue a warning on a mismatch @@ -9197,11 +9740,11 @@ void Species::ReadTreeSequenceMetadata(TreeSeqInfo &p_treeseq, slim_tick_t *p_ti Chromosome *chromosome = Chromosomes()[chromosome_index]; if (this_chromosome_id != chromosome->ID()) - EIDOS_TERMINATION << "ERROR (Species::ReadTreeSequenceMetadata): the chromosome id provided in the 'this_chromosome' key (" << this_chromosome_id << ") does not match the id (" << chromosome->ID() << ") of the corresponding chromosome in the model." << EidosTerminate(); + EIDOS_TERMINATION << "ERROR (Species::ReadTreeSequenceMetadata): the chromosome id provided in the 'this_chromosome' metadata key (" << this_chromosome_id << ") does not match the id (" << chromosome->ID() << ") of the corresponding chromosome in the model." << EidosTerminate(); if (this_chromosome_type != chromosome->TypeString()) - EIDOS_TERMINATION << "ERROR (Species::ReadTreeSequenceMetadata): the chromosome type provided in the 'this_chromosome' key (" << this_chromosome_type << ") does not match the type (" << chromosome->TypeString() << ") of the corresponding chromosome in the model." << EidosTerminate(); + EIDOS_TERMINATION << "ERROR (Species::ReadTreeSequenceMetadata): the chromosome type provided in the 'this_chromosome' metadata key (" << this_chromosome_type << ") does not match the type (" << chromosome->TypeString() << ") of the corresponding chromosome in the model." << EidosTerminate(); if (this_chromosome_symbol != chromosome->Symbol()) - EIDOS_TERMINATION << "ERROR (Species::ReadTreeSequenceMetadata): the chromosome symbol provided in the 'this_chromosome' key (" << this_chromosome_symbol << ") does not match the symbol (" << chromosome->Symbol() << ") of the corresponding chromosome in the model." << EidosTerminate(); + EIDOS_TERMINATION << "ERROR (Species::ReadTreeSequenceMetadata): the chromosome symbol provided in the 'this_chromosome' metadata key (" << this_chromosome_symbol << ") does not match the symbol (" << chromosome->Symbol() << ") of the corresponding chromosome in the model." << EidosTerminate(); // Check the chromosome index; when loading a multi-chromosome set, we normally require indices to match // - one exception is that you can load a chromosome from any index into a single-chromosome model @@ -9785,55 +10328,17 @@ void Species::_MakeHaplosomeMetadataRecords(void) // printf("hap_metadata_2M_ == %.2X\n", hap_metadata_2M_->is_vacant_[0]); } -void Species::MetadataForMutation(Mutation *p_mutation, MutationMetadataRec *p_metadata) -{ - static_assert(sizeof(MutationMetadataRec) == 17, "MutationMetadataRec has changed size; this code probably needs to be updated"); - - if (!p_mutation || !p_metadata) - EIDOS_TERMINATION << "ERROR (Species::MetadataForMutation): (internal error) bad parameters to MetadataForMutation()." << EidosTerminate(); - - p_metadata->mutation_type_id_ = p_mutation->mutation_type_ptr_->mutation_type_id_; - - // FIXME MULTITRAIT: We need to figure out where we're going to put multitrait information in .trees - // For now we just write out the effect for trait 0, but we need the dominance coeff too, and we need - // it for all traits in the model not just trait 0; this design is not going to work. See - // https://github.com/MesserLab/SLiM/issues/569 - MutationBlock *mutation_block = p_mutation->mutation_type_ptr_->mutation_block_; - MutationTraitInfo *mut_trait_info = mutation_block->TraitInfoForMutation(p_mutation); - - p_metadata->selection_coeff_ = mut_trait_info[0].effect_size_; - - p_metadata->subpop_index_ = p_mutation->subpop_index_; - p_metadata->origin_tick_ = p_mutation->origin_tick_; - p_metadata->nucleotide_ = p_mutation->nucleotide_; -} - -void Species::MetadataForSubstitution(Substitution *p_substitution, MutationMetadataRec *p_metadata) -{ - static_assert(sizeof(MutationMetadataRec) == 17, "MutationMetadataRec has changed size; this code probably needs to be updated"); - - if (!p_substitution || !p_metadata) - EIDOS_TERMINATION << "ERROR (Species::MetadataForSubstitution): (internal error) bad parameters to MetadataForSubstitution()." << EidosTerminate(); - - p_metadata->mutation_type_id_ = p_substitution->mutation_type_ptr_->mutation_type_id_; - - // FIXME MULTITRAIT: We need to figure out where we're going to put multitrait information in .trees - // For now we just write out the effect for trait 0, but we need the dominance coeff too, and we need - // it for all traits in the model not just trait 0; this design is not going to work. See - // https://github.com/MesserLab/SLiM/issues/569 - p_metadata->selection_coeff_ = p_substitution->trait_info_[0].effect_size_; - - p_metadata->subpop_index_ = p_substitution->subpop_index_; - p_metadata->origin_tick_ = p_substitution->origin_tick_; - p_metadata->nucleotide_ = p_substitution->nucleotide_; -} - void Species::MetadataForIndividual(Individual *p_individual, IndividualMetadataRec *p_metadata) { - static_assert(sizeof(IndividualMetadataRec) == 40, "IndividualMetadataRec has changed size; this code probably needs to be updated"); + // We check the struct size here to detect changes that would need to be responded to here; but it is + // very important to note that the caller guarantees that the actual size of p_metadata is large enough + // to accommodate all of the per-trait metadata, which is variable-length! + static_assert(sizeof(IndividualMetadataRec) == 56, "IndividualMetadataRec has changed size; this code probably needs to be updated"); +#if DEBUG if (!p_individual || !p_metadata) EIDOS_TERMINATION << "ERROR (Species::MetadataForIndividual): (internal error) bad parameters to MetadataForIndividual()." << EidosTerminate(); +#endif p_metadata->pedigree_id_ = p_individual->PedigreeID(); p_metadata->pedigree_p1_ = p_individual->Parent1PedigreeID(); @@ -9845,6 +10350,20 @@ void Species::MetadataForIndividual(Individual *p_individual, IndividualMetadata p_metadata->flags_ = 0; if (p_individual->migrant_) p_metadata->flags_ |= SLIM_INDIVIDUAL_METADATA_MIGRATED; + + // write per-trait metadata + int trait_count = TraitCount(); + _IndividualPerTraitMetadata *per_trait_metadata_array = p_metadata->per_trait_; + IndividualTraitInfo *per_trait_info_array = p_individual->trait_info_; + + for (int trait_index = 0; trait_index < trait_count; ++trait_index) + { + _IndividualPerTraitMetadata &per_trait_metadata = per_trait_metadata_array[trait_index]; + IndividualTraitInfo &per_trait_info = per_trait_info_array[trait_index]; + + per_trait_metadata.phenotype_ = per_trait_info.phenotype_; + per_trait_metadata.offset_ = per_trait_info.offset_; + } } void Species::CheckTreeSeqIntegrity(void) @@ -10216,6 +10735,8 @@ void Species::CrosscheckTreeSeqIntegrity(void) for (tsk_size_t individual_index = 0; individual_index < shared_individuals_table.num_rows; individual_index++) { tsk_id_t tsk_individual = (tsk_id_t)individual_index; + + // note that IndividualMetadataRec is variable-length now, but we only use the fixed-size header portion here IndividualMetadataRec *metadata_rec = (IndividualMetadataRec *)(shared_individuals_table.metadata + shared_individuals_table.metadata_offset[tsk_individual]); slim_pedigreeid_t pedigree_id = metadata_rec->pedigree_id_; auto lookup = tabled_individuals_hash_.find(pedigree_id); @@ -10270,7 +10791,7 @@ typedef robin_hood::unordered_flat_map SUBPOP_REMAP_RE typedef std::unordered_map SUBPOP_REMAP_REVERSE_HASH; #endif -void Species::__RemapSubpopulationIDs(SUBPOP_REMAP_HASH &p_subpop_map, TreeSeqInfo &p_treeseq, __attribute__ ((unused)) int p_file_version) +void Species::__RemapSubpopulationIDs(SUBPOP_REMAP_HASH &p_subpop_map, TreeSeqInfo &p_treeseq, MutationMetadataTable &p_mut_metadata_table, __attribute__ ((unused)) int p_file_version) { // If we have been given a remapping table, this method munges all of the data // and metadata in the treeseq tables to accomplish that remapping. It is gross @@ -10545,32 +11066,21 @@ void Species::__RemapSubpopulationIDs(SUBPOP_REMAP_HASH &p_subpop_map, TreeSeqIn // Remap subpop_index_ in the mutation metadata, in place { - std::size_t metadata_rec_size = sizeof(MutationMetadataRec); - tsk_mutation_table_t &mut_table = tables.mutations; - tsk_size_t num_rows = mut_table.num_rows; + // BCH 2/13/2026: mutation metadata is now in a binary table stored in the top-level `json+struct` codec and passed in to us + uint8_t *table_buffer = p_mut_metadata_table.table_buffer; + size_t row_size = p_mut_metadata_table.row_size; + tsk_size_t row_count = p_mut_metadata_table.row_count; - for (tsk_size_t mut_index = 0; mut_index < num_rows; ++mut_index) + for (tsk_size_t mut_index = 0; mut_index < row_count; ++mut_index) { - char *metadata_bytes = mut_table.metadata + mut_table.metadata_offset[mut_index]; - tsk_size_t metadata_length = mut_table.metadata_offset[mut_index + 1] - mut_table.metadata_offset[mut_index]; - - if (metadata_length % metadata_rec_size != 0) - EIDOS_TERMINATION << "ERROR (Species::__RemapSubpopulationIDs): unexpected mutation metadata length; this file cannot be read." << EidosTerminate(); + MutationTableMetadataRec *metadata = (MutationTableMetadataRec *)(table_buffer + mut_index * row_size); + slim_objectid_t old_subpop = metadata->subpop_index_; + auto remap_iter = subpop_map.find(old_subpop); - int stack_count = (int)(metadata_length / metadata_rec_size); + if (remap_iter == subpop_map.end()) + EIDOS_TERMINATION << "ERROR (Species::__RemapSubpopulationIDs): a subpopulation index (" << old_subpop << ") used by the tree sequence data (mutation metadata) was not remapped." << EidosTerminate(); - for (int stack_index = 0; stack_index < stack_count; ++stack_index) - { - // Here we have to deal with the metadata format - MutationMetadataRec *metadata = (MutationMetadataRec *)metadata_bytes + stack_index; - slim_objectid_t old_subpop = metadata->subpop_index_; - auto remap_iter = subpop_map.find(old_subpop); - - if (remap_iter == subpop_map.end()) - EIDOS_TERMINATION << "ERROR (Species::__RemapSubpopulationIDs): a subpopulation index (" << old_subpop << ") used by the tree sequence data (mutation metadata) was not remapped." << EidosTerminate(); - - metadata->subpop_index_ = remap_iter->second; - } + metadata->subpop_index_ = remap_iter->second; } } @@ -10584,7 +11094,9 @@ void Species::__RemapSubpopulationIDs(SUBPOP_REMAP_HASH &p_subpop_map, TreeSeqIn char *metadata_bytes = ind_table.metadata + ind_table.metadata_offset[ind_index]; tsk_size_t metadata_length = ind_table.metadata_offset[ind_index + 1] - ind_table.metadata_offset[ind_index]; - if (metadata_length != sizeof(IndividualMetadataRec)) + // BCH 2/11/2026: the IndividualMetadataRec struct is now variable-length, but we only need to + // work with the first part of it; so now we just require the size to be >= the base size + if (metadata_length >= sizeof(IndividualMetadataRec)) EIDOS_TERMINATION << "ERROR (Species::__RemapSubpopulationIDs): unexpected individual metadata length; this file cannot be read." << EidosTerminate(); IndividualMetadataRec *metadata = (IndividualMetadataRec *)metadata_bytes; @@ -10651,16 +11163,9 @@ void Species::__RemapSubpopulationIDs(SUBPOP_REMAP_HASH &p_subpop_map, TreeSeqIn typedef struct ts_subpop_info { slim_popsize_t countMH_ = 0, countF_ = 0; - std::vector sex_; std::vector nodes_; - std::vector pedigreeID_; - std::vector pedigreeP1_; - std::vector pedigreeP2_; - std::vector age_; - std::vector spatial_x_; - std::vector spatial_y_; - std::vector spatial_z_; - std::vector flags_; + std::vector spatial_positions_; // points into the locations column of the individual table + std::vector metadata_; // points into the metadata column of the individual table } ts_subpop_info; void Species::__PrepareSubpopulationsFromTables(std::unordered_map &p_subpopInfoMap, TreeSeqInfo &p_treeseq) @@ -10711,10 +11216,14 @@ void Species::__TabulateSubpopulationsFromTreeSequence(std::unordered_mapsecond; + // remember our metadata pointer, we will fetch information from it later; below we check metadata for + // correctness, and even edit it, but we do not copy its values, we just keep the metadata pointer + subpop_info.metadata_.push_back(metadata); + // check and tabulate sex within each subpop IndividualSex sex = (IndividualSex)metadata->sex_; // IndividualSex, but int32_t in the record @@ -10761,56 +11274,51 @@ void Species::__TabulateSubpopulationsFromTreeSequence(std::unordered_mappedigree_id_ < 0) EIDOS_TERMINATION << "ERROR (Species::__TabulateSubpopulationsFromTreeSequence): individuals loaded must have pedigree IDs >= 0." << EidosTerminate(); - subpop_info.pedigreeID_.push_back(metadata->pedigree_id_); if ((metadata->pedigree_p1_ < -1) || (metadata->pedigree_p2_ < -1)) EIDOS_TERMINATION << "ERROR (Species::__TabulateSubpopulationsFromTreeSequence): individuals loaded must have parent pedigree IDs >= -1." << EidosTerminate(); - subpop_info.pedigreeP1_.push_back(metadata->pedigree_p1_); - subpop_info.pedigreeP2_.push_back(metadata->pedigree_p2_); - // save off the flags for later use - subpop_info.flags_.push_back(metadata->flags_); - // bounds-check ages; we cross-translate ages of 0 and -1 if the model type has been switched slim_age_t age = metadata->age_; if ((p_file_model_type == SLiMModelType::kModelTypeNonWF) && (model_type_ == SLiMModelType::kModelTypeWF) && (age == 0)) + { age = -1; + metadata->age_ = -1; + } if ((p_file_model_type == SLiMModelType::kModelTypeWF) && (model_type_ == SLiMModelType::kModelTypeNonWF) && (age == -1)) + { age = 0; + metadata->age_ = 0; + } if (((age < 0) || (age > SLIM_MAX_ID_VALUE)) && (model_type_ == SLiMModelType::kModelTypeNonWF)) EIDOS_TERMINATION << "ERROR (Species::__TabulateSubpopulationsFromTreeSequence): individuals loaded into a nonWF model must have age values >= 0 and <= " << SLIM_MAX_ID_VALUE << "." << EidosTerminate(); if ((age != -1) && (model_type_ == SLiMModelType::kModelTypeWF)) EIDOS_TERMINATION << "ERROR (Species::__TabulateSubpopulationsFromTreeSequence): individuals loaded into a WF model must have age values == -1." << EidosTerminate(); - subpop_info.age_.emplace_back(age); - // no bounds-checks for spatial position if (individual.location_length != 3) EIDOS_TERMINATION << "ERROR (Species::__TabulateSubpopulationsFromTreeSequence): unexpected individual location length; this file cannot be read." << EidosTerminate(); - subpop_info.spatial_x_.emplace_back(individual.location[0]); - subpop_info.spatial_y_.emplace_back(individual.location[1]); - subpop_info.spatial_z_.emplace_back(individual.location[2]); + subpop_info.spatial_positions_.emplace_back(individual.location); + + // check that the individual has exactly two nodes; we are always diploid in terms of nodes, regardless of the chromosome type + if (individual.nodes_length != 2) + EIDOS_TERMINATION << "ERROR (Species::__TabulateSubpopulationsFromTreeSequence): unexpected node count; this file cannot be read." << EidosTerminate(); + + const tsk_id_t node0 = individual.nodes[0]; + const tsk_id_t node1 = individual.nodes[1]; + + subpop_info.nodes_.push_back(node0); + subpop_info.nodes_.push_back(node1); // check the referenced nodes; right now this is not essential for re-creating the saved state, but is just a crosscheck // here we crosscheck the node information against expected values from other places in the tables or the model tsk_node_table_t &node_table = tables.nodes; - tsk_id_t node0 = individual.nodes[0]; - tsk_id_t node1 = individual.nodes[1]; if (((node_table.flags[node0] & TSK_NODE_IS_SAMPLE) == 0) || ((node_table.flags[node1] & TSK_NODE_IS_SAMPLE) == 0)) EIDOS_TERMINATION << "ERROR (Species::__TabulateSubpopulationsFromTreeSequence): nodes for individual are not in-sample; this file cannot be read." << EidosTerminate(); @@ -10980,7 +11488,7 @@ void Species::__CreateSubpopulationsFromTabulation(std::unordered_map= subpop_size) EIDOS_TERMINATION << "ERROR (Species::__CreateSubpopulationsFromTabulation): (internal error) ran out of tabulated individuals." << EidosTerminate(); - } while (subpop_info.sex_[tabulation_index] != generating_sex); + } while ((IndividualSex)subpop_info.metadata_[tabulation_index]->sex_ != generating_sex); Individual *individual = new_subpop->parent_individuals_[ind_index]; @@ -10995,21 +11503,21 @@ void Species::__CreateSubpopulationsFromTabulation(std::unordered_mapSetTskitNodeIdBase(node_id_0); - slim_pedigreeid_t pedigree_id = subpop_info.pedigreeID_[tabulation_index]; + slim_pedigreeid_t pedigree_id = subpop_info.metadata_[tabulation_index]->pedigree_id_; individual->SetPedigreeID(pedigree_id); pedigree_id_check.emplace_back(pedigree_id); // we will test for collisions below gSLiM_next_pedigree_id = std::max(gSLiM_next_pedigree_id, pedigree_id + 1); - individual->SetParentPedigreeID(subpop_info.pedigreeP1_[tabulation_index], subpop_info.pedigreeP2_[tabulation_index]); + individual->SetParentPedigreeID(subpop_info.metadata_[tabulation_index]->pedigree_p1_, subpop_info.metadata_[tabulation_index]->pedigree_p2_); - uint32_t flags = subpop_info.flags_[tabulation_index]; + uint32_t flags = subpop_info.metadata_[tabulation_index]->flags_; if (flags & SLIM_INDIVIDUAL_METADATA_MIGRATED) individual->migrant_ = true; - individual->age_ = subpop_info.age_[tabulation_index]; - individual->spatial_x_ = subpop_info.spatial_x_[tabulation_index]; - individual->spatial_y_ = subpop_info.spatial_y_[tabulation_index]; - individual->spatial_z_ = subpop_info.spatial_z_[tabulation_index]; + individual->age_ = subpop_info.metadata_[tabulation_index]->age_; + individual->spatial_x_ = subpop_info.spatial_positions_[tabulation_index][0]; + individual->spatial_y_ = subpop_info.spatial_positions_[tabulation_index][1]; + individual->spatial_z_ = subpop_info.spatial_positions_[tabulation_index][2]; p_nodeToHaplosomeMap.emplace(node_id_0, individual->haplosomes_[first_haplosome_index]); slim_haplosomeid_t haplosome_id = pedigree_id * 2; @@ -11149,7 +11657,7 @@ void Species::__CreateSubpopulationsFromTabulation_SECONDARY(std::unordered_map< if (tabulation_index >= subpop_size) EIDOS_TERMINATION << "ERROR (Species::__CreateSubpopulationsFromTabulation_SECONDARY): (internal error) ran out of tabulated individuals." << EidosTerminate(); - } while (subpop_info.sex_[tabulation_index] != generating_sex); + } while ((IndividualSex)subpop_info.metadata_[tabulation_index]->sex_ != generating_sex); Individual *individual = new_subpop->parent_individuals_[ind_index]; @@ -11165,23 +11673,23 @@ void Species::__CreateSubpopulationsFromTabulation_SECONDARY(std::unordered_map< if (individual->TskitNodeIdBase() != node_id_0) EIDOS_TERMINATION << "ERROR (Species::__CreateSubpopulationsFromTabulation_SECONDARY): tskit node id mismatch between chromosomes read." << EidosTerminate(); - slim_pedigreeid_t pedigree_id = subpop_info.pedigreeID_[tabulation_index]; + slim_pedigreeid_t pedigree_id = subpop_info.metadata_[tabulation_index]->pedigree_id_; if (individual->PedigreeID() != pedigree_id) EIDOS_TERMINATION << "ERROR (Species::__CreateSubpopulationsFromTabulation_SECONDARY): pedigree id mismatch between chromosomes read." << EidosTerminate(); - if ((individual->Parent1PedigreeID() != subpop_info.pedigreeP1_[tabulation_index]) || - (individual->Parent2PedigreeID() != subpop_info.pedigreeP2_[tabulation_index])) + if ((individual->Parent1PedigreeID() != subpop_info.metadata_[tabulation_index]->pedigree_p1_) || + (individual->Parent2PedigreeID() != subpop_info.metadata_[tabulation_index]->pedigree_p2_)) EIDOS_TERMINATION << "ERROR (Species::__CreateSubpopulationsFromTabulation_SECONDARY): parent pedigree id mismatch between chromosomes read." << EidosTerminate(); - uint32_t flags = subpop_info.flags_[tabulation_index]; + uint32_t flags = subpop_info.metadata_[tabulation_index]->flags_; if ((flags & SLIM_INDIVIDUAL_METADATA_MIGRATED) && !individual->migrant_) EIDOS_TERMINATION << "ERROR (Species::__CreateSubpopulationsFromTabulation_SECONDARY): individual migrant flag mismatch between chromosomes read." << EidosTerminate(); - if (individual->age_ != subpop_info.age_[tabulation_index]) + if (individual->age_ != subpop_info.metadata_[tabulation_index]->age_) EIDOS_TERMINATION << "ERROR (Species::__CreateSubpopulationsFromTabulation_SECONDARY): individual age mismatch between chromosomes read." << EidosTerminate(); - if ((individual->spatial_x_ != subpop_info.spatial_x_[tabulation_index]) || - (individual->spatial_y_ != subpop_info.spatial_y_[tabulation_index]) || - (individual->spatial_z_ != subpop_info.spatial_z_[tabulation_index])) + if ((individual->spatial_x_ != subpop_info.spatial_positions_[tabulation_index][0]) || + (individual->spatial_y_ != subpop_info.spatial_positions_[tabulation_index][1]) || + (individual->spatial_z_ != subpop_info.spatial_positions_[tabulation_index][2])) EIDOS_TERMINATION << "ERROR (Species::__CreateSubpopulationsFromTabulation_SECONDARY): individual spatial position mismatch between chromosomes read." << EidosTerminate(); // the haplosomes we're setting up are different from the haplosomes previously set up, @@ -11639,37 +12147,47 @@ void Species::__ConfigureSubpopulationsFromTables_SECONDARY(__attribute__((unuse typedef struct ts_mut_info { slim_position_t position; - MutationMetadataRec metadata; + MutationTableMetadataRec *metadata; slim_refcount_t ref_count; } ts_mut_info; -void Species::__TabulateMutationsFromTables(std::unordered_map &p_mutMap, TreeSeqInfo &p_treeseq, __attribute__ ((unused)) int p_file_version) +void Species::__TabulateMutationsFromTables(std::unordered_map &p_mutMap, TreeSeqInfo &p_treeseq, MutationMetadataTable &p_mut_metadata_table, __attribute__ ((unused)) int p_file_version) { tsk_table_collection_t &tables = p_treeseq.tables_; - std::size_t metadata_rec_size = sizeof(MutationMetadataRec); tsk_mutation_table_t &mut_table = tables.mutations; tsk_size_t mut_count = mut_table.num_rows; if ((mut_count > 0) && !recording_mutations_) EIDOS_TERMINATION << "ERROR (Species::__TabulateMutationsFromTables): cannot load mutations when mutation recording is disabled." << EidosTerminate(); + // we need a hash table into the mutation metadata table, to look up metadata records by mutation ID + std::unordered_map metadata_map; + uint8_t *mutation_table_buffer = p_mut_metadata_table.table_buffer; + size_t row_size = p_mut_metadata_table.row_size; + size_t row_count = p_mut_metadata_table.row_count; + + for (size_t row_index = 0; row_index < row_count; ++row_index) + { + MutationTableMetadataRec *metadata_rec = (MutationTableMetadataRec *)(mutation_table_buffer + row_index * row_size); + slim_mutationid_t mut_id = metadata_rec->mutation_id_; + + if (metadata_map.find(mut_id) != metadata_map.end()) + EIDOS_TERMINATION << "ERROR (Species::__TabulateMutationsFromTables): mutation id " << mut_id << " was duplicated in the mutation metadata table." << EidosTerminate(); + + metadata_map.insert(std::pair(mut_id, metadata_rec)); + } + + // now we go through the mutation table's derived state column and make ts_mut_info entries for each referenced mutation for (tsk_size_t mut_index = 0; mut_index < mut_count; ++mut_index) { const char *derived_state_bytes = mut_table.derived_state + mut_table.derived_state_offset[mut_index]; tsk_size_t derived_state_length = mut_table.derived_state_offset[mut_index + 1] - mut_table.derived_state_offset[mut_index]; - const char *metadata_bytes = mut_table.metadata + mut_table.metadata_offset[mut_index]; - tsk_size_t metadata_length = mut_table.metadata_offset[mut_index + 1] - mut_table.metadata_offset[mut_index]; if (derived_state_length % sizeof(slim_mutationid_t) != 0) EIDOS_TERMINATION << "ERROR (Species::__TabulateMutationsFromTables): unexpected mutation derived state length; this file cannot be read." << EidosTerminate(); - if (metadata_length % metadata_rec_size != 0) - EIDOS_TERMINATION << "ERROR (Species::__TabulateMutationsFromTables): unexpected mutation metadata length; this file cannot be read." << EidosTerminate(); - if (derived_state_length / sizeof(slim_mutationid_t) != metadata_length / metadata_rec_size) - EIDOS_TERMINATION << "ERROR (Species::__TabulateMutationsFromTables): (internal error) mutation metadata length does not match derived state length." << EidosTerminate(); int stack_count = (int)(derived_state_length / sizeof(slim_mutationid_t)); slim_mutationid_t *derived_state_vec = (slim_mutationid_t *)derived_state_bytes; - const void *metadata_vec = metadata_bytes; // either const MutationMetadataRec* or const MutationMetadataRec_PRENUC* tsk_id_t site_id = mut_table.site[mut_index]; double position_double = tables.sites.position[site_id]; double position_double_round = round(position_double); @@ -11694,19 +12212,25 @@ void Species::__TabulateMutationsFromTables(std::unordered_mapsecond); mut_info->position = position; + + // get a pointer into the mutation metadata table, and keep that pointer in our ts_mut_info + // note that we don't want to copy the data here because it is variable-size and can be large + auto metadata_iter = metadata_map.find(mut_id); + + if (metadata_iter == metadata_map.end()) + EIDOS_TERMINATION << "ERROR (Species::__TabulateMutationsFromTables): mutation id " << mut_id << " referenced by a derived state is not in the mutation metadata table." << EidosTerminate(); + + mut_info->metadata = metadata_iter->second; } else { // entry already present; check that it refers to the same mutation, using its position (see https://github.com/MesserLab/SLiM/issues/179) + // we used to update the metadata here, to use the last recorded metadata record, but now we have one canonical metadata entry mut_info = &(mut_info_find->second); if (mut_info->position != position) EIDOS_TERMINATION << "ERROR (Species::__TabulateMutationsFromTables): inconsistent mutation position observed reading tree sequence data; this may indicate that mutation IDs are not unique." << EidosTerminate(); } - - MutationMetadataRec *metadata = (MutationMetadataRec *)metadata_vec + stack_index; - - mut_info->metadata = *metadata; } } } @@ -11833,8 +12357,7 @@ void Species::__CreateMutationsFromTabulation(std::unordered_map(metadata_ptr) & 0x03) != 0) + EIDOS_TERMINATION << "ERROR (Species::__CreateMutationsFromTabulation): (internal error) misaligned pointer to mutation metadata (misalignment == " << (int)(reinterpret_cast(metadata_ptr) & 0x07) << ")." << EidosTerminate(); +#endif // look up the mutation type from its index - MutationType *mutation_type_ptr = MutationTypeWithID(metadata.mutation_type_id_); + MutationType *mutation_type_ptr = MutationTypeWithID(metadata_ptr->mutation_type_id_); if (!mutation_type_ptr) - EIDOS_TERMINATION << "ERROR (Species::__CreateMutationsFromTabulation): mutation type m" << metadata.mutation_type_id_ << " has not been defined for this species." << EidosTerminate(); + EIDOS_TERMINATION << "ERROR (Species::__CreateMutationsFromTabulation): mutation type m" << metadata_ptr->mutation_type_id_ << " has not been defined for this species." << EidosTerminate(); if ((mut_info.ref_count == fixation_count) && (mutation_type_ptr->convert_to_substitution_)) { // this mutation is fixed, and the muttype wants substitutions, so make a substitution - // BCH 1/26/2026: note that this does NOT do baseline accumulation, because it is assumed that the - // baseline offset recorded in the file already contains such effects as needed; FIXME MULTITRAIT - // FIXME MULTITRAIT for now I assume the dominance coeff from the mutation type; needs to be added to MutationMetadataRec; likewise hemizygous dominance - // FIXME MULTITRAIT this code will also now need to handle the independent dominance case, for which NaN should be in the metadata - Substitution *sub = new Substitution(mutation_id, mutation_type_ptr, chromosome_index, position, metadata.selection_coeff_, mutation_type_ptr->DefaultDominanceForTrait(0) /* metadata.dominance_coeff_ */, metadata.subpop_index_, metadata.origin_tick_, community_.Tick(), metadata.nucleotide_); // FIXME MULTITRAIT + // BCH 1/26/2026: note that this code path does NOT do baseline accumulation, because it is assumed + // that the baseline offset recorded in the file already contains such effects as needed + Substitution *sub = new Substitution(mutation_id, mutation_type_ptr, chromosome_index, position, metadata_ptr, community_.Tick()); population_.treeseq_substitutions_map_.emplace(position, sub); population_.substitutions_.emplace_back(sub); @@ -11885,9 +12408,7 @@ void Species::__CreateMutationsFromTabulation(std::unordered_mapNewMutationFromBlock(); - // FIXME MULTITRAIT for now I assume the dominance coeff from the mutation type; needs to be added to MutationMetadataRec; likewise hemizygous dominance - // FIXME MULTITRAIT this code will also now need to handle the independent dominance case, for which NaN should be in the metadata - Mutation *new_mut = new (mut_block_ptr + new_mut_index) Mutation(mutation_id, mutation_type_ptr, chromosome_index, position, metadata.selection_coeff_, mutation_type_ptr->DefaultDominanceForTrait(0) /* metadata.dominance_coeff_ */, metadata.subpop_index_, metadata.origin_tick_, metadata.nucleotide_); // FIXME MULTITRAIT + Mutation *new_mut = new (mut_block_ptr + new_mut_index) Mutation(mutation_id, mutation_type_ptr, chromosome_index, position, metadata_ptr); // add it to our local map, so we can find it when making haplosomes, and to the population's mutation registry p_mutIndexMap[mutation_id] = new_mut_index; @@ -12233,7 +12754,7 @@ void _ComparePopulationTables(tsk_population_table_t &population0, tsk_populatio EIDOS_TERMINATION << "ERROR (_ComparePopulationTables): population table mismatch between loaded chromosomes (metadata_schema column differs)." << EidosTerminate(); } -void Species::_InstantiateSLiMObjectsFromTables(EidosInterpreter *p_interpreter, slim_tick_t p_metadata_tick, slim_tick_t p_metadata_cycle, SLiMModelType p_file_model_type, int p_file_version, SUBPOP_REMAP_HASH &p_subpop_map, TreeSeqInfo &p_treeseq) +void Species::_InstantiateSLiMObjectsFromTables(EidosInterpreter *p_interpreter, slim_tick_t p_metadata_tick, slim_tick_t p_metadata_cycle, SLiMModelType p_file_model_type, int p_file_version, SUBPOP_REMAP_HASH &p_subpop_map, TreeSeqInfo &p_treeseq, MutationMetadataTable &p_mut_metadata_table) { // NOTE: This method handles the first (or only) chromosome being read in. A parallel method, // _InstantiateSLiMObjectsFromTables_SECONDARY(), handles the second and onward. The code is @@ -12263,7 +12784,7 @@ void Species::_InstantiateSLiMObjectsFromTables(EidosInterpreter *p_interpreter, // check/rewrite the incoming tree-seq information in various ways __CheckPopulationMetadata(p_treeseq); - __RemapSubpopulationIDs(p_subpop_map, p_treeseq, p_file_version); + __RemapSubpopulationIDs(p_subpop_map, p_treeseq, p_mut_metadata_table, p_file_version); // allocate and set up the tree_sequence object // note that this tree sequence is based upon whatever sample the file was saved with, and may contain in-sample individuals @@ -12295,7 +12816,7 @@ void Species::_InstantiateSLiMObjectsFromTables(EidosInterpreter *p_interpreter, { std::unordered_map mutInfoMap; - __TabulateMutationsFromTables(mutInfoMap, p_treeseq, p_file_version); + __TabulateMutationsFromTables(mutInfoMap, p_treeseq, p_mut_metadata_table, p_file_version); __TallyMutationReferencesWithTreeSequence(mutInfoMap, nodeToHaplosomeMap, ts); __CreateMutationsFromTabulation(mutInfoMap, mutIndexMap, p_treeseq); } @@ -12310,7 +12831,7 @@ void Species::_InstantiateSLiMObjectsFromTables(EidosInterpreter *p_interpreter, p_treeseq.last_coalescence_state_ = false; } -void Species::_InstantiateSLiMObjectsFromTables_SECONDARY(EidosInterpreter *p_interpreter, slim_tick_t p_metadata_tick, slim_tick_t p_metadata_cycle, SLiMModelType p_file_model_type, int p_file_version, SUBPOP_REMAP_HASH &p_subpop_map, TreeSeqInfo &p_treeseq) +void Species::_InstantiateSLiMObjectsFromTables_SECONDARY(EidosInterpreter *p_interpreter, slim_tick_t p_metadata_tick, slim_tick_t p_metadata_cycle, SLiMModelType p_file_model_type, int p_file_version, SUBPOP_REMAP_HASH &p_subpop_map, TreeSeqInfo &p_treeseq, MutationMetadataTable &p_mut_metadata_table) { // NOTE: _InstantiateSLiMObjectsFromTables() handles the first (or only) chromosome being read in. This // method handles the second and onward. The code is quite similar, and should be maintained in parallel! @@ -12341,7 +12862,7 @@ void Species::_InstantiateSLiMObjectsFromTables_SECONDARY(EidosInterpreter *p_in // check/rewrite the incoming tree-seq information in various ways __CheckPopulationMetadata(p_treeseq); - __RemapSubpopulationIDs(p_subpop_map, p_treeseq, p_file_version); + __RemapSubpopulationIDs(p_subpop_map, p_treeseq, p_mut_metadata_table, p_file_version); // allocate and set up the tree_sequence object // note that this tree sequence is based upon whatever sample the file was saved with, and may contain in-sample individuals @@ -12373,7 +12894,7 @@ void Species::_InstantiateSLiMObjectsFromTables_SECONDARY(EidosInterpreter *p_in { std::unordered_map mutInfoMap; - __TabulateMutationsFromTables(mutInfoMap, p_treeseq, p_file_version); + __TabulateMutationsFromTables(mutInfoMap, p_treeseq, p_mut_metadata_table, p_file_version); __TallyMutationReferencesWithTreeSequence(mutInfoMap, nodeToHaplosomeMap, ts); __CreateMutationsFromTabulation(mutInfoMap, mutIndexMap, p_treeseq); } @@ -12536,8 +13057,9 @@ slim_tick_t Species::_InitializePopulationFromTskitBinaryFile(const char *p_file slim_tick_t metadata_cycle; SLiMModelType file_model_type; int file_version; + MutationMetadataTable mut_metadata_table; - ReadTreeSequenceMetadata(treeSeqInfo, &metadata_tick, &metadata_cycle, &file_model_type, &file_version); + ReadTreeSequenceMetadata(treeSeqInfo, &metadata_tick, &metadata_cycle, &file_model_type, &file_version, mut_metadata_table); // convert ASCII derived-state data, which is the required format on disk, back to our in-memory binary format DerivedStatesFromAscii(&treeSeqInfo.tables_); @@ -12547,7 +13069,7 @@ slim_tick_t Species::_InitializePopulationFromTskitBinaryFile(const char *p_file _ReadAncestralSequence(p_file, p_chromosome); // make the corresponding SLiM objects - _InstantiateSLiMObjectsFromTables(p_interpreter, metadata_tick, metadata_cycle, file_model_type, file_version, p_subpop_map, treeSeqInfo); + _InstantiateSLiMObjectsFromTables(p_interpreter, metadata_tick, metadata_cycle, file_model_type, file_version, p_subpop_map, treeSeqInfo, mut_metadata_table); // cleanup such as handling remembered haplosomes and the individuals table, mutation tallying, and integrity checks // this incorporates all of the post-load work that spans the whole set of chromosomes in the model @@ -12676,8 +13198,9 @@ slim_tick_t Species::_InitializePopulationFromTskitDirectory(std::string p_direc slim_tick_t this_metadata_cycle; SLiMModelType this_file_model_type; int this_file_version; + MutationMetadataTable mut_metadata_table; - ReadTreeSequenceMetadata(treeSeqInfo, &this_metadata_tick, &this_metadata_cycle, &this_file_model_type, &this_file_version); + ReadTreeSequenceMetadata(treeSeqInfo, &this_metadata_tick, &this_metadata_cycle, &this_file_model_type, &this_file_version, mut_metadata_table); if (is_first_chromosome) { @@ -12705,9 +13228,9 @@ slim_tick_t Species::_InitializePopulationFromTskitDirectory(std::string p_direc // as needed. The remaining chromosomes use _InstantiateSLiMObjectsFromTables_SECONDARY(), which // checks against the population structure that was created; it should always match exactly. if (is_first_chromosome) - _InstantiateSLiMObjectsFromTables(p_interpreter, metadata_tick, metadata_cycle, file_model_type, file_version, p_subpop_remap, treeSeqInfo); + _InstantiateSLiMObjectsFromTables(p_interpreter, metadata_tick, metadata_cycle, file_model_type, file_version, p_subpop_remap, treeSeqInfo, mut_metadata_table); else - _InstantiateSLiMObjectsFromTables_SECONDARY(p_interpreter, metadata_tick, metadata_cycle, file_model_type, file_version, p_subpop_remap, treeSeqInfo); + _InstantiateSLiMObjectsFromTables_SECONDARY(p_interpreter, metadata_tick, metadata_cycle, file_model_type, file_version, p_subpop_remap, treeSeqInfo, mut_metadata_table); } // cleanup such as handling remembered haplosomes and the individuals table, mutation tallying, and integrity checks diff --git a/core/species.h b/core/species.h index a16d5b9ab..0af428475 100644 --- a/core/species.h +++ b/core/species.h @@ -38,6 +38,7 @@ #include "trait.h" #include "eidos_value.h" #include "mutation_run.h" +#include "mutation_block.h" //TREE SEQUENCE //INCLUDE JEROME's TABLES API @@ -97,14 +98,21 @@ enum class SLiMFileFormat // Note that these structs are packed, and so accesses to them and within them may be unaligned; we assume // that is OK on the platforms we run on, so as to keep file sizes down. -typedef struct __attribute__((__packed__)) { - slim_objectid_t mutation_type_id_; // 4 bytes (int32_t): the id of the mutation type the mutation belongs to - slim_effect_t selection_coeff_; // 4 bytes (float): the mutation effect (e.g., selection coefficient) - // FIXME MULTITRAIT need to add a dominance_coeff_ property here! and hemizygous_dominance_coeff_! - slim_objectid_t subpop_index_; // 4 bytes (int32_t): the id of the subpopulation in which the mutation arose - slim_tick_t origin_tick_; // 4 bytes (int32_t): the tick in which the mutation arose - int8_t nucleotide_; // 1 byte (int8_t): the nucleotide for the mutation (0='A', 1='C', 2='G', 3='T'), or -1 -} MutationMetadataRec; +typedef struct __attribute__((__packed__)) _MutationPerTraitMetadata { + slim_effect_t effect_size_; // 4 bytes (float): the mutation effect size (e.g., selection coefficient) + slim_effect_t dominance_coeff_; // 4 bytes (float): the dominance coefficient; note that NAN indicates independent dominance + slim_effect_t hemizygous_dominance_coeff_; // 4 bytes (float): the hemizygous dominance coefficient +} _MutationPerTraitMetadata; + +typedef struct __attribute__((__packed__)) MutationTableMetadataRec { + slim_mutationid_t mutation_id_; // 8 bytes (int64_t): the SLiM id of the mutation + slim_objectid_t mutation_type_id_; // 4 bytes (int32_t): the id of the mutation type the mutation belongs to + slim_objectid_t subpop_index_; // 4 bytes (int32_t): the id of the subpopulation in which the mutation arose + slim_tick_t origin_tick_; // 4 bytes (int32_t): the tick in which the mutation arose + int8_t nucleotide_; // 1 byte (int8_t): the nucleotide for the mutation (0='A', 1='C', 2='G', 3='T'), or -1 + int8_t unused_[3]; // 3 bytes (int8_t): UNUSED SPACE, PRESENTLY FOR PADDING + _MutationPerTraitMetadata per_trait_[1]; // 12 bytes per entry: 1 or more per-trait entries (count determined by the schema!) +} MutationTableMetadataRec; typedef struct __attribute__((__packed__)) { // BCH 12/10/2024: This metadata record is becoming a bit complicated, for multichromosome SLiM, and is now actually variable-length. @@ -129,6 +137,11 @@ typedef struct __attribute__((__packed__)) { // BCH 12/6/2024: type_, the chromosome type for the haplosome, has moved to top-level metadata; it is constant across a tree sequence } HaplosomeMetadataRec; +typedef struct __attribute__((__packed__)) { + slim_phenotype_t phenotype_; // 8 bytes (double): the phenotypic value for a trait + slim_trait_offset_t offset_; // 8 bytes (double): the individual offset combined in to produce a trait value +} _IndividualPerTraitMetadata; + typedef struct __attribute__((__packed__)) { slim_pedigreeid_t pedigree_id_; // 8 bytes (int64_t): the SLiM pedigree ID for this individual, assigned by pedigree rec slim_pedigreeid_t pedigree_p1_; // 8 bytes (int64_t): the SLiM pedigree ID for this individual's parent 1 @@ -137,14 +150,18 @@ typedef struct __attribute__((__packed__)) { slim_objectid_t subpopulation_id_; // 4 bytes (int32_t): the subpopulation the individual belongs to int32_t sex_; // 4 bytes (int32_t): the sex of the individual, as defined by the IndividualSex enum uint32_t flags_; // 4 bytes (uint32_t): assorted flags, see below + _IndividualPerTraitMetadata per_trait_[1]; // 16 bytes per entry: 1 or more per-trait entries (count determined by the schema!) } IndividualMetadataRec; #define SLIM_INDIVIDUAL_METADATA_MIGRATED 0x01 // set if the individual has migrated in this cycle // We double-check the size of these records to make sure we understand what they contain and how they're packed -static_assert(sizeof(MutationMetadataRec) == 17, "MutationMetadataRec is not 17 bytes!"); -static_assert(sizeof(HaplosomeMetadataRec) == 9, "HaplosomeMetadataRec is not 9 bytes!"); // but its size is dynamic at runtime -static_assert(sizeof(IndividualMetadataRec) == 40, "IndividualMetadataRec is not 40 bytes!"); +// BCH 2/11/2026: Note that all of these metadata structs are now actually variable-length; this is just a base. +static_assert(sizeof(_MutationPerTraitMetadata) == 12, "_MutationPerTraitMetadata is not 12 bytes!"); +static_assert(sizeof(MutationTableMetadataRec) == 36, "MutationTableMetadataRec is not 36 bytes!"); +static_assert(sizeof(HaplosomeMetadataRec) == 9, "HaplosomeMetadataRec is not 9 bytes!"); +static_assert(sizeof(_IndividualPerTraitMetadata) == 16, "_IndividualPerTraitMetadata is not 16 bytes!"); +static_assert(sizeof(IndividualMetadataRec) == 56, "IndividualMetadataRec is not 56 bytes!"); // We check endianness on the platform we're building on; we assume little-endianness in our read/write code, I think. #if defined(__BYTE_ORDER__) @@ -153,6 +170,13 @@ static_assert(sizeof(IndividualMetadataRec) == 40, "IndividualMetadataRec is not #endif #endif +// This struct is used when reading .trees metadata, to encapsulate the structure of the binary mutation metadata table +typedef struct { + uint8_t *table_buffer; // the base address of the table; this is not MutationTableMetadataRec* because that is variable-length + size_t row_size; // the number of bytes per row (the realized size of one MutationTableMetadataRec) + size_t row_count; // the number of rows (the number of mutations) +} MutationMetadataTable; + #pragma mark - #pragma mark Species @@ -349,6 +373,9 @@ class Species : public EidosDictionaryUnretained // actually be shared by multiple haplosomes in different chromosomes //Individual *current_new_individual_; + std::vector muts_retained_by_treeseq_; // mutations that we have retained for the tree sequence; see WriteTreeSequenceMetadata() + bool any_muts_retained_impermanently_ = false; // if false, all retained mutations are from permanently remembered individuals + #if EIDOS_ROBIN_HOOD_HASHING() typedef robin_hood::unordered_flat_map INDIVIDUALS_HASH; #elif STD_UNORDERED_MAP_HASHING() @@ -664,9 +691,6 @@ class Species : public EidosDictionaryUnretained void DisconnectCopiedSharedTables(tsk_table_collection_t &p_tables); // zeroes out the shared table copies in p_tables static void handle_error(const std::string &msg, int error) __attribute__((__noreturn__)) __attribute__((cold)) __attribute__((analyzer_noreturn)); - static void MetadataForMutation(Mutation *p_mutation, MutationMetadataRec *p_metadata); - static void MetadataForSubstitution(Substitution *p_substitution, MutationMetadataRec *p_metadata); - static void MetadataForIndividual(Individual *p_individual, IndividualMetadataRec *p_metadata); static void DerivedStatesFromAscii(tsk_table_collection_t *p_tables); static void DerivedStatesToAscii(tsk_table_collection_t *p_tables); @@ -678,6 +702,7 @@ class Species : public EidosDictionaryUnretained void RecordNewHaplosome_NULL(Haplosome *p_new_haplosome); void RecordNewDerivedState(const Haplosome *p_haplosome, slim_position_t p_position, const std::vector &p_derived_mutations); void RetractNewIndividual(void); + void MetadataForIndividual(Individual *p_individual, IndividualMetadataRec *p_metadata); void AddIndividualsToTable(Individual * const *p_individual, size_t p_num_individuals, tsk_table_collection_t *p_tables, INDIVIDUALS_HASH *p_individuals_hash, tsk_flags_t p_flags); void AddLiveIndividualsToIndividualsTable(tsk_table_collection_t *p_tables, INDIVIDUALS_HASH *p_individuals_hash); void FixAliveIndividuals(tsk_table_collection_t *p_tables); @@ -685,7 +710,7 @@ class Species : public EidosDictionaryUnretained void WriteProvenanceTable(tsk_table_collection_t *p_tables, bool p_use_newlines, bool p_include_model, slim_chromosome_index_t p_chromosome_index); void WriteTreeSequenceMetadata(tsk_table_collection_t *p_tables, EidosDictionaryUnretained *p_metadata_dict, slim_chromosome_index_t p_chromosome_index); void _MungeIsNullNodeMetadataToIndex0(TreeSeqInfo &p_treeseq, int original_index); - void ReadTreeSequenceMetadata(TreeSeqInfo &p_treeseq, slim_tick_t *p_tick, slim_tick_t *p_cycle, SLiMModelType *p_model_type, int *p_file_version); + void ReadTreeSequenceMetadata(TreeSeqInfo &p_treeseq, slim_tick_t *p_tick, slim_tick_t *p_cycle, SLiMModelType *p_model_type, int *p_file_version, MutationMetadataTable &p_mut_metadata_table); void _CreateDirectoryForMultichromArchive(std::string resolved_user_path, bool p_overwrite_directory); void WriteTreeSequence(std::string &p_recording_tree_path, bool p_simplify, bool p_include_model, EidosDictionaryUnretained *p_metadata_dict, bool p_overwrite_directory); void ReorderIndividualTable(tsk_table_collection_t *p_tables, std::vector p_individual_map, bool p_keep_unmapped); @@ -700,22 +725,41 @@ class Species : public EidosDictionaryUnretained void CheckTreeSeqIntegrity(void); // checks the tree sequence tables themselves void CrosscheckTreeSeqIntegrity(void); // checks the tree sequence tables against SLiM's data structures + inline __attribute__((always_inline)) void NotifyMutationRemoved(Mutation *p_mut) + { + // Called when a mutation is removed by script or by stacking policy; such mutations need to be retained + // since they are still present in the tree sequence, so we can persist their metadata. Note that this + // needs to happen even when just one of many copies of a mutation is removed in this way, because the + // other copies of the mutation might subsequently be lost; in that case, the retained copy might still + // be needed to represent the mutation in the tree where it used to exist, unless/until simplified away. + // See also ExecuteMethod_treeSeqRememberIndividuals(), the other place where mutations are retained. + if (RecordingTreeSequenceMutations() && !p_mut->retained_by_treeseq_) + { + MutationIndex mut_index = mutation_block_->IndexInBlock(p_mut); + muts_retained_by_treeseq_.push_back(mut_index); + any_muts_retained_impermanently_ = true; + + p_mut->Retain(); + p_mut->retained_by_treeseq_ = true; + } + } + void __CheckPopulationMetadata(TreeSeqInfo &p_treeseq); - void __RemapSubpopulationIDs(SUBPOP_REMAP_HASH &p_subpop_map, TreeSeqInfo &p_treeseq, int p_file_version); + void __RemapSubpopulationIDs(SUBPOP_REMAP_HASH &p_subpop_map, TreeSeqInfo &p_treeseq, MutationMetadataTable &p_mut_metadata_table, int p_file_version); void __PrepareSubpopulationsFromTables(std::unordered_map &p_subpopInfoMap, TreeSeqInfo &p_treeseq); void __TabulateSubpopulationsFromTreeSequence(std::unordered_map &p_subpopInfoMap, tsk_treeseq_t *p_ts, TreeSeqInfo &p_treeseq, SLiMModelType p_file_model_type); void __CreateSubpopulationsFromTabulation(std::unordered_map &p_subpopInfoMap, EidosInterpreter *p_interpreter, std::unordered_map &p_nodeToHaplosomeMap, TreeSeqInfo &p_treeseq); void __CreateSubpopulationsFromTabulation_SECONDARY(std::unordered_map &p_subpopInfoMap, EidosInterpreter *p_interpreter, std::unordered_map &p_nodeToHaplosomeMap, TreeSeqInfo &p_treeseq); void __ConfigureSubpopulationsFromTables(EidosInterpreter *p_interpreter, TreeSeqInfo &p_treeseq); void __ConfigureSubpopulationsFromTables_SECONDARY(EidosInterpreter *p_interpreter, TreeSeqInfo &p_treeseq); - void __TabulateMutationsFromTables(std::unordered_map &p_mutMap, TreeSeqInfo &p_treeseq, int p_file_version); + void __TabulateMutationsFromTables(std::unordered_map &p_mutMap, TreeSeqInfo &p_treeseq, MutationMetadataTable &p_mut_metadata_table, int p_file_version); void __TallyMutationReferencesWithTreeSequence(std::unordered_map &p_mutMap, std::unordered_map p_nodeToHaplosomeMap, tsk_treeseq_t *p_ts); void __CreateMutationsFromTabulation(std::unordered_map &p_mutInfoMap, std::unordered_map &p_mutIndexMap, TreeSeqInfo &p_treeseq); void __AddMutationsFromTreeSequenceToHaplosomes(std::unordered_map &p_mutIndexMap, std::unordered_map p_nodeToHaplosomeMap, tsk_treeseq_t *p_ts, TreeSeqInfo &p_treeseq); void __CheckNodePedigreeIDs(EidosInterpreter *p_interpreter, TreeSeqInfo &p_treeseq); void _ReadAncestralSequence(const char *p_file, Chromosome &p_chromosome); - void _InstantiateSLiMObjectsFromTables(EidosInterpreter *p_interpreter, slim_tick_t p_metadata_tick, slim_tick_t p_metadata_cycle, SLiMModelType p_file_model_type, int p_file_version, SUBPOP_REMAP_HASH &p_subpop_map, TreeSeqInfo &p_treeseq); // given tree-seq tables, makes individuals, haplosomes, and mutations - void _InstantiateSLiMObjectsFromTables_SECONDARY(EidosInterpreter *p_interpreter, slim_tick_t p_metadata_tick, slim_tick_t p_metadata_cycle, SLiMModelType p_file_model_type, int p_file_version, SUBPOP_REMAP_HASH &p_subpop_map, TreeSeqInfo &p_treeseq); // given tree-seq tables, makes individuals, haplosomes, and mutations + void _InstantiateSLiMObjectsFromTables(EidosInterpreter *p_interpreter, slim_tick_t p_metadata_tick, slim_tick_t p_metadata_cycle, SLiMModelType p_file_model_type, int p_file_version, SUBPOP_REMAP_HASH &p_subpop_map, TreeSeqInfo &p_treeseq, MutationMetadataTable &p_mut_metadata_table); // given tree-seq tables, makes individuals, haplosomes, and mutations + void _InstantiateSLiMObjectsFromTables_SECONDARY(EidosInterpreter *p_interpreter, slim_tick_t p_metadata_tick, slim_tick_t p_metadata_cycle, SLiMModelType p_file_model_type, int p_file_version, SUBPOP_REMAP_HASH &p_subpop_map, TreeSeqInfo &p_treeseq, MutationMetadataTable &p_mut_metadata_table); // given tree-seq tables, makes individuals, haplosomes, and mutations void _PostInstantiationCleanup(EidosInterpreter *p_interpreter); slim_tick_t _InitializePopulationFromTskitBinaryFile(const char *p_file, EidosInterpreter *p_interpreter, SUBPOP_REMAP_HASH &p_subpop_remap, Chromosome &p_chromosome); // initialize the population from an tskit binary file slim_tick_t _InitializePopulationFromTskitDirectory(std::string p_directory, EidosInterpreter *p_interpreter, SUBPOP_REMAP_HASH &p_subpop_remap); // initialize the population from a multi-chromosome directory diff --git a/core/species_eidos.cpp b/core/species_eidos.cpp index d489c1c3b..44943643f 100644 --- a/core/species_eidos.cpp +++ b/core/species_eidos.cpp @@ -4884,6 +4884,54 @@ EidosValue_SP Species::ExecuteMethod_treeSeqRememberIndividuals(EidosGlobalStrin AddIndividualsToTable(ind_buffer, ind_count, &treeseq_[0].tables_, &tabled_individuals_hash_, flag); + // BCH 2/14/2026: With the new way we handle mutation metadata with the multitrait redesign, we need to + // retain all mutations present in the remembered individuals here. This ensures that the mutation state + // we need to write out mutation metadata at save time is still available; we get it from the mutations. + // FIXME: This could be optimized with a bulk operation on the mutation runs if it is a bottleneck. + if (RecordingTreeSequenceMutations()) + { + Mutation *mut_block_ptr = mutation_block_->mutation_buffer_; + int haplosome_count_per_individual = HaplosomeCountPerIndividual(); + + for (int ind_index = 0; ind_index < ind_count; ++ind_index) + { + Individual *ind = ind_buffer[ind_index]; + + for (int haplosome_index = 0; haplosome_index < haplosome_count_per_individual; haplosome_index++) + { + Haplosome *haplosome = ind->haplosomes_[haplosome_index]; + int mutrun_count = haplosome->mutrun_count_; + + for (int run_index = 0; run_index < mutrun_count; ++run_index) + { + const MutationRun *mutrun = haplosome->mutruns_[run_index]; + int mut_count = mutrun->size(); + const MutationIndex *mut_ptr = mutrun->begin_pointer_const(); + + for (int mut_index = 0; mut_index < mut_count; ++mut_index) + { + MutationIndex scan_mutation_index = mut_ptr[mut_index]; + Mutation *scan_mutation = mut_block_ptr + scan_mutation_index; + + if (!scan_mutation->retained_by_treeseq_) + { + // This mutation is not already retained by this mechanism; we use retained_by_treeseq_ + // to ensure that a given mutation is only retained once by this mechanism. See also + // NotifyMutationRemoved(), the other place where mutations are retained. + muts_retained_by_treeseq_.push_back(scan_mutation_index); + + if (!permanent) + any_muts_retained_impermanently_ = true; + + scan_mutation->Retain(); + scan_mutation->retained_by_treeseq_ = true; + } + } + } + } + } + } + return gStaticEidosValueVOID; } diff --git a/core/substitution.cpp b/core/substitution.cpp index 2cd4cac94..455e1b4c8 100644 --- a/core/substitution.cpp +++ b/core/substitution.cpp @@ -91,6 +91,30 @@ mutation_type_ptr_(p_mutation_type_ptr), position_(p_position), subpop_index_(p_ #endif } +Substitution::Substitution(slim_mutationid_t p_mutation_id, MutationType *p_mutation_type_ptr, slim_chromosome_index_t p_chromosome_index, slim_position_t p_position, MutationTableMetadataRec *p_metadata_ptr, slim_tick_t p_fixation_tick) : +mutation_type_ptr_(p_mutation_type_ptr), position_(p_position), subpop_index_(p_metadata_ptr->subpop_index_), origin_tick_(p_metadata_ptr->origin_tick_), fixation_tick_(p_fixation_tick), chromosome_index_(p_chromosome_index), nucleotide_(p_metadata_ptr->nucleotide_), mutation_id_(p_mutation_id), tag_value_(SLIM_TAG_UNSET_VALUE) +{ + // This constructor is called by Species::__CreateMutationsFromTabulation() to construct a mutation directly + // from a MutationTableMetadataRec pointer into the tree sequence's mutation metadata table + Species &species = mutation_type_ptr_->species_; + slim_trait_index_t trait_count = species.TraitCount(); + + trait_info_ = (SubstitutionTraitInfo *)malloc(trait_count * sizeof(SubstitutionTraitInfo)); + + for (slim_trait_index_t trait_index = 0; trait_index < trait_count; trait_index++) + { + trait_info_[trait_index].effect_size_ = p_metadata_ptr->per_trait_[trait_index].effect_size_; + trait_info_[trait_index].dominance_coeff_UNSAFE_ = p_metadata_ptr->per_trait_[trait_index].dominance_coeff_; + trait_info_[trait_index].hemizygous_dominance_coeff_ = p_metadata_ptr->per_trait_[trait_index].hemizygous_dominance_coeff_; + } + + EvaluateFlags(); + +#if DEBUG + SelfConsistencyCheck(" in Substitution::Substitution()"); +#endif +} + void Substitution::EvaluateFlags(void) { // This mirrors Substitution::SelfConsistencyCheck(); essentially it sets up flags as expected by that method. diff --git a/core/substitution.h b/core/substitution.h index b8112e3cb..50aed8018 100644 --- a/core/substitution.h +++ b/core/substitution.h @@ -35,6 +35,7 @@ #include "eidos_value.h" class Trait; +struct MutationTableMetadataRec; class Substitution_Class; @@ -90,6 +91,7 @@ class Substitution : public EidosDictionaryRetained Substitution(void) = delete; // no null construction Substitution(Mutation &p_mutation, slim_tick_t p_fixation_tick); // construct from the mutation that has fixed, and the tick in which it fixed Substitution(slim_mutationid_t p_mutation_id, MutationType *p_mutation_type_ptr, slim_chromosome_index_t p_chromosome_index, slim_position_t p_position, slim_effect_t p_selection_coeff, slim_effect_t p_dominance_coeff, slim_objectid_t p_subpop_index, slim_tick_t p_tick, slim_tick_t p_fixation_tick, int8_t p_nucleotide); + Substitution(slim_mutationid_t p_mutation_id, MutationType *p_mutation_type_ptr, slim_chromosome_index_t p_chromosome_index, slim_position_t p_position, MutationTableMetadataRec *p_metadata_ptr, slim_tick_t p_fixation_tick); inline virtual ~Substitution(void) override { free(trait_info_); trait_info_ = nullptr; } diff --git a/core/trait.cpp b/core/trait.cpp index 93fb01e5d..30997c325 100644 --- a/core/trait.cpp +++ b/core/trait.cpp @@ -242,9 +242,7 @@ void Trait::SetProperty(EidosGlobalStringID p_property_id, const EidosValue &p_v if (!std::isfinite(value)) EIDOS_TERMINATION << "ERROR (Trait::SetProperty): property individualOffsetMean requires a finite value (not NAN or INF)." << EidosTerminate(); - individualOffsetDistributionMean_ = value; - _RecacheIndividualOffsetDistribution(); - IndividualOffsetChanged(); + SetIndividualOffsetDistributionMean(value); return; } case gID_individualOffsetSD: @@ -254,9 +252,7 @@ void Trait::SetProperty(EidosGlobalStringID p_property_id, const EidosValue &p_v if (!std::isfinite(value) || (value < 0.0)) EIDOS_TERMINATION << "ERROR (Trait::SetProperty): property individualOffsetSD requires a nonnegative finite value (not NAN or INF)." << EidosTerminate(); - individualOffsetDistributionSD_ = value; - _RecacheIndividualOffsetDistribution(); - IndividualOffsetChanged(); + SetIndividualOffsetDistributionSD(value); return; } case gID_tag: diff --git a/core/trait.h b/core/trait.h index 3a3db1861..34894682b 100644 --- a/core/trait.h +++ b/core/trait.h @@ -154,10 +154,26 @@ class Trait : public EidosDictionaryRetained void _RecacheIndividualOffsetDistribution(void); // caches individualOffsetDistributionFixed_ and individualOffsetDistributionFixedValue_ slim_trait_offset_t _DrawIndividualOffset(void) const; // draws from the distribution defined by individualOffsetDistributionMean_ and individualOffsetDistributionSD_ inline __attribute__((always_inline)) slim_trait_offset_t DrawIndividualOffset(void) const { return (individualOffsetDistributionFixed_) ? individualOffsetDistributionFixedValue_ : _DrawIndividualOffset(); } - //inline __attribute__((always_inline)) slim_trait_offset_t IndividualOffsetDistributionMean(void) const { return individualOffsetDistributionMean_; } // a bit dangerous because of the exp() post-transform; probably nobody should use this - inline __attribute__((always_inline)) slim_trait_offset_t IndividualOffsetDistributionSD(void) const { return individualOffsetDistributionSD_; } + inline __attribute__((always_inline)) slim_trait_offset_t IndividualOffsetDistributionMean(void) const { return individualOffsetDistributionMean_; } // NOTE: this is the mean before the exp() post-transform + inline __attribute__((always_inline)) slim_trait_offset_t IndividualOffsetDistributionSD(void) const { return individualOffsetDistributionSD_; } // NOTE: this is the SD before the exp() post-transform inline __attribute__((always_inline)) void IndividualOffsetChanged(void) { individualOffsetEverOverridden_ = true; } inline __attribute__((always_inline)) bool IndividualOffsetEverChanged(void) { return individualOffsetEverOverridden_; } + inline void SetIndividualOffsetDistributionMean(slim_trait_offset_t p_mean) { + if (p_mean != individualOffsetDistributionMean_) + { + individualOffsetDistributionMean_ = p_mean; + _RecacheIndividualOffsetDistribution(); + IndividualOffsetChanged(); + } + } + inline void SetIndividualOffsetDistributionSD(slim_trait_offset_t p_SD) { + if (p_SD != individualOffsetDistributionSD_) + { + individualOffsetDistributionSD_ = p_SD; + _RecacheIndividualOffsetDistribution(); + IndividualOffsetChanged(); + } + } inline __attribute__((always_inline)) bool HasDirectFitnessEffect(void) const { return directFitnessEffect_; } inline __attribute__((always_inline)) bool HasBaselineAccumulation(void) const { return baselineAccumulation_; } diff --git a/eidos/eidos_globals.h b/eidos/eidos_globals.h index 324896d55..d60be558f 100644 --- a/eidos/eidos_globals.h +++ b/eidos/eidos_globals.h @@ -894,6 +894,19 @@ bool Eidos_RegexWorks(void); // Checks that symbol_name does not contain any illegal Unicode characters; used to check identifiers, in particular bool Eidos_ContainsIllegalUnicode(const std::string &symbol_name); +// little-endian write of a uint64_t to an address; taken from tskit/test_core.c in PR https://github.com/tskit-dev/tskit/pull/3306 +inline void Eidos_set_u64_le(uint8_t *dest, uint64_t value) +{ + dest[0] = (uint8_t)(value & 0xFF); + dest[1] = (uint8_t)((value >> 8) & 0xFF); + dest[2] = (uint8_t)((value >> 16) & 0xFF); + dest[3] = (uint8_t)((value >> 24) & 0xFF); + dest[4] = (uint8_t)((value >> 32) & 0xFF); + dest[5] = (uint8_t)((value >> 40) & 0xFF); + dest[6] = (uint8_t)((value >> 48) & 0xFF); + dest[7] = (uint8_t)((value >> 56) & 0xFF); +} + // ******************************************************************************************************************* // diff --git a/treerec/implementation.md b/treerec/implementation.md index e67e8cf98..0588b25c8 100644 --- a/treerec/implementation.md +++ b/treerec/implementation.md @@ -207,10 +207,10 @@ for instance, individual->node->population.metadata["slim_id"] should always equal individual.metadata["subpopulation"] . We would also need to ensure that the slim_id's were always unique. -## Metadata schemas +## Metadata and schemas tskit provides methods for structured metadata decoding using [JSON schemas](https://json-schema.org/understanding-json-schema/index.html), -to document what the metadata means. +to document what the metadata means (https://tskit.dev/tskit/docs/stable/metadata.html#). We don't make use of these, but write them to the tree sequence for tskit use. There's both top-level metadata (ie for the whole tree sequence) and metadata for every row in every table. @@ -229,6 +229,10 @@ Nonetheless, we only actually do JSON parsing and writing in SLiM's code for top for all the schemas (including the top-level metadata schema) we just write out the string representation, as output by tskit, saved in `slim_globals.h`. +It is worth noting that several of SLiM'm metadata schemas in SLiM 5.0 and later are actually (slightly) dynamically generated. In SLiM's code these schemas are denoted by the suffix `_schema_FORMAT` in their variable name, because the schema string has substrings named things like `"%d"`, `"%d1"`, and `"%d2"` that get replaced at runtime. This is used to control aspects of the schema that are fixed within one tree sequence but vary across different tree sequences, such as the number of chromosomes or the number of traits. Different tree sequences from SLiM will therefore not have exactly the same schemas, and indeed, the size of their metadata records may vary. + +In SLiM 5.2 and later, with the addition of support for multiple traits, keeping the mutation metadata in tskit's mutation table got rather unwieldy because the size of the metadata got larger (due to information being kept about each mutation's effect size, dominance, etc., for each trait). The pre-5.2 way or storing mutation metadata was to put a new copy of it into the mutation table each time that a new derived state was recorded. In some models, that resulting in a big pile of duplicated data in the form of snapshots of the metadata for the same mutation at multiple time points. Now, instead, SLiM keeps references to all of the mutations that are still active in the tree sequence, and it outputs a single copy of each one's metadata. This is not well-suited to the mutation table, so this metadata now goes in a separate table of mutation metadata that SLiM puts in the top-level metadata, in binary form, using tskit's new `json+struct` codec. Any mutations (or substitutions) that are currently segregating in the population are placed in this table. In addition there is a special facility to ensure the preservation of mutations that (a) are in the haplosomes of remembered individuals, or (b) were removed from a haplosome by script or by the stacking policy. Such mutations might no longer be segregating in SLiM, but they might still be in the tree sequence even after simplification. There is a vector of such mutations in Species, called `muts_retained_by_treeseq_`, that gets added to the top-level mutation metadata table at save. For efficiency, each mutation also has a flag, `retained_by_treeseq_`, that tracks whether that mutation has been retained in that vector; this allows this scheme to operate quite efficiently – preventing duplicate entries from ever being added, efficiently filtering out entries that are lost during simplification, and making it very easy to merge the different sets of mutations at save time. + In the future we may want to *keep* whatever top-level metadata there is in the tree sequence already (and the associated keys in the schema). We've not done that yet because making things exactly match seems like a pain, diff --git a/treerec/tskit/core.c b/treerec/tskit/core.c index 0f31550a7..e1aa24626 100644 --- a/treerec/tskit/core.c +++ b/treerec/tskit/core.c @@ -33,6 +33,9 @@ #include #define UUID_NUM_BYTES 16 +#define TSK_JSON_BINARY_HEADER_SIZE 21 + +static const uint8_t TSK_JSON_BINARY_MAGIC[4] = { 'J', 'B', 'L', 'B' }; #if defined(_WIN32) @@ -95,6 +98,22 @@ get_random_bytes(uint8_t *buf) #endif +static uint64_t +tsk_load_u64_le(const uint8_t *p) +{ + uint64_t value; + + value = (uint64_t) p[0]; + value |= (uint64_t) p[1] << 8; + value |= (uint64_t) p[2] << 16; + value |= (uint64_t) p[3] << 24; + value |= (uint64_t) p[4] << 32; + value |= (uint64_t) p[5] << 40; + value |= (uint64_t) p[6] << 48; + value |= (uint64_t) p[7] << 56; + return value; +} + /* Generate a new UUID4 using a system-generated source of randomness. * Note that this function writes a NULL terminator to the end of this * string, so that the total length of the buffer must be 37 bytes. @@ -121,6 +140,69 @@ tsk_generate_uuid(char *dest, int TSK_UNUSED(flags)) out: return ret; } + +// BCH 2/14/2026: MODIFIED THIS! I've removed const from this API because I want to modify the metadata that +// it returns. This change has been proposed in https://github.com/tskit-dev/tskit/pull/3306. +int +tsk_json_struct_metadata_get_blob(char *metadata, tsk_size_t metadata_length, + const char **json, tsk_size_t *json_length, uint8_t **blob, + tsk_size_t *blob_length) +{ + int ret; + uint8_t version; + uint64_t json_length_u64; + uint64_t binary_length_u64; + uint64_t header_and_json_length; + uint64_t total_length; + uint8_t *bytes; + uint8_t *blob_start; + const char *json_start; + + if (metadata == NULL || json == NULL || json_length == NULL || blob == NULL + || blob_length == NULL) { + ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); + goto out; + } + bytes = (uint8_t *) metadata; + if (metadata_length < TSK_JSON_BINARY_HEADER_SIZE) { + ret = tsk_trace_error(TSK_ERR_FILE_FORMAT); + goto out; + } + if (memcmp(bytes, TSK_JSON_BINARY_MAGIC, sizeof(TSK_JSON_BINARY_MAGIC)) != 0) { + ret = tsk_trace_error(TSK_ERR_FILE_FORMAT); + goto out; + } + version = bytes[4]; + if (version != 1) { + ret = tsk_trace_error(TSK_ERR_FILE_VERSION_TOO_NEW); + goto out; + } + json_length_u64 = tsk_load_u64_le(bytes + 5); + binary_length_u64 = tsk_load_u64_le(bytes + 13); + if (json_length_u64 > UINT64_MAX - (uint64_t) TSK_JSON_BINARY_HEADER_SIZE) { + ret = tsk_trace_error(TSK_ERR_FILE_FORMAT); + goto out; + } + header_and_json_length = (uint64_t) TSK_JSON_BINARY_HEADER_SIZE + json_length_u64; + if (binary_length_u64 > UINT64_MAX - header_and_json_length) { + ret = tsk_trace_error(TSK_ERR_FILE_FORMAT); + goto out; + } + total_length = header_and_json_length + binary_length_u64; + if ((uint64_t) metadata_length < total_length) { + ret = tsk_trace_error(TSK_ERR_FILE_FORMAT); + goto out; + } + json_start = (const char *) bytes + TSK_JSON_BINARY_HEADER_SIZE; + blob_start = bytes + TSK_JSON_BINARY_HEADER_SIZE + json_length_u64; + *json = json_start; + *json_length = (tsk_size_t) json_length_u64; + *blob = blob_start; + *blob_length = (tsk_size_t) binary_length_u64; + ret = 0; +out: + return ret; +} static const char * tsk_strerror_internal(int err) { diff --git a/treerec/tskit/core.h b/treerec/tskit/core.h index 481905b7a..0e12f663c 100644 --- a/treerec/tskit/core.h +++ b/treerec/tskit/core.h @@ -1096,6 +1096,31 @@ bool tsk_isfinite(double val); #define TSK_UUID_SIZE 36 int tsk_generate_uuid(char *dest, int flags); +/** +@brief Extract the binary payload from ``json+struct`` encoded metadata. + +@rst +Metadata produced by :py:class:`tskit.metadata.JSONStructCodec` consists of a fixed-size +header followed by canonical JSON bytes and an optional binary payload. This helper +validates the framing, returning pointers to the embedded JSON and binary sections +without copying. + +The output pointers reference memory owned by the caller and remain valid only while +the original metadata buffer is alive. +@endrst + +@param[in] metadata Pointer to the encoded metadata bytes. +@param[in] metadata_length Number of bytes available at ``metadata``. +@param[out] json On success, set to the start of the JSON bytes. +@param[out] json_length On success, set to the JSON length in bytes. +@param[out] blob On success, set to the start of the binary payload. +@param[out] blob_length On success, set to the payload length in bytes. +@return 0 on success, or a :ref:`TSK_ERR ` code on failure. +*/ +int tsk_json_struct_metadata_get_blob(char *metadata, tsk_size_t metadata_length, + const char **json, tsk_size_t *json_length, uint8_t **blob, + tsk_size_t *blob_length); + /* TODO most of these can probably be macros so they compile out as no-ops. * Lets do the 64 bit tsk_size_t switch first though. */ void *tsk_malloc(tsk_size_t size);