Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/integration-tests-mssql.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ on:

jobs:
integration-tests:
runs-on: ubuntu-latest
runs-on: ubuntu-22.04
services:
mssql:
image: mcr.microsoft.com/mssql/server:2019-latest
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/integration-tests-mysql.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ on:

jobs:
integration-tests:
runs-on: ubuntu-latest
runs-on: ubuntu-22.04
services:
mysql:
image: mysql:8.0
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/integration-tests-postgres.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ on:

jobs:
integration-tests:
runs-on: ubuntu-latest
runs-on: ubuntu-22.04
container: python:3.12-bookworm
services:
postgres:
Expand Down
6 changes: 4 additions & 2 deletions docs/how_it_works.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,10 @@ original one.

### Recursive XSD

Recursive XML schemas are not supported, because most of the time they will result in cycles in foreign key constraints
dependencies, which we cannot handle easily.
Recursive XML schemas are not fully supported, because they result in cycles in tables dependencies, which would make
the process much more complex. Whenever a field which would introduce a dependency cycle is detected in the XSD, it is
discarded with a warning, which means that the corresponding data in XML files will not be imported. The rest of the
data should be processed correctly.

### Mixed content elements

Expand Down
34 changes: 25 additions & 9 deletions src/xml2db/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,17 +171,24 @@ def _extract_node(
record["xml2db_row_number"] = row_number

# build record from fields for columns and n-1 relations
for field_type, key, _ in model_table.fields:
for field_type, key, field in model_table.fields:
if field_type == "col":
if key in content:
content_key = (
(f"{key[:-5]}__attr" if field.has_suffix else f"{key}__attr")
if field.is_attr
else key
)
if content_key in content:
if model_table.columns[key].data_type in ["decimal", "float"]:
val = [float(v) for v in content[key]]
val = [float(v) for v in content[content_key]]
elif model_table.columns[key].data_type == "integer":
val = [int(v) for v in content[key]]
val = [int(v) for v in content[content_key]]
elif model_table.columns[key].data_type == "boolean":
val = [v == "true" or v == "1" for v in content[key]]
val = [
v == "true" or v == "1" for v in content[content_key]
]
else:
val = content[key]
val = content[content_key]

if len(val) == 1:
record[key] = val[0]
Expand Down Expand Up @@ -320,20 +327,29 @@ def _build_node(node_type: str, node_pk: int) -> tuple:
record = data_index[node_type]["records"][node_pk]
for field_type, rel_name, rel in tb.fields:
if field_type == "col" and record[rel_name] is not None:
content_key = (
(
f"{rel_name[:-5]}__attr"
if rel.has_suffix
else f"{rel_name}__attr"
)
if rel.is_attr
else rel_name
)
if rel.data_type in [
"decimal",
"float",
]: # remove trailing ".0" for decimal and float
content[rel_name] = [
content[content_key] = [
value.rstrip("0").rstrip(".") if "." in value else value
for value in str(record[rel_name]).split(",")
]
elif isinstance(record[rel_name], datetime.datetime):
content[rel_name] = [
content[content_key] = [
record[rel_name].isoformat(timespec="milliseconds")
]
else:
content[rel_name] = (
content[content_key] = (
list(csv.reader([str(record[rel_name])], escapechar="\\"))[
0
]
Expand Down
106 changes: 61 additions & 45 deletions src/xml2db/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ class DataModel:
def __init__(
self,
xsd_file: str,
short_name: str = None,
short_name: str = "DocumentRoot",
long_name: str = None,
base_url: str = None,
model_config: dict = None,
Expand Down Expand Up @@ -226,8 +226,7 @@ def _build_model(self):
"""
# parse the XML schema recursively and hold a reference to the head table
root_table = self._parse_tree(
self.xml_schema[0] if len(self.xml_schema) == 1 else self.xml_schema,
is_root_table=True,
self.xml_schema[0] if len(self.xml_schema) == 1 else self.xml_schema
)
self.root_table = root_table.type_name
# compute a text representation of the original data model and store it
Expand Down Expand Up @@ -273,9 +272,7 @@ def _build_model(self):
for tb in self.fk_ordered_tables:
tb.build_sqlalchemy_tables()

def _parse_tree(
self, parent_node: xmlschema.XsdElement, is_root_table: bool = False
):
def _parse_tree(self, parent_node: xmlschema.XsdElement, nodes_path: list = None):
"""Parse a node of an XML schema recursively and create a target data model without any simplification

We parse the XSD tree recursively to create for each node (basically a complex type in the XSD) an equivalent \
Expand All @@ -289,7 +286,7 @@ def _parse_tree(

Args:
parent_node: the current XSD node being parsed
is_root_table: True if this is the root table
nodes_path: a list of nodes types from the root node
"""

# find current node type and name and returns corresponding table if it already exists
Expand All @@ -301,12 +298,16 @@ def _parse_tree(
if parent_type is None:
parent_type = parent_node.local_name

nodes_path = (nodes_path if nodes_path else []) + [parent_type]

# if this type has already been encountered, stop here and return existing table
if parent_type in self.tables:
parent_table = self.tables[parent_type]
return parent_table

# elements names and types should be bijective. If an element name is used for different types,
# For database tables we use element names rather than XSD types, under the assumption that they are often
# more meaningful given that they are the one which appear in XML documents. However, same names can be used
# for different XSD types, so if an element name is used for different types,
# we add a suffix to the name to make it unique again (using a dict to keep the name/type association)
parent_name = (
parent_node.local_name
Expand All @@ -324,7 +325,7 @@ def _parse_tree(
parent_table = self._create_table_model(
parent_name,
parent_type,
is_root_table,
len(nodes_path) == 1,
isinstance(parent_node, xmlschema.XMLSchema),
)
self.tables[parent_type] = parent_table
Expand Down Expand Up @@ -363,6 +364,13 @@ def recurse_parse_simple_type(elem_type):
if elem_type.base_type
else recurse_parse_simple_type(elem_type.member_types)
)
if elem_type.is_list():
return (
"string",
0,
None,
elem_type.allow_empty,
)
if elem_type.is_restriction():
dt = elem_type.base_type.local_name
mil = elem_type.min_length
Expand All @@ -384,7 +392,12 @@ def recurse_parse_simple_type(elem_type):
else None
)
ae = ae and bt_ae if ae is not None and bt_ae is not None else None
if elem_type.enumeration is not None and dt in ["string", "NMTOKEN", "duration", "token"]:
if elem_type.enumeration is not None and dt in [
"string",
"NMTOKEN",
"duration",
"token",
]:
mil = min([len(val) for val in elem_type.enumeration])
mal = max([len(val) for val in elem_type.enumeration])
return dt, mil, mal, ae
Expand All @@ -410,25 +423,31 @@ def get_occurs(particle):
),
]

# go through item attributes and add them as columns
# go through item attributes and add them as columns, adding a suffix if an element with the same name exists
children_names = None
for attrib_name, attrib in parent_node.attributes.items():
if children_names is None:
children_names = [child.local_name for child in parent_node]
(
data_type,
min_length,
max_length,
allow_empty,
) = recurse_parse_simple_type([attrib.type])
suffix = attrib_name in children_names
parent_table.add_column(
f"{attrib_name}",
f"{attrib_name}{'_attr' if suffix else ''}",
data_type,
[0, 1],
min_length,
max_length,
True,
suffix,
False,
allow_empty,
None,
)

nested_containers = []
# go through the children to add either arguments either relations to the current element
for child in parent_node:
Expand All @@ -454,6 +473,7 @@ def get_occurs(particle):
if child.parent
and child.parent.max_occurs != 1
and child.parent.model != "choice"
and child.max_occurs == 1
else None
),
)
Expand Down Expand Up @@ -482,32 +502,39 @@ def get_occurs(particle):
max_length,
False,
False,
False,
allow_empty,
nested_containers[-1][1],
)

elif ct.is_complex():
child_table = self._parse_tree(child)
child_table.model_group = (
"choice"
if ct.model_group and ct.model_group.model == "choice"
else "sequence"
)
occurs = get_occurs(child)
if child.is_single():
parent_table.add_relation_1(
child.local_name,
child_table,
occurs,
nested_containers[-1][1],
# ignoring recursive definitions by skipping these fields
if child.type.local_name in nodes_path:
logger.warning(
f"type '{child.type.local_name}' contains a recursive definition"
)
else:
parent_table.add_relation_n(
child.local_name,
child_table,
occurs,
nested_containers[-1][1],
child_table = self._parse_tree(child, nodes_path)
child_table.model_group = (
"choice"
if ct.model_group and ct.model_group.model == "choice"
else "sequence"
)
occurs = get_occurs(child)
if occurs[1] == 1:
parent_table.add_relation_1(
child.local_name,
child_table,
occurs,
nested_containers[-1][1],
)
else:
parent_table.add_relation_n(
child.local_name,
child_table,
occurs,
nested_containers[-1][1],
)
else:
raise ValueError("unknown case; please check")
else:
Expand All @@ -534,6 +561,7 @@ def get_occurs(particle):
min_length,
max_length,
False,
False,
True,
allow_empty,
None,
Expand All @@ -544,31 +572,19 @@ def get_occurs(particle):
def _repr_tree(
self,
parent_table: Union[DataModelTableReused, DataModelTableDuplicated],
visited_nodes: Union[set, None] = None,
):
"""Build a text representation of the data model tree

Args:
parent_table: the current data model table object
"""
if visited_nodes is None:
visited_nodes = set()
else:
visited_nodes = {item for item in visited_nodes}
visited_nodes.add(parent_table.name)
for field_type, name, field in parent_table.fields:
if field_type == "col":
yield f"{field.name}{field.occurs}: {field.data_type}"
elif field_type == "rel1":
mg = " (choice)" if field.other_table.model_group == "choice" else ""
yield f"{field.name}{field.occurs}{mg}:{' ...' if field_type in visited_nodes else ''}"
if field.other_table.name not in visited_nodes:
for line in self._repr_tree(field.other_table, visited_nodes):
yield f" {line}"
elif field_type == "reln":
else:
mg = " (choice)" if field.other_table.model_group == "choice" else ""
yield f"{field.name}{field.occurs}{mg}:{' ...' if field_type in visited_nodes else ''}"
for line in self._repr_tree(field.other_table, visited_nodes):
yield f"{field.name}{field.occurs}{mg}:"
for line in self._repr_tree(field.other_table):
yield f" {line}"

def get_entity_rel_diagram(self, text_context: bool = True) -> str:
Expand Down
Loading
Loading