diff --git a/README.md b/README.md index 2e55ae9..e5a2b52 100644 --- a/README.md +++ b/README.md @@ -177,7 +177,11 @@ docker run --rm -v $(pwd):/data dbpedia/databus-python-client download $DOWNLOAD - `--compression` - Enables on-the-fly compression format conversion during download. Supported formats: `bz2`, `gz`, `xz`. The source compression is auto-detected from the file extension. Example: `--compression gz` converts all downloaded compressed files to gzip format. - `--format` - - Enables on-the-fly RDF and tabular format conversion during download (Layer 2). Supported formats: `ntriples` (`nt`), `turtle` (`ttl`), `rdf-xml` (`rdf`, `xml`), `nquads` (`nq`), `trig`, `trix`, `json-ld` (`jsonld`), `csv`, `tsv`. Short aliases shown in brackets. Only the converted output file is kept — the original is deleted after successful conversion. Example: `--format turtle` converts all downloaded RDF triple files to Turtle format. + - Enables on-the-fly RDF and tabular format conversion during download (Layer 2 and Layer 3). Supported formats: `ntriples` (`nt`), `turtle` (`ttl`), `rdf-xml` (`rdf`, `xml`), `nquads` (`nq`), `trig`, `trix`, `json-ld` (`jsonld`), `csv`, `tsv`. Short aliases shown in brackets. Only the converted output file is kept — the original is deleted after successful conversion. Within the same equivalence class (e.g. turtle to ntriples) conversion is lossless. Across classes (e.g. RDF to CSV) some flags below may be required. +- `--graph-name` + - Required when converting RDF triples to a quad format (e.g. turtle to nquads). Assigns all triples to the specified named graph URI. Example: `--format nquads --graph-name https://example.org/mygraph`. +- `--base-uri` + - Required when converting CSV/TSV to RDF triples. Used as the base for constructing subject URIs from CSV row identifiers. Example: `--format ntriples --base-uri https://example.org/data/`. - `--validate-checksum` - Validates the checksums of downloaded files against the checksums provided by the Databus. If a checksum does not match, an error is raised and the file is deleted. @@ -296,6 +300,24 @@ databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=az.ttl.bz2 --format ntriples --compression gz ``` +**Download with Mapping Conversion (Layer 3)**: convert across format classes — between RDF triples, RDF quads, and tabular data. +```bash +# RDF Triples -> RDF Quads (requires --graph-name) +databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=az.ttl.bz2 --format nquads --graph-name https://example.org/mygraph + +# RDF Quads -> RDF Triples (splits into one file per named graph, in a subdirectory) +databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=az.nq --format turtle + +# RDF Triples -> CSV (produces a companion .meta.json preserving datatypes/language tags) +databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=az.ttl.bz2 --format csv + +# CSV -> RDF Triples (requires --base-uri; lossless if companion .meta.json is present) +databusclient download https://databus.dbpedia.org/dbpedia/some-tabular-dataset/2022.12.01/data.csv --format ntriples --base-uri https://example.org/data/ + +# RDF Quads -> CSV (adds a 'graph' column) +databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=az.nq --format csv +``` + ### Deploy diff --git a/databusclient/api/download.py b/databusclient/api/download.py index 4d96a9b..bd71555 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -18,7 +18,14 @@ get_databus_id_parts_from_file_url, compute_sha256_and_length, ) -from databusclient.filehandling.format import convert_file, get_converted_filename +from databusclient.filehandling.format import ( + convert_file, + get_converted_filename, + normalize_format, + get_format_class, + detect_format_from_filename, + FORMAT_TO_EXTENSION, +) # Compression format mappings COMPRESSION_EXTENSIONS = { @@ -316,6 +323,8 @@ def _download_file( client_id=None, compression=None, convert_format=None, + graph_name=None, + base_uri=None, validate_checksum: bool = False, expected_checksum: str | None = None, ) -> None: @@ -331,6 +340,8 @@ def _download_file( compression: Target compression format for on-the-fly conversion. Source compression is auto-detected from the file extension. convert_format: Target RDF/tabular format for on-the-fly conversion. + graph_name: Named graph URI for Triple -> Quad conversion (Layer 3). + base_uri: Base URI for CSV -> Triple conversion (Layer 3). validate_checksum: Whether to validate checksums after downloading. expected_checksum: The expected checksum of the file. """ @@ -555,10 +566,6 @@ def _download_file( # already matches target format, skip decompression and conversion # entirely — no work needed for the format part. if needs_format_conversion and source_compression is not None: - from databusclient.filehandling.format import ( - detect_format_from_filename, - normalize_format, - ) detected_input_format = detect_format_from_filename(file) normalized_target = normalize_format(convert_format) if detected_input_format == normalized_target: @@ -599,10 +606,57 @@ def _download_file( conversion_input_path = temp_decompressed_path - # Convert format on uncompressed input. + # Determine whether this is a Quad -> Triple (Layer 3) conversion. + # This direction produces multiple output files (one per named + # graph) written into a subdirectory, rather than a single file — + # so it is handled separately from the standard single-file path + # below (no recompression, no single-file delete-and-replace). + normalized_convert_format = normalize_format(convert_format) + target_class = get_format_class(normalized_convert_format) + source_format_for_mapping = detect_format_from_filename(conversion_input_path) + source_class_for_mapping = ( + get_format_class(source_format_for_mapping) + if source_format_for_mapping else None + ) + is_quad_to_triple = ( + source_class_for_mapping == "quads" and target_class == "triples" + ) + + if is_quad_to_triple: + # Output directory name = original filename with compression and + # format extensions stripped (e.g. "data.nq.gz" -> "data"). + output_stem = get_converted_filename(file, convert_format) + target_ext = FORMAT_TO_EXTENSION.get(normalized_convert_format, "") + if target_ext and output_stem.lower().endswith(target_ext): + output_stem = output_stem[: -len(target_ext)] + output_dir = os.path.join(localDir, output_stem) + + convert_file( + conversion_input_path, + output_dir, + convert_format, + graph_name=graph_name, + base_uri=base_uri, + ) + + # Delete the original downloaded (possibly compressed) file — + # the split output directory replaces it. + if os.path.exists(filename): + os.remove(filename) + print(f"Removed original file: {os.path.basename(filename)}") + return + + # Standard single-output-file path (Layer 2, and the remaining + # Layer 3 directions: Triple<->Quad, Triple<->TSD, Quad->TSD). converted_basename = get_converted_filename(file, convert_format) converted_uncompressed_path = os.path.join(localDir, converted_basename) - convert_file(conversion_input_path, converted_uncompressed_path, convert_format) + convert_file( + conversion_input_path, + converted_uncompressed_path, + convert_format, + graph_name=graph_name, + base_uri=base_uri, + ) # Delete the original downloaded file after successful format conversion, # unless the converted output is the same file (same format, same path). @@ -612,12 +666,18 @@ def _download_file( print(f"Removed original file: {os.path.basename(filename)}") # Recompress converted output when needed. + # Three cases: + # 1. Source was compressed + --compression given -> use target compression + # 2. Source was compressed, no --compression given -> recompress with original + # 3. Source was NOT compressed + --compression given -> compress the output + # 4. Source was NOT compressed, no --compression given -> no compression if source_compression is not None: if should_convert_compression and compression: final_compression = compression else: final_compression = source_compression - elif should_convert_compression and compression: + elif compression: + # Source was uncompressed but user explicitly requested --compression final_compression = compression else: final_compression = None @@ -651,6 +711,8 @@ def _download_files( client_id: str = None, compression: str = None, convert_format: str = None, + graph_name: str = None, + base_uri: str = None, validate_checksum: bool = False, checksums: dict | None = None, ) -> None: @@ -665,6 +727,8 @@ def _download_files( client_id: Client ID for token exchange. compression: Target compression format for on-the-fly conversion. convert_format: Target RDF/tabular format for on-the-fly conversion. + graph_name: Named graph URI for Triple -> Quad conversion (Layer 3). + base_uri: Base URI for CSV -> Triple conversion (Layer 3). validate_checksum: Whether to validate checksums after downloading. checksums: Dictionary mapping URLs to their expected checksums. """ @@ -681,11 +745,12 @@ def _download_files( client_id=client_id, compression=compression, convert_format=convert_format, + graph_name=graph_name, + base_uri=base_uri, validate_checksum=validate_checksum, expected_checksum=expected, ) - def _get_sparql_query_of_collection(uri: str, databus_key: str | None = None) -> str: """Get SPARQL query of collection members from databus collection URI. @@ -829,6 +894,8 @@ def _download_collection( client_id: str = None, compression: str = None, convert_format: str = None, + graph_name: str = None, + base_uri: str = None, validate_checksum: bool = False, ) -> None: """Download all files in a databus collection. @@ -843,6 +910,8 @@ def _download_collection( client_id: Client ID for token exchange. compression: Target compression format for on-the-fly conversion. convert_format: Target RDF/tabular format for on-the-fly conversion. + graph_name: Named graph URI for Triple -> Quad conversion (Layer 3). + base_uri: Base URI for CSV -> Triple conversion (Layer 3). validate_checksum: Whether to validate checksums after downloading. """ query = _get_sparql_query_of_collection(uri, databus_key=databus_key) @@ -864,6 +933,8 @@ def _download_collection( client_id=client_id, compression=compression, convert_format=convert_format, + graph_name=graph_name, + base_uri=base_uri, validate_checksum=validate_checksum, checksums=checksums if checksums else None, ) @@ -878,6 +949,8 @@ def _download_version( client_id: str = None, compression: str = None, convert_format: str = None, + graph_name: str = None, + base_uri: str = None, validate_checksum: bool = False, ) -> None: """Download all files in a databus artifact version. @@ -891,6 +964,8 @@ def _download_version( client_id: Client ID for token exchange. compression: Target compression format for on-the-fly conversion. convert_format: Target RDF/tabular format for on-the-fly conversion. + graph_name: Named graph URI for Triple -> Quad conversion (Layer 3). + base_uri: Base URI for CSV -> Triple conversion (Layer 3). validate_checksum: Whether to validate checksums after downloading. """ json_str = fetch_databus_jsonld(uri, databus_key=databus_key) @@ -911,6 +986,8 @@ def _download_version( client_id=client_id, compression=compression, convert_format=convert_format, + graph_name=graph_name, + base_uri=base_uri, validate_checksum=validate_checksum, checksums=checksums, ) @@ -926,6 +1003,8 @@ def _download_artifact( client_id: str = None, compression: str = None, convert_format: str = None, + graph_name: str = None, + base_uri: str = None, validate_checksum: bool = False, ) -> None: """Download files in a databus artifact. @@ -940,6 +1019,8 @@ def _download_artifact( client_id: Client ID for token exchange. compression: Target compression format for on-the-fly conversion. convert_format: Target RDF/tabular format for on-the-fly conversion. + graph_name: Named graph URI for Triple -> Quad conversion (Layer 3). + base_uri: Base URI for CSV -> Triple conversion (Layer 3). validate_checksum: Whether to validate checksums after downloading. """ json_str = fetch_databus_jsonld(uri, databus_key=databus_key) @@ -966,6 +1047,8 @@ def _download_artifact( client_id=client_id, compression=compression, convert_format=convert_format, + graph_name=graph_name, + base_uri=base_uri, validate_checksum=validate_checksum, checksums=checksums, ) @@ -1042,6 +1125,8 @@ def _download_group( client_id: str = None, compression: str = None, convert_format: str = None, + graph_name: str = None, + base_uri: str = None, validate_checksum: bool = False, ) -> None: """Download files in a databus group. @@ -1056,6 +1141,8 @@ def _download_group( client_id: Client ID for token exchange. compression: Target compression format for on-the-fly conversion. convert_format: Target RDF/tabular format for on-the-fly conversion. + graph_name: Named graph URI for Triple -> Quad conversion (Layer 3). + base_uri: Base URI for CSV -> Triple conversion (Layer 3). validate_checksum: Whether to validate checksums after downloading. """ json_str = fetch_databus_jsonld(uri, databus_key=databus_key) @@ -1072,6 +1159,8 @@ def _download_group( client_id=client_id, compression=compression, convert_format=convert_format, + graph_name=graph_name, + base_uri=base_uri, validate_checksum=validate_checksum, ) @@ -1121,6 +1210,8 @@ def download( client_id="vault-token-exchange", compression=None, convert_format=None, + graph_name=None, + base_uri=None, validate_checksum: bool = False, ) -> None: """Download datasets from databus. @@ -1136,8 +1227,10 @@ def download( auth_url: Keycloak token endpoint URL. Default is "https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token". client_id: Client ID for token exchange. Default is "vault-token-exchange". compression: Target compression format for on-the-fly conversion (supported: bz2, gz, xz). - Source compression is auto-detected from the file extension. + Source compression is auto-detected from the file extension. convert_format: Target RDF/tabular format for on-the-fly conversion. + graph_name: Named graph URI for Triple -> Quad conversion (Layer 3). + base_uri: Base URI for CSV -> Triple conversion (Layer 3). validate_checksum: Whether to validate checksums after downloading. """ for databusURI in databusURIs: @@ -1167,6 +1260,8 @@ def download( client_id, compression, convert_format, + graph_name=graph_name, + base_uri=base_uri, validate_checksum=validate_checksum, ) elif file is not None: @@ -1188,6 +1283,8 @@ def download( client_id=client_id, compression=compression, convert_format=convert_format, + graph_name=graph_name, + base_uri=base_uri, validate_checksum=validate_checksum, expected_checksum=expected, ) @@ -1202,6 +1299,8 @@ def download( client_id=client_id, compression=compression, convert_format=convert_format, + graph_name=graph_name, + base_uri=base_uri, validate_checksum=validate_checksum, ) elif artifact is not None: @@ -1218,6 +1317,8 @@ def download( client_id=client_id, compression=compression, convert_format=convert_format, + graph_name=graph_name, + base_uri=base_uri, validate_checksum=validate_checksum, ) elif group is not None and group != "collections": @@ -1234,6 +1335,8 @@ def download( client_id=client_id, compression=compression, convert_format=convert_format, + graph_name=graph_name, + base_uri=base_uri, validate_checksum=validate_checksum, ) elif account is not None: @@ -1272,6 +1375,8 @@ def download( client_id=client_id, compression=compression, convert_format=convert_format, + graph_name=graph_name, + base_uri=base_uri, validate_checksum=validate_checksum, checksums=checksums if checksums else None, ) \ No newline at end of file diff --git a/databusclient/cli.py b/databusclient/cli.py index 50f0766..277d0d6 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -208,6 +208,20 @@ def deploy( "Accepts full names (ntriples, turtle, rdf-xml, nquads, trig, trix, json-ld, csv, tsv) " "or short aliases (nt, ttl, rdf, xml, nq, jsonld).", ) +@click.option( + "--graph-name", + "graph_name", + default=None, + help="Named graph URI for Triple -> Quad conversion (Layer 3). " + "Required when converting RDF triple formats to quad formats.", +) +@click.option( + "--base-uri", + "base_uri", + default=None, + help="Base URI for CSV -> RDF Triple conversion (Layer 3). " + "Required when converting CSV/TSV to RDF triple formats.", +) @click.option( "--validate-checksum", is_flag=True, help="Validate checksums of downloaded files" ) @@ -222,6 +236,8 @@ def download( clientid, compression, convert_format, + graph_name, + base_uri, validate_checksum, ): """ @@ -240,10 +256,15 @@ def download( client_id=clientid, compression=compression, convert_format=convert_format, + graph_name=graph_name, + base_uri=base_uri, validate_checksum=validate_checksum, ) except DownloadAuthError as e: raise click.ClickException(str(e)) + except ValueError as e: + raise click.ClickException(str(e)) + @app.command() diff --git a/databusclient/filehandling/format.py b/databusclient/filehandling/format.py index 7c40109..262f61e 100644 --- a/databusclient/filehandling/format.py +++ b/databusclient/filehandling/format.py @@ -479,6 +479,8 @@ def convert_file( input_file: str, output_file: str, convert_format: str, + graph_name: str = None, + base_uri: str = None, ) -> None: """Main conversion dispatcher called from the download pipeline. @@ -489,10 +491,19 @@ def convert_file( Accepts both canonical format names and short aliases (e.g. 'nt' for 'ntriples', 'ttl' for 'turtle'). See normalize_format() for full list. + For Layer 3 cross-class conversions: + - Triple -> Quad requires graph_name (--graph-name ). + - CSV -> Triple requires base_uri (--base-uri ). + - Quad -> Triple produces multiple files in a subdirectory; output_file + is used as the subdirectory path. + Args: input_file: Path to the input file (must be decompressed). output_file: Path to write the converted output file. + For Quad -> Triple, this is the output subdirectory path. convert_format: Target format name or alias (CLI format string). + graph_name: Named graph URI for Triple -> Quad conversion. + base_uri: Base URI for CSV -> Triple conversion. Raises: ValueError: If input format cannot be detected or conversion @@ -513,8 +524,6 @@ def convert_file( if input_format == convert_format: # Input and target format are identical. # Copy input to output path so the caller always receives an output file. - # This is important for the download pipeline which expects an output - # file to exist after convert_file() returns — e.g. for recompression. if input_file != output_file: shutil.copy2(input_file, output_file) print( @@ -542,15 +551,45 @@ def convert_file( ) return - # --- Layer 3: cross-class (prototype only) --- + # --- Layer 3: cross-class --- + from databusclient.filehandling import mapping as _mapping + + # Triple -> Quad + if input_class == "triples" and output_class == "quads": + _mapping.convert_triples_to_quads( + input_file, output_file, input_format, convert_format, graph_name + ) + return + + # Quad -> Triple (output_file used as output subdirectory) + if input_class == "quads" and output_class == "triples": + _mapping.convert_quads_to_triples( + input_file, output_file, input_format, convert_format + ) + return + + # Triple -> TSD if input_class == "triples" and output_class == "tabular": - from databusclient.filehandling.mapping import convert_rdf_to_csv + _mapping.convert_rdf_to_csv( + input_file, output_file, input_format, convert_format + ) + return - convert_rdf_to_csv(input_file, output_file, input_format) + # TSD -> Triple + if input_class == "tabular" and output_class == "triples": + _mapping.convert_csv_to_rdf( + input_file, output_file, input_format, convert_format, base_uri + ) + return + + # Quad -> TSD + if input_class == "quads" and output_class == "tabular": + _mapping.convert_quads_to_csv( + input_file, output_file, input_format, convert_format + ) return raise ValueError( f"Conversion from '{input_format}' ({input_class}) to " - f"'{convert_format}' ({output_class}) is not yet implemented. " - f"Supported Layer 3 conversions: RDF Triples -> CSV/TSV." - ) + f"'{convert_format}' ({output_class}) is not supported." + ) \ No newline at end of file diff --git a/databusclient/filehandling/mapping.py b/databusclient/filehandling/mapping.py index 93b5a00..c89ddef 100644 --- a/databusclient/filehandling/mapping.py +++ b/databusclient/filehandling/mapping.py @@ -1,55 +1,285 @@ -"""Layer 3 prototype mapping handlers.""" +"""Layer 3 Mapping Conversion — cross-class conversions between RDF and tabular formats. + +Supported mapping directions: + Triple -> Quad : Assigns a named graph to all triples (requires graph_name). + Quad -> Triple : Splits quads into one file per named graph (in a subdirectory), + written in the triple format specified by output_format. + Triple -> TSD : Maps RDF triples to wide CSV table (quasi-equal, companion .meta.json). + TSD -> Triple : Reconstructs RDF triples from wide CSV (lossless with companion file). + Quad -> TSD : Maps RDF quads to wide CSV table with extra graph column. + +Data loss and quasi-equality: + RDF -> CSV conversion is quasi-equal. RDF datatypes (xsd:integer etc.) and language + tags (@en) cannot be represented in plain CSV. A companion .meta.json file is generated + alongside the CSV to preserve this information. When converting back (CSV -> RDF), if the + companion file is present, datatypes and language tags are restored for full lossless + round trips. Without the companion file, all values are restored as plain xsd:string. + + Note: a string literal whose lexical value itself looks like a URI (e.g. a literal + "http://example.com/text") cannot be distinguished from an actual URI reference in + the CSV representation. This is an inherent limitation of the wide-table CSV format + and matches the level of fidelity of the Java client's TSD mapping. + +Blank node handling: + Blank node subjects and objects are serialized to CSV cells as '_:label' (matching + N-Triples notation). This is essential for round trips: without the '_:' marker, + convert_csv_to_rdf() cannot distinguish a blank node reference from a URI or string + literal. On CSV -> RDF, any cell value starting with '_:' is reconstructed as a BNode + with the same label, preserving links between blank nodes and their properties. + +Per-predicate metadata granularity: + The companion .meta.json stores one datatype/language entry per predicate (the last + value seen during conversion). This assumes a predicate's values share a consistent + type, which holds for typical RDF datasets (e.g. DBpedia mappings) where a given + predicate has a consistent range. +""" import json import os -from databusclient.filehandling.format import TSDHandler, TripleHandler +from rdflib import BNode, Dataset, Graph, Literal, URIRef +from rdflib.namespace import XSD + +from databusclient.filehandling.format import ( + QuadHandler, + TSDHandler, + TripleHandler, + FORMAT_TO_EXTENSION, +) + +# --------------------------------------------------------------------------- +# Module-level handler instances — reuse across calls +# --------------------------------------------------------------------------- + +_triple_handler = TripleHandler() +_quad_handler = QuadHandler() +_tsd_handler = TSDHandler() + + +# --------------------------------------------------------------------------- +# Shared helper — RDF term to CSV cell string +# --------------------------------------------------------------------------- + +def _term_to_str(term) -> str: + """Convert an RDF term (URIRef, BNode, or Literal) to its CSV cell string. + + Blank nodes are prefixed with '_:' (matching N-Triples notation) so they + can be correctly distinguished from URIs and literals when reconstructing + RDF from CSV in convert_csv_to_rdf(). Without this prefix, a blank node + label like 'address1' would be indistinguishable from a relative resource + identifier, breaking the link between a blank node and its properties. + + Literals are represented by their lexical form (the string value as + written), regardless of datatype. This avoids conversion-related + discrepancies (e.g. datetime formatting via .toPython()) and matches + what is restored via Literal(value, datatype=...) on the reverse direction. + + Args: + term: An rdflib term (URIRef, BNode, or Literal). + + Returns: + String representation suitable for a CSV cell. + """ + if isinstance(term, BNode): + return f"_:{term}" + return str(term) + + +# --------------------------------------------------------------------------- +# Direction 1 — Triple -> Quad +# --------------------------------------------------------------------------- + +def convert_triples_to_quads( + input_file: str, + output_file: str, + input_format: str, + output_format: str, + graph_name: str, +) -> None: + """Promote RDF triples to named graph quads (Layer 3, lossless). + + All triples are assigned to the named graph specified by graph_name. + Requires --graph-name to be provided. + + Args: + input_file: Path to input RDF triples file. + output_file: Path to write output quads file. + input_format: Source triple format name (e.g. 'turtle', 'ntriples'). + output_format: Target quad format name (e.g. 'nquads', 'trig'). + graph_name: URI string for the named graph to assign all triples to. + + Raises: + ValueError: If graph_name is empty or None. + """ + if not graph_name: + raise ValueError( + "graph_name is required for Triple -> Quad conversion. " + "Use --graph-name to specify the target named graph." + ) + + g = _triple_handler.read(input_file, input_format) + d = Dataset() + graph_uri = URIRef(graph_name) + named_graph = d.graph(graph_uri) + + for triple in g: + named_graph.add(triple) + + _quad_handler.write(d, output_file, output_format) + print( + f"Converted {input_format} -> {output_format} " + f"(graph: {graph_name}): {os.path.basename(output_file)}" + ) +# --------------------------------------------------------------------------- +# Direction 2 — Quad -> Triple +# --------------------------------------------------------------------------- + +def convert_quads_to_triples( + input_file: str, + output_dir: str, + input_format: str, + output_format: str, +) -> list: + """Split RDF quads into per-graph triple files (Layer 3, lossless). + + Each named graph in the quads file becomes a separate file, written in + output_format (e.g. 'ntriples', 'turtle', 'rdf-xml' — whatever was + specified via --format). Output files are written to output_dir, named + after the last segment of the graph URI (e.g. 'people.ttl' for graph + 'https://example.org/graph/people' when output_format='turtle'). + + Default graph triples (no named graph) are written to + 'default_graph.'. + + Args: + input_file: Path to input quads file. + output_dir: Directory to write one file per named graph. + input_format: Source quad format name (e.g. 'nquads', 'trig'). + output_format: Target triple format name (e.g. 'ntriples', 'turtle', + 'rdf-xml'). Required — no default, matches whatever the user + specified via --format. + + Returns: + List of output file paths created. + + Raises: + ValueError: If no named graphs with triples are found in input. + """ + os.makedirs(output_dir, exist_ok=True) + + d = _quad_handler.read(input_file, input_format) + output_files = [] + + file_ext = FORMAT_TO_EXTENSION.get(output_format, f".{output_format}") + + for named_graph in d.graphs(): + graph_id = str(named_graph.identifier) + + # Skip empty graphs (e.g. an unused default graph) + if len(named_graph) == 0: + continue + + # Determine output filename from graph URI last segment + if graph_id in ("urn:x-rdflib:default", ""): + file_stem = "default_graph" + else: + file_stem = graph_id.rstrip("/").split("/")[-1] + # Sanitize: replace characters invalid in filenames + file_stem = "".join( + c if c.isalnum() or c in "-_." else "_" for c in file_stem + ) + if not file_stem: + file_stem = "graph" + + out_path = os.path.join(output_dir, file_stem + file_ext) + + # Handle duplicate filenames by appending a counter + counter = 1 + original_out_path = out_path + while os.path.exists(out_path): + out_path = original_out_path[: -len(file_ext)] + f"_{counter}{file_ext}" + counter += 1 + + _triple_handler.write(named_graph, out_path, output_format) + output_files.append(out_path) + print(f"Written graph '{graph_id}' -> {os.path.basename(out_path)}") + + if not output_files: + raise ValueError( + f"No named graphs with triples found in '{os.path.basename(input_file)}'. " + "Nothing to split." + ) + + print( + f"Quad -> Triple split complete: {len(output_files)} file(s) " + f"({output_format}) in '{os.path.basename(output_dir)}/'" + ) + return output_files + + +# --------------------------------------------------------------------------- +# Direction 3 — Triple -> TSD (CSV/TSV) +# --------------------------------------------------------------------------- + def convert_rdf_to_csv( input_file: str, output_file: str, input_format: str, + output_format: str, ) -> None: - """Map RDF triples to a wide CSV table (Layer 3 prototype). + """Map RDF triples to a wide tabular table (Layer 3, quasi-equal). + + Each unique RDF subject becomes one row. Each unique predicate becomes + a column header (full predicate URI). Object values fill the cells. + Multi-valued predicates are pipe-separated (|) to enable unambiguous + splitting on round trip. - Each unique subject becomes a row. Each unique predicate becomes a column. - Multi-valued predicates are pipe-separated. - A companion .meta.json file is generated to preserve RDF datatype and - language tag information for lossless round trips. + A companion .meta.json file is generated alongside the output file + to preserve RDF datatype and language tag information, enabling + lossless round trips when convert_csv_to_rdf() is called with the + same companion file present. - NOTE: This is a Layer 3 prototype. It is not yet tested and will be - properly implemented in the Layer 3 issue. + Blank node subjects and objects are serialized as '_:label' (see + _term_to_str). This is essential for correct round trips. Args: input_file: Path to input RDF triples file. - output_file: Path to write output CSV file. + output_file: Path to write output CSV or TSV file. input_format: Source triple format name (must be in RDF_TRIPLE_FORMATS). + output_format: Target tabular format ('csv' or 'tsv'). """ - handler = TripleHandler() - g = handler.read(input_file, input_format) + g = _triple_handler.read(input_file, input_format) + # Collect all unique predicates (sorted for deterministic column order) predicates = sorted(set(str(p) for s, p, o in g)) + # Group objects by (subject, predicate) subjects: dict = {} + # column_metadata: predicate URI -> {datatype: ...} or {language: ...} + # Only the LAST seen value's metadata is stored per predicate (see + # module docstring on per-predicate metadata granularity). column_metadata: dict = {} for s, p, o in g: - subj = str(s) + subj = _term_to_str(s) pred = str(p) - if hasattr(o, "datatype") and o.datatype: - column_metadata[pred] = {"datatype": str(o.datatype)} - elif hasattr(o, "language") and o.language: - column_metadata[pred] = {"language": str(o.language)} + # Collect datatype/language metadata for companion file + if isinstance(o, Literal): + if o.datatype and str(o.datatype) != str(XSD.string): + column_metadata[pred] = {"datatype": str(o.datatype)} + elif o.language: + column_metadata[pred] = {"language": str(o.language)} if subj not in subjects: subjects[subj] = {} if pred not in subjects[subj]: subjects[subj][pred] = [] - subjects[subj][pred].append(str(o)) + subjects[subj][pred].append(_term_to_str(o)) - tsd_handler = TSDHandler() + # Build rows: header + one row per subject rows = [["resource"] + predicates] for subj, pred_map in subjects.items(): row = [subj] @@ -58,11 +288,262 @@ def convert_rdf_to_csv( row.append("|".join(values)) rows.append(row) - tsd_handler.write(rows, output_file, "csv") + _tsd_handler.write(rows, output_file, output_format) + # Write companion metadata file companion_file = output_file + ".meta.json" with open(companion_file, "w", encoding="utf-8") as f: json.dump({"columns": column_metadata}, f, indent=2) - print(f"Converted RDF -> CSV: {os.path.basename(output_file)}") + print(f"Converted RDF -> {output_format.upper()}: {os.path.basename(output_file)}") print(f"Companion metadata: {os.path.basename(companion_file)}") + + +# --------------------------------------------------------------------------- +# Direction 4 — TSD (CSV/TSV) -> Triple +# --------------------------------------------------------------------------- + +def convert_csv_to_rdf( + input_file: str, + output_file: str, + input_format: str, + output_format: str, + base_uri: str, +) -> None: + """Reconstruct RDF triples from a wide tabular file (Layer 3). + + Column headers (except 'resource') become predicate URIs directly. + Each row becomes one RDF subject. Cell values become object literals, + URIs, or blank nodes depending on their content. + + If a companion .meta.json file exists alongside the input CSV + (same path + '.meta.json'), datatypes and language tags are restored + from it, enabling lossless round trips. Without the companion file, + all literal values are created as plain xsd:string literals. + + Note: companion file lookup uses input_file + '.meta.json' — the + companion must be co-located with the exact input file path passed + here. If the input was downloaded compressed and decompressed to a + temporary file, no companion will typically be found (this is an + inherent, documented limitation, not a bug). + + Multi-valued cells (pipe-separated '|') are split back into multiple + triples per subject-predicate pair. + + Blank node subjects/objects: any value starting with '_:' is + reconstructed as a BNode with the same label. URI objects: any value + starting with 'http://' or 'https://' is created as URIRef. + + Args: + input_file: Path to input CSV or TSV file. + output_file: Path to write output RDF triples file. + input_format: Source tabular format ('csv' or 'tsv'). + output_format: Target triple format name (e.g. 'ntriples', 'turtle'). + base_uri: Base URI for constructing subject URIs from relative identifiers. + + Raises: + ValueError: If base_uri is empty or None. + ValueError: If input file is empty or missing the 'resource' column. + """ + if not base_uri: + raise ValueError( + "base_uri is required for CSV -> RDF conversion. " + "Use --base-uri to specify the base URI for subject construction." + ) + + rows = _tsd_handler.read(input_file, input_format) + + if not rows: + raise ValueError(f"Input file '{os.path.basename(input_file)}' is empty.") + + header = rows[0] + if "resource" not in header: + raise ValueError( + f"Input CSV missing 'resource' column. " + f"Found columns: {header}. " + "The 'resource' column is required and must contain subject identifiers." + ) + + resource_idx = header.index("resource") + predicate_columns = [ + (i, col) for i, col in enumerate(header) if i != resource_idx + ] + + # Load companion metadata if present + companion_path = input_file + ".meta.json" + column_metadata: dict = {} + if os.path.exists(companion_path): + with open(companion_path, "r", encoding="utf-8") as f: + meta = json.load(f) + column_metadata = meta.get("columns", {}) + print(f"Loaded companion metadata: {os.path.basename(companion_path)}") + else: + print( + "No companion metadata file found. " + "All literal values will be created as plain strings." + ) + + base_uri_stripped = base_uri.rstrip("/") + g = Graph() + + for row in rows[1:]: # skip header + if len(row) < len(header): + row = row + [""] * (len(header) - len(row)) + + resource_val = row[resource_idx].strip() + if not resource_val: + continue # skip empty rows + + # Build subject node + if resource_val.startswith("_:"): + subject = BNode(resource_val[2:]) + elif resource_val.startswith("http://") or resource_val.startswith("https://"): + subject = URIRef(resource_val) + else: + subject = URIRef(f"{base_uri_stripped}/{resource_val}") + + # Build triples for each predicate column + for col_idx, pred_uri in predicate_columns: + cell = row[col_idx].strip() if col_idx < len(row) else "" + if not cell: + continue + + predicate = URIRef(pred_uri) + meta = column_metadata.get(pred_uri, {}) + + # Split multi-valued cells + for val in cell.split("|"): + val = val.strip() + if not val: + continue + + obj = _build_object(val, meta) + g.add((subject, predicate, obj)) + + _triple_handler.write(g, output_file, output_format) + print( + f"Converted {input_format.upper()} -> {output_format}: " + f"{os.path.basename(output_file)}" + ) + + +def _build_object(value: str, meta: dict): + """Build an RDF object term from a CSV cell string and metadata. + + Args: + value: String value from CSV cell. + meta: Metadata dict with optional 'datatype' or 'language' keys. + + Returns: + rdflib term: URIRef, BNode, or Literal. + """ + # Blank node + if value.startswith("_:"): + return BNode(value[2:]) + + # URI + if value.startswith("http://") or value.startswith("https://"): + return URIRef(value) + + # Literal with datatype from companion file + if "datatype" in meta: + return Literal(value, datatype=URIRef(meta["datatype"])) + + # Literal with language tag from companion file + if "language" in meta: + return Literal(value, lang=meta["language"]) + + # Plain string literal (no companion metadata) + return Literal(value) + + +# --------------------------------------------------------------------------- +# Direction 5 — Quad -> TSD (CSV/TSV) +# --------------------------------------------------------------------------- + +def convert_quads_to_csv( + input_file: str, + output_file: str, + input_format: str, + output_format: str, +) -> None: + """Map RDF quads to a wide tabular table with a graph column (Layer 3, quasi-equal). + + Extends the Triple -> TSD mapping by adding a 'graph' column containing + the named graph URI. Each row represents one (subject, graph) pair, with + one column per predicate (pipe-separated for multi-valued predicates). + + A companion .meta.json file is generated to preserve datatype and + language tag information. + + The default graph (if present) is skipped — only triples within named + graphs are represented, since the 'graph' column requires a graph URI. + + Args: + input_file: Path to input quads file. + output_file: Path to write output CSV or TSV file. + input_format: Source quad format name (e.g. 'nquads', 'trig'). + output_format: Target tabular format ('csv' or 'tsv'). + """ + d = _quad_handler.read(input_file, input_format) + + # Collect all predicates across all named graphs (sorted for determinism) + all_predicates = sorted( + set( + str(p) + for named_graph in d.graphs() + for s, p, o in named_graph + if str(named_graph.identifier) not in ("urn:x-rdflib:default", "") + ) + ) + + column_metadata: dict = {} + # rows_map key: (subject_str, graph_uri_str) -> {predicate_uri: [values]} + rows_map: dict = {} + + for named_graph in d.graphs(): + graph_id = str(named_graph.identifier) + + # Skip the default graph — no meaningful graph URI for the column + if graph_id in ("urn:x-rdflib:default", ""): + continue + + for s, p, o in named_graph: + subj = _term_to_str(s) + pred = str(p) + key = (subj, graph_id) + + if isinstance(o, Literal): + if o.datatype and str(o.datatype) != str(XSD.string): + column_metadata[pred] = {"datatype": str(o.datatype)} + elif o.language: + column_metadata[pred] = {"language": str(o.language)} + + if key not in rows_map: + rows_map[key] = {} + if pred not in rows_map[key]: + rows_map[key][pred] = [] + rows_map[key][pred].append(_term_to_str(o)) + + # Build rows: header = resource + graph + all predicates + header = ["resource", "graph"] + all_predicates + rows = [header] + + for (subj, graph_id), pred_map in rows_map.items(): + row = [subj, graph_id] + for pred in all_predicates: + values = pred_map.get(pred, []) + row.append("|".join(values)) + rows.append(row) + + _tsd_handler.write(rows, output_file, output_format) + + companion_file = output_file + ".meta.json" + with open(companion_file, "w", encoding="utf-8") as f: + json.dump({"columns": column_metadata}, f, indent=2) + + print( + f"Converted {input_format} -> {output_format.upper()} " + f"(with graph column): {os.path.basename(output_file)}" + ) + print(f"Companion metadata: {os.path.basename(companion_file)}") \ No newline at end of file diff --git a/tests/test_format_round_trips.py b/tests/test_format_round_trips.py index 7244e2e..b828ddb 100644 --- a/tests/test_format_round_trips.py +++ b/tests/test_format_round_trips.py @@ -25,12 +25,20 @@ import os import tempfile +from rdflib import BNode, URIRef from databusclient.api.convert import ( QuadHandler, TSDHandler, TripleHandler, ) +from databusclient.filehandling.mapping import ( + convert_triples_to_quads, + convert_quads_to_triples, + convert_rdf_to_csv, + convert_csv_to_rdf, + convert_quads_to_csv, +) # --------------------------------------------------------------------------- # Path to shared test resources @@ -250,4 +258,219 @@ def test_round_trip_tsv(): ) finally: if os.path.exists(output): - os.remove(output) \ No newline at end of file + os.remove(output) + +# --------------------------------------------------------------------------- +# Mapping round trip tests (Layer 3) — 5 tests total +# --------------------------------------------------------------------------- +# These tests validate cross-class conversions following the quasi-equal +# strategy from Frey et al. Where information loss is expected (e.g. RDF +# datatypes in CSV), the comparison accounts for that predictable loss. +# --------------------------------------------------------------------------- + +def test_mapping_triples_to_quads_and_back(): + """Triple -> Quad -> Triple round trip (lossless with graph_name).""" + source = resource("sample.ttl") + graph_uri = "https://example.org/graph/test" + + g_original = triple_handler.read(source, "turtle") + + with tempfile.TemporaryDirectory() as tmpdir: + quads_path = os.path.join(tmpdir, "promoted.nq") + convert_triples_to_quads(source, quads_path, "turtle", "nquads", graph_uri) + + # Split back — produces subdirectory + output_dir = os.path.join(tmpdir, "split") + files = convert_quads_to_triples(quads_path, output_dir, "nquads", "ntriples") + + assert len(files) == 1, "Expected exactly one output file (one named graph)" + + g_roundtrip = triple_handler.read(files[0], "ntriples") + assert g_original.isomorphic(g_roundtrip), ( + "Triple -> Quad -> Triple round trip failed: graphs are not isomorphic" + ) + + +def test_mapping_quads_to_triples_and_back(): + """Quad -> Triple -> Quad round trip (lossless, graph info preserved).""" + source = resource("sample.nq") + d_original = quad_handler.read(source, "nquads") + + with tempfile.TemporaryDirectory() as tmpdir: + # Split quads into per-graph triple files + output_dir = os.path.join(tmpdir, "split") + files = convert_quads_to_triples(source, output_dir, "nquads", "ntriples") + + assert len(files) >= 1, "Expected at least one output file" + + # Re-promote each file back to quads using its graph name + # (we use the same graph URIs from the original) + original_graphs = { + str(g.identifier): g + for g in d_original.graphs() + if len(g) > 0 and str(g.identifier) not in ("urn:x-rdflib:default", "") + } + + for out_file in files: + stem = os.path.basename(out_file)[:-3] # strip .nt + # Find the matching original graph by last URI segment + matching_graph_uri = next( + (uri for uri in original_graphs if uri.rstrip("/").split("/")[-1] == stem), + None + ) + if matching_graph_uri is None: + continue + + g_split = triple_handler.read(out_file, "ntriples") + g_original_named = original_graphs[matching_graph_uri] + assert g_split.isomorphic(g_original_named), ( + f"Quad -> Triple round trip failed for graph '{matching_graph_uri}': " + "graphs are not isomorphic" + ) + + +def test_mapping_triples_to_csv_and_back_with_companion(): + """Triple -> CSV -> Triple round trip (lossless with companion metadata file).""" + source = resource("sample.ttl") + g_original = triple_handler.read(source, "turtle") + + with tempfile.TemporaryDirectory() as tmpdir: + csv_path = os.path.join(tmpdir, "output.csv") + convert_rdf_to_csv(source, csv_path, "turtle", "csv") + + companion_path = csv_path + ".meta.json" + assert os.path.exists(companion_path), "Companion .meta.json was not created" + + nt_path = os.path.join(tmpdir, "roundtrip.nt") + convert_csv_to_rdf( + csv_path, nt_path, "csv", "ntriples", + base_uri="https://example.org/data/" + ) + + g_roundtrip = triple_handler.read(nt_path, "ntriples") + + # With companion file: datatypes are restored. + # Blank nodes are quasi-equal: labels may differ, structure must match. + assert g_original.isomorphic(g_roundtrip), ( + "Triple -> CSV -> Triple round trip failed (with companion file): " + "graphs are not isomorphic" + ) + + +def test_mapping_triples_to_csv_quasi_equal_without_companion(): + """Triple -> CSV -> Triple quasi-equal test (without companion file). + + Without the companion file, datatypes are lost — all values become + plain string literals. The test verifies that subjects, predicates, + and string values match, but does not assert datatype preservation. + This documents the expected information loss. + """ + source = resource("sample.ttl") + g_original = triple_handler.read(source, "turtle") + + with tempfile.TemporaryDirectory() as tmpdir: + csv_path = os.path.join(tmpdir, "output.csv") + convert_rdf_to_csv(source, csv_path, "turtle", "csv") + + # Remove companion file to simulate no-metadata scenario + companion_path = csv_path + ".meta.json" + if os.path.exists(companion_path): + os.remove(companion_path) + + nt_path = os.path.join(tmpdir, "roundtrip.nt") + convert_csv_to_rdf( + csv_path, nt_path, "csv", "ntriples", + base_uri="https://example.org/data/" + ) + + g_roundtrip = triple_handler.read(nt_path, "ntriples") + + # Quasi-equal check: named (URI) subjects must match exactly. + # Blank node subjects are expected to get NEW labels on round trip + # (blank node identity is never expected to survive serialization — + # only structure matters, same principle as isomorphic() checks + # for Layer 2). So we compare URI subjects by value, and blank + # node subjects only by count. + original_uri_subjects = set( + str(s) for s, p, o in g_original if isinstance(s, URIRef) + ) + roundtrip_uri_subjects = set( + str(s) for s, p, o in g_roundtrip if isinstance(s, URIRef) + ) + assert original_uri_subjects == roundtrip_uri_subjects, ( + "Quasi-equal check failed: named (URI) subject sets differ" + ) + + original_bnode_subjects = set( + s for s, p, o in g_original if isinstance(s, BNode) + ) + roundtrip_bnode_subjects = set( + s for s, p, o in g_roundtrip if isinstance(s, BNode) + ) + assert len(original_bnode_subjects) == len(roundtrip_bnode_subjects), ( + "Quasi-equal check failed: number of distinct blank node subjects " + "differs. Blank node labels are expected to change on round trip, " + "but their count should be preserved." + ) + + original_predicates = set(str(p) for s, p, o in g_original) + roundtrip_predicates = set(str(p) for s, p, o in g_roundtrip) + assert original_predicates == roundtrip_predicates, ( + "Quasi-equal check failed: predicate sets differ" + ) + + # String values must match (datatypes stripped — known loss). + # Blank node OBJECT values are also expected to get new labels, + # so we compare non-blank-node object values only. + original_values = set( + str(o) for s, p, o in g_original if not isinstance(o, BNode) + ) + roundtrip_values = set( + str(o) for s, p, o in g_roundtrip if not isinstance(o, BNode) + ) + assert original_values == roundtrip_values, ( + "Quasi-equal check failed: object string values differ. " + "This is unexpected — only datatypes should be lost without companion file." + ) + + +def test_mapping_quads_to_csv_and_back(): + """Quad -> CSV (with graph column) round trip (quasi-equal). + + Verifies that named graph information is preserved in the graph column + and that all triple data is present in the CSV output. + """ + source = resource("sample.nq") + d_original = quad_handler.read(source, "nquads") + + with tempfile.TemporaryDirectory() as tmpdir: + csv_path = os.path.join(tmpdir, "quads_output.csv") + convert_quads_to_csv(source, csv_path, "nquads", "csv") + + assert os.path.exists(csv_path), "CSV output was not created" + companion_path = csv_path + ".meta.json" + assert os.path.exists(companion_path), "Companion .meta.json was not created" + + # Verify graph column is present in CSV + rows = tsd_handler.read(csv_path, "csv") + assert len(rows) > 1, "CSV has no data rows" + header = rows[0] + assert "graph" in header, ( + "CSV output missing 'graph' column for Quad -> CSV conversion" + ) + assert "resource" in header, "CSV output missing 'resource' column" + + # Verify all named graph URIs appear in the graph column + graph_col_idx = header.index("graph") + csv_graphs = set(row[graph_col_idx] for row in rows[1:] if len(row) > graph_col_idx) + + original_graph_uris = set( + str(g.identifier) + for g in d_original.graphs() + if len(g) > 0 and str(g.identifier) not in ("urn:x-rdflib:default", "") + ) + + assert csv_graphs == original_graph_uris, ( + f"Graph URIs in CSV do not match original. " + f"Expected: {original_graph_uris}, got: {csv_graphs}" + ) \ No newline at end of file diff --git a/tests/test_mapping_conversions.py b/tests/test_mapping_conversions.py new file mode 100644 index 0000000..8901c59 --- /dev/null +++ b/tests/test_mapping_conversions.py @@ -0,0 +1,498 @@ +"""Comprehensive functional tests for Layer 3 mapping conversions. + +These tests cover all 5 mapping directions with edge cases: + - Triple -> Quad (with graph_name) + - Quad -> Triple (split by graph, subdirectory output) + - Triple -> TSD (CSV/TSV, companion metadata) + - TSD -> Triple (with and without companion file) + - Quad -> TSD (CSV with graph column) + +Edge cases covered: + - Blank node subjects and objects + - Typed literals (xsd:integer) + - Multi-valued predicates (pipe-separated) + - Missing companion file (graceful degradation) + - Empty cells in CSV + - Graph name sanitization in filenames +""" + +import json +import os +import tempfile + +import pytest + +from databusclient.filehandling.format import TripleHandler, QuadHandler, TSDHandler +from databusclient.filehandling.mapping import ( + convert_triples_to_quads, + convert_quads_to_triples, + convert_rdf_to_csv, + convert_csv_to_rdf, + convert_quads_to_csv, +) + +# --------------------------------------------------------------------------- +# Shared test data and helpers +# --------------------------------------------------------------------------- + +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") + + +def resource(filename: str) -> str: + return os.path.join(RESOURCES, filename) + + +triple_handler = TripleHandler() +quad_handler = QuadHandler() +tsd_handler = TSDHandler() + +# Sample Turtle with typed literals, blank nodes, multi-valued predicates +SAMPLE_TTL_CONTENT = """\ +@base . +@prefix ex: . +@prefix foaf: . +@prefix xsd: . + + foaf:name "Alice" ; + ex:age 29 ; + ex:livesAt _:address1 . + +_:address1 ex:city "Leipzig" ; + ex:country "Germany" . + + foaf:name "Bob" ; + ex:age 34 ; + ex:knows . + + ex:title "Databus Example Project" ; + ex:member . +""" + +SAMPLE_NQ_CONTENT = """\ + "Alice" . + "29"^^ . + "Bob" . + "Databus Example Project" . + . +""" + + +def write_temp_file(tmpdir, filename, content): + path = os.path.join(tmpdir, filename) + with open(path, "w", encoding="utf-8") as f: + f.write(content) + return path + + +# --------------------------------------------------------------------------- +# Direction 1: Triple -> Quad +# --------------------------------------------------------------------------- + +class TestTriplesToQuads: + + def test_basic_conversion(self): + """All triples are assigned to the specified named graph.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.ttl", SAMPLE_TTL_CONTENT) + out = os.path.join(tmpdir, "output.nq") + convert_triples_to_quads(src, out, "turtle", "nquads", + "https://example.org/graph/test") + + assert os.path.exists(out) + d = quad_handler.read(out, "nquads") + graph_uris = [ + str(g.identifier) for g in d.graphs() + if str(g.identifier) not in ("urn:x-rdflib:default", "") + and len(g) > 0 + ] + assert "https://example.org/graph/test" in graph_uris + + def test_triple_count_preserved(self): + """All triples from input appear in the named graph.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.ttl", SAMPLE_TTL_CONTENT) + out = os.path.join(tmpdir, "output.nq") + convert_triples_to_quads(src, out, "turtle", "nquads", + "https://example.org/graph/test") + + g_original = triple_handler.read(src, "turtle") + d = quad_handler.read(out, "nquads") + named_graph = d.get_context( + __import__("rdflib").URIRef("https://example.org/graph/test") + ) + assert len(g_original) == len(named_graph) + + def test_requires_graph_name(self): + """Raises ValueError if graph_name is None or empty.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.ttl", SAMPLE_TTL_CONTENT) + out = os.path.join(tmpdir, "output.nq") + + with pytest.raises(ValueError, match="graph_name is required"): + convert_triples_to_quads(src, out, "turtle", "nquads", None) + + with pytest.raises(ValueError, match="graph_name is required"): + convert_triples_to_quads(src, out, "turtle", "nquads", "") + + def test_trig_output_format(self): + """Triple -> Quad works with trig output format.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.ttl", SAMPLE_TTL_CONTENT) + out = os.path.join(tmpdir, "output.trig") + convert_triples_to_quads(src, out, "turtle", "trig", + "https://example.org/graph/trig_test") + assert os.path.exists(out) + d = quad_handler.read(out, "trig") + assert len(d) > 0 + + def test_uses_resource_files(self): + """Conversion works correctly on the shared test resource files.""" + with tempfile.TemporaryDirectory() as tmpdir: + out = os.path.join(tmpdir, "output.nq") + convert_triples_to_quads( + resource("sample.ttl"), out, "turtle", "nquads", + "https://example.org/graph/resource_test" + ) + assert os.path.exists(out) + d = quad_handler.read(out, "nquads") + assert len(d) > 0 + + +# --------------------------------------------------------------------------- +# Direction 2: Quad -> Triple +# --------------------------------------------------------------------------- + +class TestQuadsToTriples: + + def test_creates_subdirectory(self): + """Output subdirectory is created automatically.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.nq", SAMPLE_NQ_CONTENT) + out_dir = os.path.join(tmpdir, "split_output") + convert_quads_to_triples(src, out_dir, "nquads", "ntriples") + assert os.path.isdir(out_dir) + + def test_one_file_per_graph(self): + """One .nt file is created per named graph.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.nq", SAMPLE_NQ_CONTENT) + out_dir = os.path.join(tmpdir, "split") + files = convert_quads_to_triples(src, out_dir, "nquads", "ntriples") + + # SAMPLE_NQ_CONTENT has 2 named graphs + assert len(files) == 2 + for f in files: + assert f.endswith(".nt") + assert os.path.exists(f) + + def test_all_triples_present(self): + """Total triple count across all output files matches input quad count.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.nq", SAMPLE_NQ_CONTENT) + out_dir = os.path.join(tmpdir, "split") + files = convert_quads_to_triples(src, out_dir, "nquads", "ntriples") + + total_output_triples = sum( + len(triple_handler.read(f, "ntriples")) for f in files + ) + d_original = quad_handler.read(src, "nquads") + total_input_triples = sum( + len(g) for g in d_original.graphs() + if str(g.identifier) not in ("urn:x-rdflib:default", "") + ) + assert total_output_triples == total_input_triples + + def test_filename_from_graph_uri(self): + """Output filenames are derived from graph URI last segment.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.nq", SAMPLE_NQ_CONTENT) + out_dir = os.path.join(tmpdir, "split") + files = convert_quads_to_triples(src, out_dir, "nquads", "ntriples") + + filenames = [os.path.basename(f) for f in files] + # people.nt and projects.nt expected from graph URIs + assert "people.nt" in filenames + assert "projects.nt" in filenames + + def test_empty_input_raises(self): + """Raises ValueError if input has no named graphs with triples.""" + empty_nq = "" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "empty.nq", empty_nq) + out_dir = os.path.join(tmpdir, "split") + with pytest.raises(ValueError, match="No named graphs"): + convert_quads_to_triples(src, out_dir, "nquads", "ntriples") + + def test_uses_resource_files(self): + """Conversion works correctly on shared resource sample.nq.""" + with tempfile.TemporaryDirectory() as tmpdir: + out_dir = os.path.join(tmpdir, "split") + files = convert_quads_to_triples(resource("sample.nq"), out_dir, "nquads", "ntriples") + assert len(files) >= 1 + for f in files: + g = triple_handler.read(f, "ntriples") + assert len(g) > 0 + + +# --------------------------------------------------------------------------- +# Direction 3: Triple -> TSD +# --------------------------------------------------------------------------- + +class TestTriplesToCSV: + + def test_creates_csv_and_companion(self): + """Both CSV and companion .meta.json are created.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.ttl", SAMPLE_TTL_CONTENT) + out = os.path.join(tmpdir, "output.csv") + convert_rdf_to_csv(src, out, "turtle", "csv") + + assert os.path.exists(out) + assert os.path.exists(out + ".meta.json") + + def test_header_row_contains_predicates(self): + """CSV header contains 'resource' and all predicate URIs.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.ttl", SAMPLE_TTL_CONTENT) + out = os.path.join(tmpdir, "output.csv") + convert_rdf_to_csv(src, out, "turtle", "csv") + + rows = tsd_handler.read(out, "csv") + header = rows[0] + assert "resource" in header + assert "http://xmlns.com/foaf/0.1/name" in header + assert "https://example.org/vocab/age" in header + + def test_datatype_preserved_in_companion(self): + """Companion file records xsd:integer datatype for age predicate.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.ttl", SAMPLE_TTL_CONTENT) + out = os.path.join(tmpdir, "output.csv") + convert_rdf_to_csv(src, out, "turtle", "csv") + + with open(out + ".meta.json", "r", encoding="utf-8") as f: + meta = json.load(f) + age_meta = meta["columns"].get("https://example.org/vocab/age", {}) + assert "datatype" in age_meta + assert "integer" in age_meta["datatype"] + + def test_one_row_per_subject(self): + """CSV has one data row per unique subject.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.ttl", SAMPLE_TTL_CONTENT) + out = os.path.join(tmpdir, "output.csv") + convert_rdf_to_csv(src, out, "turtle", "csv") + + rows = tsd_handler.read(out, "csv") + g = triple_handler.read(src, "turtle") + unique_subjects = set(str(s) for s, p, o in g) + # rows[0] is header, rest are data rows + assert len(rows) - 1 == len(unique_subjects) + + def test_tsv_output(self): + """Triple -> TSV also works correctly.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.ttl", SAMPLE_TTL_CONTENT) + out = os.path.join(tmpdir, "output.tsv") + convert_rdf_to_csv(src, out, "turtle", "tsv") + assert os.path.exists(out) + rows = tsd_handler.read(out, "tsv") + assert len(rows) > 1 + + def test_uses_resource_files(self): + """Conversion works on shared resource sample.ttl.""" + with tempfile.TemporaryDirectory() as tmpdir: + out = os.path.join(tmpdir, "output.csv") + convert_rdf_to_csv(resource("sample.ttl"), out, "turtle", "csv") + assert os.path.exists(out) + rows = tsd_handler.read(out, "csv") + assert len(rows) > 1 + + +# --------------------------------------------------------------------------- +# Direction 4: TSD -> Triple +# --------------------------------------------------------------------------- + +class TestCSVToTriples: + + def test_basic_reconstruction_with_companion(self): + """CSV -> RDF round trip with companion file restores typed literals.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.ttl", SAMPLE_TTL_CONTENT) + csv_path = os.path.join(tmpdir, "output.csv") + convert_rdf_to_csv(src, csv_path, "turtle", "csv") + + nt_path = os.path.join(tmpdir, "roundtrip.nt") + convert_csv_to_rdf( + csv_path, nt_path, "csv", "ntriples", + base_uri="https://example.org/data/" + ) + assert os.path.exists(nt_path) + g = triple_handler.read(nt_path, "ntriples") + assert len(g) > 0 + + def test_requires_base_uri(self): + """Raises ValueError if base_uri is None or empty.""" + with tempfile.TemporaryDirectory() as tmpdir: + csv_content = "resource,https://example.org/vocab/name\nhttps://example.org/data/alice,Alice\n" + csv_path = write_temp_file(tmpdir, "input.csv", csv_content) + out = os.path.join(tmpdir, "output.nt") + + with pytest.raises(ValueError, match="base_uri is required"): + convert_csv_to_rdf(csv_path, out, "csv", "ntriples", None) + + with pytest.raises(ValueError, match="base_uri is required"): + convert_csv_to_rdf(csv_path, out, "csv", "ntriples", "") + + def test_missing_resource_column_raises(self): + """Raises ValueError if CSV has no 'resource' column.""" + with tempfile.TemporaryDirectory() as tmpdir: + csv_content = "subject,predicate\nhttps://example.org/alice,Bob\n" + csv_path = write_temp_file(tmpdir, "input.csv", csv_content) + out = os.path.join(tmpdir, "output.nt") + + with pytest.raises(ValueError, match="missing 'resource' column"): + convert_csv_to_rdf(csv_path, out, "csv", "ntriples", + "https://example.org/data/") + + def test_blank_nodes_reconstructed(self): + """Blank node subjects (starting with '_:') are reconstructed as BNodes.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.ttl", SAMPLE_TTL_CONTENT) + csv_path = os.path.join(tmpdir, "output.csv") + convert_rdf_to_csv(src, csv_path, "turtle", "csv") + + nt_path = os.path.join(tmpdir, "roundtrip.nt") + convert_csv_to_rdf( + csv_path, nt_path, "csv", "ntriples", + base_uri="https://example.org/data/" + ) + g = triple_handler.read(nt_path, "ntriples") + from rdflib import BNode + blank_subjects = [s for s, p, o in g if isinstance(s, BNode)] + assert len(blank_subjects) > 0, ( + "Expected blank node subjects to be reconstructed" + ) + + def test_uri_objects_reconstructed(self): + """Object values starting with http:// are reconstructed as URIRef.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.ttl", SAMPLE_TTL_CONTENT) + csv_path = os.path.join(tmpdir, "output.csv") + convert_rdf_to_csv(src, csv_path, "turtle", "csv") + + nt_path = os.path.join(tmpdir, "roundtrip.nt") + convert_csv_to_rdf( + csv_path, nt_path, "csv", "ntriples", + base_uri="https://example.org/data/" + ) + g = triple_handler.read(nt_path, "ntriples") + from rdflib import URIRef + uri_objects = [o for s, p, o in g if isinstance(o, URIRef)] + assert len(uri_objects) > 0 + + def test_graceful_without_companion(self): + """Without companion file, conversion succeeds with plain string literals.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.ttl", SAMPLE_TTL_CONTENT) + csv_path = os.path.join(tmpdir, "output.csv") + convert_rdf_to_csv(src, csv_path, "turtle", "csv") + + # Remove companion + companion = csv_path + ".meta.json" + if os.path.exists(companion): + os.remove(companion) + + nt_path = os.path.join(tmpdir, "roundtrip.nt") + # Should not raise — graceful degradation + convert_csv_to_rdf( + csv_path, nt_path, "csv", "ntriples", + base_uri="https://example.org/data/" + ) + g = triple_handler.read(nt_path, "ntriples") + assert len(g) > 0 + + def test_empty_csv_raises(self): + """Raises ValueError if CSV file is empty.""" + with tempfile.TemporaryDirectory() as tmpdir: + csv_path = write_temp_file(tmpdir, "empty.csv", "") + out = os.path.join(tmpdir, "output.nt") + + with pytest.raises(ValueError, match="empty"): + convert_csv_to_rdf(csv_path, out, "csv", "ntriples", + "https://example.org/data/") + + +# --------------------------------------------------------------------------- +# Direction 5: Quad -> TSD +# --------------------------------------------------------------------------- + +class TestQuadsToCSV: + + def test_creates_csv_with_graph_column(self): + """Output CSV contains 'resource', 'graph', and predicate columns.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.nq", SAMPLE_NQ_CONTENT) + out = os.path.join(tmpdir, "output.csv") + convert_quads_to_csv(src, out, "nquads", "csv") + + assert os.path.exists(out) + rows = tsd_handler.read(out, "csv") + header = rows[0] + assert "resource" in header + assert "graph" in header + + def test_companion_file_created(self): + """Companion .meta.json is created alongside CSV.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.nq", SAMPLE_NQ_CONTENT) + out = os.path.join(tmpdir, "output.csv") + convert_quads_to_csv(src, out, "nquads", "csv") + assert os.path.exists(out + ".meta.json") + + def test_graph_uris_in_csv(self): + """All named graph URIs from input appear in the graph column.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.nq", SAMPLE_NQ_CONTENT) + out = os.path.join(tmpdir, "output.csv") + convert_quads_to_csv(src, out, "nquads", "csv") + + rows = tsd_handler.read(out, "csv") + header = rows[0] + graph_idx = header.index("graph") + csv_graphs = set(row[graph_idx] for row in rows[1:] if len(row) > graph_idx) + + assert "https://example.org/graph/people" in csv_graphs + assert "https://example.org/graph/projects" in csv_graphs + + def test_all_triples_represented(self): + """Data row count matches total triple count across all named graphs.""" + with tempfile.TemporaryDirectory() as tmpdir: + src = write_temp_file(tmpdir, "input.nq", SAMPLE_NQ_CONTENT) + out = os.path.join(tmpdir, "output.csv") + convert_quads_to_csv(src, out, "nquads", "csv") + + rows = tsd_handler.read(out, "csv") + # Each row is one (subject, graph) pair, not one triple. + # Verify at least one row per unique (subject, graph) + d = quad_handler.read(src, "nquads") + unique_subject_graph_pairs = set( + (str(s), str(g.identifier)) + for g in d.graphs() + for s, p, o in g + if str(g.identifier) not in ("urn:x-rdflib:default", "") + ) + assert len(rows) - 1 == len(unique_subject_graph_pairs) + + def test_uses_resource_files(self): + """Conversion works on shared resource sample.nq.""" + with tempfile.TemporaryDirectory() as tmpdir: + out = os.path.join(tmpdir, "output.csv") + convert_quads_to_csv(resource("sample.nq"), out, "nquads", "csv") + assert os.path.exists(out) + rows = tsd_handler.read(out, "csv") + assert len(rows) > 1 + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) \ No newline at end of file