Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@
History
=======

1.1.1 (2026-06-25)
------------------

* Made ``DiscoveryMatch.label`` optional (it is absent for non-sensitive/ignore matches).
* Added the ``finished_with_warnings`` status to ``AsyncRulesetGenerationTaskStatus``.
* ``get_db_discovery_result_report`` may now return ``bytes`` (a zip)
when the server splits a large DB-discovery report,
and ruleset generation from CSV now detects and forwards zip uploads.

1.1.0 (2026-06-24)
------------------

Expand Down
16 changes: 14 additions & 2 deletions datamasque/client/discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,10 @@ def start_async_ruleset_generation_from_csv(
- A text file handle (e.g. `open(path)`)
- A binary file handle (e.g. `open(path, 'rb')`)

If the content is a zip (for example a split report from `get_db_discovery_result_report()`),
it is detected by its magic bytes and uploaded as a zip;
otherwise it is uploaded as CSV.

Generation runs asynchronously on the server.
Poll `get_async_ruleset_generation_task_status` until it returns
`AsyncRulesetGenerationTaskStatus.finished`,
Expand All @@ -114,14 +118,22 @@ def start_async_ruleset_generation_from_csv(
else:
content = csv_content

is_zip = False
if content.seekable():
is_zip = content.read(4) == b"PK\x03\x04"
content.seek(0)
filename = "ruleset.zip" if is_zip else "ruleset.csv"
content_type = "application/zip" if is_zip else "text/csv"

files = [
UploadFile(
field_name="csv_or_zip_file",
filename="ruleset.csv",
filename=filename,
content=content,
content_type="text/csv",
content_type=content_type,
),
]

self.make_request(
method="POST",
path=f"/api/async-generate-ruleset/{connection_id}/from-csv/",
Expand Down
6 changes: 3 additions & 3 deletions datamasque/client/models/discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ class DiscoveryMatch(BaseModel):

model_config = ConfigDict(extra="allow")

label: str
label: Optional[str] = None
categories: list[str]
flagged_by: str
description: str
Expand Down Expand Up @@ -343,8 +343,8 @@ class FileDiscoveryMatch(BaseModel):

flagged_by: str
description: str
label: Optional[str] = None # Omitted for non-sensitive and ignored matches.
categories: Optional[list[str]] = None # Omitted for ignored matches.
label: Optional[str] = None # Omitted for non-sensitive matches.
categories: Optional[list[str]] = None
hit_ratio: Optional[int] = None # None for metadata matches, percentage 0-100 for IDD matches.


Expand Down
1 change: 1 addition & 0 deletions datamasque/client/models/status.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ class AsyncRulesetGenerationTaskStatus(enum.Enum):
"""List of statuses of async ruleset generation tasks."""

finished = "finished"
finished_with_warnings = "finished_with_warnings"
failed = "failed"
running = "running"
queued = "queued"
Expand Down
11 changes: 9 additions & 2 deletions datamasque/client/runs.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
import re
from typing import Union

from datamasque.client.base import BaseClient
from datamasque.client.exceptions import (
Expand Down Expand Up @@ -43,9 +44,12 @@ def get_run_report(self, run_id: RunId) -> str:
response = self.make_request("GET", f"api/runs/{run_id}/run-report/")
return response.text

def get_db_discovery_result_report(self, run_id: RunId, include_selection_column: bool = True) -> str:
def get_db_discovery_result_report(self, run_id: RunId, include_selection_column: bool = True) -> Union[str, bytes]:
"""
Returns the database-discovery result report for the specified run as CSV.
Returns the database-discovery result report for the specified run.

Returns CSV text (`str`),
or a zip of numbered CSV parts as `bytes` when the server splits a large report.

When `include_selection_column` is true (the default),
the CSV includes a `selected` column suitable for feeding back into ruleset generation.
Expand All @@ -54,6 +58,9 @@ def get_db_discovery_result_report(self, run_id: RunId, include_selection_column
url = f"api/runs/{run_id}/db-discovery-results/report/"
params = None if include_selection_column else {"include_selection_column": "false"}
response = self.make_request("GET", url, params=params)

if response.headers.get("Content-Type", "").startswith("application/zip"):
return response.content
return response.text

def get_unfinished_runs(self) -> dict[str, UnfinishedRun]:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "datamasque-python"
version = "1.1.0"
version = "1.1.1"
description = "Official Python client for the DataMasque data-masking API."
authors = [
{ name = "DataMasque Ltd" },
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 1.1.0
current_version = 1.1.1
commit = True
tag = True

Expand Down
36 changes: 36 additions & 0 deletions tests/test_discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,17 @@ def test_get_db_discovery_result_report(client):
assert result == "db discovery report without selection column"


def test_get_db_discovery_result_report_returns_zip_bytes_when_split(client):
run_id = RunId(1)
zip_bytes = b"PK\x03\x04 split report zip bytes"
with requests_mock.Mocker() as m:
url = f"http://test-server/api/runs/{run_id}/db-discovery-results/report/"
m.get(url, content=zip_bytes, headers={"Content-Type": "application/zip"}, status_code=200)
result = client.get_db_discovery_result_report(run_id)
assert result == zip_bytes
assert isinstance(result, bytes)


def test_poll_async_ruleset_generation(client):
connection_id = ConnectionId("1")
with requests_mock.Mocker() as m:
Expand Down Expand Up @@ -463,6 +474,31 @@ def test_start_async_ruleset_generation_from_csv_success(client, csv_content):
assert form_data["csv_or_zip_file"]["content"] == b"schema,table,column,selected\npublic,users,email,true"


@pytest.mark.parametrize(
"zip_content",
[
b"PK\x03\x04 zipped discovery report",
BytesIO(b"PK\x03\x04 zipped discovery report"),
],
ids=["bytes", "BytesIO"],
)
def test_start_async_ruleset_generation_from_csv_uploads_zip_as_zip(client, zip_content):
"""A split report is uploaded with a .zip filename and zip content-type, whether passed as bytes or a binary stream."""
connection_id = ConnectionId("1")

with requests_mock.Mocker() as m:
m.post(
f"http://test-server/api/async-generate-ruleset/{connection_id}/from-csv/",
status_code=201,
)
client.start_async_ruleset_generation_from_csv(connection_id, zip_content)

form_data = parse_multipart_form(m.last_request)
assert form_data["csv_or_zip_file"]["filename"] == "ruleset.zip"
assert form_data["csv_or_zip_file"]["content_type"] == "application/zip"
assert form_data["csv_or_zip_file"]["content"] == b"PK\x03\x04 zipped discovery report"


def test_start_async_ruleset_generation_from_csv_with_target_size(client):
"""Test async ruleset generation from CSV with target_size_bytes parameter."""
connection_id = ConnectionId("1")
Expand Down
2 changes: 1 addition & 1 deletion uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading