diff --git a/api/data_ingestion/routers/upload.py b/api/data_ingestion/routers/upload.py index 4f731a05..f0de746f 100644 --- a/api/data_ingestion/routers/upload.py +++ b/api/data_ingestion/routers/upload.py @@ -944,3 +944,107 @@ async def validate_fuzzy_matching( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Error running fuzzy matching validation: {str(e)}", ) from e + + +@router.get("/dq_kit/{upload_id}/download") +async def download_dq_kit( + upload_id: str, + db: AsyncSession = Depends(get_db), + is_privileged: bool = Depends(IsPrivileged.raises(False)), + user: User = Depends(azure_scheme), +): + """Download a complete DQ Kit ZIP for a given upload.""" + from data_ingestion.utils.dq_kit_generator import generate_dq_kit_zip + + file_upload = await db.scalar(select(FileUpload).where(FileUpload.id == upload_id)) + if file_upload is None: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail="File Upload ID does not exist", + ) + + if ( + not is_privileged + and file_upload.uploader_email != user.claims.get("emails", ["NONE"])[0] + ): + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail="You do not have permission to access this file.", + ) + + if file_upload.dq_status != DQStatusEnum.COMPLETED: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"DQ Kit is not available. DQ Status: {file_upload.dq_status.value}", + ) + + try: + logger.info(f"Generating DQ Kit for upload_id: {upload_id}") + zip_buffer, filename = generate_dq_kit_zip(file_upload) + + return StreamingResponse( + io.BytesIO(zip_buffer.read()), + media_type="application/zip", + headers={"Content-Disposition": f"attachment; filename={filename}"}, + ) + except Exception as e: + logger.error(f"Error generating DQ Kit: {e}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Error generating DQ Kit: {str(e)}", + ) from e + + +@router.get("/map/{upload_id}") +async def get_school_map( + upload_id: str, + db: AsyncSession = Depends(get_db), + is_privileged: bool = Depends(IsPrivileged.raises(False)), + user: User = Depends(azure_scheme), +): + """Serve the interactive school-location HTML map for a given upload.""" + from data_ingestion.utils.dq_kit_generator import get_map_blob_path + + file_upload = await db.scalar(select(FileUpload).where(FileUpload.id == upload_id)) + if file_upload is None: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail="File Upload ID does not exist", + ) + + if ( + not is_privileged + and file_upload.uploader_email != user.claims.get("emails", ["NONE"])[0] + ): + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail="You do not have permission to access this file.", + ) + + map_path = get_map_blob_path(file_upload) + map_filename = Path(map_path).name + logger.info(f"Attempting to serve map from: {map_path}") + + blob = storage_client.get_blob_client(map_path) + if not blob.exists(): + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail="Map not found. It may not have been generated yet.", + ) + + try: + stream = blob.download_blob() + return StreamingResponse( + stream.chunks(), + media_type="text/html", + headers={ + "Content-Disposition": f"inline; filename={map_filename}", + "X-Frame-Options": "SAMEORIGIN", + }, + ) + except Exception as e: + logger.error(f"Error serving map: {e}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Error loading map: {str(e)}", + ) from e diff --git a/api/data_ingestion/utils/dq_kit_generator.py b/api/data_ingestion/utils/dq_kit_generator.py new file mode 100644 index 00000000..a2b0078b --- /dev/null +++ b/api/data_ingestion/utils/dq_kit_generator.py @@ -0,0 +1,110 @@ +""" +Utility for serving DQ Kit ZIP files in the ingestion API. +Tries to serve the pre-generated ZIP from Dagster first +""" + +import io +from pathlib import Path + +from loguru import logger + +from azure.core.exceptions import ResourceNotFoundError +from data_ingestion.internal.storage import storage_client +from data_ingestion.models.file_upload import FileUpload + + +class DQKitManager: + """Serve DQ Kit ZIP files from individual ADLS artifacts.""" + + def __init__(self, file_upload: FileUpload): + self.file_upload = file_upload + self.dataset = file_upload.dataset + self.country = file_upload.country + + if file_upload.dq_full_path: + self.stem = Path(file_upload.dq_full_path).stem + else: + self.stem = Path(file_upload.original_filename or "").stem + + @staticmethod + def _get_blob_if_exists(blob_path: str | None) -> bytes | None: + if not blob_path: + return None + try: + blob_client = storage_client.get_blob_client(blob_path) + if blob_client.exists(): + logger.info(f"Found file: {blob_path}") + return blob_client.download_blob().readall() + logger.warning(f"File not found: {blob_path}") + return None + except ResourceNotFoundError: + logger.warning(f"File not found: {blob_path}") + return None + except Exception as e: + logger.error(f"Error downloading {blob_path}: {e}") + return None + + @property + def _dataset_prefix(self) -> str: + return ( + f"school-{self.dataset}" + if self.dataset not in ("unstructured", "structured") + else self.dataset + ) + + def _file_paths(self) -> dict[str, str | None]: + prefix = self._dataset_prefix + country = self.country + stem = self.stem + dq_root = f"data-quality-results/{prefix}" + + # Pre-built ZIP path (from Dagster `geolocation_dq_kit_zip` asset) + prebuilt_zip = ( + f"{dq_root}/dq-kit/{country}/DQ_Kit_{country}_{self.dataset}_{stem}.zip" + ) + + return { + "prebuilt_zip": prebuilt_zip, + "raw_data": self.file_upload.upload_path, + "dq_summary_json": self.file_upload.dq_report_path, + "dq_report_txt": f"{dq_root}/dq-report/{country}/{stem}.txt", + "passed_rows": f"{dq_root}/dq-passed-rows-human-readable/{country}/{stem}.csv", + "failed_rows": f"{dq_root}/dq-failed-rows-human-readable/{country}/{stem}.csv", + "dq_full_report": self.file_upload.dq_full_path, + "map_html": f"{dq_root}/dq-map/{country}/school_map_{country}_{stem}.html", + } + + def map_blob_path(self) -> str: + """Return the conventional map HTML blob path for this upload.""" + return self._file_paths()["map_html"] # type: ignore[return-value] + + def generate_zip(self) -> io.BytesIO: + """ + Return ZIP bytes. Prefers the pre-built ZIP from Dagster; otherwise + builds one on-demand from the available artifacts. + """ + paths = self._file_paths() + + # Fast path: pre-built ZIP already exists + if prebuilt := self._get_blob_if_exists(paths["prebuilt_zip"]): + logger.info("Serving pre-built DQ Kit ZIP from Dagster") + buffer = io.BytesIO(prebuilt) + buffer.seek(0) + return buffer + + logger.info("Pre-built DQ Kit not found. Building on-demand.") + return None + + def get_zip_filename(self) -> str: + return f"DQ_Kit_{self.country}_{self.dataset}_{self.file_upload.id}.zip" + + +def generate_dq_kit_zip(file_upload: FileUpload) -> tuple[io.BytesIO, str]: + """Convenience function returning (zip_buffer, filename).""" + generator = DQKitManager(file_upload) + return generator.generate_zip(), generator.get_zip_filename() + + +def get_map_blob_path(file_upload: FileUpload) -> str: + """Return the conventional map HTML blob path for an upload.""" + return DQKitManager(file_upload).map_blob_path() diff --git a/ui/src/api/routers/uploads.ts b/ui/src/api/routers/uploads.ts index 91391b76..e12295de 100644 --- a/ui/src/api/routers/uploads.ts +++ b/ui/src/api/routers/uploads.ts @@ -159,6 +159,24 @@ export default function routes(axi: AxiosInstance) { }); }, + download_dq_kit: (params: { + upload_id: string; + }): Promise> => { + const { upload_id } = params; + return axi.get(`upload/dq_kit/${upload_id}/download`, { + responseType: "blob", + }); + }, + + download_map: (params: { + upload_id: string; + }): Promise> => { + const { upload_id } = params; + return axi.get(`upload/map/${upload_id}`, { + responseType: "blob", + }); + }, + list_basic_checks: ( dataset: string, source: string | null, diff --git a/ui/src/components/check-file-uploads/Downloadlogic.tsx b/ui/src/components/check-file-uploads/Downloadlogic.tsx index 2fd9d677..4b0a15a3 100644 --- a/ui/src/components/check-file-uploads/Downloadlogic.tsx +++ b/ui/src/components/check-file-uploads/Downloadlogic.tsx @@ -21,6 +21,14 @@ export function useDownloadHelpers(uploadData: UploadResponse) { mutationFn: api.uploads.download_raw_file, }); + const { mutateAsync: downloadDqKit } = useMutation({ + mutationFn: api.uploads.download_dq_kit, + }); + + const { mutateAsync: downloadMap } = useMutation({ + mutationFn: api.uploads.download_map, + }); + function getFilenameFromFullPath(): string { const pathParts = uploadData.dq_full_path?.split("/") || []; return pathParts[pathParts.length - 1]; @@ -69,10 +77,22 @@ export function useDownloadHelpers(uploadData: UploadResponse) { if (blob) saveFile(blob); } + async function handleDownloadDqKit() { + const blob = await downloadDqKit({ upload_id: uploadData.id }); + if (blob) saveFile(blob); + } + + async function handleDownloadMap() { + const blob = await downloadMap({ upload_id: uploadData.id }); + if (blob) saveFile(blob); + } + return { handleDownloadFailedRows, handleDownloadPassedRows, handleDownloadDqSummary, handleDownloadRawFile, + handleDownloadDqKit, + handleDownloadMap, }; } diff --git a/ui/src/routes/upload/$uploadId/index.tsx b/ui/src/routes/upload/$uploadId/index.tsx index 9f83f030..57a978ea 100644 --- a/ui/src/routes/upload/$uploadId/index.tsx +++ b/ui/src/routes/upload/$uploadId/index.tsx @@ -1,7 +1,15 @@ -import { useMemo } from "react"; +import { useEffect, useMemo, useState } from "react"; import { Download } from "@carbon/icons-react"; -import { Button, Tab, TabList, TabPanel, TabPanels, Tabs } from "@carbon/react"; +import { + Button, + InlineLoading, + Tab, + TabList, + TabPanel, + TabPanels, + Tabs, +} from "@carbon/react"; import { useSuspenseQuery } from "@tanstack/react-query"; import { createFileRoute } from "@tanstack/react-router"; @@ -38,6 +46,9 @@ export const Route = createFileRoute("/upload/$uploadId/")({ function Index() { const { uploadId } = Route.useParams(); + const [mapUrl, setMapUrl] = useState(""); + const [mapLoading, setMapLoading] = useState(true); + const [mapError, setMapError] = useState(""); const { data: dqResultQuery } = useSuspenseQuery({ queryKey: ["dq_check", uploadId], @@ -71,8 +82,43 @@ function Index() { handleDownloadPassedRows, handleDownloadDqSummary, handleDownloadRawFile, + handleDownloadDqKit, + handleDownloadMap, } = useDownloadHelpers(uploadData); + // Fetch map HTML and create a blob URL for iframe preview + useEffect(() => { + if ( + uploadData.dq_status !== "COMPLETED" || + uploadData.dataset !== "geolocation" + ) { + return; + } + + let createdUrl: string | null = null; + setMapLoading(true); + setMapError(""); + + api.uploads + .download_map({ upload_id: uploadId }) + .then(response => { + createdUrl = window.URL.createObjectURL(response.data); + setMapUrl(createdUrl); + setMapLoading(false); + }) + .catch(error => { + console.error("Error loading map:", error); + setMapError("Map not available or not yet generated"); + setMapLoading(false); + }); + + return () => { + if (createdUrl) { + window.URL.revokeObjectURL(createdUrl); + } + }; + }, [uploadId, uploadData.dq_status, uploadData.dataset]); + // Extract checks from dqResultData const { summary: _summaryStats, @@ -144,7 +190,7 @@ function Index() {

-
+
+
@@ -236,6 +290,95 @@ function Index() { + + {uploadData.dq_status === "COMPLETED" && + uploadData.dataset === "geolocation" && ( +
+
+

School Location Map

+ +
+ +
+ {mapLoading && ( +
+ +
+ )} + + {mapError && !mapLoading && ( +
+

{mapError}

+

+ The map may not have been generated yet. +

+
+ )} + + {mapUrl && !mapLoading && !mapError && ( +