diff --git a/internal/handler/document.go b/internal/handler/document.go index 96e81c4005f..87f5369cb1e 100644 --- a/internal/handler/document.go +++ b/internal/handler/document.go @@ -25,6 +25,7 @@ import ( "path/filepath" "ragflow/internal/common" "ragflow/internal/entity" + "ragflow/internal/utility" "strconv" "strings" "time" @@ -55,6 +56,9 @@ type documentServiceIface interface { DeleteDocumentMetadata(docID string, keys []string) error DeleteDocumentAllMetadata(docID string) error GetDocumentMetadataByID(docID string) (map[string]interface{}, error) + GetDocumentArtifact(filename string) (*service.ArtifactResponse, error) + GetDocumentPreview(docID string) (*service.DocumentPreview, error) + DownloadDocument(datasetID, docID string) (*service.DownloadDocumentResp, error) } // DocumentHandler document handler @@ -198,6 +202,68 @@ func (h *DocumentHandler) GetDocumentImage(c *gin.Context) { c.Data(http.StatusOK, contentType, data) } +func (h *DocumentHandler) GetDocumentArtifact(c *gin.Context) { + filename := c.Param("filename") + artifact, err := h.documentService.GetDocumentArtifact(filename) + if err != nil { + switch { + case errors.Is(err, service.ErrArtifactInvalidFilename), + errors.Is(err, service.ErrArtifactInvalidFileType), + errors.Is(err, service.ErrArtifactNotFound): + c.JSON(http.StatusOK, gin.H{ + "code": common.CodeDataError, + "message": err.Error(), + }) + default: + c.JSON(http.StatusOK, gin.H{ + "code": common.CodeExceptionError, + "data": nil, + "message": err.Error(), + }) + } + return + } + + c.Header("Content-Type", artifact.ContentType) + if artifact.ForceAttachment { + c.Header("X-Content-Type-Options", "nosniff") + c.Header("Content-Disposition", "attachment") + } else { + c.Header("Content-Disposition", fmt.Sprintf(`inline; filename="%s"`, artifact.SafeFilename)) + } + c.Data(http.StatusOK, artifact.ContentType, artifact.Data) +} + +func (h *DocumentHandler) GetDocumentPreview(c *gin.Context) { + docID := c.Param("id") + + if docID == "" { + jsonError(c, common.CodeParamError, "id is required") + return + } + + preview, err := h.documentService.GetDocumentPreview(docID) + if err != nil { + c.JSON(http.StatusOK, gin.H{ + "code": common.CodeDataError, + "message": "Document not found!", + }) + return + } + + ext := utility.GetFileExtension(preview.FileName) + if preview.ContentType != "" { + c.Header("Content-Type", preview.ContentType) + } + + if utility.ShouldForceAttachment(ext, preview.ContentType) { + c.Header("X-Content-Type-Options", "nosniff") + c.Header("Content-Disposition", "attachment") + } + + c.Data(http.StatusOK, preview.ContentType, preview.Data) +} + // UpdateDocument update document // @Summary Update Document // @Description Update document info @@ -382,6 +448,40 @@ func (h *DocumentHandler) ListDocuments(c *gin.Context) { }) } +func (h *DocumentHandler) DownloadDocument(c *gin.Context) { + datasetID := c.Param("dataset_id") + docID := c.Param("document_id") + + if docID == "" { + c.JSON(http.StatusOK, gin.H{ + "code": common.CodeDataError, + "message": "Specify document_id please.", + }) + return + } + if datasetID == "" { + c.JSON(http.StatusOK, gin.H{ + "code": common.CodeDataError, + "message": fmt.Sprintf("The dataset not own the document %s.", docID), + }) + return + } + + res, err := h.documentService.DownloadDocument(datasetID, docID) + + if err != nil { + c.JSON(http.StatusOK, gin.H{ + "code": common.CodeDataError, + "message": err.Error(), + }) + return + } + + c.Header("Content-Type", res.ContentType) + c.Header("Content-Disposition", fmt.Sprintf(`attachment; filename="%s"`, res.FileName)) + c.Data(http.StatusOK, res.ContentType, res.Data) +} + func mapDocumentListItem(doc *entity.DocumentListItem, metaFields map[string]interface{}) map[string]interface{} { item := map[string]interface{}{ "id": doc.ID, diff --git a/internal/handler/document_test.go b/internal/handler/document_test.go index 8717e7b51ba..68b79849017 100644 --- a/internal/handler/document_test.go +++ b/internal/handler/document_test.go @@ -42,6 +42,40 @@ type fakeDocumentService struct { stopErr error } +func (f *fakeDocumentService) GetDocumentArtifact(filename string) (*service.ArtifactResponse, error) { + if filename == "error.txt" { + return nil, service.ErrArtifactNotFound + } + if filename == "unexpected.txt" { + return nil, fmt.Errorf("unexpected error") + } + return &service.ArtifactResponse{ + Data: []byte("artifact content"), + ContentType: "text/plain", + SafeFilename: "safe.txt", + ForceAttachment: false, + }, nil +} +func (f *fakeDocumentService) GetDocumentPreview(docID string) (*service.DocumentPreview, error) { + if docID == "not-found" { + return nil, fmt.Errorf("not found") + } + return &service.DocumentPreview{ + Data: []byte("preview content"), + ContentType: "text/plain", + FileName: "preview.txt", + }, nil +} +func (f *fakeDocumentService) DownloadDocument(datasetID, docID string) (*service.DownloadDocumentResp, error) { + if docID == "not-found" { + return nil, fmt.Errorf("not found") + } + return &service.DownloadDocumentResp{ + Data: []byte("document data"), + ContentType: "application/pdf", + FileName: "doc.pdf", + }, nil +} func (f *fakeDocumentService) CreateDocument(req *service.CreateDocumentRequest) (*entity.Document, error) { return nil, nil } @@ -442,3 +476,146 @@ func TestStopParseDocumentsHandler_NotAccessible(t *testing.T) { t.Fatal("expected error for no authorization") } } + +func TestGetDocumentArtifact_Success(t *testing.T) { + gin.SetMode(gin.TestMode) + h := &DocumentHandler{ + documentService: &fakeDocumentService{}, + } + c, w := setupGinContextWithUser("GET", "/api/v1/documents/artifact/test.txt", "") + c.Params = gin.Params{{Key: "filename", Value: "test.txt"}} + + h.GetDocumentArtifact(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", w.Code) + } + if w.Header().Get("Content-Type") != "text/plain" { + t.Fatalf("unexpected content type: %s", w.Header().Get("Content-Type")) + } + if w.Body.String() != "artifact content" { + t.Fatalf("unexpected body: %s", w.Body.String()) + } +} + +func TestGetDocumentArtifact_NotFound(t *testing.T) { + gin.SetMode(gin.TestMode) + h := &DocumentHandler{ + documentService: &fakeDocumentService{}, + } + c, w := setupGinContextWithUser("GET", "/api/v1/documents/artifact/error.txt", "") + c.Params = gin.Params{{Key: "filename", Value: "error.txt"}} + + h.GetDocumentArtifact(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", w.Code) + } + var resp map[string]interface{} + json.Unmarshal(w.Body.Bytes(), &resp) + if resp["code"] != float64(common.CodeDataError) { + t.Fatalf("expected code %d, got %v", common.CodeDataError, resp["code"]) + } +} + +func TestGetDocumentArtifact_UnexpectedError(t *testing.T) { + gin.SetMode(gin.TestMode) + h := &DocumentHandler{ + documentService: &fakeDocumentService{}, + } + c, w := setupGinContextWithUser("GET", "/api/v1/documents/artifact/unexpected.txt", "") + c.Params = gin.Params{{Key: "filename", Value: "unexpected.txt"}} + + h.GetDocumentArtifact(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", w.Code) + } + var resp map[string]interface{} + json.Unmarshal(w.Body.Bytes(), &resp) + if resp["code"] != float64(common.CodeExceptionError) { + t.Fatalf("expected code %d, got %v", common.CodeExceptionError, resp["code"]) + } +} + +func TestGetDocumentPreview_Success(t *testing.T) { + gin.SetMode(gin.TestMode) + h := &DocumentHandler{ + documentService: &fakeDocumentService{}, + } + c, w := setupGinContextWithUser("GET", "/api/v1/documents/doc-1/preview", "") + c.Params = gin.Params{{Key: "id", Value: "doc-1"}} + + h.GetDocumentPreview(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", w.Code) + } + if w.Header().Get("Content-Type") != "text/plain" { + t.Fatalf("unexpected content type: %s", w.Header().Get("Content-Type")) + } + if w.Body.String() != "preview content" { + t.Fatalf("unexpected body: %s", w.Body.String()) + } +} + +func TestGetDocumentPreview_NotFound(t *testing.T) { + gin.SetMode(gin.TestMode) + h := &DocumentHandler{ + documentService: &fakeDocumentService{}, + } + c, w := setupGinContextWithUser("GET", "/api/v1/documents/not-found/preview", "") + c.Params = gin.Params{{Key: "id", Value: "not-found"}} + + h.GetDocumentPreview(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", w.Code) + } + var resp map[string]interface{} + json.Unmarshal(w.Body.Bytes(), &resp) + if resp["code"] != float64(common.CodeDataError) { + t.Fatalf("expected code %d, got %v", common.CodeDataError, resp["code"]) + } +} + +func TestDownloadDocument_Success(t *testing.T) { + gin.SetMode(gin.TestMode) + h := &DocumentHandler{ + documentService: &fakeDocumentService{}, + } + c, w := setupGinContextWithUser("GET", "/api/v1/datasets/ds-1/documents/doc-1", "") + c.Params = gin.Params{{Key: "dataset_id", Value: "ds-1"}, {Key: "document_id", Value: "doc-1"}} + + h.DownloadDocument(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", w.Code) + } + if w.Header().Get("Content-Type") != "application/pdf" { + t.Fatalf("unexpected content type: %s", w.Header().Get("Content-Type")) + } + if w.Body.String() != "document data" { + t.Fatalf("unexpected body: %s", w.Body.String()) + } +} + +func TestDownloadDocument_NotFound(t *testing.T) { + gin.SetMode(gin.TestMode) + h := &DocumentHandler{ + documentService: &fakeDocumentService{}, + } + c, w := setupGinContextWithUser("GET", "/api/v1/datasets/ds-1/documents/not-found", "") + c.Params = gin.Params{{Key: "dataset_id", Value: "ds-1"}, {Key: "document_id", Value: "not-found"}} + + h.DownloadDocument(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", w.Code) + } + var resp map[string]interface{} + json.Unmarshal(w.Body.Bytes(), &resp) + if resp["code"] != float64(common.CodeDataError) { + t.Fatalf("expected code %d, got %v", common.CodeDataError, resp["code"]) + } +} diff --git a/internal/router/router.go b/internal/router/router.go index 9a84c7b55ca..865bfec6291 100644 --- a/internal/router/router.go +++ b/internal/router/router.go @@ -204,6 +204,8 @@ func (r *Router) Setup(engine *gin.Engine) { { documents.POST("", r.documentHandler.CreateDocument) documents.GET("", r.documentHandler.ListDocuments) + documents.GET("/artifact/:filename", r.documentHandler.GetDocumentArtifact) + documents.GET("/:id/preview", r.documentHandler.GetDocumentPreview) documents.GET("/:id", r.documentHandler.GetDocumentByID) documents.PUT("/:id", r.documentHandler.UpdateDocument) documents.DELETE("/:id", r.documentHandler.DeleteDocument) @@ -244,6 +246,7 @@ func (r *Router) Setup(engine *gin.Engine) { // Dataset documents datasets.GET("/:dataset_id/documents", r.documentHandler.ListDocuments) + datasets.GET("/:dataset_id/documents/:document_id", r.documentHandler.DownloadDocument) datasets.DELETE("/:dataset_id/documents", r.documentHandler.DeleteDocuments) // Dataset document chunk diff --git a/internal/service/document.go b/internal/service/document.go index 213c6edb39e..ceaef93ba83 100644 --- a/internal/service/document.go +++ b/internal/service/document.go @@ -19,10 +19,14 @@ package service import ( "context" "encoding/json" + "errors" "fmt" + "os" + "path/filepath" "ragflow/internal/common" "ragflow/internal/entity" "ragflow/internal/storage" + "ragflow/internal/utility" "regexp" "sort" "strings" @@ -117,6 +121,51 @@ type ThumbnailResponse struct { KbID string `json:"kb_id"` } +type ArtifactResponse struct { + Data []byte + ContentType string + SafeFilename string + ForceAttachment bool +} + +var ( + ErrArtifactInvalidFilename = errors.New("Invalid filename.") + ErrArtifactInvalidFileType = errors.New("Invalid file type.") + ErrArtifactNotFound = errors.New("Artifact not found.") +) + +var artifactContentTypes = map[string]string{ + ".png": "image/png", + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".svg": "image/svg+xml", + ".pdf": "application/pdf", + ".csv": "text/csv", + ".json": "application/json", + ".html": "text/html", +} + +var artifactForceAttachmentExtensions = map[string]struct{}{ + ".htm": {}, + ".html": {}, + ".shtml": {}, + ".xht": {}, + ".xhtml": {}, + ".xml": {}, + ".mhtml": {}, + ".svg": {}, +} +var artifactForceAttachmentContentTypes = map[string]struct{}{ + "text/html": {}, + "image/svg+xml": {}, + "application/xhtml+xml": {}, + "text/xml": {}, + "application/xml": {}, + "multipart/related": {}, +} + +var artifactUnsafeFilenameChars = regexp.MustCompile(`[^\pL\pN_.-]`) + // GetDocumentImage retrieves an image object from storage. func (s *DocumentService) GetDocumentImage(imageID string) ([]byte, error) { parts := strings.Split(imageID, "-") @@ -132,6 +181,186 @@ func (s *DocumentService) GetDocumentImage(imageID string) ([]byte, error) { return storageImpl.Get(parts[0], parts[1]) } +// GetDocumentArtifact retrieves a sandbox artifact from object storage. +func (s *DocumentService) GetDocumentArtifact(filename string) (*ArtifactResponse, error) { + basename := filepath.Base(filename) + if basename != filename || strings.Contains(filename, "/") || strings.Contains(filename, "\\") { + return nil, ErrArtifactInvalidFilename + } + + ext := strings.ToLower(filepath.Ext(basename)) + contentType, ok := artifactContentTypes[ext] + if !ok { + return nil, ErrArtifactInvalidFileType + } + + storageImpl := storage.GetStorageFactory().GetStorage() + if storageImpl == nil { + return nil, fmt.Errorf("storage not initialized") + } + + bucket := sandboxArtifactBucket() + if !storageImpl.ObjExist(bucket, basename) { + return nil, ErrArtifactNotFound + } + + data, err := storageImpl.Get(bucket, basename) + if err != nil { + return nil, err + } + if len(data) == 0 { + return nil, ErrArtifactNotFound + } + + return &ArtifactResponse{ + Data: data, + ContentType: contentType, + SafeFilename: sanitizeArtifactFilename(basename), + ForceAttachment: shouldForceArtifactAttachment(ext, contentType), + }, nil +} + +func sandboxArtifactBucket() string { + if bucket := os.Getenv("SANDBOX_ARTIFACT_BUCKET"); bucket != "" { + return bucket + } + return "sandbox-artifacts" +} + +func sanitizeArtifactFilename(filename string) string { + return artifactUnsafeFilenameChars.ReplaceAllString(filename, "_") +} + +func shouldForceArtifactAttachment(ext, contentType string) bool { + if _, ok := artifactForceAttachmentExtensions[strings.ToLower(ext)]; ok { + return true + } + _, ok := artifactForceAttachmentContentTypes[strings.ToLower(contentType)] + return ok +} + +type DocumentPreview struct { + Data []byte + ContentType string + FileName string +} + +func (s *DocumentService) GetDocumentPreview(docID string) (*DocumentPreview, error) { + doc, err := s.documentDAO.GetByID(docID) + if err != nil { + return nil, err + } + + bucket, name, err := s.GetDocumentStorageAddress(doc) + if err != nil { + return nil, err + } + + storageImpl := storage.GetStorageFactory().GetStorage() + if storageImpl == nil { + return nil, fmt.Errorf("storage not initialized") + } + + data, err := storageImpl.Get(bucket, name) + if err != nil { + return nil, err + } + if len(data) == 0 { + return nil, ErrArtifactNotFound + } + + fileName := "" + if doc.Name != nil { + fileName = *doc.Name + } + + ext := utility.GetFileExtension(fileName) + contentType := utility.GetContentType(ext, doc.Type) + + return &DocumentPreview{ + Data: data, + ContentType: contentType, + FileName: fileName, + }, nil +} + +func (s *DocumentService) GetDocumentStorageAddress(doc *entity.Document) (string, string, error) { + if doc == nil { + return "", "", fmt.Errorf("document is nil") + } + + file2DocumentDAO := dao.NewFile2DocumentDAO() + fileDAO := dao.NewFileDAO() + + mappings, err := file2DocumentDAO.GetByDocumentID(doc.ID) + if err != nil { + return "", "", err + } + + if len(mappings) > 0 && mappings[0].FileID != nil { + file, err := fileDAO.GetByID(*mappings[0].FileID) + if err != nil { + return "", "", err + } + + if file.SourceType == "" || entity.FileSource(file.SourceType) == entity.FileSourceLocal { + if file.Location == nil || *file.Location == "" { + return "", "", fmt.Errorf("file location is empty") + } + return file.ParentID, *file.Location, nil + } + } + + if doc.Location == nil || *doc.Location == "" { + return "", "", fmt.Errorf("document location is empty") + } + return doc.KbID, *doc.Location, nil +} + +type DownloadDocumentResp struct { + Data []byte + FileName string + ContentType string +} + +func (s *DocumentService) DownloadDocument(datasetID, docID string) (*DownloadDocumentResp, error) { + if docID == "" { + return nil, fmt.Errorf("Specify document_id please.") + } + doc, err := s.documentDAO.GetByID(docID) + if err != nil || doc.KbID != datasetID { + return nil, fmt.Errorf("The dataset not own the document %s.", docID) + } + bucket, name, err := s.GetDocumentStorageAddress(doc) + if err != nil { + return nil, err + } + + storageImpl := storage.GetStorageFactory().GetStorage() + if storageImpl == nil { + return nil, fmt.Errorf("storage not initialized") + } + + data, err := storageImpl.Get(bucket, name) + if err != nil { + return nil, err + } + if len(data) == 0 { + return nil, fmt.Errorf("This file is empty.") + } + + fileName := "" + if doc.Name != nil { + fileName = *doc.Name + } + + return &DownloadDocumentResp{ + Data: data, + FileName: fileName, + ContentType: "application/octet-stream", + }, nil +} + // CreateDocument create document func (s *DocumentService) CreateDocument(req *CreateDocumentRequest) (*entity.Document, error) { document := &entity.Document{ diff --git a/internal/service/document_test.go b/internal/service/document_test.go index 31a3f6718b0..7f646964eeb 100644 --- a/internal/service/document_test.go +++ b/internal/service/document_test.go @@ -855,3 +855,67 @@ func TestCleanupFileReferences_SharedFileSurvives(t *testing.T) { t.Fatalf("expected 1 f2d for doc-2, got %d", len(mappings)) } } + +func TestArtifactHelpers(t *testing.T) { + // Test sanitizeArtifactFilename + safe := sanitizeArtifactFilename("test@#file.txt") + if safe != "test__file.txt" { + t.Errorf("expected test__file.txt, got %s", safe) + } + + // Test shouldForceArtifactAttachment + if !shouldForceArtifactAttachment(".html", "text/html") { + t.Error("expected true for .html") + } + if shouldForceArtifactAttachment(".txt", "text/plain") { + t.Error("expected false for .txt") + } +} + +func TestGetDocumentArtifact_InvalidFilename(t *testing.T) { + svc := testDocumentService(t) + _, err := svc.GetDocumentArtifact("../test.txt") + if err != ErrArtifactInvalidFilename { + t.Errorf("expected ErrArtifactInvalidFilename, got %v", err) + } +} + +func TestGetDocumentArtifact_InvalidFileType(t *testing.T) { + svc := testDocumentService(t) + _, err := svc.GetDocumentArtifact("test.exe") + if err != ErrArtifactInvalidFileType { + t.Errorf("expected ErrArtifactInvalidFileType, got %v", err) + } +} + +func TestGetDocumentPreview_DocumentNotFound(t *testing.T) { + db := setupServiceTestDB(t) + pushServiceDB(t, db) + svc := testDocumentService(t) + + _, err := svc.GetDocumentPreview("nonexistent") + if err == nil { + t.Error("expected error for nonexistent document") + } +} + +func TestDownloadDocument_MissingDocID(t *testing.T) { + svc := testDocumentService(t) + _, err := svc.DownloadDocument("ds-1", "") + if err == nil { + t.Error("expected error for missing docID") + } +} + +func TestDownloadDocument_WrongDataset(t *testing.T) { + db := setupServiceTestDB(t) + pushServiceDB(t, db) + insertTestKB(t, "kb-1", "tenant-1", 1, 5, 2) + insertTestDoc(t, "doc-1", "kb-1", 5, 2) + svc := testDocumentService(t) + + _, err := svc.DownloadDocument("wrong-ds", "doc-1") + if err == nil { + t.Error("expected error for wrong dataset") + } +}