Public/Add-DifyDocument.ps1

function Add-DifyDocument {
    [CmdletBinding()]
    param(
        [Parameter(ValueFromPipeline = $true)]
        [PSCustomObject[]] $Item = @(),
        [String[]] $Path = @(),
        [PSCustomObject] $Knowledge,
        [String] $ChunkMode = "automatic",
        [String] $IndexMode = "high_quality",
        [PSCustomObject] $EmbeddingModel = $null,
        [String] $RetrievalMode = "semantic_search",
        [Switch] $Wait = $false,
        [Int] $Interval = 5,
        [Int] $Timeout = 300
    )

    begin {
        $Files = @()
    }

    process {
        foreach ($ItemObject in $Item) {
            $Files += $ItemObject
        }
    }
    end {
        if (-not $Files -and -not $Path) {
            throw "Path is required"
        }
        if ($Path) {
            $Files += Get-ChildItem -Path $Path
        }

        if (-not $Knowledge) {
            throw "Knowledge is required"
        }
        if (@($Knowledge).Count -gt 1) {
            throw "Only one knowledge can be specified"
        }

        $ValidChunkModes = @("automatic", "custom")
        if (-not $ValidChunkModes.Contains($ChunkMode)) {
            throw "Invalid value for ChunkMode. Must be one of: $($ValidChunkModes -join ', ')"
        }
        $ValidIndexModes = @("high_quality", "economy")
        if (-not $ValidIndexModes.Contains($IndexMode)) {
            throw "Invalid value for IndexMode. Must be one of: $($ValidIndexModes -join ', ')"
        }
        $ValidRetrievalModes = @("semantic_search", "full_text_search", "hybrid_search")
        if (-not $ValidRetrievalModes.Contains($RetrievalMode)) {
            throw "Invalid value for RetrievalMode. Must be one of: $($ValidRetrievalModes -join ', ')"
        }

        # not implemented
        if ($RetrievalMode -eq "full_text_search") {
            throw "RetrievalMode: full_text_search is not implemented"
        }
        if ($RetrievalMode -eq "hybrid_search") {
            throw "RetrievalMode: hybrid_search is not implemented"
        }

        # set embedding model
        $DefaultEmbeddingModel = Get-DifySystemModel -Type "text-embedding"
        if ($IndexMode -eq "high_quality" -and -not $DefaultEmbeddingModel -and -not $EmbeddingModel) {
            throw "Model is required for IndexMode: high_quality"
        }
        if ($IndexMode -eq "high_quality" -and -not $EmbeddingModel) {
            $EmbeddingModel = $DefaultEmbeddingModel
        }

        # upload files
        $UploadedFiles = Add-DifyFile -Path $Files -Source "datasets"
        $UploadedFileIds = $UploadedFiles | Select-Object -ExpandProperty Id

        # rules
        $Rules = @{}
        switch ($ChunkMode) {
            "automatic" {
                $Rules = @{}
            }
            "custom" {
                $Rules = @{
                    "pre_processing_rules" = @(
                        @{
                            "id"      = "remove_extra_spaces"
                            "enabled" = $true
                        },
                        @{
                            "id"      = "remove_urls_emails"
                            "enabled" = $false
                        }
                    )
                    "segmentation"         = @{
                        "separator"     = "\n\n"
                        "max_tokens"    = 500
                        "chunk_overlap" = 50
                    }
                }
            }
        }

        # add document
        $Endpoint = Join-Url -Segments @($env:PSDIFY_URL, "/console/api/datasets", $Knowledge.Id, "/documents")
        $Method = "POST"
        $Body = @{
            "data_source"              = @{
                "type"      = "upload_file"
                "info_list" = @{ 
                    "data_source_type" = "upload_file"
                    "file_info_list"   = @{
                        "file_ids" = @($UploadedFileIds)
                    } 
                } 
            }
            "indexing_technique"       = $IndexMode
            "process_rule"             = @{
                "mode"  = $ChunkMode
                "rules" = $Rules
            }
            "doc_form"                 = "text_model"
            "doc_language"             = "English"
            "retrieval_model"          = @{
                "search_method"           = $RetrievalMode
                "reranking_enable"        = $false
                "reranking_mode"          = $null
                "reranking_model"         = @{ 
                    "reranking_provider_name" = $null
                    "reranking_model_name"    = $null
                }
                "weights"                 = $null
                "top_k"                   = 3
                "score_threshold_enabled" = $false
                "score_threshold"         = 0
            }
            "embedding_model"          = $EmbeddingModel.Model
            "embedding_model_provider" = $EmbeddingModel.Provider
        } | ConvertTo-Json -Depth 10
        try {
            $Response = Invoke-DifyRestMethod -Uri $Endpoint -Method $Method -Body $Body -Token $env:PSDIFY_CONSOLE_TOKEN
        }
        catch {
            throw "Failed to add documents to knowledge: $_"
        }

        if (-not $Response.documents) {
            throw "Failed to add documents to knowledge"
        }

        $Documents = @()
        foreach ($Document in $Response.documents) {
            $CreatedBy = $Members | Where-Object { $_.Id -eq $Document.created_by } | Select-Object -ExpandProperty Email
            if (-not $CreatedBy) {
                $CreatedBy = $Document.created_by
            }
            $UploadedBy = $Members | Where-Object { $_.Id -eq $Document.data_source_detail_dict.upload_file.created_by } | Select-Object -ExpandProperty Email
            if (-not $UploadedBy) {
                $UploadedBy = $Document.data_source_detail_dict.upload_file.created_by
            }
            $DocumentObject = [PSCustomObject]@{
                Batch          = $Response.batch
                KnowledgeId    = $Knowledge.Id
                Id             = $Document.id
                Name           = $Document.name
                DataSourceType = $Document.data_source_type
                WordCount      = $Document.word_count
                HitCount       = $Document.hit_count
                IndexingStatus = $Document.indexing_status
                Enabled        = $Document.enabled
                Archived       = $Document.archived
                CreatedBy      = $CreatedBy
                CreatedAt      = Convert-UnixTimeToLocalDateTime($Document.created_at)
                UploadedBy     = $UploadedBy
                UploadedAt     = Convert-UnixTimeToLocalDateTime($Document.data_source_detail_dict.upload_file.created_at)
            }
            $Documents += $DocumentObject
        }

        if ($Wait) {
            $null = Get-DifyDocumentIndexingStatus -Document $Documents -Wait -Interval $Interval -Timeout $Timeout
            $UpdatedAllDocuments = Get-DifyDocument -Knowledge $Knowledge
            $UpdatedDocuments = @()
            foreach ($Document in $Documents) {
                $UpdatedDocuments += $UpdatedAllDocuments | Where-Object { $_.Id -eq $Document.Id }
            }
            return $UpdatedDocuments
        }
        else {
            return $Documents
        }
    }
}