Metadata-Aware Embedding Clusters

Problem

You need to cluster embeddings while preserving and utilizing metadata (categories, tags, timestamps) to create more meaningful clusters that respect domain boundaries and improve retrieval quality.

Pure embedding-based clustering groups documents by semantic similarity alone, but this can produce misleading clusters. The word “Python” in a programming tutorial and “Python” in a wildlife documentary produce similar embeddings, yet they belong to completely different domains. Metadata like categories, tags, and timestamps provides domain context that embeddings lack. Incorporating this metadata as clustering constraints produces groups that are both semantically coherent and domain-appropriate.

Solution

Implement clustering algorithms that incorporate metadata as constraints or features, allowing you to create clusters that are both semantically similar and metadata-coherent. The approach uses a weighted combination of embedding similarity and metadata overlap, where the metadataWeight parameter controls the balance. This produces clusters that are more useful for real-world applications like topic-based navigation, duplicate detection, and retrieval pre-filtering.

Code Example

package main

import (
    "context"
    "fmt"
    "log"
    "math"

    "go.opentelemetry.io/otel"
    "go.opentelemetry.io/otel/attribute"
    "go.opentelemetry.io/otel/trace"

    "github.com/lookatitude/beluga-ai/rag/embedding"
    "github.com/lookatitude/beluga-ai/schema"
)

var tracer = otel.Tracer("beluga.embeddings.clustering")

// MetadataAwareClusterer clusters embeddings with metadata constraints.
type MetadataAwareClusterer struct {
    embedder       embedding.Embedder
    metadataWeight float64 // Weight for metadata similarity (0-1)
    minClusterSize int
}

// DocumentWithEmbedding represents a document with its embedding.
type DocumentWithEmbedding struct {
    Document  schema.Document
    Embedding []float32
}

// Cluster represents a group of similar documents.
type Cluster struct {
    ID        string
    Documents []DocumentWithEmbedding
    Centroid  []float32
    Metadata  map[string]interface{}
}

// NewMetadataAwareClusterer creates a new clusterer.
func NewMetadataAwareClusterer(embedder embedding.Embedder, metadataWeight float64, minClusterSize int) *MetadataAwareClusterer {
    return &MetadataAwareClusterer{
        embedder:       embedder,
        metadataWeight: metadataWeight,
        minClusterSize: minClusterSize,
    }
}

// ClusterDocuments clusters documents using embeddings and metadata.
func (mac *MetadataAwareClusterer) ClusterDocuments(ctx context.Context, documents []schema.Document, k int) ([]Cluster, error) {
    ctx, span := tracer.Start(ctx, "clusterer.cluster_documents")
    defer span.End()

    span.SetAttributes(
        attribute.Int("document_count", len(documents)),
        attribute.Int("k_clusters", k),
        attribute.Float64("metadata_weight", mac.metadataWeight),
    )

    // Generate embeddings
    texts := make([]string, len(documents))
    for i, doc := range documents {
        texts[i] = doc.GetContent()
    }

    embeddings, err := mac.embedder.EmbedDocuments(ctx, texts)
    if err != nil {
        span.RecordError(err)
        span.SetStatus(trace.StatusError, err.Error())
        return nil, err
    }

    // Create document-embedding pairs
    docsWithEmbeddings := make([]DocumentWithEmbedding, len(documents))
    for i := range documents {
        docsWithEmbeddings[i] = DocumentWithEmbedding{
            Document:  documents[i],
            Embedding: embeddings[i],
        }
    }

    // Perform metadata-aware clustering
    clusters, err := mac.performClustering(ctx, docsWithEmbeddings, k)
    if err != nil {
        span.RecordError(err)
        span.SetStatus(trace.StatusError, err.Error())
        return nil, err
    }

    span.SetAttributes(attribute.Int("cluster_count", len(clusters)))
    span.SetStatus(trace.StatusOK, "clustering completed")

    return clusters, nil
}

// performClustering performs k-means clustering with metadata constraints.
func (mac *MetadataAwareClusterer) performClustering(ctx context.Context, docs []DocumentWithEmbedding, k int) ([]Cluster, error) {
    clusters := make([]Cluster, k)
    for i := 0; i < k; i++ {
        clusters[i] = Cluster{
            ID:        fmt.Sprintf("cluster-%d", i),
            Documents: []DocumentWithEmbedding{},
            Metadata:  make(map[string]interface{}),
        }
    }

    // Initialize centroids
    for i := range clusters {
        clusters[i].Centroid = docs[i%len(docs)].Embedding
    }

    // K-means iteration
    maxIterations := 100
    for iteration := 0; iteration < maxIterations; iteration++ {
        for i := range clusters {
            clusters[i].Documents = []DocumentWithEmbedding{}
        }

        // Assign documents to clusters
        for _, doc := range docs {
            bestCluster := mac.findBestCluster(doc, clusters)
            clusters[bestCluster].Documents = append(clusters[bestCluster].Documents, doc)
        }

        // Update centroids
        changed := false
        for i := range clusters {
            if len(clusters[i].Documents) > 0 {
                newCentroid := mac.calculateCentroid(clusters[i].Documents)
                if !mac.vectorsEqual(clusters[i].Centroid, newCentroid) {
                    clusters[i].Centroid = newCentroid
                    changed = true
                }
            }
        }

        if !changed {
            break
        }
    }

    // Filter small clusters and aggregate metadata
    filteredClusters := []Cluster{}
    for _, cluster := range clusters {
        if len(cluster.Documents) >= mac.minClusterSize {
            cluster.Metadata = mac.aggregateMetadata(cluster.Documents)
            filteredClusters = append(filteredClusters, cluster)
        }
    }

    return filteredClusters, nil
}

// findBestCluster finds the best cluster for a document using combined similarity.
func (mac *MetadataAwareClusterer) findBestCluster(doc DocumentWithEmbedding, clusters []Cluster) int {
    bestIdx := 0
    bestScore := math.MaxFloat32

    for i, cluster := range clusters {
        embeddingSim := mac.cosineSimilarity(doc.Embedding, cluster.Centroid)
        metadataSim := mac.metadataSimilarity(doc.Document, cluster.Documents)

        score := (1-mac.metadataWeight)*(1-embeddingSim) + mac.metadataWeight*(1-metadataSim)

        if score < bestScore {
            bestScore = score
            bestIdx = i
        }
    }

    return bestIdx
}

// cosineSimilarity calculates cosine similarity between two vectors.
func (mac *MetadataAwareClusterer) cosineSimilarity(a, b []float32) float64 {
    var dotProduct, normA, normB float64
    for i := range a {
        dotProduct += float64(a[i] * b[i])
        normA += float64(a[i] * a[i])
        normB += float64(b[i] * b[i])
    }
    if normA == 0 || normB == 0 {
        return 0
    }
    return dotProduct / (math.Sqrt(normA) * math.Sqrt(normB))
}

// metadataSimilarity calculates metadata overlap between a document and a cluster.
func (mac *MetadataAwareClusterer) metadataSimilarity(doc schema.Document, clusterDocs []DocumentWithEmbedding) float64 {
    if len(clusterDocs) == 0 {
        return 0
    }

    docMeta := doc.GetMetadata()
    matches := 0

    for _, clusterDoc := range clusterDocs {
        clusterMeta := clusterDoc.Document.GetMetadata()
        for key, value := range docMeta {
            if clusterValue, exists := clusterMeta[key]; exists && clusterValue == value {
                matches++
            }
        }
    }

    totalComparisons := len(clusterDocs) * len(docMeta)
    if totalComparisons == 0 {
        return 0
    }

    return float64(matches) / float64(totalComparisons)
}

// calculateCentroid calculates the centroid of a cluster.
func (mac *MetadataAwareClusterer) calculateCentroid(docs []DocumentWithEmbedding) []float32 {
    if len(docs) == 0 {
        return nil
    }

    dim := len(docs[0].Embedding)
    centroid := make([]float32, dim)

    for _, doc := range docs {
        for i := range doc.Embedding {
            centroid[i] += doc.Embedding[i]
        }
    }

    for i := range centroid {
        centroid[i] /= float32(len(docs))
    }

    return centroid
}

// aggregateMetadata aggregates metadata from cluster documents.
func (mac *MetadataAwareClusterer) aggregateMetadata(docs []DocumentWithEmbedding) map[string]interface{} {
    aggregated := make(map[string]interface{})
    valueCounts := make(map[string]map[interface{}]int)

    for _, doc := range docs {
        meta := doc.Document.GetMetadata()
        for key, value := range meta {
            if valueCounts[key] == nil {
                valueCounts[key] = make(map[interface{}]int)
            }
            valueCounts[key][value]++
        }
    }

    for key, counts := range valueCounts {
        maxCount := 0
        var maxValue interface{}
        for value, count := range counts {
            if count > maxCount {
                maxCount = count
                maxValue = value
            }
        }
        aggregated[key] = maxValue
    }

    return aggregated
}

// vectorsEqual checks if two vectors are approximately equal.
func (mac *MetadataAwareClusterer) vectorsEqual(a, b []float32) bool {
    if len(a) != len(b) {
        return false
    }
    for i := range a {
        if math.Abs(float64(a[i]-b[i])) > 0.0001 {
            return false
        }
    }
    return true
}

func main() {
    ctx := context.Background()

    // embedder := your embedding.Embedder instance
    clusterer := NewMetadataAwareClusterer(embedder, 0.3, 5)

    documents := []schema.Document{
        schema.NewDocument("Document 1", map[string]string{"category": "tech"}),
        // ... more documents
    }

    clusters, err := clusterer.ClusterDocuments(ctx, documents, 10)
    if err != nil {
        log.Fatalf("Failed to cluster: %v", err)
    }
    fmt.Printf("Created %d clusters\n", len(clusters))
}

Explanation

Combined similarity metric — Embedding similarity is combined with metadata similarity using a configurable weight parameter (metadataWeight). A weight of 0.0 produces pure embedding-based clustering, while 1.0 would cluster purely by metadata. A typical value of 0.2-0.4 adds domain awareness while keeping semantic similarity as the primary signal. This balance ensures that documents are grouped by meaning while respecting domain boundaries.
Metadata aggregation — After clustering, metadata from cluster members is aggregated using majority voting (most common value per key). This creates cluster-level metadata that describes what each cluster represents (e.g., “category: tech, quarter: Q4”). Cluster metadata is useful for labeling, navigation, and as pre-filtering criteria for retrieval.
Minimum cluster size — Clusters below a minimum size are filtered out. Small clusters often represent noise or outlier documents that don’t share meaningful commonalities. Filtering them prevents creating too many tiny clusters that add complexity without improving retrieval.
Domain boundary respect — The metadata similarity component prevents semantically similar but domain-different documents from clustering together. A document about “Python” in the “programming” category should not cluster with “Python” the snake in the “animals” category, even if their embeddings are similar. The metadata weight provides the domain signal that embeddings lack.

Variations

Hierarchical Clustering

Use hierarchical clustering for nested categories:

type HierarchicalClusterer struct {
    // Build tree of clusters
}

Dynamic K Selection

Automatically determine optimal k:

func (mac *MetadataAwareClusterer) FindOptimalK(ctx context.Context, docs []DocumentWithEmbedding) int {
    // Use elbow method or silhouette score
}

Batch Embedding Optimization — Optimize batch embedding operations
Advanced Metadata Filtering — Filter vector store results with metadata

AI Agents

Data & Retrieval

Infrastructure

Orchestration

Metadata-Aware Embedding Clusters

Problem

Solution

Code Example

Explanation

Variations

Hierarchical Clustering

Dynamic K Selection

AI Agents

Data & Retrieval

Infrastructure

Orchestration

Metadata-Aware Embedding Clusters

Problem

Solution

Code Example

Explanation

Variations

Hierarchical Clustering

Dynamic K Selection

Related Recipes