Semantic Deduplication

Category: Lifecycle

Problem

Over time, memory stores accumulate duplicate or near-duplicate entries. A user who mentions "I prefer dark mode" in five different conversations creates five nearly identical memories. During recall, these duplicates waste retrieval slots and provide no additional value.

Architecture

Dakera detects semantic duplicates by comparing embedding similarity between new memories and existing ones. When a new memory is highly similar to an existing one, it can be merged rather than stored separately. The autopilot consolidation also periodically scans for and merges clusters of similar memories.

Flow

Implementation

from dakera import Dakera

client = Dakera(base_url="http://localhost:3300", api_key="dk-...")

# These memories are semantically equivalent:
client.memory.store(
    content="User prefers dark mode for all applications",
    namespace="user-prefs",
    metadata={"importance": 0.9}
)

# If deduplication detects this is a near-duplicate, it merges automatically
client.memory.store(
    content="The user likes dark mode and always enables it",
    namespace="user-prefs",
    metadata={"importance": 0.85}
)
# Result: single memory retained with highest importance (0.9)

# Run batch deduplication on a namespace
# This finds clusters of similar memories and consolidates them
# via REST API:
# POST /v1/memory/deduplicate
# {
#   "namespace": "user-prefs",
#   "similarity_threshold": 0.92,
#   "merge_strategy": "keep_highest_importance"
# }

Search-Before-Store Pattern

from dakera import Dakera

client = Dakera(base_url="http://localhost:3300", api_key="dk-...")

def store_with_dedup(content: str, namespace: str, metadata: dict):
    """Store a memory, checking for duplicates first."""
    # Search for similar existing memories
    existing = client.memory.search(
        query=content,
        namespace=namespace,
        top_k=3
    )

    # Check if any result is highly similar
    for result in existing.get("results", []):
        if result.get("score", 0) > 0.92:
            # Near-duplicate found — update existing instead of creating new
            client.memory.update(
                memory_id=result["id"],
                metadata={
                    **result.get("metadata", {}),
                    **metadata,
                    "importance": max(
                        result.get("metadata", {}).get("importance", 0),
                        metadata.get("importance", 0.5)
                    ),
                    "reinforcement_count": result.get("metadata", {}).get("reinforcement_count", 0) + 1
                }
            )
            return {"action": "merged", "memory_id": result["id"]}

    # No duplicate found — store as new
    response = client.memory.store(
        content=content,
        namespace=namespace,
        metadata=metadata
    )
    return {"action": "created", "memory_id": response["id"]}

# Usage
store_with_dedup(
    content="User's timezone is America/New_York",
    namespace="user-alice",
    metadata={"type": "preference", "importance": 0.8}
)

TypeScript Example

import { Dakera } from 'dakera';

const client = new Dakera({
  baseUrl: 'http://localhost:3300',
  apiKey: 'dk-...'
});

async function deduplicateNamespace(namespace: string) {
  // Search for all memories and find clusters
  const memories = await client.memory.search({
    query: '*',
    namespace,
    topK: 100
  });

  // Group by high similarity (this is simplified —
  // Dakera's autopilot does this automatically)
  console.log(`Found ${memories.results.length} memories in ${namespace}`);
  console.log('Running deduplication...');

  // The autopilot handles this automatically, but you can also trigger manually
  // await client.autopilot.trigger({ namespace, action: 'deduplicate' });
}

When to Use This Pattern

Key Considerations