VectorDB

The main database class for managing vector collections.

`simplevecdb.core.VectorDB`

Dead-simple local vector database powered by usearch HNSW.

SQLite stores metadata and text; usearch stores vectors in separate .usearch files per collection. Provides Chroma-like API with built-in quantization for storage efficiency.

Storage layout: - {path} - SQLite database (metadata, text, FTS) - {path}.{collection}.usearch - usearch HNSW index per collection

Encryption (optional): - SQLite encrypted via SQLCipher (transparent page-level AES-256) - Index files encrypted via AES-256-GCM (at-rest only, zero runtime overhead)

Source code in src/simplevecdb/core.py

class VectorDB:
    """
    Dead-simple local vector database powered by usearch HNSW.

    SQLite stores metadata and text; usearch stores vectors in separate
    .usearch files per collection. Provides Chroma-like API with built-in
    quantization for storage efficiency.

    Storage layout:
    - {path} - SQLite database (metadata, text, FTS)
    - {path}.{collection}.usearch - usearch HNSW index per collection

    Encryption (optional):
    - SQLite encrypted via SQLCipher (transparent page-level AES-256)
    - Index files encrypted via AES-256-GCM (at-rest only, zero runtime overhead)
    """

    def __init__(
        self,
        path: str | Path = ":memory:",
        distance_strategy: DistanceStrategy = DistanceStrategy.COSINE,
        quantization: Quantization = Quantization.FLOAT,
        *,
        encryption_key: str | bytes | None = None,
        auto_migrate: bool = False,
    ):
        """Initialize the vector database.

        Args:
            path: Database file path or ":memory:" for in-memory database.
            distance_strategy: Default distance metric for similarity search.
            quantization: Default vector compression strategy.
            encryption_key: Optional passphrase or 32-byte key for at-rest encryption.
                Encrypts both SQLite (via SQLCipher) and usearch index files (via AES-256-GCM).
            auto_migrate: If True, automatically migrate v1.x sqlite-vec data
                to usearch. If False (default), raise MigrationRequiredError
                when legacy data is detected. Use check_migration() to preview.

        Raises:
            MigrationRequiredError: If auto_migrate=False and legacy sqlite-vec
                data is detected. Contains details about what needs migration.
            EncryptionUnavailableError: If encryption_key provided but encryption
                dependencies are missing.
            EncryptionError: If encrypted database cannot be opened (wrong key).
            ValueError: If encryption_key used with ":memory:" database.
        """
        self.path = str(path)
        self.distance_strategy = distance_strategy
        self.quantization = quantization
        self.auto_migrate = auto_migrate
        self._encryption_key = encryption_key
        self._collections: dict[str, VectorCollection] = {}

        # Create connection (encrypted or plain)
        if encryption_key is not None:
            if self.path == ":memory:":
                raise ValueError(
                    "In-memory databases cannot be encrypted. "
                    "Use a file path for encrypted databases."
                )
            self.conn = create_encrypted_connection(
                self.path,
                encryption_key,
                check_same_thread=False,
                timeout=30.0,
            )
            self._encrypted = True
            _logger.info("Opened encrypted database: %s", self.path)
        else:
            self.conn = sqlite3.connect(
                self.path, check_same_thread=False, timeout=30.0
            )
            self.conn.execute("PRAGMA journal_mode=WAL")
            self.conn.execute("PRAGMA synchronous=NORMAL")
            self._encrypted = False

        # Check for required migration before allowing collection access
        if not auto_migrate and self.path != ":memory:":
            migration_info = VectorDB.check_migration(self.path)
            if migration_info["needs_migration"]:
                self.conn.close()
                raise MigrationRequiredError(
                    path=self.path,
                    collections=migration_info["collections"],
                    total_vectors=migration_info["total_vectors"],
                    migration_info=migration_info,
                )

    def list_collections(self) -> list[str]:
        """
        Return names of all initialized collections.

        Only returns collections that have been accessed via `collection()` in this
        session. Does not scan the database for collections created in previous sessions.

        Returns:
            List of collection names currently cached in this VectorDB instance.

        Example:
            >>> db = VectorDB("app.db")
            >>> db.collection("users")
            >>> db.collection("products")
            >>> db.list_collections()
            ['users', 'products']
        """
        return list(self._collections.keys())

    def search_collections(
        self,
        query: Sequence[float],
        collections: list[str] | None = None,
        k: int = 10,
        filter: dict[str, Any] | None = None,
        *,
        normalize_scores: bool = True,
        parallel: bool = True,
    ) -> list[tuple[Document, float, str]]:
        """
        Search across multiple collections with merged, ranked results.

        Performs similarity search on each collection and merges results using
        score normalization for fair comparison across distance metrics.

        Args:
            query: Query vector (must match dimension of all searched collections).
            collections: List of collection names to search. None searches all
                initialized collections (from list_collections()).
            k: Number of top results to return after merging.
            filter: Optional metadata filter applied to all collections.
            normalize_scores: If True, convert distances to similarity scores
                in [0, 1] range using `1 / (1 + distance)`. Enables fair
                comparison across COSINE [0,2] and L2 [0,∞) metrics.
            parallel: If True, search collections concurrently using ThreadPoolExecutor.

        Returns:
            List of (Document, similarity_score, collection_name) tuples,
            sorted by descending similarity score (highest first).

        Raises:
            ValueError: If no collections specified and none initialized,
                or if collections have mismatched dimensions.
            KeyError: If a specified collection name doesn't exist.

        Example:
            >>> db = VectorDB("app.db")
            >>> db.collection("users").add_texts(["alice"], embeddings=[[0.1]*384])
            >>> db.collection("products").add_texts(["widget"], embeddings=[[0.2]*384])
            >>> results = db.search_collections([0.15]*384, k=2)
            >>> for doc, score, coll in results:
            ...     print(f"{coll}: {doc.page_content} ({score:.3f})")
        """
        target_names = (
            collections if collections is not None else self.list_collections()
        )

        if not target_names:
            return []

        # Resolve and validate collections
        targets: list[VectorCollection] = []
        dims: set[int | None] = set()
        for name in target_names:
            if name not in self._collections:
                raise KeyError(
                    f"Collection '{name}' not initialized. Call db.collection('{name}') first."
                )
            coll = self._collections[name]
            targets.append(coll)
            dims.add(coll._dim)

        # Check dimension consistency (ignore None for empty collections)
        dims.discard(None)
        if len(dims) > 1:
            raise ValueError(
                f"Dimension mismatch across collections: {dims}. "
                "All searched collections must have the same embedding dimension."
            )

        # Search function for each collection
        def _search_one(coll: VectorCollection) -> list[tuple[Document, float, str]]:
            results = coll.similarity_search(query, k=k, filter=filter)
            return [(doc, dist, coll.name) for doc, dist in results]

        # Execute searches
        all_results: list[tuple[Document, float, str]] = []
        if parallel and len(targets) > 1:
            from concurrent.futures import ThreadPoolExecutor

            with ThreadPoolExecutor(max_workers=min(len(targets), 8)) as executor:
                futures = [executor.submit(_search_one, coll) for coll in targets]
                for future in futures:
                    all_results.extend(future.result())
        else:
            for coll in targets:
                all_results.extend(_search_one(coll))

        # Normalize scores: similarity = 1 / (1 + distance)
        if normalize_scores:
            all_results = [
                (doc, 1.0 / (1.0 + dist), name) for doc, dist, name in all_results
            ]
        else:
            # Invert for sorting (lower distance = higher rank)
            all_results = [(doc, -dist, name) for doc, dist, name in all_results]

        # Sort by score descending and take top k
        all_results.sort(key=lambda x: x[1], reverse=True)
        return all_results[:k]

    def collection(
        self,
        name: str = "default",
        distance_strategy: DistanceStrategy | None = None,
        quantization: Quantization | None = None,
    ) -> VectorCollection:
        """
        Get or create a named collection.

        Collections provide isolated namespaces within a single database.
        Each collection has its own usearch index file.

        Args:
            name: Collection name (alphanumeric + underscore only).
            distance_strategy: Override database-level distance metric.
            quantization: Override database-level quantization.

        Returns:
            VectorCollection instance.

        Raises:
            ValueError: If collection name contains invalid characters.
        """
        cache_key = name
        if cache_key not in self._collections:
            self._collections[cache_key] = VectorCollection(
                conn=self.conn,
                db_path=self.path,
                name=name,
                distance_strategy=distance_strategy or self.distance_strategy,
                quantization=quantization or self.quantization,
                encryption_key=self._encryption_key,
            )
        return self._collections[cache_key]

    # ------------------------------------------------------------------ #
    # Integrations
    # ------------------------------------------------------------------ #
    def as_langchain(
        self, embeddings: Embeddings | None = None, collection_name: str = "default"
    ) -> SimpleVecDBVectorStore:
        """Return a LangChain-compatible vector store interface."""
        from .integrations.langchain import SimpleVecDBVectorStore

        return SimpleVecDBVectorStore(
            db_path=self.path, embedding=embeddings, collection_name=collection_name
        )

    def as_llama_index(self, collection_name: str = "default") -> SimpleVecDBLlamaStore:
        """Return a LlamaIndex-compatible vector store interface."""
        from .integrations.llamaindex import SimpleVecDBLlamaStore

        return SimpleVecDBLlamaStore(db_path=self.path, collection_name=collection_name)

    # ------------------------------------------------------------------ #
    # Convenience
    # ------------------------------------------------------------------ #
    @staticmethod
    def check_migration(path: str | Path) -> dict[str, Any]:
        """
        Check if a database needs migration from sqlite-vec (dry-run).

        Use this before opening a v1.x database to understand what will
        be migrated. Does not modify the database.

        Args:
            path: Path to the SQLite database file

        Returns:
            Dict with migration info:
            - needs_migration: bool
            - collections: list of collection names with legacy data
            - total_vectors: estimated total vector count
            - estimated_size_mb: approximate usearch index size
            - rollback_notes: instructions for reverting if needed

        Example:
            >>> info = VectorDB.check_migration("mydb.db")
            >>> if info["needs_migration"]:
            ...     print(f"Will migrate {info['total_vectors']} vectors")
            ...     print(info["rollback_notes"])
        """
        path = str(path)
        if path == ":memory:" or not Path(path).exists():
            return {
                "needs_migration": False,
                "collections": [],
                "total_vectors": 0,
                "estimated_size_mb": 0.0,
                "rollback_notes": "",
            }

        try:
            conn = sqlite3.connect(path, check_same_thread=False)
        except sqlite3.DatabaseError:
            # Database may be encrypted or corrupted - cannot check migration
            return {
                "needs_migration": False,
                "collections": [],
                "total_vectors": 0,
                "estimated_size_mb": 0.0,
                "rollback_notes": "",
            }

        try:
            # Check for legacy sqlite-vec tables
            tables = conn.execute(
                "SELECT name FROM sqlite_master WHERE type='table'"
            ).fetchall()
        except sqlite3.DatabaseError:
            # Database is encrypted or corrupted - cannot check migration
            conn.close()
            return {
                "needs_migration": False,
                "collections": [],
                "total_vectors": 0,
                "estimated_size_mb": 0.0,
                "rollback_notes": "",
            }

        try:
            table_names = {t[0] for t in tables}

            legacy_collections = []
            total_vectors = 0
            total_bytes = 0

            # Check default collection
            if "vec_index" in table_names:
                try:
                    count = conn.execute("SELECT COUNT(*) FROM vec_index").fetchone()[0]
                    if count > 0:
                        legacy_collections.append("default")
                        total_vectors += count
                        # Estimate: rowid(8) + embedding blob
                        row = conn.execute(
                            "SELECT embedding FROM vec_index LIMIT 1"
                        ).fetchone()
                        if row and row[0]:
                            dim = len(row[0]) // 4
                            total_bytes += count * dim * 4  # float32
                except Exception:
                    pass

            # Check named collections (vectors_{name})
            for table in table_names:
                if table.startswith("vectors_") and table != "vec_index":
                    collection_name = table[8:]  # Remove "vectors_" prefix
                    try:
                        count = conn.execute(
                            f"SELECT COUNT(*) FROM {table}"
                        ).fetchone()[0]
                        if count > 0:
                            legacy_collections.append(collection_name)
                            total_vectors += count
                            row = conn.execute(
                                f"SELECT embedding FROM {table} LIMIT 1"
                            ).fetchone()
                            if row and row[0]:
                                dim = len(row[0]) // 4
                                total_bytes += count * dim * 4
                    except Exception:
                        pass

            estimated_mb = total_bytes / (1024 * 1024)

            rollback_notes = ""
            if legacy_collections:
                rollback_notes = f"""
MIGRATION ROLLBACK INSTRUCTIONS:
================================
1. BEFORE upgrading, backup your database:
   cp {path} {path}.backup

2. If migration fails or you need to revert:
   - Delete the new .usearch files: {path}.*.usearch
   - Restore from backup: cp {path}.backup {path}
   - Downgrade to simplevecdb<2.0.0

3. After successful migration, the legacy sqlite-vec tables are dropped.
   Keep your backup until you've verified the migration worked correctly.

4. New storage layout after migration:
   - {path} (SQLite: metadata, text, FTS, embeddings)
   - {path}.<collection>.usearch (usearch HNSW index per collection)
"""

            return {
                "needs_migration": len(legacy_collections) > 0,
                "collections": legacy_collections,
                "total_vectors": total_vectors,
                "estimated_size_mb": round(estimated_mb, 2),
                "rollback_notes": rollback_notes.strip(),
            }
        finally:
            conn.close()

    def vacuum(self, checkpoint_wal: bool = True) -> None:
        """
        Reclaim disk space by rebuilding the SQLite database file.

        Note: This only affects SQLite metadata storage. Usearch indexes
        don't support in-place compaction; use rebuild_index() for that.

        Args:
            checkpoint_wal: If True (default), also truncate the WAL file.
        """
        if checkpoint_wal:
            self.conn.execute("PRAGMA wal_checkpoint(TRUNCATE)")
        self.conn.execute("VACUUM")
        self.conn.execute("PRAGMA optimize")

    def save(self) -> None:
        """Save all collection indexes to disk."""
        for collection in self._collections.values():
            collection.save()

    def close(self) -> None:
        """Close the database connection and save indexes."""
        self.save()
        self.conn.close()

    def __del__(self) -> None:
        try:
            self.close()
        except Exception:
            pass

`collection(name='default', distance_strategy=None, quantization=None)`

Get or create a named collection.

Collections provide isolated namespaces within a single database. Each collection has its own usearch index file.

Parameters:

Name	Type	Description	Default
`name`	`str`	Collection name (alphanumeric + underscore only).	`'default'`
`distance_strategy`	`DistanceStrategy \| None`	Override database-level distance metric.	`None`
`quantization`	`Quantization \| None`	Override database-level quantization.	`None`

Returns:

Type	Description
`VectorCollection`	VectorCollection instance.

Raises:

Type	Description
`ValueError`	If collection name contains invalid characters.

Source code in src/simplevecdb/core.py

def collection(
    self,
    name: str = "default",
    distance_strategy: DistanceStrategy | None = None,
    quantization: Quantization | None = None,
) -> VectorCollection:
    """
    Get or create a named collection.

    Collections provide isolated namespaces within a single database.
    Each collection has its own usearch index file.

    Args:
        name: Collection name (alphanumeric + underscore only).
        distance_strategy: Override database-level distance metric.
        quantization: Override database-level quantization.

    Returns:
        VectorCollection instance.

    Raises:
        ValueError: If collection name contains invalid characters.
    """
    cache_key = name
    if cache_key not in self._collections:
        self._collections[cache_key] = VectorCollection(
            conn=self.conn,
            db_path=self.path,
            name=name,
            distance_strategy=distance_strategy or self.distance_strategy,
            quantization=quantization or self.quantization,
            encryption_key=self._encryption_key,
        )
    return self._collections[cache_key]

`list_collections()`

Return names of all initialized collections.

Only returns collections that have been accessed via collection() in this session. Does not scan the database for collections created in previous sessions.

Returns:

Type	Description
`list[str]`	List of collection names currently cached in this VectorDB instance.

Example

db = VectorDB("app.db") db.collection("users") db.collection("products") db.list_collections() ['users', 'products']

Source code in src/simplevecdb/core.py

def list_collections(self) -> list[str]:
    """
    Return names of all initialized collections.

    Only returns collections that have been accessed via `collection()` in this
    session. Does not scan the database for collections created in previous sessions.

    Returns:
        List of collection names currently cached in this VectorDB instance.

    Example:
        >>> db = VectorDB("app.db")
        >>> db.collection("users")
        >>> db.collection("products")
        >>> db.list_collections()
        ['users', 'products']
    """
    return list(self._collections.keys())

`search_collections(query, collections=None, k=10, filter=None, *, normalize_scores=True, parallel=True)`

Search across multiple collections with merged, ranked results.

Performs similarity search on each collection and merges results using score normalization for fair comparison across distance metrics.

Parameters:

Name	Type	Description	Default
`query`	`Sequence[float]`	Query vector (must match dimension of all searched collections).	required
`collections`	`list[str] \| None`	List of collection names to search. None searches all initialized collections (from list_collections()).	`None`
`k`	`int`	Number of top results to return after merging.	`10`
`filter`	`dict[str, Any] \| None`	Optional metadata filter applied to all collections.	`None`
`normalize_scores`	`bool`	If True, convert distances to similarity scores in [0, 1] range using `1 / (1 + distance)`. Enables fair comparison across COSINE [0,2] and L2 [0,∞) metrics.	`True`
`parallel`	`bool`	If True, search collections concurrently using ThreadPoolExecutor.	`True`

Returns:

Type	Description
`list[tuple[Document, float, str]]`	List of (Document, similarity_score, collection_name) tuples,
`list[tuple[Document, float, str]]`	sorted by descending similarity score (highest first).

Raises:

Type	Description
`ValueError`	If no collections specified and none initialized, or if collections have mismatched dimensions.
`KeyError`	If a specified collection name doesn't exist.

Example

db = VectorDB("app.db") db.collection("users").add_texts(["alice"], embeddings=[[0.1]384]) db.collection("products").add_texts(["widget"], embeddings=[[0.2]384]) results = db.search_collections([0.15]*384, k=2) for doc, score, coll in results: ... print(f"{coll}: {doc.page_content} ({score:.3f})")

Source code in src/simplevecdb/core.py

def search_collections(
    self,
    query: Sequence[float],
    collections: list[str] | None = None,
    k: int = 10,
    filter: dict[str, Any] | None = None,
    *,
    normalize_scores: bool = True,
    parallel: bool = True,
) -> list[tuple[Document, float, str]]:
    """
    Search across multiple collections with merged, ranked results.

    Performs similarity search on each collection and merges results using
    score normalization for fair comparison across distance metrics.

    Args:
        query: Query vector (must match dimension of all searched collections).
        collections: List of collection names to search. None searches all
            initialized collections (from list_collections()).
        k: Number of top results to return after merging.
        filter: Optional metadata filter applied to all collections.
        normalize_scores: If True, convert distances to similarity scores
            in [0, 1] range using `1 / (1 + distance)`. Enables fair
            comparison across COSINE [0,2] and L2 [0,∞) metrics.
        parallel: If True, search collections concurrently using ThreadPoolExecutor.

    Returns:
        List of (Document, similarity_score, collection_name) tuples,
        sorted by descending similarity score (highest first).

    Raises:
        ValueError: If no collections specified and none initialized,
            or if collections have mismatched dimensions.
        KeyError: If a specified collection name doesn't exist.

    Example:
        >>> db = VectorDB("app.db")
        >>> db.collection("users").add_texts(["alice"], embeddings=[[0.1]*384])
        >>> db.collection("products").add_texts(["widget"], embeddings=[[0.2]*384])
        >>> results = db.search_collections([0.15]*384, k=2)
        >>> for doc, score, coll in results:
        ...     print(f"{coll}: {doc.page_content} ({score:.3f})")
    """
    target_names = (
        collections if collections is not None else self.list_collections()
    )

    if not target_names:
        return []

    # Resolve and validate collections
    targets: list[VectorCollection] = []
    dims: set[int | None] = set()
    for name in target_names:
        if name not in self._collections:
            raise KeyError(
                f"Collection '{name}' not initialized. Call db.collection('{name}') first."
            )
        coll = self._collections[name]
        targets.append(coll)
        dims.add(coll._dim)

    # Check dimension consistency (ignore None for empty collections)
    dims.discard(None)
    if len(dims) > 1:
        raise ValueError(
            f"Dimension mismatch across collections: {dims}. "
            "All searched collections must have the same embedding dimension."
        )

    # Search function for each collection
    def _search_one(coll: VectorCollection) -> list[tuple[Document, float, str]]:
        results = coll.similarity_search(query, k=k, filter=filter)
        return [(doc, dist, coll.name) for doc, dist in results]

    # Execute searches
    all_results: list[tuple[Document, float, str]] = []
    if parallel and len(targets) > 1:
        from concurrent.futures import ThreadPoolExecutor

        with ThreadPoolExecutor(max_workers=min(len(targets), 8)) as executor:
            futures = [executor.submit(_search_one, coll) for coll in targets]
            for future in futures:
                all_results.extend(future.result())
    else:
        for coll in targets:
            all_results.extend(_search_one(coll))

    # Normalize scores: similarity = 1 / (1 + distance)
    if normalize_scores:
        all_results = [
            (doc, 1.0 / (1.0 + dist), name) for doc, dist, name in all_results
        ]
    else:
        # Invert for sorting (lower distance = higher rank)
        all_results = [(doc, -dist, name) for doc, dist, name in all_results]

    # Sort by score descending and take top k
    all_results.sort(key=lambda x: x[1], reverse=True)
    return all_results[:k]

`vacuum(checkpoint_wal=True)`

Reclaim disk space by rebuilding the SQLite database file.

Note: This only affects SQLite metadata storage. Usearch indexes don't support in-place compaction; use rebuild_index() for that.

Parameters:

Name	Type	Description	Default
`checkpoint_wal`	`bool`	If True (default), also truncate the WAL file.	`True`

Source code in src/simplevecdb/core.py

def vacuum(self, checkpoint_wal: bool = True) -> None:
    """
    Reclaim disk space by rebuilding the SQLite database file.

    Note: This only affects SQLite metadata storage. Usearch indexes
    don't support in-place compaction; use rebuild_index() for that.

    Args:
        checkpoint_wal: If True (default), also truncate the WAL file.
    """
    if checkpoint_wal:
        self.conn.execute("PRAGMA wal_checkpoint(TRUNCATE)")
    self.conn.execute("VACUUM")
    self.conn.execute("PRAGMA optimize")

`close()`

Close the database connection and save indexes.

Source code in src/simplevecdb/core.py

def close(self) -> None:
    """Close the database connection and save indexes."""
    self.save()
    self.conn.close()

`check_migration(path)` `staticmethod`

Check if a database needs migration from sqlite-vec (dry-run).

Use this before opening a v1.x database to understand what will be migrated. Does not modify the database.

Parameters:

Name	Type	Description	Default
`path`	`str \| Path`	Path to the SQLite database file	required

Returns:

Type	Description
`dict[str, Any]`	Dict with migration info:
`dict[str, Any]`	needs_migration: bool
`dict[str, Any]`	collections: list of collection names with legacy data
`dict[str, Any]`	total_vectors: estimated total vector count
`dict[str, Any]`	estimated_size_mb: approximate usearch index size
`dict[str, Any]`	rollback_notes: instructions for reverting if needed

Example

info = VectorDB.check_migration("mydb.db") if info["needs_migration"]: ... print(f"Will migrate {info['total_vectors']} vectors") ... print(info["rollback_notes"])

Source code in src/simplevecdb/core.py

    @staticmethod
    def check_migration(path: str | Path) -> dict[str, Any]:
        """
        Check if a database needs migration from sqlite-vec (dry-run).

        Use this before opening a v1.x database to understand what will
        be migrated. Does not modify the database.

        Args:
            path: Path to the SQLite database file

        Returns:
            Dict with migration info:
            - needs_migration: bool
            - collections: list of collection names with legacy data
            - total_vectors: estimated total vector count
            - estimated_size_mb: approximate usearch index size
            - rollback_notes: instructions for reverting if needed

        Example:
            >>> info = VectorDB.check_migration("mydb.db")
            >>> if info["needs_migration"]:
            ...     print(f"Will migrate {info['total_vectors']} vectors")
            ...     print(info["rollback_notes"])
        """
        path = str(path)
        if path == ":memory:" or not Path(path).exists():
            return {
                "needs_migration": False,
                "collections": [],
                "total_vectors": 0,
                "estimated_size_mb": 0.0,
                "rollback_notes": "",
            }

        try:
            conn = sqlite3.connect(path, check_same_thread=False)
        except sqlite3.DatabaseError:
            # Database may be encrypted or corrupted - cannot check migration
            return {
                "needs_migration": False,
                "collections": [],
                "total_vectors": 0,
                "estimated_size_mb": 0.0,
                "rollback_notes": "",
            }

        try:
            # Check for legacy sqlite-vec tables
            tables = conn.execute(
                "SELECT name FROM sqlite_master WHERE type='table'"
            ).fetchall()
        except sqlite3.DatabaseError:
            # Database is encrypted or corrupted - cannot check migration
            conn.close()
            return {
                "needs_migration": False,
                "collections": [],
                "total_vectors": 0,
                "estimated_size_mb": 0.0,
                "rollback_notes": "",
            }

        try:
            table_names = {t[0] for t in tables}

            legacy_collections = []
            total_vectors = 0
            total_bytes = 0

            # Check default collection
            if "vec_index" in table_names:
                try:
                    count = conn.execute("SELECT COUNT(*) FROM vec_index").fetchone()[0]
                    if count > 0:
                        legacy_collections.append("default")
                        total_vectors += count
                        # Estimate: rowid(8) + embedding blob
                        row = conn.execute(
                            "SELECT embedding FROM vec_index LIMIT 1"
                        ).fetchone()
                        if row and row[0]:
                            dim = len(row[0]) // 4
                            total_bytes += count * dim * 4  # float32
                except Exception:
                    pass

            # Check named collections (vectors_{name})
            for table in table_names:
                if table.startswith("vectors_") and table != "vec_index":
                    collection_name = table[8:]  # Remove "vectors_" prefix
                    try:
                        count = conn.execute(
                            f"SELECT COUNT(*) FROM {table}"
                        ).fetchone()[0]
                        if count > 0:
                            legacy_collections.append(collection_name)
                            total_vectors += count
                            row = conn.execute(
                                f"SELECT embedding FROM {table} LIMIT 1"
                            ).fetchone()
                            if row and row[0]:
                                dim = len(row[0]) // 4
                                total_bytes += count * dim * 4
                    except Exception:
                        pass

            estimated_mb = total_bytes / (1024 * 1024)

            rollback_notes = ""
            if legacy_collections:
                rollback_notes = f"""
MIGRATION ROLLBACK INSTRUCTIONS:
================================
1. BEFORE upgrading, backup your database:
   cp {path} {path}.backup

2. If migration fails or you need to revert:
   - Delete the new .usearch files: {path}.*.usearch
   - Restore from backup: cp {path}.backup {path}
   - Downgrade to simplevecdb<2.0.0

3. After successful migration, the legacy sqlite-vec tables are dropped.
   Keep your backup until you've verified the migration worked correctly.

4. New storage layout after migration:
   - {path} (SQLite: metadata, text, FTS, embeddings)
   - {path}.<collection>.usearch (usearch HNSW index per collection)
"""

            return {
                "needs_migration": len(legacy_collections) > 0,
                "collections": legacy_collections,
                "total_vectors": total_vectors,
                "estimated_size_mb": round(estimated_mb, 2),
                "rollback_notes": rollback_notes.strip(),
            }
        finally:
            conn.close()

VectorCollection

A named collection of vectors within a database.

`simplevecdb.core.VectorCollection`

Represents a single vector collection within the database.

Handles vector storage via usearch HNSW index and metadata via SQLite. Uses a facade pattern to delegate operations to specialized engine components (catalog, search, usearch_index).

Note

Collections are created via VectorDB.collection(). Do not instantiate directly.

Source code in src/simplevecdb/core.py

class VectorCollection:
    """
    Represents a single vector collection within the database.

    Handles vector storage via usearch HNSW index and metadata via SQLite.
    Uses a facade pattern to delegate operations to specialized engine
    components (catalog, search, usearch_index).

    Note:
        Collections are created via `VectorDB.collection()`. Do not instantiate directly.
    """

    def __init__(
        self,
        conn: sqlite3.Connection,
        db_path: str,
        name: str,
        distance_strategy: DistanceStrategy,
        quantization: Quantization,
        encryption_key: str | bytes | None = None,
    ):
        self.conn = conn
        self._db_path = db_path
        self.name = name
        self.distance_strategy = distance_strategy
        self.quantization = quantization
        self._quantizer = QuantizationStrategy(quantization)
        self._encryption_key = encryption_key

        # Sanitize name to prevent issues
        if not re.match(r"^[a-zA-Z0-9_]+$", name):
            raise ValueError(
                f"Invalid collection name '{name}'. Must be alphanumeric + underscores."
            )

        # Table names
        if name == "default":
            self._table_name = "tinyvec_items"
            self._legacy_vec_table = "vec_index"  # For migration
        else:
            self._table_name = f"items_{name}"
            self._legacy_vec_table = f"vectors_{name}"  # For migration

        self._fts_table_name = f"{self._table_name}_fts"

        # Usearch index path: {db_path}.{collection}.usearch
        if db_path == ":memory:":
            self._index_path = None  # In-memory index
        else:
            self._index_path = f"{db_path}.{name}.usearch"

        # Initialize components
        self._catalog = CatalogManager(
            conn=self.conn,
            table_name=self._table_name,
            fts_table_name=self._fts_table_name,
        )
        self._catalog.create_tables()

        # Handle encrypted index loading
        actual_index_path = self._resolve_index_path()

        # Create usearch index
        self._index = UsearchIndex(
            index_path=actual_index_path
            or os.path.join(
                tempfile.gettempdir(), f"simplevecdb_{uuid.uuid4().hex}.usearch"
            ),
            ndim=None,  # Will be set on first add
            distance_strategy=self.distance_strategy,
            quantization=self.quantization,
        )

        # Create search engine
        self._search = SearchEngine(
            index=self._index,
            catalog=self._catalog,
            distance_strategy=self.distance_strategy,
        )

        # Check for and perform migration from sqlite-vec
        self._migrate_from_sqlite_vec_if_needed()

    def _resolve_index_path(self) -> str | None:
        """
        Resolve the actual index path, handling encryption.

        If encryption is enabled and an encrypted index exists, decrypt it first.
        Returns the path to use for the usearch index.
        """
        if self._index_path is None:
            return None

        index_path = Path(self._index_path)

        # Check for encrypted index
        encrypted_path = get_encrypted_index_path(index_path)

        if encrypted_path is not None:
            if self._encryption_key is None:
                raise EncryptionError(
                    f"Encrypted index found at {encrypted_path} but no encryption_key provided. "
                    "Pass encryption_key to VectorDB to decrypt."
                )
            # Decrypt to the expected path
            decrypt_index_file(encrypted_path, self._encryption_key)
            _logger.info("Decrypted index file: %s", encrypted_path)

        return self._index_path

    def _migrate_from_sqlite_vec_if_needed(self) -> None:
        """Auto-migrate from sqlite-vec to usearch on first connection."""
        if not self._catalog.check_legacy_sqlite_vec(self._legacy_vec_table):
            return

        _logger.info(
            "Detected legacy sqlite-vec data in collection '%s'. Migrating to usearch...",
            self.name,
        )

        try:
            # Get legacy vectors
            legacy_data = self._catalog.get_legacy_vectors(self._legacy_vec_table)
            if not legacy_data:
                _logger.warning("No vectors found in legacy table")
                self._catalog.drop_legacy_vec_table(self._legacy_vec_table)
                return

            # Deserialize and add to usearch
            keys = []
            vectors = []
            for rowid, blob in legacy_data:
                vec = np.frombuffer(blob, dtype=np.float32)
                keys.append(rowid)
                vectors.append(vec)

            keys_arr = np.array(keys, dtype=np.uint64)
            vectors_arr = np.array(vectors, dtype=np.float32)

            self._index.add(keys_arr, vectors_arr)
            self._index.save()

            # Drop legacy table
            self._catalog.drop_legacy_vec_table(self._legacy_vec_table)

            _logger.info(
                "Migration complete: %d vectors migrated to usearch", len(keys)
            )

        except Exception as e:
            _logger.error("Migration failed: %s", e)
            raise RuntimeError(
                f"Failed to migrate from sqlite-vec: {e}. "
                "You may need to manually migrate or restore from backup."
            ) from e

    def add_texts(
        self,
        texts: Sequence[str],
        metadatas: Sequence[dict] | None = None,
        embeddings: Sequence[Sequence[float]] | None = None,
        ids: Sequence[int | None] | None = None,
        *,
        parent_ids: Sequence[int | None] | None = None,
        threads: int = 0,
    ) -> list[int]:
        """
        Add texts with optional embeddings and metadata to the collection.

        Automatically infers vector dimension from first batch. Supports upsert
        (update on conflict) when providing existing IDs. For COSINE distance,
        vectors are L2-normalized automatically by usearch.

        Args:
            texts: Document text content to store.
            metadatas: Optional metadata dicts (one per text).
            embeddings: Optional pre-computed embeddings (one per text).
                If None, attempts to use local embedding model.
            ids: Optional document IDs for upsert behavior.
            parent_ids: Optional parent document IDs for hierarchical relationships.
            threads: Number of threads for parallel insertion (0=auto).

        Returns:
            List of inserted/updated document IDs.

        Raises:
            ValueError: If embedding dimensions don't match, or if no embeddings
                provided and local embedder not available.
        """
        if not texts:
            return []

        # Resolve embeddings
        if embeddings is None:
            try:
                from simplevecdb.embeddings.models import embed_texts as embed_fn

                embeddings = embed_fn(list(texts))
            except Exception as e:
                raise ValueError(
                    "No embeddings provided and local embedder failed – "
                    "install with [server] extra"
                ) from e

        # Normalize metadatas
        if metadatas is None:
            metadatas = [{} for _ in texts]

        # Process in batches
        from simplevecdb import config

        batch_size = config.EMBEDDING_BATCH_SIZE
        all_ids: list[int] = []

        for batch_start in range(0, len(texts), batch_size):
            batch_end = min(batch_start + batch_size, len(texts))
            batch_texts = texts[batch_start:batch_end]
            batch_metas = metadatas[batch_start:batch_end]
            batch_embeds = embeddings[batch_start:batch_end]
            batch_ids = ids[batch_start:batch_end] if ids else None
            batch_parent_ids = parent_ids[batch_start:batch_end] if parent_ids else None

            # Add to SQLite metadata store (with embeddings for MMR support)
            doc_ids = self._catalog.add_documents(
                batch_texts,
                list(batch_metas),
                batch_ids,
                embeddings=batch_embeds,
                parent_ids=batch_parent_ids,
            )

            # Prepare vectors
            emb_np = np.array(batch_embeds, dtype=np.float32)

            # Add to usearch index
            self._index.add(np.array(doc_ids, dtype=np.uint64), emb_np, threads=threads)

            all_ids.extend(doc_ids)

        return all_ids

    def add_texts_streaming(
        self,
        items: Iterable[tuple[str, dict | None, Sequence[float] | None]],
        *,
        batch_size: int | None = None,
        threads: int = 0,
        on_progress: ProgressCallback | None = None,
    ) -> Generator[StreamingProgress, None, list[int]]:
        """
        Stream documents into the collection with controlled memory usage.

        Processes documents in batches from any iterable (generator, file reader,
        API paginator, etc.) without loading all data into memory. Yields progress
        after each batch for monitoring large ingestions.

        Args:
            items: Iterable of (text, metadata, embedding) tuples.
                - text: Document content (required)
                - metadata: Optional dict, use None for empty
                - embedding: Optional pre-computed vector, use None to auto-embed
            batch_size: Documents per batch (default: config.EMBEDDING_BATCH_SIZE).
            threads: Threads for parallel insertion (0=auto).
            on_progress: Optional callback invoked after each batch.

        Yields:
            StreamingProgress dict after each batch with:
            - batch_num: Current batch number (1-indexed)
            - total_batches: Estimated total (None if unknown)
            - docs_processed: Cumulative documents inserted
            - docs_in_batch: Documents in current batch
            - batch_ids: IDs of documents in current batch

        Returns:
            List of all inserted document IDs (access via generator.send(None)
            or list(generator) after exhaustion).

        Example:
            >>> def load_documents():
            ...     for line in open("large_file.jsonl"):
            ...         doc = json.loads(line)
            ...         yield (doc["text"], doc.get("meta"), None)
            ...
            >>> gen = collection.add_texts_streaming(load_documents())
            >>> for progress in gen:
            ...     print(f"Batch {progress['batch_num']}: {progress['docs_processed']} total")
            >>> # IDs accumulated in progress['ids'] for each batch

        Example with callback:
            >>> def log_progress(p):
            ...     print(f"{p['docs_processed']} docs inserted")
            >>> list(collection.add_texts_streaming(items, on_progress=log_progress))
        """
        from simplevecdb import config

        if batch_size is None:
            batch_size = config.EMBEDDING_BATCH_SIZE

        all_ids: list[int] = []
        batch_num = 0
        docs_processed = 0

        # Accumulate batch
        batch_texts: list[str] = []
        batch_metas: list[dict] = []
        batch_embeds: list[Sequence[float]] = []
        needs_embedding = False

        for text, metadata, embedding in items:
            batch_texts.append(text)
            batch_metas.append(metadata or {})
            if embedding is not None:
                batch_embeds.append(embedding)
            else:
                needs_embedding = True
                batch_embeds.append([])  # Placeholder

            # Process batch when full
            if len(batch_texts) >= batch_size:
                batch_ids = self._process_streaming_batch(
                    batch_texts, batch_metas, batch_embeds, needs_embedding, threads
                )
                all_ids.extend(batch_ids)
                batch_num += 1
                docs_processed += len(batch_ids)

                progress: StreamingProgress = {
                    "batch_num": batch_num,
                    "total_batches": None,
                    "docs_processed": docs_processed,
                    "docs_in_batch": len(batch_ids),
                    "batch_ids": batch_ids,
                }

                if on_progress:
                    on_progress(progress)

                yield progress

                # Reset batch
                batch_texts = []
                batch_metas = []
                batch_embeds = []
                needs_embedding = False

        # Process final partial batch
        if batch_texts:
            batch_ids = self._process_streaming_batch(
                batch_texts, batch_metas, batch_embeds, needs_embedding, threads
            )
            all_ids.extend(batch_ids)
            batch_num += 1
            docs_processed += len(batch_ids)

            progress = {
                "batch_num": batch_num,
                "total_batches": batch_num,  # Now we know total
                "docs_processed": docs_processed,
                "docs_in_batch": len(batch_ids),
                "batch_ids": batch_ids,
            }

            if on_progress:
                on_progress(progress)

            yield progress

        return all_ids

    def _process_streaming_batch(
        self,
        texts: list[str],
        metas: list[dict],
        embeds: list[Sequence[float]],
        needs_embedding: bool,
        threads: int,
    ) -> list[int]:
        """Process a single batch for streaming insert."""
        # Generate embeddings if needed
        if needs_embedding:
            try:
                from simplevecdb.embeddings.models import embed_texts as embed_fn

                generated = embed_fn(texts)
                # Replace placeholders with generated embeddings
                for i, emb in enumerate(embeds):
                    if (
                        emb is None
                    ):  # Explicit None placeholder (avoids NumPy truthiness issue)
                        embeds[i] = generated[i]
            except Exception as e:
                raise ValueError(
                    "Auto-embedding failed - install with [server] extra or provide embeddings"
                ) from e

        # Add to catalog and index
        doc_ids = self._catalog.add_documents(texts, metas, None, embeddings=embeds)
        emb_np = np.array(embeds, dtype=np.float32)
        self._index.add(np.array(doc_ids, dtype=np.uint64), emb_np, threads=threads)

        return doc_ids

    def similarity_search(
        self,
        query: str | Sequence[float],
        k: int = 5,
        filter: dict[str, Any] | None = None,
        *,
        exact: bool | None = None,
        threads: int = 0,
    ) -> list[tuple[Document, float]]:
        """
        Search for most similar vectors using HNSW approximate nearest neighbor.

        For COSINE distance, returns distance in [0, 2] (lower = more similar).
        For L2/L1, returns raw distance (lower = more similar).

        Args:
            query: Query vector or text string (auto-embedded if string).
            k: Number of nearest neighbors to return.
            filter: Optional metadata filter.
            exact: Force search mode. None=adaptive (brute-force for <10k vectors),
                   True=always brute-force (perfect recall), False=always HNSW.
            threads: Number of threads for parallel search (0=auto).

        Returns:
            List of (Document, distance) tuples, sorted by ascending distance.
        """
        return self._search.similarity_search(
            query, k, filter, exact=exact, threads=threads
        )

    def similarity_search_batch(
        self,
        queries: Sequence[Sequence[float]],
        k: int = 5,
        filter: dict[str, Any] | None = None,
        *,
        exact: bool | None = None,
        threads: int = 0,
    ) -> list[list[tuple[Document, float]]]:
        """
        Search for similar vectors across multiple queries in parallel.

        Automatically batches queries for ~10x throughput compared to
        sequential single-query searches. Uses usearch's native batch
        search optimization.

        Args:
            queries: List of query vectors.
            k: Number of nearest neighbors per query.
            filter: Optional metadata filter (applied to all queries).
            exact: Force search mode. None=adaptive, True=brute-force, False=HNSW.
            threads: Number of threads for parallel search (0=auto).

        Returns:
            List of result lists, one per query. Each result is (Document, distance).

        Example:
            >>> queries = [embedding1, embedding2, embedding3]
            >>> results = collection.similarity_search_batch(queries, k=5)
            >>> for query_results in results:
            ...     print(f"Found {len(query_results)} matches")
        """
        return self._search.similarity_search_batch(
            queries, k, filter, exact=exact, threads=threads
        )

    def keyword_search(
        self, query: str, k: int = 5, filter: dict[str, Any] | None = None
    ) -> list[tuple[Document, float]]:
        """
        Search using BM25 keyword ranking (full-text search).

        Uses SQLite's FTS5 extension for BM25-based ranking.

        Args:
            query: Text query using FTS5 syntax.
            k: Maximum number of results to return.
            filter: Optional metadata filter.

        Returns:
            List of (Document, bm25_score) tuples, sorted by descending relevance.

        Raises:
            RuntimeError: If FTS5 is not available.
        """
        return self._search.keyword_search(query, k, filter)

    def hybrid_search(
        self,
        query: str,
        k: int = 5,
        filter: dict[str, Any] | None = None,
        *,
        query_vector: Sequence[float] | None = None,
        vector_k: int | None = None,
        keyword_k: int | None = None,
        rrf_k: int = 60,
    ) -> list[tuple[Document, float]]:
        """
        Combine BM25 keyword search with vector similarity using Reciprocal Rank Fusion.

        Args:
            query: Text query for keyword search.
            k: Final number of results after fusion.
            filter: Optional metadata filter.
            query_vector: Optional pre-computed query embedding.
            vector_k: Number of vector search candidates.
            keyword_k: Number of keyword search candidates.
            rrf_k: RRF constant parameter (default: 60).

        Returns:
            List of (Document, rrf_score) tuples, sorted by descending RRF score.

        Raises:
            RuntimeError: If FTS5 is not available.
        """
        return self._search.hybrid_search(
            query,
            k,
            filter,
            query_vector=query_vector,
            vector_k=vector_k,
            keyword_k=keyword_k,
            rrf_k=rrf_k,
        )

    def max_marginal_relevance_search(
        self,
        query: str | Sequence[float],
        k: int = 5,
        fetch_k: int = 20,
        lambda_mult: float = 0.5,
        filter: dict[str, Any] | None = None,
    ) -> list[Document]:
        """
        Search with diversity - return relevant but non-redundant results.

        Args:
            query: Query vector or text string.
            k: Number of diverse results to return.
            fetch_k: Number of candidates to consider.
            lambda_mult: Diversity trade-off (0=diverse, 1=relevant).
            filter: Optional metadata filter.

        Returns:
            List of Documents ordered by MMR selection.
        """
        return self._search.max_marginal_relevance_search(
            query, k, fetch_k, lambda_mult, filter
        )

    def delete_by_ids(self, ids: Iterable[int]) -> None:
        """
        Delete documents by their IDs.

        Removes documents from both usearch index and SQLite metadata.
        Does NOT auto-vacuum; call `VectorDB.vacuum()` separately.

        Args:
            ids: Document IDs to delete
        """
        ids_list = list(ids)
        if not ids_list:
            return

        # Delete from usearch
        self._index.remove(ids_list)

        # Delete from SQLite
        self._catalog.delete_by_ids(ids_list)

    def remove_texts(
        self,
        texts: Sequence[str] | None = None,
        filter: dict[str, Any] | None = None,
    ) -> int:
        """
        Remove documents by text content or metadata filter.

        Args:
            texts: Optional list of exact text strings to remove
            filter: Optional metadata filter dict

        Returns:
            Number of documents deleted

        Raises:
            ValueError: If neither texts nor filter provided
        """
        if texts is None and filter is None:
            raise ValueError("Must provide either texts or filter to remove")

        ids_to_delete: list[int] = []

        if texts:
            ids_to_delete.extend(self._catalog.find_ids_by_texts(texts))

        if filter:
            ids_to_delete.extend(
                self._catalog.find_ids_by_filter(
                    filter, self._catalog.build_filter_clause
                )
            )

        unique_ids = list(set(ids_to_delete))
        if unique_ids:
            self.delete_by_ids(unique_ids)

        return len(unique_ids)

    def save(self) -> None:
        """
        Save the usearch index to disk.

        If encryption is enabled, the index is encrypted after saving.
        """
        self._index.save()

        # Encrypt index if encryption is enabled
        if self._encryption_key is not None and self._index_path is not None:
            index_path = Path(self._index_path)
            if index_path.exists():
                encrypt_index_file(index_path, self._encryption_key)

    def rebuild_index(
        self,
        *,
        connectivity: int | None = None,
        expansion_add: int | None = None,
        expansion_search: int | None = None,
    ) -> int:
        """
        Rebuild the usearch HNSW index from embeddings stored in SQLite.

        Useful for:
        - Recovering from index corruption
        - Tuning HNSW parameters (connectivity, expansion)
        - Reclaiming space after many deletions

        Args:
            connectivity: HNSW M parameter (edges per node). Default: 16
            expansion_add: efConstruction (build quality). Default: 128
            expansion_search: ef (search quality). Default: 64

        Returns:
            Number of vectors rebuilt

        Raises:
            RuntimeError: If no embeddings found in SQLite
        """
        _logger.info("Rebuilding usearch index for collection '%s'...", self.name)

        # Get all document IDs
        all_ids = self.conn.execute(f"SELECT id FROM {self._table_name}").fetchall()
        all_ids = [row[0] for row in all_ids]

        if not all_ids:
            _logger.warning("No documents found in collection")
            return 0

        # Fetch embeddings from SQLite
        embeddings_map = self._catalog.get_embeddings_by_ids(all_ids)

        # Filter to only docs with embeddings
        valid_pairs = [
            (doc_id, emb)
            for doc_id in all_ids
            if (emb := embeddings_map.get(doc_id)) is not None
        ]

        if not valid_pairs:
            raise RuntimeError(
                "No embeddings found in SQLite. Cannot rebuild index. "
                "This may happen if documents were added before v2.0.0."
            )

        keys = np.array([doc_id for doc_id, _ in valid_pairs], dtype=np.uint64)
        vectors = np.array([emb for _, emb in valid_pairs], dtype=np.float32)

        # Determine dimension
        ndim = vectors.shape[1]

        # Close old index
        old_path = self._index._path
        self._index.close()

        # Delete old index file
        if old_path.exists():
            old_path.unlink()
            _logger.debug("Deleted old index file: %s", old_path)

        # Create new index with optional custom parameters
        from .engine.usearch_index import (
            DEFAULT_CONNECTIVITY,
            DEFAULT_EXPANSION_ADD,
            DEFAULT_EXPANSION_SEARCH,
        )

        self._index = UsearchIndex(
            index_path=str(old_path),
            ndim=ndim,
            distance_strategy=self.distance_strategy,
            quantization=self.quantization,
            connectivity=connectivity or DEFAULT_CONNECTIVITY,
            expansion_add=expansion_add or DEFAULT_EXPANSION_ADD,
            expansion_search=expansion_search or DEFAULT_EXPANSION_SEARCH,
        )

        # Re-add all vectors
        self._index.add(keys, vectors)
        self._index.save()

        # Update search engine reference
        self._search._index = self._index

        _logger.info("Rebuilt index with %d vectors", len(keys))
        return len(keys)

    # ------------------------------------------------------------------ #
    # Hierarchical Relationships
    # ------------------------------------------------------------------ #

    def get_children(self, doc_id: int) -> list[Document]:
        """
        Get all direct children of a document.

        Args:
            doc_id: ID of the parent document

        Returns:
            List of child Documents

        Example:
            >>> # Add parent and children
            >>> parent_id = collection.add_texts(["Parent doc"], embeddings=[emb])[0]
            >>> collection.add_texts(
            ...     ["Child 1", "Child 2"],
            ...     embeddings=[emb1, emb2],
            ...     parent_ids=[parent_id, parent_id]
            ... )
            >>> children = collection.get_children(parent_id)
        """
        rows = self._catalog.get_children(doc_id)
        return [Document(page_content=text, metadata=meta) for _, text, meta in rows]

    def get_parent(self, doc_id: int) -> Document | None:
        """
        Get the parent document of a given document.

        Args:
            doc_id: ID of the child document

        Returns:
            Parent Document, or None if no parent
        """
        result = self._catalog.get_parent(doc_id)
        if result is None:
            return None
        _, text, meta = result
        return Document(page_content=text, metadata=meta)

    def get_descendants(
        self, doc_id: int, max_depth: int | None = None
    ) -> list[tuple[Document, int]]:
        """
        Get all descendants of a document (recursive).

        Args:
            doc_id: ID of the root document
            max_depth: Maximum depth to traverse (None for unlimited)

        Returns:
            List of (Document, depth) tuples, ordered by depth then ID
        """
        rows = self._catalog.get_descendants(doc_id, max_depth)
        return [
            (Document(page_content=text, metadata=meta), depth)
            for _, text, meta, depth in rows
        ]

    def get_ancestors(
        self, doc_id: int, max_depth: int | None = None
    ) -> list[tuple[Document, int]]:
        """
        Get all ancestors of a document (path to root).

        Args:
            doc_id: ID of the document
            max_depth: Maximum depth to traverse (None for unlimited)

        Returns:
            List of (Document, depth) tuples, from immediate parent to root
        """
        rows = self._catalog.get_ancestors(doc_id, max_depth)
        return [
            (Document(page_content=text, metadata=meta), depth)
            for _, text, meta, depth in rows
        ]

    def set_parent(self, doc_id: int, parent_id: int | None) -> bool:
        """
        Set or update the parent of a document.

        Args:
            doc_id: ID of the document to update
            parent_id: New parent ID (None to remove parent relationship)

        Returns:
            True if document was updated, False if document not found
        """
        return self._catalog.set_parent(doc_id, parent_id)

    # ─────────────────────────────────────────────────────────────────────────
    # Clustering Methods
    # ─────────────────────────────────────────────────────────────────────────

    def cluster(
        self,
        n_clusters: int | None = None,
        algorithm: ClusterAlgorithm = "minibatch_kmeans",
        *,
        filter: dict[str, Any] | None = None,
        sample_size: int | None = None,
        min_cluster_size: int = 5,
        random_state: int | None = None,
    ) -> ClusterResult:
        """
        Cluster documents in the collection by their embeddings.

        Requires scikit-learn and hdbscan (included in the standard install).

        Args:
            n_clusters: Number of clusters (required for kmeans/minibatch_kmeans).
            algorithm: Clustering algorithm - 'kmeans', 'minibatch_kmeans', or 'hdbscan'.
            filter: Optional metadata filter to cluster a subset of documents.
            sample_size: If set, cluster a random sample and assign rest to nearest centroid.
            min_cluster_size: Minimum cluster size (HDBSCAN only).
            random_state: Random seed for reproducibility.

        Returns:
            ClusterResult with labels, centroids, and doc_ids.

        Raises:
            ImportError: If scikit-learn or hdbscan (for HDBSCAN) not installed.
            ValueError: If n_clusters required but not provided.

        Example:
            >>> result = collection.cluster(n_clusters=5)
            >>> print(result.summary())  # {0: 42, 1: 38, 2: 20, ...}
        """
        engine = ClusterEngine()

        doc_ids = list(self._index.keys())
        if not doc_ids:
            return ClusterResult(
                labels=np.array([], dtype=np.int32),
                centroids=None,
                doc_ids=[],
                n_clusters=0,
                algorithm=algorithm,
            )

        if filter:
            filtered_ids = set(
                self._catalog.find_ids_by_filter(
                    filter, self._catalog.build_filter_clause
                )
            )
            doc_ids = [d for d in doc_ids if d in filtered_ids]

        vectors = self._index.get(np.array(doc_ids, dtype=np.uint64))

        effective_n_clusters = n_clusters
        if n_clusters is not None and algorithm in ("kmeans", "minibatch_kmeans"):
            effective_n_clusters = min(n_clusters, len(doc_ids))

        if sample_size and sample_size < len(doc_ids):
            rng = np.random.default_rng(random_state)
            sample_indices = rng.choice(len(doc_ids), sample_size, replace=False)
            sample_ids = [doc_ids[i] for i in sample_indices]
            sample_vectors = vectors[sample_indices]

            result = engine.cluster_vectors(
                sample_vectors,
                sample_ids,
                algorithm=algorithm,
                n_clusters=effective_n_clusters,
                min_cluster_size=min_cluster_size,
                random_state=random_state,
            )

            if result.centroids is not None:
                remaining_mask = np.ones(len(doc_ids), dtype=bool)
                remaining_mask[sample_indices] = False
                remaining_ids = [doc_ids[i] for i, m in enumerate(remaining_mask) if m]
                remaining_vectors = vectors[remaining_mask]

                remaining_labels = engine.assign_to_nearest_centroid(
                    remaining_vectors, result.centroids
                )

                all_ids = sample_ids + remaining_ids
                all_labels = np.concatenate([result.labels, remaining_labels])

                order = np.argsort(all_ids)
                return ClusterResult(
                    labels=all_labels[order],
                    centroids=result.centroids,
                    doc_ids=[all_ids[i] for i in order],
                    n_clusters=result.n_clusters,
                    algorithm=algorithm,
                )
            return result

        return engine.cluster_vectors(
            vectors,
            doc_ids,
            algorithm=algorithm,
            n_clusters=effective_n_clusters,
            min_cluster_size=min_cluster_size,
            random_state=random_state,
        )

    def auto_tag(
        self,
        cluster_result: ClusterResult,
        *,
        method: str = "keywords",
        n_keywords: int = 5,
        custom_callback: ClusterTagCallback | None = None,
    ) -> dict[int, str]:
        """
        Generate descriptive tags for each cluster.

        Args:
            cluster_result: Result from cluster() method.
            method: Tagging method - 'keywords' (TF-IDF) or 'custom'.
            n_keywords: Number of keywords per cluster (for 'keywords' method).
            custom_callback: Custom function (texts: list[str]) -> str for 'custom' method.

        Returns:
            Dict mapping cluster_id -> tag string.

        Example:
            >>> result = collection.cluster(n_clusters=3)
            >>> tags = collection.auto_tag(result)
            >>> print(tags)  # {0: 'machine learning, neural', 1: 'database, sql', ...}
        """
        docs = self._catalog.get_documents_by_ids(cluster_result.doc_ids)

        cluster_texts: dict[int, list[str]] = {}
        for doc_id, label in zip(cluster_result.doc_ids, cluster_result.labels):
            label_int = int(label)
            if label_int not in cluster_texts:
                cluster_texts[label_int] = []
            if doc_id in docs:
                cluster_texts[label_int].append(docs[doc_id][0])

        if method == "custom" and custom_callback:
            return {
                cluster_id: custom_callback(texts)
                for cluster_id, texts in cluster_texts.items()
            }

        engine = ClusterEngine()
        return engine.generate_keywords(cluster_texts, n_keywords)

    def assign_cluster_metadata(
        self,
        cluster_result: ClusterResult,
        tags: dict[int, str] | None = None,
        *,
        metadata_key: str = "cluster",
        tag_key: str = "cluster_tag",
    ) -> int:
        """
        Persist cluster assignments to document metadata.

        After calling this, you can filter by cluster: filter={"cluster": 2}

        Args:
            cluster_result: Result from cluster() method.
            tags: Optional cluster tags from auto_tag(). If provided, also sets tag_key.
            metadata_key: Metadata key for cluster ID (default: "cluster").
            tag_key: Metadata key for cluster tag (default: "cluster_tag").

        Returns:
            Number of documents updated.

        Example:
            >>> result = collection.cluster(n_clusters=5)
            >>> tags = collection.auto_tag(result)
            >>> collection.assign_cluster_metadata(result, tags)
            >>> # Now filter by cluster
            >>> docs = collection.similarity_search(query, filter={"cluster": 2})
        """
        updates: list[tuple[int, dict[str, Any]]] = []
        for doc_id, label in zip(cluster_result.doc_ids, cluster_result.labels):
            meta: dict[str, Any] = {metadata_key: int(label)}
            if tags and int(label) in tags:
                meta[tag_key] = tags[int(label)]
            updates.append((doc_id, meta))

        return self._catalog.update_metadata_batch(updates)

    def get_cluster_members(
        self,
        cluster_id: int,
        *,
        metadata_key: str = "cluster",
    ) -> list[Document]:
        """
        Get all documents in a cluster (requires assign_cluster_metadata first).

        Args:
            cluster_id: Cluster ID to retrieve.
            metadata_key: Metadata key where cluster is stored (default: "cluster").

        Returns:
            List of Documents in the cluster.
        """
        rows = self._catalog.get_all_docs_with_text(
            filter_dict={metadata_key: cluster_id},
            filter_builder=self._catalog.build_filter_clause,
        )
        return [Document(page_content=text, metadata=meta) for _, text, meta in rows]

    def save_cluster(
        self,
        name: str,
        cluster_result: ClusterResult,
        *,
        metadata: dict[str, Any] | None = None,
    ) -> None:
        """
        Save cluster state for later reuse without re-clustering.

        Persists centroids and algorithm info so new documents can be assigned
        to existing clusters using assign_to_cluster().

        Args:
            name: Unique name for this cluster configuration.
            cluster_result: Result from cluster() method.
            metadata: Optional additional metadata (tags, metrics, etc.).

        Example:
            >>> result = collection.cluster(n_clusters=5)
            >>> tags = collection.auto_tag(result)
            >>> collection.save_cluster("product_categories", result, metadata={"tags": tags})
        """
        centroids_bytes = None
        if cluster_result.centroids is not None:
            centroids_bytes = cluster_result.centroids.tobytes()

        self._catalog.save_cluster_state(
            name=name,
            algorithm=cluster_result.algorithm,
            n_clusters=cluster_result.n_clusters,
            centroids=centroids_bytes,
            metadata=metadata,
        )

    def load_cluster(self, name: str) -> tuple[ClusterResult, dict[str, Any]] | None:
        """
        Load a saved cluster configuration.

        Args:
            name: Name of the saved cluster configuration.

        Returns:
            Tuple of (ClusterResult with centroids, metadata dict) or None if not found.

        Example:
            >>> saved = collection.load_cluster("product_categories")
            >>> if saved:
            ...     result, meta = saved
            ...     print(f"Loaded {result.n_clusters} clusters")
        """
        state = self._catalog.load_cluster_state(name)
        if state is None:
            return None

        algorithm, n_clusters, centroids_bytes, metadata = state

        centroids = None
        if centroids_bytes is not None:
            dim = self._dim
            if dim:
                centroids = np.frombuffer(centroids_bytes, dtype=np.float32).reshape(
                    n_clusters, dim
                )

        result = ClusterResult(
            labels=np.array([], dtype=np.int32),
            centroids=centroids,
            doc_ids=[],
            n_clusters=n_clusters,
            algorithm=algorithm,
        )
        return result, metadata

    def list_clusters(self) -> list[dict[str, Any]]:
        """List all saved cluster configurations."""
        return self._catalog.list_cluster_states()

    def delete_cluster(self, name: str) -> bool:
        """Delete a saved cluster configuration."""
        return self._catalog.delete_cluster_state(name)

    def assign_to_cluster(
        self,
        name: str,
        doc_ids: list[int] | None = None,
        *,
        metadata_key: str = "cluster",
    ) -> int:
        """
        Assign documents to clusters using saved centroids.

        Fast assignment without re-clustering - uses nearest centroid matching.
        Useful for assigning newly added documents to existing cluster structure.

        Args:
            name: Name of saved cluster configuration (from save_cluster).
            doc_ids: Document IDs to assign. If None, assigns all unassigned docs.
            metadata_key: Metadata key to store cluster assignment.

        Returns:
            Number of documents assigned.

        Raises:
            ValueError: If cluster not found or has no centroids (HDBSCAN).

        Example:
            >>> # Add new documents
            >>> new_ids = collection.add_texts(new_texts, embeddings=new_embs)
            >>> # Assign to existing clusters
            >>> collection.assign_to_cluster("product_categories", new_ids)
        """
        saved = self.load_cluster(name)
        if saved is None:
            raise ValueError(f"Cluster '{name}' not found")

        result, _ = saved
        if result.centroids is None:
            raise ValueError(
                f"Cluster '{name}' has no centroids (HDBSCAN clusters cannot be used for assignment)"
            )

        if doc_ids is None:
            all_ids = list(self._index.keys())
            # Get all documents to check for metadata key existence
            all_docs = self._catalog.get_all_docs_with_text()
            assigned_ids = {
                doc_id for doc_id, _, meta in all_docs if metadata_key in meta
            }
            doc_ids = [d for d in all_ids if d not in assigned_ids]

        if not doc_ids:
            return 0

        vectors = self._index.get(np.array(doc_ids, dtype=np.uint64))

        engine = ClusterEngine()
        labels = engine.assign_to_nearest_centroid(vectors, result.centroids)

        updates = [
            (doc_id, {metadata_key: int(label)})
            for doc_id, label in zip(doc_ids, labels)
        ]
        return self._catalog.update_metadata_batch(updates)

    def count(self) -> int:
        """Return the number of documents in the collection."""
        return self._catalog.count()

    @property
    def _dim(self) -> int | None:
        """Vector dimension (None if no vectors added yet)."""
        return self._index.ndim

`add_texts(texts, metadatas=None, embeddings=None, ids=None, *, parent_ids=None, threads=0)`

Add texts with optional embeddings and metadata to the collection.

Automatically infers vector dimension from first batch. Supports upsert (update on conflict) when providing existing IDs. For COSINE distance, vectors are L2-normalized automatically by usearch.

Parameters:

Name	Type	Description	Default
`texts`	`Sequence[str]`	Document text content to store.	required
`metadatas`	`Sequence[dict] \| None`	Optional metadata dicts (one per text).	`None`
`embeddings`	`Sequence[Sequence[float]] \| None`	Optional pre-computed embeddings (one per text). If None, attempts to use local embedding model.	`None`
`ids`	`Sequence[int \| None] \| None`	Optional document IDs for upsert behavior.	`None`
`parent_ids`	`Sequence[int \| None] \| None`	Optional parent document IDs for hierarchical relationships.	`None`
`threads`	`int`	Number of threads for parallel insertion (0=auto).	`0`

Returns:

Type	Description
`list[int]`	List of inserted/updated document IDs.

Raises:

Type	Description
`ValueError`	If embedding dimensions don't match, or if no embeddings provided and local embedder not available.

Source code in src/simplevecdb/core.py

def add_texts(
    self,
    texts: Sequence[str],
    metadatas: Sequence[dict] | None = None,
    embeddings: Sequence[Sequence[float]] | None = None,
    ids: Sequence[int | None] | None = None,
    *,
    parent_ids: Sequence[int | None] | None = None,
    threads: int = 0,
) -> list[int]:
    """
    Add texts with optional embeddings and metadata to the collection.

    Automatically infers vector dimension from first batch. Supports upsert
    (update on conflict) when providing existing IDs. For COSINE distance,
    vectors are L2-normalized automatically by usearch.

    Args:
        texts: Document text content to store.
        metadatas: Optional metadata dicts (one per text).
        embeddings: Optional pre-computed embeddings (one per text).
            If None, attempts to use local embedding model.
        ids: Optional document IDs for upsert behavior.
        parent_ids: Optional parent document IDs for hierarchical relationships.
        threads: Number of threads for parallel insertion (0=auto).

    Returns:
        List of inserted/updated document IDs.

    Raises:
        ValueError: If embedding dimensions don't match, or if no embeddings
            provided and local embedder not available.
    """
    if not texts:
        return []

    # Resolve embeddings
    if embeddings is None:
        try:
            from simplevecdb.embeddings.models import embed_texts as embed_fn

            embeddings = embed_fn(list(texts))
        except Exception as e:
            raise ValueError(
                "No embeddings provided and local embedder failed – "
                "install with [server] extra"
            ) from e

    # Normalize metadatas
    if metadatas is None:
        metadatas = [{} for _ in texts]

    # Process in batches
    from simplevecdb import config

    batch_size = config.EMBEDDING_BATCH_SIZE
    all_ids: list[int] = []

    for batch_start in range(0, len(texts), batch_size):
        batch_end = min(batch_start + batch_size, len(texts))
        batch_texts = texts[batch_start:batch_end]
        batch_metas = metadatas[batch_start:batch_end]
        batch_embeds = embeddings[batch_start:batch_end]
        batch_ids = ids[batch_start:batch_end] if ids else None
        batch_parent_ids = parent_ids[batch_start:batch_end] if parent_ids else None

        # Add to SQLite metadata store (with embeddings for MMR support)
        doc_ids = self._catalog.add_documents(
            batch_texts,
            list(batch_metas),
            batch_ids,
            embeddings=batch_embeds,
            parent_ids=batch_parent_ids,
        )

        # Prepare vectors
        emb_np = np.array(batch_embeds, dtype=np.float32)

        # Add to usearch index
        self._index.add(np.array(doc_ids, dtype=np.uint64), emb_np, threads=threads)

        all_ids.extend(doc_ids)

    return all_ids

`add_texts_streaming(items, *, batch_size=None, threads=0, on_progress=None)`

Stream documents into the collection with controlled memory usage.

Processes documents in batches from any iterable (generator, file reader, API paginator, etc.) without loading all data into memory. Yields progress after each batch for monitoring large ingestions.

Parameters:

Name	Type	Description	Default
`items`	`Iterable[tuple[str, dict \| None, Sequence[float] \| None]]`	Iterable of (text, metadata, embedding) tuples. - text: Document content (required) - metadata: Optional dict, use None for empty - embedding: Optional pre-computed vector, use None to auto-embed	required
`batch_size`	`int \| None`	Documents per batch (default: config.EMBEDDING_BATCH_SIZE).	`None`
`threads`	`int`	Threads for parallel insertion (0=auto).	`0`
`on_progress`	`ProgressCallback \| None`	Optional callback invoked after each batch.	`None`

Yields:

Type	Description
`StreamingProgress`	StreamingProgress dict after each batch with:
`StreamingProgress`	batch_num: Current batch number (1-indexed)
`StreamingProgress`	total_batches: Estimated total (None if unknown)
`StreamingProgress`	docs_processed: Cumulative documents inserted
`StreamingProgress`	docs_in_batch: Documents in current batch
`StreamingProgress`	batch_ids: IDs of documents in current batch

Returns:

Type	Description
`list[int]`	List of all inserted document IDs (access via generator.send(None)
`list[int]`	or list(generator) after exhaustion).

Example

def load_documents(): ... for line in open("large_file.jsonl"): ... doc = json.loads(line) ... yield (doc["text"], doc.get("meta"), None) ... gen = collection.add_texts_streaming(load_documents()) for progress in gen: ... print(f"Batch {progress['batch_num']}: {progress['docs_processed']} total")

IDs accumulated in progress['ids'] for each batch

Example with callback

def log_progress(p): ... print(f"{p['docs_processed']} docs inserted") list(collection.add_texts_streaming(items, on_progress=log_progress))

Source code in src/simplevecdb/core.py

def add_texts_streaming(
    self,
    items: Iterable[tuple[str, dict | None, Sequence[float] | None]],
    *,
    batch_size: int | None = None,
    threads: int = 0,
    on_progress: ProgressCallback | None = None,
) -> Generator[StreamingProgress, None, list[int]]:
    """
    Stream documents into the collection with controlled memory usage.

    Processes documents in batches from any iterable (generator, file reader,
    API paginator, etc.) without loading all data into memory. Yields progress
    after each batch for monitoring large ingestions.

    Args:
        items: Iterable of (text, metadata, embedding) tuples.
            - text: Document content (required)
            - metadata: Optional dict, use None for empty
            - embedding: Optional pre-computed vector, use None to auto-embed
        batch_size: Documents per batch (default: config.EMBEDDING_BATCH_SIZE).
        threads: Threads for parallel insertion (0=auto).
        on_progress: Optional callback invoked after each batch.

    Yields:
        StreamingProgress dict after each batch with:
        - batch_num: Current batch number (1-indexed)
        - total_batches: Estimated total (None if unknown)
        - docs_processed: Cumulative documents inserted
        - docs_in_batch: Documents in current batch
        - batch_ids: IDs of documents in current batch

    Returns:
        List of all inserted document IDs (access via generator.send(None)
        or list(generator) after exhaustion).

    Example:
        >>> def load_documents():
        ...     for line in open("large_file.jsonl"):
        ...         doc = json.loads(line)
        ...         yield (doc["text"], doc.get("meta"), None)
        ...
        >>> gen = collection.add_texts_streaming(load_documents())
        >>> for progress in gen:
        ...     print(f"Batch {progress['batch_num']}: {progress['docs_processed']} total")
        >>> # IDs accumulated in progress['ids'] for each batch

    Example with callback:
        >>> def log_progress(p):
        ...     print(f"{p['docs_processed']} docs inserted")
        >>> list(collection.add_texts_streaming(items, on_progress=log_progress))
    """
    from simplevecdb import config

    if batch_size is None:
        batch_size = config.EMBEDDING_BATCH_SIZE

    all_ids: list[int] = []
    batch_num = 0
    docs_processed = 0

    # Accumulate batch
    batch_texts: list[str] = []
    batch_metas: list[dict] = []
    batch_embeds: list[Sequence[float]] = []
    needs_embedding = False

    for text, metadata, embedding in items:
        batch_texts.append(text)
        batch_metas.append(metadata or {})
        if embedding is not None:
            batch_embeds.append(embedding)
        else:
            needs_embedding = True
            batch_embeds.append([])  # Placeholder

        # Process batch when full
        if len(batch_texts) >= batch_size:
            batch_ids = self._process_streaming_batch(
                batch_texts, batch_metas, batch_embeds, needs_embedding, threads
            )
            all_ids.extend(batch_ids)
            batch_num += 1
            docs_processed += len(batch_ids)

            progress: StreamingProgress = {
                "batch_num": batch_num,
                "total_batches": None,
                "docs_processed": docs_processed,
                "docs_in_batch": len(batch_ids),
                "batch_ids": batch_ids,
            }

            if on_progress:
                on_progress(progress)

            yield progress

            # Reset batch
            batch_texts = []
            batch_metas = []
            batch_embeds = []
            needs_embedding = False

    # Process final partial batch
    if batch_texts:
        batch_ids = self._process_streaming_batch(
            batch_texts, batch_metas, batch_embeds, needs_embedding, threads
        )
        all_ids.extend(batch_ids)
        batch_num += 1
        docs_processed += len(batch_ids)

        progress = {
            "batch_num": batch_num,
            "total_batches": batch_num,  # Now we know total
            "docs_processed": docs_processed,
            "docs_in_batch": len(batch_ids),
            "batch_ids": batch_ids,
        }

        if on_progress:
            on_progress(progress)

        yield progress

    return all_ids

`similarity_search(query, k=5, filter=None, *, exact=None, threads=0)`

Search for most similar vectors using HNSW approximate nearest neighbor.

For COSINE distance, returns distance in [0, 2] (lower = more similar). For L2/L1, returns raw distance (lower = more similar).

Parameters:

Name	Type	Description	Default
`query`	`str \| Sequence[float]`	Query vector or text string (auto-embedded if string).	required
`k`	`int`	Number of nearest neighbors to return.	`5`
`filter`	`dict[str, Any] \| None`	Optional metadata filter.	`None`
`exact`	`bool \| None`	Force search mode. None=adaptive (brute-force for <10k vectors), True=always brute-force (perfect recall), False=always HNSW.	`None`
`threads`	`int`	Number of threads for parallel search (0=auto).	`0`

Returns:

Type	Description
`list[tuple[Document, float]]`	List of (Document, distance) tuples, sorted by ascending distance.

Source code in src/simplevecdb/core.py

def similarity_search(
    self,
    query: str | Sequence[float],
    k: int = 5,
    filter: dict[str, Any] | None = None,
    *,
    exact: bool | None = None,
    threads: int = 0,
) -> list[tuple[Document, float]]:
    """
    Search for most similar vectors using HNSW approximate nearest neighbor.

    For COSINE distance, returns distance in [0, 2] (lower = more similar).
    For L2/L1, returns raw distance (lower = more similar).

    Args:
        query: Query vector or text string (auto-embedded if string).
        k: Number of nearest neighbors to return.
        filter: Optional metadata filter.
        exact: Force search mode. None=adaptive (brute-force for <10k vectors),
               True=always brute-force (perfect recall), False=always HNSW.
        threads: Number of threads for parallel search (0=auto).

    Returns:
        List of (Document, distance) tuples, sorted by ascending distance.
    """
    return self._search.similarity_search(
        query, k, filter, exact=exact, threads=threads
    )

`similarity_search_batch(queries, k=5, filter=None, *, exact=None, threads=0)`

Search for similar vectors across multiple queries in parallel.

Automatically batches queries for ~10x throughput compared to sequential single-query searches. Uses usearch's native batch search optimization.

Parameters:

Name	Type	Description	Default
`queries`	`Sequence[Sequence[float]]`	List of query vectors.	required
`k`	`int`	Number of nearest neighbors per query.	`5`
`filter`	`dict[str, Any] \| None`	Optional metadata filter (applied to all queries).	`None`
`exact`	`bool \| None`	Force search mode. None=adaptive, True=brute-force, False=HNSW.	`None`
`threads`	`int`	Number of threads for parallel search (0=auto).	`0`

Returns:

Type	Description
`list[list[tuple[Document, float]]]`	List of result lists, one per query. Each result is (Document, distance).

Example

queries = [embedding1, embedding2, embedding3] results = collection.similarity_search_batch(queries, k=5) for query_results in results: ... print(f"Found {len(query_results)} matches")

Source code in src/simplevecdb/core.py

def similarity_search_batch(
    self,
    queries: Sequence[Sequence[float]],
    k: int = 5,
    filter: dict[str, Any] | None = None,
    *,
    exact: bool | None = None,
    threads: int = 0,
) -> list[list[tuple[Document, float]]]:
    """
    Search for similar vectors across multiple queries in parallel.

    Automatically batches queries for ~10x throughput compared to
    sequential single-query searches. Uses usearch's native batch
    search optimization.

    Args:
        queries: List of query vectors.
        k: Number of nearest neighbors per query.
        filter: Optional metadata filter (applied to all queries).
        exact: Force search mode. None=adaptive, True=brute-force, False=HNSW.
        threads: Number of threads for parallel search (0=auto).

    Returns:
        List of result lists, one per query. Each result is (Document, distance).

    Example:
        >>> queries = [embedding1, embedding2, embedding3]
        >>> results = collection.similarity_search_batch(queries, k=5)
        >>> for query_results in results:
        ...     print(f"Found {len(query_results)} matches")
    """
    return self._search.similarity_search_batch(
        queries, k, filter, exact=exact, threads=threads
    )

`keyword_search(query, k=5, filter=None)`

Search using BM25 keyword ranking (full-text search).

Uses SQLite's FTS5 extension for BM25-based ranking.

Parameters:

Name	Type	Description	Default
`query`	`str`	Text query using FTS5 syntax.	required
`k`	`int`	Maximum number of results to return.	`5`
`filter`	`dict[str, Any] \| None`	Optional metadata filter.	`None`

Returns:

Type	Description
`list[tuple[Document, float]]`	List of (Document, bm25_score) tuples, sorted by descending relevance.

Raises:

Type	Description
`RuntimeError`	If FTS5 is not available.

Source code in src/simplevecdb/core.py

def keyword_search(
    self, query: str, k: int = 5, filter: dict[str, Any] | None = None
) -> list[tuple[Document, float]]:
    """
    Search using BM25 keyword ranking (full-text search).

    Uses SQLite's FTS5 extension for BM25-based ranking.

    Args:
        query: Text query using FTS5 syntax.
        k: Maximum number of results to return.
        filter: Optional metadata filter.

    Returns:
        List of (Document, bm25_score) tuples, sorted by descending relevance.

    Raises:
        RuntimeError: If FTS5 is not available.
    """
    return self._search.keyword_search(query, k, filter)

`hybrid_search(query, k=5, filter=None, *, query_vector=None, vector_k=None, keyword_k=None, rrf_k=60)`

Combine BM25 keyword search with vector similarity using Reciprocal Rank Fusion.

Parameters:

Name	Type	Description	Default
`query`	`str`	Text query for keyword search.	required
`k`	`int`	Final number of results after fusion.	`5`
`filter`	`dict[str, Any] \| None`	Optional metadata filter.	`None`
`query_vector`	`Sequence[float] \| None`	Optional pre-computed query embedding.	`None`
`vector_k`	`int \| None`	Number of vector search candidates.	`None`
`keyword_k`	`int \| None`	Number of keyword search candidates.	`None`
`rrf_k`	`int`	RRF constant parameter (default: 60).	`60`

Returns:

Type	Description
`list[tuple[Document, float]]`	List of (Document, rrf_score) tuples, sorted by descending RRF score.

Raises:

Type	Description
`RuntimeError`	If FTS5 is not available.

Source code in src/simplevecdb/core.py

def hybrid_search(
    self,
    query: str,
    k: int = 5,
    filter: dict[str, Any] | None = None,
    *,
    query_vector: Sequence[float] | None = None,
    vector_k: int | None = None,
    keyword_k: int | None = None,
    rrf_k: int = 60,
) -> list[tuple[Document, float]]:
    """
    Combine BM25 keyword search with vector similarity using Reciprocal Rank Fusion.

    Args:
        query: Text query for keyword search.
        k: Final number of results after fusion.
        filter: Optional metadata filter.
        query_vector: Optional pre-computed query embedding.
        vector_k: Number of vector search candidates.
        keyword_k: Number of keyword search candidates.
        rrf_k: RRF constant parameter (default: 60).

    Returns:
        List of (Document, rrf_score) tuples, sorted by descending RRF score.

    Raises:
        RuntimeError: If FTS5 is not available.
    """
    return self._search.hybrid_search(
        query,
        k,
        filter,
        query_vector=query_vector,
        vector_k=vector_k,
        keyword_k=keyword_k,
        rrf_k=rrf_k,
    )

`max_marginal_relevance_search(query, k=5, fetch_k=20, lambda_mult=0.5, filter=None)`

Search with diversity - return relevant but non-redundant results.

Parameters:

Name	Type	Description	Default
`query`	`str \| Sequence[float]`	Query vector or text string.	required
`k`	`int`	Number of diverse results to return.	`5`
`fetch_k`	`int`	Number of candidates to consider.	`20`
`lambda_mult`	`float`	Diversity trade-off (0=diverse, 1=relevant).	`0.5`
`filter`	`dict[str, Any] \| None`	Optional metadata filter.	`None`

Returns:

Type	Description
`list[Document]`	List of Documents ordered by MMR selection.

Source code in src/simplevecdb/core.py

def max_marginal_relevance_search(
    self,
    query: str | Sequence[float],
    k: int = 5,
    fetch_k: int = 20,
    lambda_mult: float = 0.5,
    filter: dict[str, Any] | None = None,
) -> list[Document]:
    """
    Search with diversity - return relevant but non-redundant results.

    Args:
        query: Query vector or text string.
        k: Number of diverse results to return.
        fetch_k: Number of candidates to consider.
        lambda_mult: Diversity trade-off (0=diverse, 1=relevant).
        filter: Optional metadata filter.

    Returns:
        List of Documents ordered by MMR selection.
    """
    return self._search.max_marginal_relevance_search(
        query, k, fetch_k, lambda_mult, filter
    )

`delete_by_ids(ids)`

Delete documents by their IDs.

Removes documents from both usearch index and SQLite metadata. Does NOT auto-vacuum; call VectorDB.vacuum() separately.

Parameters:

Name	Type	Description	Default
`ids`	`Iterable[int]`	Document IDs to delete	required

Source code in src/simplevecdb/core.py

def delete_by_ids(self, ids: Iterable[int]) -> None:
    """
    Delete documents by their IDs.

    Removes documents from both usearch index and SQLite metadata.
    Does NOT auto-vacuum; call `VectorDB.vacuum()` separately.

    Args:
        ids: Document IDs to delete
    """
    ids_list = list(ids)
    if not ids_list:
        return

    # Delete from usearch
    self._index.remove(ids_list)

    # Delete from SQLite
    self._catalog.delete_by_ids(ids_list)

`remove_texts(texts=None, filter=None)`

Remove documents by text content or metadata filter.

Parameters:

Name	Type	Description	Default
`texts`	`Sequence[str] \| None`	Optional list of exact text strings to remove	`None`
`filter`	`dict[str, Any] \| None`	Optional metadata filter dict	`None`

Returns:

Type	Description
`int`	Number of documents deleted

Raises:

Type	Description
`ValueError`	If neither texts nor filter provided

Source code in src/simplevecdb/core.py

def remove_texts(
    self,
    texts: Sequence[str] | None = None,
    filter: dict[str, Any] | None = None,
) -> int:
    """
    Remove documents by text content or metadata filter.

    Args:
        texts: Optional list of exact text strings to remove
        filter: Optional metadata filter dict

    Returns:
        Number of documents deleted

    Raises:
        ValueError: If neither texts nor filter provided
    """
    if texts is None and filter is None:
        raise ValueError("Must provide either texts or filter to remove")

    ids_to_delete: list[int] = []

    if texts:
        ids_to_delete.extend(self._catalog.find_ids_by_texts(texts))

    if filter:
        ids_to_delete.extend(
            self._catalog.find_ids_by_filter(
                filter, self._catalog.build_filter_clause
            )
        )

    unique_ids = list(set(ids_to_delete))
    if unique_ids:
        self.delete_by_ids(unique_ids)

    return len(unique_ids)

`rebuild_index(*, connectivity=None, expansion_add=None, expansion_search=None)`

Rebuild the usearch HNSW index from embeddings stored in SQLite.

Useful for: - Recovering from index corruption - Tuning HNSW parameters (connectivity, expansion) - Reclaiming space after many deletions

Parameters:

Name	Type	Description	Default
`connectivity`	`int \| None`	HNSW M parameter (edges per node). Default: 16	`None`
`expansion_add`	`int \| None`	efConstruction (build quality). Default: 128	`None`
`expansion_search`	`int \| None`	ef (search quality). Default: 64	`None`

Returns:

Type	Description
`int`	Number of vectors rebuilt

Raises:

Type	Description
`RuntimeError`	If no embeddings found in SQLite

Source code in src/simplevecdb/core.py

def rebuild_index(
    self,
    *,
    connectivity: int | None = None,
    expansion_add: int | None = None,
    expansion_search: int | None = None,
) -> int:
    """
    Rebuild the usearch HNSW index from embeddings stored in SQLite.

    Useful for:
    - Recovering from index corruption
    - Tuning HNSW parameters (connectivity, expansion)
    - Reclaiming space after many deletions

    Args:
        connectivity: HNSW M parameter (edges per node). Default: 16
        expansion_add: efConstruction (build quality). Default: 128
        expansion_search: ef (search quality). Default: 64

    Returns:
        Number of vectors rebuilt

    Raises:
        RuntimeError: If no embeddings found in SQLite
    """
    _logger.info("Rebuilding usearch index for collection '%s'...", self.name)

    # Get all document IDs
    all_ids = self.conn.execute(f"SELECT id FROM {self._table_name}").fetchall()
    all_ids = [row[0] for row in all_ids]

    if not all_ids:
        _logger.warning("No documents found in collection")
        return 0

    # Fetch embeddings from SQLite
    embeddings_map = self._catalog.get_embeddings_by_ids(all_ids)

    # Filter to only docs with embeddings
    valid_pairs = [
        (doc_id, emb)
        for doc_id in all_ids
        if (emb := embeddings_map.get(doc_id)) is not None
    ]

    if not valid_pairs:
        raise RuntimeError(
            "No embeddings found in SQLite. Cannot rebuild index. "
            "This may happen if documents were added before v2.0.0."
        )

    keys = np.array([doc_id for doc_id, _ in valid_pairs], dtype=np.uint64)
    vectors = np.array([emb for _, emb in valid_pairs], dtype=np.float32)

    # Determine dimension
    ndim = vectors.shape[1]

    # Close old index
    old_path = self._index._path
    self._index.close()

    # Delete old index file
    if old_path.exists():
        old_path.unlink()
        _logger.debug("Deleted old index file: %s", old_path)

    # Create new index with optional custom parameters
    from .engine.usearch_index import (
        DEFAULT_CONNECTIVITY,
        DEFAULT_EXPANSION_ADD,
        DEFAULT_EXPANSION_SEARCH,
    )

    self._index = UsearchIndex(
        index_path=str(old_path),
        ndim=ndim,
        distance_strategy=self.distance_strategy,
        quantization=self.quantization,
        connectivity=connectivity or DEFAULT_CONNECTIVITY,
        expansion_add=expansion_add or DEFAULT_EXPANSION_ADD,
        expansion_search=expansion_search or DEFAULT_EXPANSION_SEARCH,
    )

    # Re-add all vectors
    self._index.add(keys, vectors)
    self._index.save()

    # Update search engine reference
    self._search._index = self._index

    _logger.info("Rebuilt index with %d vectors", len(keys))
    return len(keys)

`get_children(doc_id)`

Get all direct children of a document.

Parameters:

Name	Type	Description	Default
`doc_id`	`int`	ID of the parent document	required

Returns:

Type	Description
`list[Document]`	List of child Documents

Example

Add parent and children

parent_id = collection.add_texts(["Parent doc"], embeddings=[emb])[0] collection.add_texts( ... ["Child 1", "Child 2"], ... embeddings=[emb1, emb2], ... parent_ids=[parent_id, parent_id] ... ) children = collection.get_children(parent_id)

Source code in src/simplevecdb/core.py

def get_children(self, doc_id: int) -> list[Document]:
    """
    Get all direct children of a document.

    Args:
        doc_id: ID of the parent document

    Returns:
        List of child Documents

    Example:
        >>> # Add parent and children
        >>> parent_id = collection.add_texts(["Parent doc"], embeddings=[emb])[0]
        >>> collection.add_texts(
        ...     ["Child 1", "Child 2"],
        ...     embeddings=[emb1, emb2],
        ...     parent_ids=[parent_id, parent_id]
        ... )
        >>> children = collection.get_children(parent_id)
    """
    rows = self._catalog.get_children(doc_id)
    return [Document(page_content=text, metadata=meta) for _, text, meta in rows]

`get_parent(doc_id)`

Get the parent document of a given document.

Parameters:

Name	Type	Description	Default
`doc_id`	`int`	ID of the child document	required

Returns:

Type	Description
`Document \| None`	Parent Document, or None if no parent

Source code in src/simplevecdb/core.py

def get_parent(self, doc_id: int) -> Document | None:
    """
    Get the parent document of a given document.

    Args:
        doc_id: ID of the child document

    Returns:
        Parent Document, or None if no parent
    """
    result = self._catalog.get_parent(doc_id)
    if result is None:
        return None
    _, text, meta = result
    return Document(page_content=text, metadata=meta)

`get_descendants(doc_id, max_depth=None)`

Get all descendants of a document (recursive).

Parameters:

Name	Type	Description	Default
`doc_id`	`int`	ID of the root document	required
`max_depth`	`int \| None`	Maximum depth to traverse (None for unlimited)	`None`

Returns:

Type	Description
`list[tuple[Document, int]]`	List of (Document, depth) tuples, ordered by depth then ID

Source code in src/simplevecdb/core.py

def get_descendants(
    self, doc_id: int, max_depth: int | None = None
) -> list[tuple[Document, int]]:
    """
    Get all descendants of a document (recursive).

    Args:
        doc_id: ID of the root document
        max_depth: Maximum depth to traverse (None for unlimited)

    Returns:
        List of (Document, depth) tuples, ordered by depth then ID
    """
    rows = self._catalog.get_descendants(doc_id, max_depth)
    return [
        (Document(page_content=text, metadata=meta), depth)
        for _, text, meta, depth in rows
    ]

`get_ancestors(doc_id, max_depth=None)`

Get all ancestors of a document (path to root).

Parameters:

Name	Type	Description	Default
`doc_id`	`int`	ID of the document	required
`max_depth`	`int \| None`	Maximum depth to traverse (None for unlimited)	`None`

Returns:

Type	Description
`list[tuple[Document, int]]`	List of (Document, depth) tuples, from immediate parent to root

Source code in src/simplevecdb/core.py

def get_ancestors(
    self, doc_id: int, max_depth: int | None = None
) -> list[tuple[Document, int]]:
    """
    Get all ancestors of a document (path to root).

    Args:
        doc_id: ID of the document
        max_depth: Maximum depth to traverse (None for unlimited)

    Returns:
        List of (Document, depth) tuples, from immediate parent to root
    """
    rows = self._catalog.get_ancestors(doc_id, max_depth)
    return [
        (Document(page_content=text, metadata=meta), depth)
        for _, text, meta, depth in rows
    ]

`set_parent(doc_id, parent_id)`

Set or update the parent of a document.

Parameters:

Name	Type	Description	Default
`doc_id`	`int`	ID of the document to update	required
`parent_id`	`int \| None`	New parent ID (None to remove parent relationship)	required

Returns:

Type	Description
`bool`	True if document was updated, False if document not found

Source code in src/simplevecdb/core.py

def set_parent(self, doc_id: int, parent_id: int | None) -> bool:
    """
    Set or update the parent of a document.

    Args:
        doc_id: ID of the document to update
        parent_id: New parent ID (None to remove parent relationship)

    Returns:
        True if document was updated, False if document not found
    """
    return self._catalog.set_parent(doc_id, parent_id)

`cluster(n_clusters=None, algorithm='minibatch_kmeans', *, filter=None, sample_size=None, min_cluster_size=5, random_state=None)`

Cluster documents in the collection by their embeddings.

Requires scikit-learn and hdbscan (included in the standard install).

Parameters:

Name	Type	Description	Default
`n_clusters`	`int \| None`	Number of clusters (required for kmeans/minibatch_kmeans).	`None`
`algorithm`	`ClusterAlgorithm`	Clustering algorithm - 'kmeans', 'minibatch_kmeans', or 'hdbscan'.	`'minibatch_kmeans'`
`filter`	`dict[str, Any] \| None`	Optional metadata filter to cluster a subset of documents.	`None`
`sample_size`	`int \| None`	If set, cluster a random sample and assign rest to nearest centroid.	`None`
`min_cluster_size`	`int`	Minimum cluster size (HDBSCAN only).	`5`
`random_state`	`int \| None`	Random seed for reproducibility.	`None`

Returns:

Type	Description
`ClusterResult`	ClusterResult with labels, centroids, and doc_ids.

Raises:

Type	Description
`ImportError`	If scikit-learn or hdbscan (for HDBSCAN) not installed.
`ValueError`	If n_clusters required but not provided.

Example

result = collection.cluster(n_clusters=5) print(result.summary()) # {0: 42, 1: 38, 2: 20, ...}

Source code in src/simplevecdb/core.py

def cluster(
    self,
    n_clusters: int | None = None,
    algorithm: ClusterAlgorithm = "minibatch_kmeans",
    *,
    filter: dict[str, Any] | None = None,
    sample_size: int | None = None,
    min_cluster_size: int = 5,
    random_state: int | None = None,
) -> ClusterResult:
    """
    Cluster documents in the collection by their embeddings.

    Requires scikit-learn and hdbscan (included in the standard install).

    Args:
        n_clusters: Number of clusters (required for kmeans/minibatch_kmeans).
        algorithm: Clustering algorithm - 'kmeans', 'minibatch_kmeans', or 'hdbscan'.
        filter: Optional metadata filter to cluster a subset of documents.
        sample_size: If set, cluster a random sample and assign rest to nearest centroid.
        min_cluster_size: Minimum cluster size (HDBSCAN only).
        random_state: Random seed for reproducibility.

    Returns:
        ClusterResult with labels, centroids, and doc_ids.

    Raises:
        ImportError: If scikit-learn or hdbscan (for HDBSCAN) not installed.
        ValueError: If n_clusters required but not provided.

    Example:
        >>> result = collection.cluster(n_clusters=5)
        >>> print(result.summary())  # {0: 42, 1: 38, 2: 20, ...}
    """
    engine = ClusterEngine()

    doc_ids = list(self._index.keys())
    if not doc_ids:
        return ClusterResult(
            labels=np.array([], dtype=np.int32),
            centroids=None,
            doc_ids=[],
            n_clusters=0,
            algorithm=algorithm,
        )

    if filter:
        filtered_ids = set(
            self._catalog.find_ids_by_filter(
                filter, self._catalog.build_filter_clause
            )
        )
        doc_ids = [d for d in doc_ids if d in filtered_ids]

    vectors = self._index.get(np.array(doc_ids, dtype=np.uint64))

    effective_n_clusters = n_clusters
    if n_clusters is not None and algorithm in ("kmeans", "minibatch_kmeans"):
        effective_n_clusters = min(n_clusters, len(doc_ids))

    if sample_size and sample_size < len(doc_ids):
        rng = np.random.default_rng(random_state)
        sample_indices = rng.choice(len(doc_ids), sample_size, replace=False)
        sample_ids = [doc_ids[i] for i in sample_indices]
        sample_vectors = vectors[sample_indices]

        result = engine.cluster_vectors(
            sample_vectors,
            sample_ids,
            algorithm=algorithm,
            n_clusters=effective_n_clusters,
            min_cluster_size=min_cluster_size,
            random_state=random_state,
        )

        if result.centroids is not None:
            remaining_mask = np.ones(len(doc_ids), dtype=bool)
            remaining_mask[sample_indices] = False
            remaining_ids = [doc_ids[i] for i, m in enumerate(remaining_mask) if m]
            remaining_vectors = vectors[remaining_mask]

            remaining_labels = engine.assign_to_nearest_centroid(
                remaining_vectors, result.centroids
            )

            all_ids = sample_ids + remaining_ids
            all_labels = np.concatenate([result.labels, remaining_labels])

            order = np.argsort(all_ids)
            return ClusterResult(
                labels=all_labels[order],
                centroids=result.centroids,
                doc_ids=[all_ids[i] for i in order],
                n_clusters=result.n_clusters,
                algorithm=algorithm,
            )
        return result

    return engine.cluster_vectors(
        vectors,
        doc_ids,
        algorithm=algorithm,
        n_clusters=effective_n_clusters,
        min_cluster_size=min_cluster_size,
        random_state=random_state,
    )

`auto_tag(cluster_result, *, method='keywords', n_keywords=5, custom_callback=None)`

Generate descriptive tags for each cluster.

Parameters:

Name	Type	Description	Default
`cluster_result`	`ClusterResult`	Result from cluster() method.	required
`method`	`str`	Tagging method - 'keywords' (TF-IDF) or 'custom'.	`'keywords'`
`n_keywords`	`int`	Number of keywords per cluster (for 'keywords' method).	`5`
`custom_callback`	`ClusterTagCallback \| None`	Custom function (texts: list[str]) -> str for 'custom' method.	`None`

Returns:

Type	Description
`dict[int, str]`	Dict mapping cluster_id -> tag string.

Example

result = collection.cluster(n_clusters=3) tags = collection.auto_tag(result) print(tags) # {0: 'machine learning, neural', 1: 'database, sql', ...}

Source code in src/simplevecdb/core.py

def auto_tag(
    self,
    cluster_result: ClusterResult,
    *,
    method: str = "keywords",
    n_keywords: int = 5,
    custom_callback: ClusterTagCallback | None = None,
) -> dict[int, str]:
    """
    Generate descriptive tags for each cluster.

    Args:
        cluster_result: Result from cluster() method.
        method: Tagging method - 'keywords' (TF-IDF) or 'custom'.
        n_keywords: Number of keywords per cluster (for 'keywords' method).
        custom_callback: Custom function (texts: list[str]) -> str for 'custom' method.

    Returns:
        Dict mapping cluster_id -> tag string.

    Example:
        >>> result = collection.cluster(n_clusters=3)
        >>> tags = collection.auto_tag(result)
        >>> print(tags)  # {0: 'machine learning, neural', 1: 'database, sql', ...}
    """
    docs = self._catalog.get_documents_by_ids(cluster_result.doc_ids)

    cluster_texts: dict[int, list[str]] = {}
    for doc_id, label in zip(cluster_result.doc_ids, cluster_result.labels):
        label_int = int(label)
        if label_int not in cluster_texts:
            cluster_texts[label_int] = []
        if doc_id in docs:
            cluster_texts[label_int].append(docs[doc_id][0])

    if method == "custom" and custom_callback:
        return {
            cluster_id: custom_callback(texts)
            for cluster_id, texts in cluster_texts.items()
        }

    engine = ClusterEngine()
    return engine.generate_keywords(cluster_texts, n_keywords)

`assign_cluster_metadata(cluster_result, tags=None, *, metadata_key='cluster', tag_key='cluster_tag')`

Persist cluster assignments to document metadata.

After calling this, you can filter by cluster: filter={"cluster": 2}

Parameters:

Name	Type	Description	Default
`cluster_result`	`ClusterResult`	Result from cluster() method.	required
`tags`	`dict[int, str] \| None`	Optional cluster tags from auto_tag(). If provided, also sets tag_key.	`None`
`metadata_key`	`str`	Metadata key for cluster ID (default: "cluster").	`'cluster'`
`tag_key`	`str`	Metadata key for cluster tag (default: "cluster_tag").	`'cluster_tag'`

Returns:

Type	Description
`int`	Number of documents updated.

Example

result = collection.cluster(n_clusters=5) tags = collection.auto_tag(result) collection.assign_cluster_metadata(result, tags)

Now filter by cluster

docs = collection.similarity_search(query, filter={"cluster": 2})

Source code in src/simplevecdb/core.py

def assign_cluster_metadata(
    self,
    cluster_result: ClusterResult,
    tags: dict[int, str] | None = None,
    *,
    metadata_key: str = "cluster",
    tag_key: str = "cluster_tag",
) -> int:
    """
    Persist cluster assignments to document metadata.

    After calling this, you can filter by cluster: filter={"cluster": 2}

    Args:
        cluster_result: Result from cluster() method.
        tags: Optional cluster tags from auto_tag(). If provided, also sets tag_key.
        metadata_key: Metadata key for cluster ID (default: "cluster").
        tag_key: Metadata key for cluster tag (default: "cluster_tag").

    Returns:
        Number of documents updated.

    Example:
        >>> result = collection.cluster(n_clusters=5)
        >>> tags = collection.auto_tag(result)
        >>> collection.assign_cluster_metadata(result, tags)
        >>> # Now filter by cluster
        >>> docs = collection.similarity_search(query, filter={"cluster": 2})
    """
    updates: list[tuple[int, dict[str, Any]]] = []
    for doc_id, label in zip(cluster_result.doc_ids, cluster_result.labels):
        meta: dict[str, Any] = {metadata_key: int(label)}
        if tags and int(label) in tags:
            meta[tag_key] = tags[int(label)]
        updates.append((doc_id, meta))

    return self._catalog.update_metadata_batch(updates)

`get_cluster_members(cluster_id, *, metadata_key='cluster')`

Get all documents in a cluster (requires assign_cluster_metadata first).

Parameters:

Name	Type	Description	Default
`cluster_id`	`int`	Cluster ID to retrieve.	required
`metadata_key`	`str`	Metadata key where cluster is stored (default: "cluster").	`'cluster'`

Returns:

Type	Description
`list[Document]`	List of Documents in the cluster.

Source code in src/simplevecdb/core.py

def get_cluster_members(
    self,
    cluster_id: int,
    *,
    metadata_key: str = "cluster",
) -> list[Document]:
    """
    Get all documents in a cluster (requires assign_cluster_metadata first).

    Args:
        cluster_id: Cluster ID to retrieve.
        metadata_key: Metadata key where cluster is stored (default: "cluster").

    Returns:
        List of Documents in the cluster.
    """
    rows = self._catalog.get_all_docs_with_text(
        filter_dict={metadata_key: cluster_id},
        filter_builder=self._catalog.build_filter_clause,
    )
    return [Document(page_content=text, metadata=meta) for _, text, meta in rows]

`save_cluster(name, cluster_result, *, metadata=None)`

Save cluster state for later reuse without re-clustering.

Persists centroids and algorithm info so new documents can be assigned to existing clusters using assign_to_cluster().

Parameters:

Name	Type	Description	Default
`name`	`str`	Unique name for this cluster configuration.	required
`cluster_result`	`ClusterResult`	Result from cluster() method.	required
`metadata`	`dict[str, Any] \| None`	Optional additional metadata (tags, metrics, etc.).	`None`

Example

result = collection.cluster(n_clusters=5) tags = collection.auto_tag(result) collection.save_cluster("product_categories", result, metadata={"tags": tags})

Source code in src/simplevecdb/core.py

def save_cluster(
    self,
    name: str,
    cluster_result: ClusterResult,
    *,
    metadata: dict[str, Any] | None = None,
) -> None:
    """
    Save cluster state for later reuse without re-clustering.

    Persists centroids and algorithm info so new documents can be assigned
    to existing clusters using assign_to_cluster().

    Args:
        name: Unique name for this cluster configuration.
        cluster_result: Result from cluster() method.
        metadata: Optional additional metadata (tags, metrics, etc.).

    Example:
        >>> result = collection.cluster(n_clusters=5)
        >>> tags = collection.auto_tag(result)
        >>> collection.save_cluster("product_categories", result, metadata={"tags": tags})
    """
    centroids_bytes = None
    if cluster_result.centroids is not None:
        centroids_bytes = cluster_result.centroids.tobytes()

    self._catalog.save_cluster_state(
        name=name,
        algorithm=cluster_result.algorithm,
        n_clusters=cluster_result.n_clusters,
        centroids=centroids_bytes,
        metadata=metadata,
    )

`load_cluster(name)`

Load a saved cluster configuration.

Parameters:

Name	Type	Description	Default
`name`	`str`	Name of the saved cluster configuration.	required

Returns:

Type	Description
`tuple[ClusterResult, dict[str, Any]] \| None`	Tuple of (ClusterResult with centroids, metadata dict) or None if not found.

Example

saved = collection.load_cluster("product_categories") if saved: ... result, meta = saved ... print(f"Loaded {result.n_clusters} clusters")

Source code in src/simplevecdb/core.py

def load_cluster(self, name: str) -> tuple[ClusterResult, dict[str, Any]] | None:
    """
    Load a saved cluster configuration.

    Args:
        name: Name of the saved cluster configuration.

    Returns:
        Tuple of (ClusterResult with centroids, metadata dict) or None if not found.

    Example:
        >>> saved = collection.load_cluster("product_categories")
        >>> if saved:
        ...     result, meta = saved
        ...     print(f"Loaded {result.n_clusters} clusters")
    """
    state = self._catalog.load_cluster_state(name)
    if state is None:
        return None

    algorithm, n_clusters, centroids_bytes, metadata = state

    centroids = None
    if centroids_bytes is not None:
        dim = self._dim
        if dim:
            centroids = np.frombuffer(centroids_bytes, dtype=np.float32).reshape(
                n_clusters, dim
            )

    result = ClusterResult(
        labels=np.array([], dtype=np.int32),
        centroids=centroids,
        doc_ids=[],
        n_clusters=n_clusters,
        algorithm=algorithm,
    )
    return result, metadata

`list_clusters()`

List all saved cluster configurations.

Source code in src/simplevecdb/core.py

def list_clusters(self) -> list[dict[str, Any]]:
    """List all saved cluster configurations."""
    return self._catalog.list_cluster_states()

`delete_cluster(name)`

Delete a saved cluster configuration.

Source code in src/simplevecdb/core.py

def delete_cluster(self, name: str) -> bool:
    """Delete a saved cluster configuration."""
    return self._catalog.delete_cluster_state(name)

`assign_to_cluster(name, doc_ids=None, *, metadata_key='cluster')`

Assign documents to clusters using saved centroids.

Fast assignment without re-clustering - uses nearest centroid matching. Useful for assigning newly added documents to existing cluster structure.

Parameters:

Name	Type	Description	Default
`name`	`str`	Name of saved cluster configuration (from save_cluster).	required
`doc_ids`	`list[int] \| None`	Document IDs to assign. If None, assigns all unassigned docs.	`None`
`metadata_key`	`str`	Metadata key to store cluster assignment.	`'cluster'`

Returns:

Type	Description
`int`	Number of documents assigned.

Raises:

Type	Description
`ValueError`	If cluster not found or has no centroids (HDBSCAN).

Example

Add new documents

new_ids = collection.add_texts(new_texts, embeddings=new_embs)

Assign to existing clusters

collection.assign_to_cluster("product_categories", new_ids)

Source code in src/simplevecdb/core.py

def assign_to_cluster(
    self,
    name: str,
    doc_ids: list[int] | None = None,
    *,
    metadata_key: str = "cluster",
) -> int:
    """
    Assign documents to clusters using saved centroids.

    Fast assignment without re-clustering - uses nearest centroid matching.
    Useful for assigning newly added documents to existing cluster structure.

    Args:
        name: Name of saved cluster configuration (from save_cluster).
        doc_ids: Document IDs to assign. If None, assigns all unassigned docs.
        metadata_key: Metadata key to store cluster assignment.

    Returns:
        Number of documents assigned.

    Raises:
        ValueError: If cluster not found or has no centroids (HDBSCAN).

    Example:
        >>> # Add new documents
        >>> new_ids = collection.add_texts(new_texts, embeddings=new_embs)
        >>> # Assign to existing clusters
        >>> collection.assign_to_cluster("product_categories", new_ids)
    """
    saved = self.load_cluster(name)
    if saved is None:
        raise ValueError(f"Cluster '{name}' not found")

    result, _ = saved
    if result.centroids is None:
        raise ValueError(
            f"Cluster '{name}' has no centroids (HDBSCAN clusters cannot be used for assignment)"
        )

    if doc_ids is None:
        all_ids = list(self._index.keys())
        # Get all documents to check for metadata key existence
        all_docs = self._catalog.get_all_docs_with_text()
        assigned_ids = {
            doc_id for doc_id, _, meta in all_docs if metadata_key in meta
        }
        doc_ids = [d for d in all_ids if d not in assigned_ids]

    if not doc_ids:
        return 0

    vectors = self._index.get(np.array(doc_ids, dtype=np.uint64))

    engine = ClusterEngine()
    labels = engine.assign_to_nearest_centroid(vectors, result.centroids)

    updates = [
        (doc_id, {metadata_key: int(label)})
        for doc_id, label in zip(doc_ids, labels)
    ]
    return self._catalog.update_metadata_batch(updates)

Quick Reference

Search Methods

Method	Description	Use Case
`similarity_search()`	Vector similarity search	Single query, best match
`similarity_search_batch()`	Batch vector search	Multiple queries, ~10x throughput
`keyword_search()`	BM25 full-text search	Keyword matching
`hybrid_search()`	BM25 + vector fusion	Best of both worlds
`max_marginal_relevance_search()`	Diversity-aware search	Avoid redundant results

Search Parameters

# Adaptive search (default) - auto-selects brute-force or HNSW
results = collection.similarity_search(query, k=10)

# Force exact brute-force search (perfect recall)
results = collection.similarity_search(query, k=10, exact=True)

# Force HNSW approximate search (faster)
results = collection.similarity_search(query, k=10, exact=False)

# Parallel search with explicit thread count
results = collection.similarity_search(query, k=10, threads=4)

# Batch search for multiple queries
results = collection.similarity_search_batch(queries, k=10)

Quantization Options

from simplevecdb import Quantization

# Full precision (default)
collection = db.collection("docs", quantization=Quantization.FLOAT)

# Half precision - 2x memory savings, 1.5x faster
collection = db.collection("docs", quantization=Quantization.FLOAT16)

# 8-bit quantization - 4x memory savings
collection = db.collection("docs", quantization=Quantization.INT8)

# 1-bit quantization - 32x memory savings
collection = db.collection("docs", quantization=Quantization.BIT)

Streaming Insert

For large-scale ingestion without memory pressure:

# From generator/iterator
def load_documents():
    for line in open("large_file.jsonl"):
        doc = json.loads(line)
        yield (doc["text"], doc.get("metadata"), doc.get("embedding"))

for progress in collection.add_texts_streaming(load_documents()):
    print(f"Batch {progress['batch_num']}: {progress['docs_processed']} total")

# With progress callback
def log_progress(p):
    print(f"{p['docs_processed']} docs, batch {p['batch_num']}")

list(collection.add_texts_streaming(items, batch_size=500, on_progress=log_progress))

Hierarchical Relationships

Organize documents in parent-child hierarchies for chunked documents, threaded conversations, or nested content:

# Add documents with parent relationships
parent_ids = collection.add_texts(["Main document"], metadatas=[{"type": "parent"}])
parent_id = parent_ids[0]

# Add children referencing the parent
child_ids = collection.add_texts(
    ["Chunk 1", "Chunk 2", "Chunk 3"],
    parent_ids=[parent_id, parent_id, parent_id]
)

# Navigate the hierarchy
children = collection.get_children(parent_id)         # Direct children
parent = collection.get_parent(child_ids[0])          # Get parent document
descendants = collection.get_descendants(parent_id)   # All nested children
ancestors = collection.get_ancestors(child_ids[0])    # Path to root

# Reparent or orphan documents
collection.set_parent(child_ids[0], new_parent_id)    # Move to new parent
collection.set_parent(child_ids[0], None)             # Make root document

# Search within a subtree
results = collection.similarity_search(
    query_embedding,
    k=5,
    filter={"parent_id": parent_id}  # Only search children
)

Method	Description
`get_children(doc_id)`	Direct children of a document
`get_parent(doc_id)`	Parent document (or None if root)
`get_descendants(doc_id, max_depth)`	All nested children recursively
`get_ancestors(doc_id)`	Path from document to root
`set_parent(doc_id, parent_id)`	Move document to new parent (or None to orphan)

Cross-Collection Search

Search across multiple collections with unified, ranked results:

from simplevecdb import VectorDB

db = VectorDB("app.db")

# Initialize collections
users = db.collection("users")
products = db.collection("products")
docs = db.collection("docs")

# Add data to each collection
users.add_texts(["Alice likes hiking"], embeddings=[[0.1]*384])
products.add_texts(["Hiking boots", "Trail map"], embeddings=[[0.2]*384, [0.15]*384])
docs.add_texts(["Mountain hiking guide"], embeddings=[[0.12]*384])

# List initialized collections
print(db.list_collections())  # ['users', 'products', 'docs']

# Search across ALL collections
results = db.search_collections([0.1]*384, k=5)
for doc, score, collection_name in results:
    print(f"[{collection_name}] {doc.page_content} (score: {score:.3f})")

# Search specific collections only
results = db.search_collections(
    [0.1]*384,
    collections=["users", "products"],  # Exclude 'docs'
    k=3
)

# With metadata filtering (applies to all collections)
results = db.search_collections(
    [0.1]*384,
    k=10,
    filter={"category": "outdoor"}
)

# Disable score normalization (returns inverted distances)
results = db.search_collections([0.1]*384, normalize_scores=False)

# Sequential search (disable parallelism)
results = db.search_collections([0.1]*384, parallel=False)

Method	Description
`list_collections()`	Names of all initialized collections
`search_collections(query, collections, k, filter, normalize_scores, parallel)`	Search across multiple collections with merged results

Clustering & Auto-Tagging

Group similar documents and generate descriptive tags:

from simplevecdb import VectorDB

db = VectorDB("app.db")
collection = db.collection("docs")

# Add documents with embeddings
collection.add_texts(texts, embeddings=embeddings)

# Cluster documents into groups
result = collection.cluster(
    n_clusters=5,
    algorithm="minibatch_kmeans",  # or "kmeans", "hdbscan"
    random_state=42
)
print(result.summary())  # {0: 42, 1: 38, 2: 15, 3: 3, 4: 2}

# Generate keyword tags for each cluster
tags = collection.auto_tag(result, n_keywords=5)
# {0: 'machine learning, neural network, deep', 1: 'database, sql, query', ...}

# Persist cluster assignments to metadata
collection.assign_cluster_metadata(result, tags)

# Query documents by cluster
ml_docs = collection.get_cluster_members(0)
db_docs = collection.similarity_search(query, filter={"cluster": 1})

# Custom tagging callback
def summarize_cluster(texts: list[str]) -> str:
    return f"Group of {len(texts)} docs about {texts[0][:20]}..."

custom_tags = collection.auto_tag(result, method="custom", custom_callback=summarize_cluster)

Method	Description
`cluster(n_clusters, algorithm, filter, sample_size)`	Cluster documents by embedding similarity
`auto_tag(result, method, n_keywords, custom_callback)`	Generate descriptive tags for clusters
`assign_cluster_metadata(result, tags, metadata_key)`	Persist cluster IDs to document metadata
`get_cluster_members(cluster_id, metadata_key)`	Retrieve all documents in a cluster
`save_cluster(name, result, metadata)`	Save cluster centroids for later assignment
`load_cluster(name)`	Load saved cluster configuration
`list_clusters()`	List all saved cluster configurations
`delete_cluster(name)`	Delete a saved cluster configuration
`assign_to_cluster(name, doc_ids, metadata_key)`	Assign documents to saved clusters

Algorithms:

Algorithm	Best For	Requires n_clusters
`minibatch_kmeans`	Large datasets (default)	Yes
`kmeans`	Small datasets, precise centroids	Yes
`hdbscan`	Unknown cluster count, density-based	No

Clustering is included in the standard installation (no extras needed).

Cluster Metrics

Access clustering quality metrics to evaluate results:

result = collection.cluster(n_clusters=5, random_state=42)

# Inertia (K-means only): sum of squared distances to centroids
# Lower is better; indicates tighter clusters
print(f"Inertia: {result.inertia}")

# Silhouette score: measure of cluster separation (-1 to 1)
# Higher is better; >0.5 indicates good clustering
print(f"Silhouette: {result.silhouette_score}")

# Get all metrics as dict
metrics = result.metrics()
# {'inertia': 1523.45, 'silhouette_score': 0.62}

Cluster Persistence

Save cluster configurations for fast assignment of new documents:

# 1. Cluster your documents
result = collection.cluster(n_clusters=5, random_state=42)
tags = collection.auto_tag(result)

# 2. Save cluster state (centroids + metadata)
collection.save_cluster(
    "product_categories",
    result,
    metadata={"tags": tags, "version": 1}
)

# 3. Later: assign new documents without re-clustering
new_ids = collection.add_texts(new_texts, embeddings=new_embeddings)
collection.assign_to_cluster("product_categories", new_ids)

# List saved clusters
clusters = collection.list_clusters()
# [{'name': 'product_categories', 'n_clusters': 5, 'algorithm': 'minibatch_kmeans', ...}]

# Load cluster for inspection
saved = collection.load_cluster("product_categories")
if saved:
    result, meta = saved
    print(f"Loaded {result.n_clusters} clusters, tags: {meta['tags']}")

# Delete when no longer needed
collection.delete_cluster("product_categories")

VectorDB

VectorDB

simplevecdb.core.VectorDB

collection(name='default', distance_strategy=None, quantization=None)

list_collections()

search_collections(query, collections=None, k=10, filter=None, *, normalize_scores=True, parallel=True)

vacuum(checkpoint_wal=True)

close()

check_migration(path) staticmethod

VectorCollection

simplevecdb.core.VectorCollection

add_texts(texts, metadatas=None, embeddings=None, ids=None, *, parent_ids=None, threads=0)

add_texts_streaming(items, *, batch_size=None, threads=0, on_progress=None)

IDs accumulated in progress['ids'] for each batch

similarity_search(query, k=5, filter=None, *, exact=None, threads=0)

similarity_search_batch(queries, k=5, filter=None, *, exact=None, threads=0)

keyword_search(query, k=5, filter=None)

hybrid_search(query, k=5, filter=None, *, query_vector=None, vector_k=None, keyword_k=None, rrf_k=60)

max_marginal_relevance_search(query, k=5, fetch_k=20, lambda_mult=0.5, filter=None)

delete_by_ids(ids)

remove_texts(texts=None, filter=None)

rebuild_index(*, connectivity=None, expansion_add=None, expansion_search=None)

get_children(doc_id)

Add parent and children

get_parent(doc_id)

get_descendants(doc_id, max_depth=None)

get_ancestors(doc_id, max_depth=None)

set_parent(doc_id, parent_id)

cluster(n_clusters=None, algorithm='minibatch_kmeans', *, filter=None, sample_size=None, min_cluster_size=5, random_state=None)

auto_tag(cluster_result, *, method='keywords', n_keywords=5, custom_callback=None)

assign_cluster_metadata(cluster_result, tags=None, *, metadata_key='cluster', tag_key='cluster_tag')

Now filter by cluster

get_cluster_members(cluster_id, *, metadata_key='cluster')

save_cluster(name, cluster_result, *, metadata=None)

load_cluster(name)

list_clusters()

delete_cluster(name)

assign_to_cluster(name, doc_ids=None, *, metadata_key='cluster')

Add new documents

Assign to existing clusters

Quick Reference

Search Methods

Search Parameters

Quantization Options

Streaming Insert

Hierarchical Relationships

Cross-Collection Search

Clustering & Auto-Tagging

Cluster Metrics

Cluster Persistence

`simplevecdb.core.VectorDB`

`collection(name='default', distance_strategy=None, quantization=None)`

`list_collections()`

`search_collections(query, collections=None, k=10, filter=None, *, normalize_scores=True, parallel=True)`

`vacuum(checkpoint_wal=True)`

`close()`

`check_migration(path)` `staticmethod`

`simplevecdb.core.VectorCollection`

`add_texts(texts, metadatas=None, embeddings=None, ids=None, *, parent_ids=None, threads=0)`

`add_texts_streaming(items, *, batch_size=None, threads=0, on_progress=None)`

`similarity_search(query, k=5, filter=None, *, exact=None, threads=0)`

`similarity_search_batch(queries, k=5, filter=None, *, exact=None, threads=0)`

`keyword_search(query, k=5, filter=None)`

`hybrid_search(query, k=5, filter=None, *, query_vector=None, vector_k=None, keyword_k=None, rrf_k=60)`

`max_marginal_relevance_search(query, k=5, fetch_k=20, lambda_mult=0.5, filter=None)`

`delete_by_ids(ids)`

`remove_texts(texts=None, filter=None)`

`rebuild_index(*, connectivity=None, expansion_add=None, expansion_search=None)`

`get_children(doc_id)`

`get_parent(doc_id)`

`get_descendants(doc_id, max_depth=None)`

`get_ancestors(doc_id, max_depth=None)`

`set_parent(doc_id, parent_id)`

`cluster(n_clusters=None, algorithm='minibatch_kmeans', *, filter=None, sample_size=None, min_cluster_size=5, random_state=None)`

`auto_tag(cluster_result, *, method='keywords', n_keywords=5, custom_callback=None)`

`assign_cluster_metadata(cluster_result, tags=None, *, metadata_key='cluster', tag_key='cluster_tag')`

`get_cluster_members(cluster_id, *, metadata_key='cluster')`

`save_cluster(name, cluster_result, *, metadata=None)`

`load_cluster(name)`

`list_clusters()`

`delete_cluster(name)`

`assign_to_cluster(name, doc_ids=None, *, metadata_key='cluster')`