Skip to content

Search Reference

bear.search

SearchEngine

Search engine for vector-based similarity search across resources.

Source code in bear/search.py
class SearchEngine:
    """Search engine for vector-based similarity search across resources."""

    def __init__(self, client: MilvusClient | None = None, reranker: Reranker | None = None) -> None:
        self.client = client or get_milvus_client()
        self.reranker = reranker or get_reranker()

    def search_resource(
        self,
        resource_name: str,
        query: str,
        top_k: int = 3,
        min_distance: float | None = None,
        since_year: int | None = None,
        author_ids: list[str] | None = None,
        output_fields: list[str] | None = None,
    ) -> list[dict[str, Any]]:
        """Search and filter for resource using a query.

        Args:
            resource_name: Name of the resource collection to search
            query: Search query string
            top_k: Maximum number of results to return
            min_distance: Minimum distance threshold for results
            since_year: Filter results from this year onwards
            author_ids: Filter results by specific author IDs
            output_fields: Fields to include in output. If None, all fields except embedding

        Returns:
            List of search results sorted by distance (descending)

        Raises:
            ValueError: If resource class is not found in model
        """
        # Build filter conditions
        filter_conditions = ["ignore == false"]
        if since_year is not None:
            filter_conditions.append(f"publication_year >= {since_year}")
        if author_ids is not None:
            filter_conditions.append(f"array_contains_any(author_ids, {author_ids})")
        filter_expr = " and ".join(filter_conditions)

        # Get resource class and validate
        resource_class = getattr(model, resource_name.capitalize(), None)
        if not resource_class:
            raise ValueError(f"Resource class '{resource_name}' not found in model.")

        # Set output fields if not provided
        if output_fields is None:
            output_fields = [field for field in resource_class.model_fields.keys() if field != "embedding"]

        # Prepare search arguments
        search_args = {
            "collection_name": resource_name,
            "data": [embed_query(query)],
            "limit": top_k,
            "output_fields": output_fields,
            "filter": filter_expr,
        }

        # Execute search
        results = self.client.search(**search_args)[0]

        # Apply distance filter if specified
        if min_distance is not None:
            results = [result for result in results if result["distance"] >= min_distance]

        return sorted(results, key=lambda x: x["distance"], reverse=True)

    def search_author(self, query: str, top_k: int = 1000, institutions: list[str] | None = None, **kwargs) -> list[dict]:
        """Search for authors based on a query string."""

        if not institutions:
            institutions = [config.OPENALEX_INSTITUTION_ID]

        resources_sets = {name: self.search_resource(name, query, top_k, **kwargs) for name in model.ALL_RESOURCES_NAMES}
        results = self.reranker.rerank(resources_sets)
        results = filter_institution_authors(institution_ids=institutions, results=results)
        return results

search_resource(resource_name, query, top_k=3, min_distance=None, since_year=None, author_ids=None, output_fields=None)

Search and filter for resource using a query.

Parameters:

Name Type Description Default
resource_name str

Name of the resource collection to search

required
query str

Search query string

required
top_k int

Maximum number of results to return

3
min_distance float | None

Minimum distance threshold for results

None
since_year int | None

Filter results from this year onwards

None
author_ids list[str] | None

Filter results by specific author IDs

None
output_fields list[str] | None

Fields to include in output. If None, all fields except embedding

None

Returns:

Type Description
list[dict[str, Any]]

List of search results sorted by distance (descending)

Raises:

Type Description
ValueError

If resource class is not found in model

Source code in bear/search.py
def search_resource(
    self,
    resource_name: str,
    query: str,
    top_k: int = 3,
    min_distance: float | None = None,
    since_year: int | None = None,
    author_ids: list[str] | None = None,
    output_fields: list[str] | None = None,
) -> list[dict[str, Any]]:
    """Search and filter for resource using a query.

    Args:
        resource_name: Name of the resource collection to search
        query: Search query string
        top_k: Maximum number of results to return
        min_distance: Minimum distance threshold for results
        since_year: Filter results from this year onwards
        author_ids: Filter results by specific author IDs
        output_fields: Fields to include in output. If None, all fields except embedding

    Returns:
        List of search results sorted by distance (descending)

    Raises:
        ValueError: If resource class is not found in model
    """
    # Build filter conditions
    filter_conditions = ["ignore == false"]
    if since_year is not None:
        filter_conditions.append(f"publication_year >= {since_year}")
    if author_ids is not None:
        filter_conditions.append(f"array_contains_any(author_ids, {author_ids})")
    filter_expr = " and ".join(filter_conditions)

    # Get resource class and validate
    resource_class = getattr(model, resource_name.capitalize(), None)
    if not resource_class:
        raise ValueError(f"Resource class '{resource_name}' not found in model.")

    # Set output fields if not provided
    if output_fields is None:
        output_fields = [field for field in resource_class.model_fields.keys() if field != "embedding"]

    # Prepare search arguments
    search_args = {
        "collection_name": resource_name,
        "data": [embed_query(query)],
        "limit": top_k,
        "output_fields": output_fields,
        "filter": filter_expr,
    }

    # Execute search
    results = self.client.search(**search_args)[0]

    # Apply distance filter if specified
    if min_distance is not None:
        results = [result for result in results if result["distance"] >= min_distance]

    return sorted(results, key=lambda x: x["distance"], reverse=True)

search_author(query, top_k=1000, institutions=None, **kwargs)

Search for authors based on a query string.

Source code in bear/search.py
def search_author(self, query: str, top_k: int = 1000, institutions: list[str] | None = None, **kwargs) -> list[dict]:
    """Search for authors based on a query string."""

    if not institutions:
        institutions = [config.OPENALEX_INSTITUTION_ID]

    resources_sets = {name: self.search_resource(name, query, top_k, **kwargs) for name in model.ALL_RESOURCES_NAMES}
    results = self.reranker.rerank(resources_sets)
    results = filter_institution_authors(institution_ids=institutions, results=results)
    return results

load_institution_author_ids(institution_id=config.OPENALEX_INSTITUTION_ID)

Load author IDs associated with a specific institution.

Source code in bear/search.py
@cached(cache=TTLCache(maxsize=3, ttl=24 * 60 * 60))
def load_institution_author_ids(institution_id: str = config.OPENALEX_INSTITUTION_ID) -> set[str]:
    """Load author IDs associated with a specific institution."""

    client = get_milvus_client()
    iterator = client.query_iterator(collection_name="person", filter=f"institution_id == '{institution_id}'", output_fields=["id"], batch_size=1000)
    results = set()
    while True:
        batch = iterator.next()
        if not batch:
            iterator.close()
            break
        ids = {strip_oa_prefix(item["id"]) for item in batch}
        results.update(ids)
    return results

filter_institution_authors(institution_ids, results)

Filter authors by institution.

Source code in bear/search.py
def filter_institution_authors(institution_ids: list[str], results: list[dict[str, Any]]) -> list[dict[str, Any]]:
    """Filter authors by institution."""

    logger.info(f"Filtering authors for institutions: {institution_ids}")
    logger.info(f"Total results before filtering: {len(results)}")
    acceptable_author_ids = set()
    for id in institution_ids:
        acceptable_author_ids.update(load_institution_author_ids(id))
    filtered_results = [result for result in results if strip_oa_prefix(result["author_id"]) in acceptable_author_ids]
    logger.info(f"Total results after filtering: {len(filtered_results)}")
    return filtered_results

Search Engine

The search module provides vector search capabilities for academic resources.

Features

  • Vector similarity search
  • Metadata filtering
  • Result ranking
  • Multi-modal search (authors and works)

Search Types

Search for academic works, papers, and publications.

Search for authors based on research interests and expertise.

Filtering Options

  • Publication year filtering
  • Institution filtering
  • Citation count filtering
  • Distance threshold filtering

Performance

The search engine is optimized for:

  • Sub-second query response times
  • Accurate semantic matching
  • Scalable to millions of documents