Skip to content

Solr

Solr indexing utilities for GSO.

show_active_config(ctx)

Log the effective config we are using (purely informational).

Source code in gso/cli/solr.py
@app.callback(invoke_without_command=True)
def show_active_config(ctx: typer.Context) -> None:
    """Log the effective config we are using (purely informational)."""
    cfg = load_oss_params()
    logger.info(
        "solr-cli using settings",
        solr_url=cfg.SOLR.url,
        solr_update_url=getattr(idx, "SOLR_UPDATE_URL", None),
        solr_schema_url=getattr(idx, "SOLR_SCHEMA_URL", None),
    )
    if ctx.invoked_subcommand is None:
        # Show help if run without a subcommand
        typer.echo(ctx.get_help())

_stream_write_json_array(out_path, docs)

Write a valid JSON array to disk from a stream without buffering all docs.

Returns number of documents written.

Source code in gso/cli/solr.py
def _stream_write_json_array(out_path: Path, docs: Iterable[dict]) -> int:
    """Write a valid JSON array to disk from a stream without buffering all docs.

    Returns number of documents written.
    """
    out_path.parent.mkdir(parents=True, exist_ok=True)
    count = 0
    with out_path.open("w", encoding="utf-8") as f:
        f.write("[\n")
        first = True
        for doc in docs:
            if not first:
                f.write(",\n")
            json.dump(doc, f, default=str, ensure_ascii=False)
            first = False
            count += 1
        f.write("\n]\n")
    return count

ensure_copy_field()

Ensure the catch-all copy-field to _text_ exists on the configured core.

Source code in gso/cli/solr.py
@app.command("ensure-copy-field")
def ensure_copy_field() -> None:
    """Ensure the catch-all copy-field to `_text_` exists on the configured core."""
    idx.ensure_solr_copy_field()
    typer.echo("✅ Solr schema copy-field ensured.")

reindex_all(batch_size=500, skip_copy_field=False, dry_run_json=None)

Stream data from Postgres and index into Solr (no giant in-memory lists).

Source code in gso/cli/solr.py
@app.command("reindex")
def reindex_all(
    batch_size: Annotated[int, typer.Option(help="Docs per HTTP batch to Solr.")] = 500,
    skip_copy_field: Annotated[bool, typer.Option(help="Skip ensuring the `_text_` copy-field.")] = False,  # noqa: FBT002
    dry_run_json: Annotated[
        Path | None,
        typer.Option("--dry-run-json", help="Write generated docs to this JSON file instead of posting to Solr."),
    ] = None,
) -> None:
    """Stream data from Postgres and index into Solr (no giant in-memory lists)."""
    if not skip_copy_field:
        idx.ensure_solr_copy_field()

    if dry_run_json:
        typer.echo("⏳ Streaming docs for dry-run...")
        docs_stream = idx.stream_all_data()
        written = _stream_write_json_array(dry_run_json, docs_stream)
        typer.echo(f"📝 Wrote {written} docs to {dry_run_json} (dry-run, nothing sent to Solr).")
        raise typer.Exit(code=0)

    typer.echo(f"🚀 Streaming and posting to Solr in batches of {batch_size} ...")
    docs_stream = idx.stream_all_data()
    idx.post_to_solr(docs_stream, batch_size=batch_size)
    typer.echo("✅ Reindex complete.")

dump_docs(out_json=typer.Argument(..., help='Path to write the generated JSON array (no Solr post).'))

Generate Solr docs (streaming) and write them to a JSON file for inspection.

Source code in gso/cli/solr.py
@app.command("dump")
def dump_docs(
    out_json: Path = typer.Argument(..., help="Path to write the generated JSON array (no Solr post)."),  # noqa: B008
) -> None:
    """Generate Solr docs (streaming) and write them to a JSON file for inspection."""
    typer.echo("⏳ Streaming docs to file...")
    docs_stream = idx.stream_all_data()
    written = _stream_write_json_array(out_json, docs_stream)
    typer.echo(f"📝 Wrote {written} docs to {out_json}.")