Quick Start
from qubots import BenchmarkSuite, AutoProblem, AutoOptimizer
# Create benchmark suite
suite = BenchmarkSuite("Algorithm Comparison")
# Add problem and optimizers
problem = AutoProblem.from_repo("examples/tsp")
genetic_opt = AutoOptimizer.from_repo("examples/genetic_tsp")
sa_opt = AutoOptimizer.from_repo("examples/simulated_annealing")
suite.add_problem("TSP-Berlin52", problem, config={"n_cities": 52})
suite.add_optimizer("Genetic Algorithm", genetic_opt, config={"population_size": 100})
suite.add_optimizer("Simulated Annealing", sa_opt, config={"initial_temp": 1000})
# Run benchmark (30 independent runs)
genetic_result = suite.run_benchmark("TSP-Berlin52", "Genetic Algorithm", num_runs=30)
sa_result = suite.run_benchmark("TSP-Berlin52", "Simulated Annealing", num_runs=30)
# Compare results
print(f"Genetic: {genetic_result.metrics.mean_value:.4f} ± {genetic_result.metrics.std_value:.4f}")
print(f"SA: {sa_result.metrics.mean_value:.4f} ± {sa_result.metrics.std_value:.4f}")
Key Classes
BenchmarkSuite
Organize problems and optimizers for systematic evaluation:
from qubots import BenchmarkSuite
suite = BenchmarkSuite("My Benchmark Study")
# Add multiple problems
tsp_problem = AutoProblem.from_repo("examples/tsp")
maxcut_problem = AutoProblem.from_repo("examples/maxcut")
suite.add_problem("TSP-50", tsp_problem, config={"n_cities": 50})
suite.add_problem("MaxCut-20", maxcut_problem, config={"n_vertices": 20})
# Add multiple optimizers
genetic = AutoOptimizer.from_repo("examples/genetic_tsp")
sa = AutoOptimizer.from_repo("examples/simulated_annealing")
suite.add_optimizer("GA", genetic, config={"population_size": 100})
suite.add_optimizer("SA", sa, config={"cooling_rate": 0.95})
# Run all combinations
all_results = suite.run_full_benchmark(num_runs=20)
BenchmarkResult
Access detailed statistics from benchmark runs:
result = suite.run_benchmark("TSP-50", "GA", num_runs=30)
# Key metrics
metrics = result.metrics
print(f"Best: {metrics.best_value:.4f}")
print(f"Mean: {metrics.mean_value:.4f} ± {metrics.std_value:.4f}")
print(f"Success rate: {metrics.success_rate:.1%}")
print(f"Avg runtime: {metrics.mean_runtime_seconds:.2f}s")
# Additional metrics
print(f"Convergence rate: {metrics.convergence_rate:.1%}")
print(f"Coefficient of variation: {metrics.coefficient_of_variation:.3f}")
print(f"Evaluations/sec: {metrics.evaluations_per_second:.0f}")
# Individual run details
for i, run in enumerate(result.individual_runs[:5]): # First 5 runs
print(f"Run {i+1}: {run.best_value:.4f} ({run.runtime_seconds:.2f}s)")
Benchmark Types
Default benchmarking for algorithm comparison:
from qubots import BenchmarkType
# Standard performance benchmark
result = suite.run_benchmark(
"TSP-50",
"GA",
num_runs=30,
benchmark_type=BenchmarkType.PERFORMANCE
)
print(f"Mean quality: {result.metrics.mean_value:.4f}")
print(f"Best quality: {result.metrics.best_value:.4f}")
print(f"Success rate: {result.metrics.success_rate:.1%}")
Convergence Analysis
Track how algorithms converge over time:
# Convergence benchmark
result = suite.run_benchmark(
"MaxCut-20",
"SA",
num_runs=20,
benchmark_type=BenchmarkType.CONVERGENCE,
convergence_threshold=1e-4,
max_stagnation=100
)
# Analyze convergence
print(f"Convergence rate: {result.metrics.convergence_rate:.1%}")
print(f"Avg iterations to converge: {result.metrics.mean_iterations_to_convergence:.0f}")
Scalability Testing
Test performance across different problem sizes:
# Test scalability
sizes = [20, 50, 100, 200]
runtimes = []
for size in sizes:
# Create problem with specific size
problem = AutoProblem.from_repo("examples/tsp", override_params={"n_cities": size})
temp_suite = BenchmarkSuite(f"Scale-{size}")
temp_suite.add_problem(f"TSP-{size}", problem)
temp_suite.add_optimizer("GA", genetic)
result = temp_suite.run_benchmark(f"TSP-{size}", "GA", num_runs=10)
runtimes.append((size, result.metrics.mean_runtime_seconds))
# Analyze scaling
for size, runtime in runtimes:
print(f"Size {size}: {runtime:.2f}s average")
Statistical Analysis
Algorithm Comparison
Compare multiple algorithms statistically:
# Benchmark multiple algorithms
algorithms = {
"GA": AutoOptimizer.from_repo("examples/genetic_tsp"),
"SA": AutoOptimizer.from_repo("examples/simulated_annealing"),
"TS": AutoOptimizer.from_repo("examples/tabu_search")
}
results = {}
for name, optimizer in algorithms.items():
suite.add_optimizer(name, optimizer)
results[name] = suite.run_benchmark("TSP-50", name, num_runs=30)
# Compare results
print("Algorithm Comparison:")
print("Algorithm | Mean ± Std | Best | Success Rate")
print("-" * 50)
for name, result in results.items():
m = result.metrics
print(f"{name:8} | {m.mean_value:.3f} ± {m.std_value:.3f} | {m.best_value:.3f} | {m.success_rate:.1%}")
Statistical Significance
Test if differences are statistically significant:
from scipy import stats
# Get raw values from two algorithms
ga_values = [run.best_value for run in results["GA"].individual_runs]
sa_values = [run.best_value for run in results["SA"].individual_runs]
# Perform t-test
t_stat, p_value = stats.ttest_ind(ga_values, sa_values)
print(f"T-test p-value: {p_value:.4f}")
print(f"Significant difference: {p_value < 0.05}")
# Effect size (Cohen's d)
pooled_std = ((len(ga_values)-1)*np.std(ga_values)**2 + (len(sa_values)-1)*np.std(sa_values)**2) / (len(ga_values)+len(sa_values)-2)
cohens_d = (np.mean(ga_values) - np.mean(sa_values)) / np.sqrt(pooled_std)
print(f"Effect size (Cohen's d): {cohens_d:.3f}")
Cloud Benchmarking
Run large-scale benchmarks in the cloud:
from qubots import execute_playground_optimization
# Cloud benchmark with multiple runs
cloud_results = []
for run in range(50): # Large-scale benchmark
result = execute_playground_optimization(
problem_name="tsp_berlin52",
problem_username="benchmarks",
optimizer_name="genetic_tsp",
optimizer_username="algorithms",
optimizer_params={"population_size": 100, "generations": 1000}
)
if result["success"]:
cloud_results.append(result["best_value"])
# Analyze cloud results
import numpy as np
print(f"Cloud benchmark results:")
print(f"Mean: {np.mean(cloud_results):.4f}")
print(f"Std: {np.std(cloud_results):.4f}")
print(f"Best: {np.min(cloud_results):.4f}")
Leaderboard Submission
Submit your best results to the leaderboard:
from qubots import submit_to_leaderboard
# Get best run from benchmark
best_run = min(result.individual_runs, key=lambda r: r.best_value)
# Submit to leaderboard
submit_to_leaderboard(
result=best_run,
problem_id=1,
solver_name="My Genetic Algorithm",
solver_repository="username/my-ga-optimizer",
solver_config=best_run.optimizer_config
)
Best Practices
Experimental Setup
- Use ≥30 runs for statistical significance
- Set different random seeds for each run
- Control for external factors (CPU load, etc.)
- Include baseline algorithms for comparison
Statistical Analysis
- Report confidence intervals, not just means
- Test for statistical significance (p-values)
- Consider effect size (practical significance)
- Analyze failure cases and outliers
Next Steps