Historical Cost Analyzer
/install historical-cost-analyzer
\r \r
Historical Cost Analyzer for Construction\r
\r
Overview\r
\r Analyze historical construction cost data for benchmarking, escalation tracking, and estimating calibration. Compare similar projects, identify cost drivers, and improve future estimates.\r \r
Business Case\r
\r Historical cost analysis enables:\r
- Benchmarking: Compare current estimates to past projects\r
- Calibration: Improve estimating accuracy using actual data\r
- Trends: Track cost escalation and market changes\r
- Risk Assessment: Identify cost drivers and overrun patterns\r \r
Technical Implementation\r
\r
from dataclasses import dataclass, field\r
from typing import List, Dict, Any, Optional, Tuple\r
import pandas as pd\r
import numpy as np\r
from datetime import datetime\r
from scipy import stats\r
\r
@dataclass\r
class CostBenchmark:\r
metric_name: str\r
value: float\r
unit: str\r
percentile_25: float\r
percentile_50: float\r
percentile_75: float\r
sample_size: int\r
project_types: List[str]\r
\r
@dataclass\r
class EscalationAnalysis:\r
from_year: int\r
to_year: int\r
annual_rate: float\r
total_change: float\r
category: str\r
confidence: float\r
\r
@dataclass\r
class CostDriver:\r
factor: str\r
impact_percentage: float\r
correlation: float\r
description: str\r
\r
class HistoricalCostAnalyzer:\r
"""Analyze historical construction costs."""\r
\r
# RSMeans City Cost Indexes (sample - would be loaded from database)\r
LOCATION_FACTORS = {\r
'New York': 1.32, 'San Francisco': 1.28, 'Los Angeles': 1.15,\r
'Chicago': 1.12, 'Houston': 0.92, 'Dallas': 0.89,\r
'Phoenix': 0.93, 'Atlanta': 0.91, 'Denver': 1.02,\r
'Seattle': 1.08, 'National Average': 1.00\r
}\r
\r
# Historical cost indices by year\r
COST_INDICES = {\r
2015: 100.0, 2016: 102.1, 2017: 105.3, 2018: 109.2,\r
2019: 112.5, 2020: 114.8, 2021: 121.4, 2022: 135.6,\r
2023: 142.3, 2024: 148.7, 2025: 154.2, 2026: 160.0\r
}\r
\r
def __init__(self, historical_data: pd.DataFrame = None):\r
self.data = historical_data\r
self.benchmarks: Dict[str, CostBenchmark] = {}\r
\r
def load_data(self, data: pd.DataFrame):\r
"""Load historical project data."""\r
self.data = data.copy()\r
\r
# Normalize data\r
if 'completion_year' not in self.data.columns and 'completion_date' in self.data.columns:\r
self.data['completion_year'] = pd.to_datetime(self.data['completion_date']).dt.year\r
\r
# Calculate key metrics\r
if 'gross_area' in self.data.columns and 'final_cost' in self.data.columns:\r
self.data['cost_per_sf'] = self.data['final_cost'] / self.data['gross_area']\r
\r
if 'original_estimate' in self.data.columns and 'final_cost' in self.data.columns:\r
self.data['overrun_pct'] = ((self.data['final_cost'] - self.data['original_estimate'])\r
/ self.data['original_estimate'] * 100)\r
\r
def normalize_to_year(self, costs: pd.Series, from_years: pd.Series,\r
to_year: int = 2026) -> pd.Series:\r
"""Normalize costs to a common year using cost indices."""\r
normalized = costs.copy()\r
\r
for i, (cost, year) in enumerate(zip(costs, from_years)):\r
if pd.notna(cost) and pd.notna(year):\r
year = int(year)\r
if year in self.COST_INDICES and to_year in self.COST_INDICES:\r
factor = self.COST_INDICES[to_year] / self.COST_INDICES[year]\r
normalized.iloc[i] = cost * factor\r
\r
return normalized\r
\r
def normalize_to_location(self, costs: pd.Series, locations: pd.Series,\r
to_location: str = 'National Average') -> pd.Series:\r
"""Normalize costs to a common location."""\r
normalized = costs.copy()\r
to_factor = self.LOCATION_FACTORS.get(to_location, 1.0)\r
\r
for i, (cost, loc) in enumerate(zip(costs, locations)):\r
if pd.notna(cost) and loc in self.LOCATION_FACTORS:\r
from_factor = self.LOCATION_FACTORS[loc]\r
normalized.iloc[i] = cost * (to_factor / from_factor)\r
\r
return normalized\r
\r
def calculate_benchmarks(self, project_type: str = None,\r
year_range: Tuple[int, int] = None) -> Dict[str, CostBenchmark]:\r
"""Calculate cost benchmarks from historical data."""\r
df = self.data.copy()\r
\r
# Filter by project type\r
if project_type and 'project_type' in df.columns:\r
df = df[df['project_type'] == project_type]\r
\r
# Filter by year range\r
if year_range and 'completion_year' in df.columns:\r
df = df[(df['completion_year'] >= year_range[0]) &\r
(df['completion_year'] \x3C= year_range[1])]\r
\r
benchmarks = {}\r
\r
# Cost per SF\r
if 'cost_per_sf' in df.columns:\r
values = df['cost_per_sf'].dropna()\r
if len(values) > 0:\r
benchmarks['cost_per_sf'] = CostBenchmark(\r
metric_name='Cost per SF',\r
value=values.median(),\r
unit='$/SF',\r
percentile_25=values.quantile(0.25),\r
percentile_50=values.quantile(0.50),\r
percentile_75=values.quantile(0.75),\r
sample_size=len(values),\r
project_types=[project_type] if project_type else df['project_type'].unique().tolist()\r
)\r
\r
# Overrun percentage\r
if 'overrun_pct' in df.columns:\r
values = df['overrun_pct'].dropna()\r
if len(values) > 0:\r
benchmarks['overrun_pct'] = CostBenchmark(\r
metric_name='Cost Overrun',\r
value=values.median(),\r
unit='%',\r
percentile_25=values.quantile(0.25),\r
percentile_50=values.quantile(0.50),\r
percentile_75=values.quantile(0.75),\r
sample_size=len(values),\r
project_types=[project_type] if project_type else df['project_type'].unique().tolist()\r
)\r
\r
self.benchmarks.update(benchmarks)\r
return benchmarks\r
\r
def calculate_escalation(self, category: str = 'overall',\r
from_year: int = 2020,\r
to_year: int = 2026) -> EscalationAnalysis:\r
"""Calculate cost escalation between years."""\r
if from_year in self.COST_INDICES and to_year in self.COST_INDICES:\r
from_index = self.COST_INDICES[from_year]\r
to_index = self.COST_INDICES[to_year]\r
\r
total_change = (to_index - from_index) / from_index\r
years = to_year - from_year\r
annual_rate = (to_index / from_index) ** (1 / years) - 1 if years > 0 else 0\r
\r
return EscalationAnalysis(\r
from_year=from_year,\r
to_year=to_year,\r
annual_rate=annual_rate,\r
total_change=total_change,\r
category=category,\r
confidence=0.95\r
)\r
\r
return None\r
\r
def identify_cost_drivers(self, target_col: str = 'cost_per_sf') -> List[CostDriver]:\r
"""Identify factors that drive costs."""\r
if self.data is None or target_col not in self.data.columns:\r
return []\r
\r
drivers = []\r
target = self.data[target_col].dropna()\r
\r
# Analyze numeric columns\r
numeric_cols = self.data.select_dtypes(include=[np.number]).columns\r
exclude = [target_col, 'final_cost', 'original_estimate']\r
\r
for col in numeric_cols:\r
if col not in exclude:\r
valid_mask = self.data[col].notna() & self.data[target_col].notna()\r
if valid_mask.sum() > 10:\r
corr, p_value = stats.pearsonr(\r
self.data.loc[valid_mask, col],\r
self.data.loc[valid_mask, target_col]\r
)\r
\r
if abs(corr) > 0.3 and p_value \x3C 0.05:\r
impact = corr * self.data[col].std() / target.std() * 100\r
\r
drivers.append(CostDriver(\r
factor=col,\r
impact_percentage=abs(impact),\r
correlation=corr,\r
description=f"{'Positive' if corr > 0 else 'Negative'} correlation with {target_col}"\r
))\r
\r
# Analyze categorical columns\r
categorical_cols = self.data.select_dtypes(include=['object', 'category']).columns\r
\r
for col in categorical_cols:\r
if col not in ['project_id', 'project_name']:\r
groups = self.data.groupby(col)[target_col].mean()\r
if len(groups) > 1:\r
variance = groups.var()\r
overall_var = target.var()\r
\r
if variance / overall_var > 0.1:\r
drivers.append(CostDriver(\r
factor=col,\r
impact_percentage=variance / overall_var * 100,\r
correlation=0,\r
description=f"Categorical factor with significant cost variation"\r
))\r
\r
return sorted(drivers, key=lambda x: -x.impact_percentage)\r
\r
def compare_to_benchmark(self, estimate: Dict, project_type: str = None) -> Dict:\r
"""Compare an estimate to historical benchmarks."""\r
if project_type:\r
self.calculate_benchmarks(project_type)\r
\r
comparison = {}\r
\r
# Cost per SF comparison\r
if 'cost_per_sf' in estimate and 'cost_per_sf' in self.benchmarks:\r
benchmark = self.benchmarks['cost_per_sf']\r
value = estimate['cost_per_sf']\r
\r
percentile = stats.percentileofscore(\r
self.data['cost_per_sf'].dropna(), value\r
)\r
\r
comparison['cost_per_sf'] = {\r
'estimate': value,\r
'benchmark_median': benchmark.value,\r
'benchmark_range': (benchmark.percentile_25, benchmark.percentile_75),\r
'percentile': percentile,\r
'status': 'within_range' if benchmark.percentile_25 \x3C= value \x3C= benchmark.percentile_75 else 'outside_range'\r
}\r
\r
return comparison\r
\r
def find_similar_projects(self, criteria: Dict, n: int = 10) -> pd.DataFrame:\r
"""Find similar historical projects."""\r
df = self.data.copy()\r
\r
# Filter by criteria\r
if 'project_type' in criteria:\r
df = df[df['project_type'] == criteria['project_type']]\r
\r
if 'gross_area' in criteria:\r
target = criteria['gross_area']\r
tolerance = criteria.get('area_tolerance', 0.3)\r
df = df[(df['gross_area'] >= target * (1 - tolerance)) &\r
(df['gross_area'] \x3C= target * (1 + tolerance))]\r
\r
if 'location' in criteria and 'location' in df.columns:\r
df = df[df['location'] == criteria['location']]\r
\r
if 'year_range' in criteria:\r
df = df[(df['completion_year'] >= criteria['year_range'][0]) &\r
(df['completion_year'] \x3C= criteria['year_range'][1])]\r
\r
# Sort by similarity (simple: by area difference)\r
if 'gross_area' in criteria and 'gross_area' in df.columns:\r
df['similarity'] = 1 - abs(df['gross_area'] - criteria['gross_area']) / criteria['gross_area']\r
df = df.sort_values('similarity', ascending=False)\r
\r
return df.head(n)\r
\r
def analyze_overrun_patterns(self) -> Dict:\r
"""Analyze patterns in cost overruns."""\r
if 'overrun_pct' not in self.data.columns:\r
return {}\r
\r
analysis = {}\r
\r
# Overall statistics\r
overruns = self.data['overrun_pct'].dropna()\r
analysis['overall'] = {\r
'mean': overruns.mean(),\r
'median': overruns.median(),\r
'std': overruns.std(),\r
'projects_over_budget': (overruns > 0).sum(),\r
'projects_under_budget': (overruns \x3C 0).sum(),\r
'pct_over_budget': (overruns > 0).mean() * 100\r
}\r
\r
# By project type\r
if 'project_type' in self.data.columns:\r
by_type = self.data.groupby('project_type')['overrun_pct'].agg(['mean', 'std', 'count'])\r
analysis['by_type'] = by_type.to_dict('index')\r
\r
# By size category\r
if 'gross_area' in self.data.columns:\r
self.data['size_category'] = pd.cut(\r
self.data['gross_area'],\r
bins=[0, 10000, 50000, 100000, np.inf],\r
labels=['Small (\x3C10k SF)', 'Medium (10-50k SF)', 'Large (50-100k SF)', 'Very Large (>100k SF)']\r
)\r
by_size = self.data.groupby('size_category')['overrun_pct'].agg(['mean', 'std', 'count'])\r
analysis['by_size'] = by_size.to_dict('index')\r
\r
return analysis\r
\r
def generate_report(self, project_type: str = None) -> str:\r
"""Generate comprehensive cost analysis report."""\r
lines = ["# Historical Cost Analysis Report", ""]\r
lines.append(f"**Generated:** {datetime.now().strftime('%Y-%m-%d')}")\r
lines.append(f"**Projects Analyzed:** {len(self.data):,}")\r
if project_type:\r
lines.append(f"**Project Type:** {project_type}")\r
lines.append("")\r
\r
# Benchmarks\r
benchmarks = self.calculate_benchmarks(project_type)\r
if benchmarks:\r
lines.append("## Cost Benchmarks")\r
for name, bm in benchmarks.items():\r
lines.append(f"\
### {bm.metric_name}")\r
lines.append(f"- **Median:** {bm.value:.2f} {bm.unit}")\r
lines.append(f"- **25th Percentile:** {bm.percentile_25:.2f} {bm.unit}")\r
lines.append(f"- **75th Percentile:** {bm.percentile_75:.2f} {bm.unit}")\r
lines.append(f"- **Sample Size:** {bm.sample_size}")\r
\r
# Escalation\r
lines.append("\
## Cost Escalation")\r
esc = self.calculate_escalation(from_year=2020, to_year=2026)\r
if esc:\r
lines.append(f"- **Period:** {esc.from_year} to {esc.to_year}")\r
lines.append(f"- **Annual Rate:** {esc.annual_rate:.1%}")\r
lines.append(f"- **Total Change:** {esc.total_change:.1%}")\r
\r
# Cost Drivers\r
drivers = self.identify_cost_drivers()\r
if drivers:\r
lines.append("\
## Key Cost Drivers")\r
for driver in drivers[:5]:\r
lines.append(f"- **{driver.factor}:** {driver.impact_percentage:.1f}% impact (r={driver.correlation:.2f})")\r
\r
# Overrun Analysis\r
overrun_analysis = self.analyze_overrun_patterns()\r
if 'overall' in overrun_analysis:\r
lines.append("\
## Overrun Analysis")\r
overall = overrun_analysis['overall']\r
lines.append(f"- **Average Overrun:** {overall['mean']:.1f}%")\r
lines.append(f"- **Projects Over Budget:** {overall['pct_over_budget']:.1f}%")\r
\r
return "\
".join(lines)\r
```\r
\r
## Quick Start\r
\r
```python\r
import pandas as pd\r
\r
# Load historical data\r
historical = pd.read_excel("historical_projects.xlsx")\r
\r
# Initialize analyzer\r
analyzer = HistoricalCostAnalyzer()\r
analyzer.load_data(historical)\r
\r
# Calculate benchmarks for office buildings\r
benchmarks = analyzer.calculate_benchmarks(project_type='Office')\r
print(f"Office median cost: ${benchmarks['cost_per_sf'].value:.2f}/SF")\r
\r
# Calculate escalation\r
escalation = analyzer.calculate_escalation(from_year=2020, to_year=2026)\r
print(f"Annual escalation: {escalation.annual_rate:.1%}")\r
\r
# Find similar projects\r
similar = analyzer.find_similar_projects({\r
'project_type': 'Office',\r
'gross_area': 50000,\r
'year_range': (2020, 2025)\r
})\r
print(f"Found {len(similar)} similar projects")\r
\r
# Compare estimate to benchmark\r
comparison = analyzer.compare_to_benchmark({'cost_per_sf': 250}, 'Office')\r
print(f"Estimate percentile: {comparison['cost_per_sf']['percentile']:.0f}th")\r
\r
# Generate report\r
report = analyzer.generate_report('Office')\r
print(report)\r
```\r
\r
## Dependencies\r
\r
```bash\r
pip install pandas numpy scipy\r
```\r
- Make sure OpenClaw is installed (local or Docker)
- Run the install command in chat:
/install historical-cost-analyzer - After installation, invoke the skill by name or use
/historical-cost-analyzer - Provide required inputs per the skill's parameter spec and get structured output
What is Historical Cost Analyzer?
Analyze historical construction costs for benchmarking, trend analysis, and estimating calibration. Compare projects, track escalation, identify patterns. It is an AI Agent Skill for Claude Code / OpenClaw, with 1233 downloads so far.
How do I install Historical Cost Analyzer?
Run "/install historical-cost-analyzer" in the OpenClaw or Claude Code chat to install it in one step — no extra setup required.
Is Historical Cost Analyzer free?
Yes, Historical Cost Analyzer is completely free (open-source). You can download, install and use it at no cost.
Which platforms does Historical Cost Analyzer support?
Historical Cost Analyzer is cross-platform and runs anywhere OpenClaw / Claude Code is available (cross-platform).
Who created Historical Cost Analyzer?
It is built and maintained by datadrivenconstruction (@datadrivenconstruction); the current version is v2.0.0.