Df Merger
/install df-merger
\r
DataFrame Merger for Construction Data\r
\r
Overview\r
Construction projects combine data from BIM, schedules, costs, and sensors. This skill merges DataFrames from disparate sources with intelligent key matching and schema reconciliation.\r \r
Python Implementation\r
\r
import pandas as pd\r
import numpy as np\r
from typing import Dict, Any, List, Optional, Tuple\r
from dataclasses import dataclass\r
from enum import Enum\r
from difflib import SequenceMatcher\r
\r
\r
class MergeStrategy(Enum):\r
"""DataFrame merge strategies."""\r
INNER = "inner" # Only matching rows\r
LEFT = "left" # All left, matching right\r
RIGHT = "right" # Matching left, all right\r
OUTER = "outer" # All rows from both\r
CROSS = "cross" # Cartesian product\r
\r
\r
@dataclass\r
class MergeResult:\r
"""Result of merge operation."""\r
merged_df: pd.DataFrame\r
matched_rows: int\r
left_only: int\r
right_only: int\r
merge_quality: float # 0-1 score\r
\r
\r
class ConstructionDFMerger:\r
"""Merge DataFrames from construction sources."""\r
\r
# Common construction column name mappings\r
COLUMN_MAPPINGS = {\r
'element_id': ['elementid', 'elem_id', 'id', 'guid', 'globalid'],\r
'type_name': ['typename', 'type', 'element_type', 'category'],\r
'level': ['level', 'floor', 'storey', 'building_storey'],\r
'material': ['material', 'mat', 'material_name'],\r
'volume': ['volume', 'vol', 'volume_m3', 'qty_volume'],\r
'area': ['area', 'surface_area', 'qty_area', 'area_m2'],\r
'cost': ['cost', 'price', 'total_cost', 'amount'],\r
'task_id': ['task_id', 'activity_id', 'wbs', 'activity'],\r
'start_date': ['start', 'start_date', 'planned_start', 'begin'],\r
'end_date': ['end', 'end_date', 'planned_finish', 'finish']\r
}\r
\r
def __init__(self):\r
self.column_cache: Dict[str, str] = {}\r
\r
def find_common_key(self, df1: pd.DataFrame,\r
df2: pd.DataFrame) -> Optional[str]:\r
"""Find common key column between DataFrames."""\r
\r
# Check exact matches first\r
common = set(df1.columns) & set(df2.columns)\r
if common:\r
# Prefer ID-like columns\r
for col in common:\r
if 'id' in col.lower() or 'code' in col.lower():\r
return col\r
return list(common)[0]\r
\r
# Try semantic matching\r
for col1 in df1.columns:\r
for col2 in df2.columns:\r
if self._columns_match(col1, col2):\r
return col1\r
\r
return None\r
\r
def _columns_match(self, col1: str, col2: str) -> bool:\r
"""Check if column names are semantically similar."""\r
col1_lower = col1.lower().replace('_', '').replace('-', '')\r
col2_lower = col2.lower().replace('_', '').replace('-', '')\r
\r
# Exact match after normalization\r
if col1_lower == col2_lower:\r
return True\r
\r
# Check against mappings\r
for standard, variants in self.COLUMN_MAPPINGS.items():\r
if col1_lower in variants and col2_lower in variants:\r
return True\r
\r
# Similarity check\r
similarity = SequenceMatcher(None, col1_lower, col2_lower).ratio()\r
return similarity > 0.8\r
\r
def harmonize_columns(self, df: pd.DataFrame) -> pd.DataFrame:\r
"""Standardize column names."""\r
df = df.copy()\r
rename_map = {}\r
\r
for col in df.columns:\r
col_lower = col.lower().replace('_', '').replace('-', '')\r
\r
for standard, variants in self.COLUMN_MAPPINGS.items():\r
if col_lower in variants:\r
rename_map[col] = standard\r
break\r
\r
return df.rename(columns=rename_map)\r
\r
def merge(self, left: pd.DataFrame,\r
right: pd.DataFrame,\r
on: Optional[str] = None,\r
left_on: Optional[str] = None,\r
right_on: Optional[str] = None,\r
how: MergeStrategy = MergeStrategy.LEFT,\r
harmonize: bool = True) -> MergeResult:\r
"""Merge two DataFrames."""\r
\r
if harmonize:\r
left = self.harmonize_columns(left)\r
right = self.harmonize_columns(right)\r
\r
# Determine merge keys\r
if on is None and left_on is None and right_on is None:\r
common_key = self.find_common_key(left, right)\r
if common_key is None:\r
raise ValueError("No common key found. Specify merge key manually.")\r
on = common_key\r
\r
# Perform merge\r
merged = pd.merge(\r
left, right,\r
on=on,\r
left_on=left_on,\r
right_on=right_on,\r
how=how.value,\r
indicator=True,\r
suffixes=('_left', '_right')\r
)\r
\r
# Calculate statistics\r
matched = len(merged[merged['_merge'] == 'both'])\r
left_only = len(merged[merged['_merge'] == 'left_only'])\r
right_only = len(merged[merged['_merge'] == 'right_only'])\r
\r
# Quality score\r
total = len(left) + len(right)\r
quality = (matched * 2) / total if total > 0 else 0\r
\r
# Clean up\r
merged = merged.drop('_merge', axis=1)\r
\r
return MergeResult(\r
merged_df=merged,\r
matched_rows=matched,\r
left_only=left_only,\r
right_only=right_only,\r
merge_quality=round(quality, 2)\r
)\r
\r
def merge_multiple(self, dfs: List[pd.DataFrame],\r
on: Optional[str] = None,\r
how: MergeStrategy = MergeStrategy.OUTER) -> pd.DataFrame:\r
"""Merge multiple DataFrames sequentially."""\r
\r
if not dfs:\r
return pd.DataFrame()\r
\r
result = dfs[0].copy()\r
\r
for i, df in enumerate(dfs[1:], 1):\r
result_obj = self.merge(result, df, on=on, how=how)\r
result = result_obj.merged_df\r
\r
return result\r
\r
def fuzzy_merge(self, left: pd.DataFrame,\r
right: pd.DataFrame,\r
left_on: str,\r
right_on: str,\r
threshold: float = 0.8) -> pd.DataFrame:\r
"""Merge using fuzzy string matching."""\r
\r
matches = []\r
\r
left_values = left[left_on].dropna().unique()\r
right_values = right[right_on].dropna().unique()\r
\r
for lval in left_values:\r
best_match = None\r
best_score = 0\r
\r
for rval in right_values:\r
score = SequenceMatcher(None, str(lval).lower(),\r
str(rval).lower()).ratio()\r
if score > best_score and score >= threshold:\r
best_score = score\r
best_match = rval\r
\r
if best_match:\r
matches.append({\r
'left_key': lval,\r
'right_key': best_match,\r
'match_score': best_score\r
})\r
\r
match_df = pd.DataFrame(matches)\r
\r
# Join using match mapping\r
left_with_key = left.merge(match_df, left_on=left_on, right_on='left_key', how='left')\r
result = left_with_key.merge(right, left_on='right_key', right_on=right_on, how='left')\r
\r
return result\r
\r
\r
class BIMScheduleMerger(ConstructionDFMerger):\r
"""Specialized merger for BIM and schedule data."""\r
\r
def merge_bim_schedule(self, bim_df: pd.DataFrame,\r
schedule_df: pd.DataFrame,\r
bim_type_col: str = 'Type Name',\r
schedule_wbs_col: str = 'WBS') -> pd.DataFrame:\r
"""Merge BIM elements with schedule activities."""\r
\r
# This typically requires a mapping table\r
# For now, use fuzzy matching on descriptions\r
\r
bim_df = self.harmonize_columns(bim_df)\r
schedule_df = self.harmonize_columns(schedule_df)\r
\r
# Try to match type names to WBS descriptions\r
result = self.fuzzy_merge(\r
bim_df, schedule_df,\r
left_on=bim_type_col,\r
right_on=schedule_wbs_col,\r
threshold=0.6\r
)\r
\r
return result\r
\r
\r
class CostQTOMerger(ConstructionDFMerger):\r
"""Merge cost data with quantity takeoffs."""\r
\r
def merge_cost_qto(self, cost_df: pd.DataFrame,\r
qto_df: pd.DataFrame) -> pd.DataFrame:\r
"""Merge cost rates with QTO quantities."""\r
\r
cost_df = self.harmonize_columns(cost_df)\r
qto_df = self.harmonize_columns(qto_df)\r
\r
# Try common merge keys\r
for key in ['work_item_code', 'type_name', 'material', 'element_id']:\r
if key in cost_df.columns and key in qto_df.columns:\r
result = self.merge(cost_df, qto_df, on=key)\r
\r
# Calculate extended costs\r
result.merged_df['extended_cost'] = (\r
result.merged_df.get('quantity', 0) *\r
result.merged_df.get('unit_price', 0)\r
)\r
\r
return result.merged_df\r
\r
# Fallback to fuzzy merge\r
return self.fuzzy_merge(\r
qto_df, cost_df,\r
left_on='type_name' if 'type_name' in qto_df.columns else qto_df.columns[0],\r
right_on='description' if 'description' in cost_df.columns else cost_df.columns[0]\r
)\r
```\r
\r
## Quick Start\r
\r
```python\r
merger = ConstructionDFMerger()\r
\r
# Merge two DataFrames\r
result = merger.merge(bim_df, schedule_df)\r
print(f"Matched: {result.matched_rows}, Quality: {result.merge_quality}")\r
\r
# Access merged data\r
merged = result.merged_df\r
```\r
\r
## Common Use Cases\r
\r
### 1. BIM + Schedule Integration\r
```python\r
bim_schedule = BIMScheduleMerger()\r
integrated = bim_schedule.merge_bim_schedule(bim_elements, schedule_activities)\r
```\r
\r
### 2. Cost + QTO\r
```python\r
cost_merger = CostQTOMerger()\r
priced_qto = cost_merger.merge_cost_qto(cost_database, quantities)\r
print(f"Total: ${priced_qto['extended_cost'].sum():,.2f}")\r
```\r
\r
### 3. Multiple Sources\r
```python\r
all_data = merger.merge_multiple(\r
[bim_df, schedule_df, cost_df, resource_df],\r
on='element_id'\r
)\r
```\r
\r
## Resources\r
- **DDC Book**: Chapter 2.3 - Pandas DataFrame\r
- Make sure OpenClaw is installed (local or Docker)
- Run the install command in chat:
/install df-merger - After installation, invoke the skill by name or use
/df-merger - Provide required inputs per the skill's parameter spec and get structured output
What is Df Merger?
Merge pandas DataFrames from multiple construction sources. Handle different schemas, keys, and data quality issues. It is an AI Agent Skill for Claude Code / OpenClaw, with 1092 downloads so far.
How do I install Df Merger?
Run "/install df-merger" in the OpenClaw or Claude Code chat to install it in one step — no extra setup required.
Is Df Merger free?
Yes, Df Merger is completely free (open-source). You can download, install and use it at no cost.
Which platforms does Df Merger support?
Df Merger is cross-platform and runs anywhere OpenClaw / Claude Code is available (darwin, linux, win32).
Who created Df Merger?
It is built and maintained by datadrivenconstruction (@datadrivenconstruction); the current version is v2.1.0.