Expert-Level Feature Engineering: Advanced Techniques for High-Stakes Models

Iván Palomares Carrascosa introduces three advanced feature engineering techniques in a 2025 article, addressing robustness and explainability in high-stakes ML models. The methods include counterfactual features, domain-constrained autoencoders, and causal-invariant features.

Why This Matters

Traditional feature engineering often fails in high-stakes domains like healthcare and finance, where model errors can have severe consequences. These advanced techniques explicitly encode domain constraints and causal relationships, reducing failure risks and improving interpretability. Without such methods, models may rely on spurious correlations, leading to unstable predictions and regulatory non-compliance.

Key Insights

“2025 article by Iván Palomares Carrascosa on MachineLearningMastery.com”
“Counterfactual features for decision-boundary awareness in healthcare diagnostics”
“Constrained autoencoders used to enforce monotonicity rules in financial models”

Working Example

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler

# Toy data and baseline linear classifier
X, y = make_classification(n_samples=500, n_features=5, random_state=42)
df = pd.DataFrame(X, columns=[f"feat_{i}" for i in range(X.shape[1])])
df['target'] = y
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df.drop(columns="target"))
clf = LogisticRegression().fit(X_scaled, y)

# Decision boundary parameters
weights = clf.coef_[0]
bias = clf.intercept_[0]

def counterfactual_delta_feat0(x, eps=1e-9):
    """Minimal change to feature 0, holding other features fixed, required to move the linear logit score to the decision boundary (0)."""
    score = np.dot(weights, x) + bias
    w0 = weights[0]
    return -score / (w0 + eps)

df['cf_delta_feat0'] = [counterfactual_delta_feat0(x) for x in X_scaled]
df.head()

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split

# Supervised split using the earlier DataFrame `df`
X_train, X_val, y_train, y_val = train_test_split(
    df.drop(columns="target").values, df['target'].values, test_size=0.2, random_state=42
)
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
torch.manual_seed(42)

class ConstrainedAutoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim=3):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 8), nn.ReLU(), nn.Linear(8, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 8), nn.ReLU(), nn.Linear(8, input_dim)
        )
        self.predictor = nn.Linear(latent_dim, 1)

    def forward(self, x):
        z = self.encoder(x)
        recon = self.decoder(z)
        logit = self.predictor(z)
        return recon, z, logit

model = ConstrainedAutoencoder(input_dim=X_train.shape[1])
optimizer = optim.Adam(model.parameters(), lr=1e-3)
recon_loss_fn = nn.MSELoss()
pred_loss_fn = nn.BCEWithLogitsLoss()
epsilon = 1e-2

for epoch in range(50):
    model.train()
    optimizer.zero_grad()
    recon, z, logit = model(X_train)
    loss_recon = recon_loss_fn(recon, X_train)
    loss_pred = pred_loss_fn(logit, y_train)
    X_plus = X_train.clone()
    X_plus[:, 0] = X_plus[:, 0] + epsilon
    _, _, logit_plus = model(X_plus)
    mono_violation = torch.relu(logit - logit_plus)
    loss_mono = mono_violation.mean()
    loss = loss_recon + 0.5 * loss_pred + 0.1 * loss_mono
    loss.backward()
    optimizer.step()

# Latent features now reflect the monotonic constraint
with torch.no_grad():
    _, latent_feats, _ = model(X_train)
    latent_feats[:5]

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

torch.manual_seed(42)
np.random.seed(42)

# Two environments with a spurious signal in env1
n = 300
X_env1 = np.random.randn(n, 2)
X_env2 = np.random.randn(n, 2)
y_env1 = (X_env1[:, 0] + 0.1 * np.random.randn(n) > 0).astype(int)
y_env2 = (X_env2[:, 0] + 0.1 * np.random.randn(n) > 0).astype(int)
X_env1[:, 1] = y_env1 + 0.1 * np.random.randn(n)

X1, y1 = torch.tensor(X_env1, dtype=torch.float32), torch.tensor(y_env1, dtype=torch.float32)
X2, y2 = torch.tensor(X_env2, dtype=torch.float32), torch.tensor(y_env2, dtype=torch.float32)

class LinearModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.w = nn.Parameter(torch.randn(2, 1))

    def forward(self, x):
        return x @ self.w

model = LinearModel()
optimizer = optim.Adam(model.parameters(), lr=1e-2)

def env_risk(x, y, w):
    logits = x @ w
    return torch.mean((logits.squeeze() - y) ** 2)

for epoch in range(2000):
    optimizer.zero_grad()
    risk1 = env_risk(X1, y1, model.w)
    risk2 = env_risk(X2, y2, model.w)
    grad1 = torch.autograd.grad(risk1, model.w, create_graph=True)[0]
    grad2 = torch.autograd.grad(risk2, model.w, create_graph=True)[0]
    penalty = torch.sum((grad1 - grad2) ** 2)
    loss = (risk1 + risk2) + 100.0 * penalty
    loss.backward()
    optimizer.step()

print("Learned weights:", model.w.data.numpy().ravel())

Practical Applications

Use Case: Healthcare diagnosis models using counterfactual features to assess patient risk thresholds.
Pitfall: Overlooking spurious correlations in data without causal-invariant features, leading to poor out-of-distribution generalization.

References:

https://machinelearningmastery.com/expert-level-feature-engineering-advanced-techniques-for-high-stakes-models/

On This Page

Expert-Level Feature Engineering: Advanced Techniques for High-Stakes Models