Incident Response Examples¶

Automated incident detection, notification, and remediation with elastic-script.

Basic Alert Handler¶

Detect and notify on critical errors:

CREATE PROCEDURE check_critical_errors()
BEGIN
    -- Check for critical errors in last 5 minutes
    DECLARE errors CURSOR FOR ESQL_QUERY('
        FROM logs-*
        | WHERE level = "ERROR" AND @timestamp > NOW() - 5 minutes
        | STATS count = COUNT(*) BY service.name
        | WHERE count > 10
    ');

    FOR svc IN errors LOOP
        DECLARE service STRING = DOCUMENT_GET(svc, 'service.name');
        DECLARE count NUMBER = DOCUMENT_GET(svc, 'count');

        PRINT '🚨 ALERT: ' || service || ' has ' || count || ' errors';

        -- Notify via Slack
        SLACK_SEND('#alerts', '🚨 *' || service || '* has ' || count || ' errors in the last 5 minutes');
    END LOOP;
END PROCEDURE;

Full Incident Workflow¶

Complete incident lifecycle management:

CREATE PROCEDURE handle_incident(service STRING, description STRING, severity STRING)
BEGIN
    DECLARE incident_id STRING = 'INC-' || SUBSTR(CURRENT_TIMESTAMP(), 1, 10);

    PRINT '=== Creating Incident: ' || incident_id || ' ===';

    -- 1. Log the incident
    DECLARE incident DOCUMENT = {
        "incident_id": incident_id,
        "service": service,
        "description": description,
        "severity": severity,
        "status": "open",
        "created_at": CURRENT_TIMESTAMP()
    };
    ES_INDEX('incidents', incident_id, incident);

    -- 2. Create PagerDuty incident
    DECLARE pd_key STRING;
    TRY
        SET pd_key = PAGERDUTY_CREATE_INCIDENT(
            '[' || severity || '] ' || service || ': ' || description,
            severity,
            service
        );
        PRINT 'PagerDuty incident created: ' || pd_key;
    CATCH error
        PRINT 'Warning: PagerDuty notification failed: ' || error;
    END TRY;

    -- 3. Notify Slack
    DECLARE slack_message STRING = '🚨 *New Incident: ' || incident_id || '*\n'
        || '• Service: ' || service || '\n'
        || '• Severity: ' || severity || '\n'
        || '• Description: ' || description;

    TRY
        SLACK_SEND('#incidents', slack_message);
    CATCH error
        PRINT 'Warning: Slack notification failed: ' || error;
    END TRY;

    -- 4. Gather context
    PRINT 'Gathering incident context...';
    DECLARE recent_errors CURSOR FOR ESQL_QUERY('
        FROM logs-*
        | WHERE service.name = "' || service || '" AND level = "ERROR"
        | SORT @timestamp DESC
        | LIMIT 10
    ');

    DECLARE context_messages ARRAY = [];
    FOR error IN recent_errors LOOP
        SET context_messages = ARRAY_APPEND(context_messages, 
            DOCUMENT_GET(error, '@timestamp') || ': ' || DOCUMENT_GET(error, 'message'));
    END LOOP;

    -- 5. Update incident with context
    ES_UPDATE('incidents', incident_id, {
        "pagerduty_key": pd_key,
        "context": {
            "recent_errors": context_messages
        }
    });

    RETURN incident_id;
END PROCEDURE;

Auto-Remediation¶

Automated recovery actions:

CREATE PROCEDURE auto_remediate(service STRING, issue_type STRING)
BEGIN
    PRINT '=== Auto-Remediation for ' || service || ' ===';
    PRINT 'Issue type: ' || issue_type;

    DECLARE action_taken STRING = 'none';
    DECLARE success BOOLEAN = FALSE;

    TRY
        IF issue_type = 'high_memory' THEN
            -- Restart pods with high memory
            PRINT 'Action: Restarting deployment...';
            K8S_RESTART_DEPLOYMENT('production', service);
            SET action_taken = 'deployment_restart';
            SET success = TRUE;

        ELSEIF issue_type = 'high_load' THEN
            -- Scale up the service
            PRINT 'Action: Scaling up deployment...';
            DECLARE current_pods ARRAY = K8S_GET_PODS('production');
            DECLARE current_count NUMBER = ARRAY_LENGTH(current_pods);
            DECLARE new_count NUMBER = current_count + 2;

            K8S_SCALE_DEPLOYMENT('production', service, new_count);
            SET action_taken = 'scale_up_to_' || new_count;
            SET success = TRUE;

        ELSEIF issue_type = 'connection_pool_exhausted' THEN
            -- Restart to reset connections
            PRINT 'Action: Rolling restart to reset connections...';
            K8S_RESTART_DEPLOYMENT('production', service);
            SET action_taken = 'connection_reset';
            SET success = TRUE;

        ELSE
            PRINT 'No automatic remediation available for: ' || issue_type;
            SET action_taken = 'manual_required';
        END IF;

    CATCH error
        PRINT 'Remediation failed: ' || error;
        SET action_taken = 'failed: ' || error;
    END TRY;

    -- Log remediation attempt
    DECLARE log DOCUMENT = {
        "service": service,
        "issue_type": issue_type,
        "action_taken": action_taken,
        "success": success,
        "timestamp": CURRENT_TIMESTAMP()
    };
    ES_INDEX('remediation-logs', NULL, log);

    -- Notify team
    IF success THEN
        SLACK_SEND('#incidents', '✅ Auto-remediation for *' || service || '*: ' || action_taken);
    ELSE
        SLACK_SEND('#incidents', '❌ Auto-remediation failed for *' || service || '* - manual intervention required');
    END IF;

    RETURN action_taken;
END PROCEDURE;

Escalation Workflow¶

Progressive escalation based on severity and time:

CREATE PROCEDURE escalate_incident(incident_id STRING)
BEGIN
    -- Get incident details
    DECLARE incident DOCUMENT = ES_GET('incidents', incident_id);
    DECLARE status STRING = DOCUMENT_GET(incident, 'status');
    DECLARE severity STRING = DOCUMENT_GET(incident, 'severity');
    DECLARE created DATE = DOCUMENT_GET(incident, 'created_at');
    DECLARE escalation_level NUMBER = DOCUMENT_GET(incident, 'escalation_level');

    IF escalation_level = NULL THEN
        SET escalation_level = 0;
    END IF;

    IF status = 'resolved' THEN
        PRINT 'Incident already resolved, no escalation needed';
        RETURN 'resolved';
    END IF;

    -- Calculate age in minutes
    DECLARE age_minutes NUMBER = DATE_DIFF(CURRENT_TIMESTAMP(), created) * 24 * 60;

    PRINT 'Incident ' || incident_id || ' age: ' || age_minutes || ' minutes';
    PRINT 'Current escalation level: ' || escalation_level;

    -- Determine if escalation is needed
    DECLARE escalate BOOLEAN = FALSE;

    IF severity = 'critical' AND age_minutes > 15 AND escalation_level < 3 THEN
        SET escalate = TRUE;
    ELSEIF severity = 'high' AND age_minutes > 30 AND escalation_level < 2 THEN
        SET escalate = TRUE;
    ELSEIF severity = 'medium' AND age_minutes > 60 AND escalation_level < 1 THEN
        SET escalate = TRUE;
    END IF;

    IF escalate THEN
        SET escalation_level = escalation_level + 1;

        -- Notify escalation
        DECLARE escalation_message STRING = '⬆️ *Escalation Level ' || escalation_level || '*\n'
            || 'Incident: ' || incident_id || '\n'
            || 'Severity: ' || severity || '\n'
            || 'Age: ' || age_minutes || ' minutes\n'
            || 'Awaiting response...';

        -- Different channels for different levels
        IF escalation_level = 1 THEN
            SLACK_SEND('#oncall-primary', escalation_message);
        ELSEIF escalation_level = 2 THEN
            SLACK_SEND('#oncall-secondary', escalation_message);
            SLACK_SEND('#engineering-leads', escalation_message);
        ELSE
            SLACK_SEND('#incidents-critical', escalation_message);
            SLACK_SEND('#leadership', escalation_message);
        END IF;

        -- Update incident
        ES_UPDATE('incidents', incident_id, {
            "escalation_level": escalation_level,
            "last_escalation": CURRENT_TIMESTAMP()
        });

        PRINT 'Escalated to level ' || escalation_level;
    ELSE
        PRINT 'No escalation needed at this time';
    END IF;

    RETURN escalation_level;
END PROCEDURE;

Post-Incident Report Generation¶

Generate incident summary with AI:

CREATE PROCEDURE generate_postmortem(incident_id STRING)
BEGIN
    -- Get incident data
    DECLARE incident DOCUMENT = ES_GET('incidents', incident_id);
    DECLARE service STRING = DOCUMENT_GET(incident, 'service');
    DECLARE description STRING = DOCUMENT_GET(incident, 'description');
    DECLARE created DATE = DOCUMENT_GET(incident, 'created_at');
    DECLARE resolved DATE = DOCUMENT_GET(incident, 'resolved_at');

    -- Get related logs
    DECLARE logs CURSOR FOR ESQL_QUERY('
        FROM logs-*
        | WHERE service.name = "' || service || '"
          AND @timestamp >= "' || created || '"
          AND @timestamp <= "' || resolved || '"
          AND level IN ("ERROR", "WARN")
        | SORT @timestamp
        | LIMIT 50
    ');

    DECLARE log_entries ARRAY = [];
    FOR log IN logs LOOP
        DECLARE entry STRING = DOCUMENT_GET(log, '@timestamp') 
            || ' [' || DOCUMENT_GET(log, 'level') || '] '
            || DOCUMENT_GET(log, 'message');
        SET log_entries = ARRAY_APPEND(log_entries, entry);
    END LOOP;

    -- Generate postmortem with AI
    DECLARE prompt STRING = 'Generate a post-incident report (postmortem) for this incident:

Incident ID: ' || incident_id || '
Service: ' || service || '
Description: ' || description || '
Start Time: ' || created || '
End Time: ' || resolved || '

Related log entries:
' || ARRAY_JOIN(log_entries, '\n') || '

Please include:
1. Executive Summary
2. Timeline of Events  
3. Root Cause Analysis
4. Impact Assessment
5. Action Items for Prevention

Format as Markdown.';

    DECLARE postmortem STRING = LLM_COMPLETE(prompt);

    -- Store postmortem
    ES_UPDATE('incidents', incident_id, {
        "postmortem": postmortem,
        "postmortem_generated_at": CURRENT_TIMESTAMP()
    });

    PRINT '=== Post-Incident Report ===';
    PRINT postmortem;

    RETURN postmortem;
END PROCEDURE;

Usage¶

-- Create an incident
CALL handle_incident('api-gateway', 'Connection timeout errors spiking', 'high');

-- Attempt auto-remediation
CALL auto_remediate('api-gateway', 'connection_pool_exhausted');

-- Check for escalation (run periodically)
CALL escalate_incident('INC-2026-01-09');

-- Generate post-incident report
CALL generate_postmortem('INC-2026-01-09');