AlkantarClanX12

Your IP : 3.145.164.47


Current Path : /lib64/nagios/plugins/nccustom/
Upload File :
Current File : //lib64/nagios/plugins/nccustom/check_stalled_procs.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import argparse
import os
import psutil
import time
import json

# Set default values for thresholds and monitoring time
WARNNUM = 10
CRITNUM = 20
MONTIME = 15

LOG_FILE = '/var/log/check_stalled_procs.json'

def parse_arguments():
    """
    Parse command-line arguments for warning and critical thresholds, and monitoring time.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('-w', type=int, default=WARNNUM, help='Warning threshold for process count')
    parser.add_argument('-c', type=int, default=CRITNUM, help='Critical threshold for process count')
    parser.add_argument('-t', type=int, default=MONTIME, help='Monitoring time in minutes')
    return parser.parse_args()

def get_process_info(state):
    """
    Retrieve information about processes in a given state.
    """
    try:
        # Collect processes matching the given state
        processes = [p for p in psutil.process_iter(['pid', 'status', 'cmdline', 'username', 'name']) if p.info['status'] == state]
        # Create a description for each process
        descriptions = [{
            'pid': p.info['pid'],
            'user': p.info.get('username', 'unknown'),
            'cmd': p.info['cmdline'][:3] if p.info['cmdline'] else p.info.get('name', 'unknown')
        } for p in processes]
        return len(processes), descriptions
    except Exception as e:
        print(f"Error retrieving process information: {e}")
        return 0, []

def read_last_log():
    """
    Read the last entry from the log file. Return None if the file does not exist or has 0 size 
    """
    if os.path.exists(LOG_FILE) and os.path.getsize(LOG_FILE) > 0:
        with open(LOG_FILE, 'r') as log:
            lines = log.readlines()
            if lines:
                return json.loads(lines[-1].strip())
    return None

def write_log(current_time, d_count, z_count, status, d_desc, z_desc):
    """
    Write a log entry to the log file.
    """
    log_entry = {
        'time': current_time,
        'd_count': d_count,
        'z_count': z_count,
        'status': status,
        'd_desc': d_desc,
        'z_desc': z_desc
    }
    with open(LOG_FILE, 'a') as log:
        log.write(json.dumps(log_entry) + '\n')

def main():
    """
    Main function to monitor stalled processes and report their status.
    """
    args = parse_arguments()
    
    current_time = int(time.time())
    # Get the count and description of processes in disk sleep (D) state and zombie (Z) state
    d_count, d_desc = get_process_info(psutil.STATUS_DISK_SLEEP)
    z_count, z_desc = get_process_info(psutil.STATUS_ZOMBIE)
    
    # Read the last log entry
    last_log = read_last_log()
    if last_log:
        last_time = int(last_log['time'])
        last_d_count = int(last_log['d_count'])
        last_z_count = int(last_log['z_count'])
        last_status = last_log['status']
    else:
        # Initialize variables if no last log entry exists
        last_time, last_d_count, last_z_count, last_status = current_time, d_count, z_count, "OK"
        write_log(last_time, last_d_count, last_z_count, last_status, d_desc, z_desc)
    
    time_diff = current_time - last_time
    status = last_status

     # Check if the monitoring time has elapsed
    if time_diff >= args.t * 60:
        # Use the maximum count between current and last counts to determine status
        td_count = max(d_count, last_d_count)
        tz_count = max(z_count, last_z_count)
        
        if td_count >= args.c or tz_count >= args.c:
            status = "CRITICAL"
        elif td_count >= args.w or tz_count >= args.w:
            status = "WARNING"
        else:
            status = "OK"
        
        write_log(current_time, d_count, z_count, status, d_desc, z_desc)
    elif last_status != "OK" and d_count < args.w and z_count < args.w:
        # Reset status to OK if counts drop below warning thresholds and previous status was not OK
        status = "OK"
        write_log(current_time, d_count, z_count, status, d_desc, z_desc)
    
    # Output the status, counts and perfdata 
    output = f"{status} - Processes in D state: {d_count}, Z state: {z_count} | D={d_count};{args.w};{args.c}; Z={z_count};{args.w};{args.c};"
    print(output)
    
    # Exit with the appropriate code
    if status == "OK":
        exit(0)
    elif status == "WARNING":
        exit(1)
    elif status == "CRITICAL":
        exit(2)
    else:
        exit(3)

if __name__ == "__main__":
    main()