scripts-monitor

监控_rabbitmq

monitor_rabbitmq.sh

#!/bin/bash
# 20140102
# AUTHOR coolber

DATE=$(date "+%F %T")
MONITOR_LOG=/root/scripts/log/monitor_rabbitmq_queue.log
RECEIVE_MAILER2[email protected]
METRIC_NUM=100


echo "### ${DATE} Rabbitmq queue ###" >$MONITOR_LOG

/usr/local/rabbitmq/sbin/rabbitmqctl list_queues >>$MONITOR_LOG

if [ $? -eq 0 ]
then
    while read i
    do
        CUR_NUM=$( echo $i | awk '{ print $NF }' | grep -vE '^#|^$' | grep -E '[0-9]+' )
        #if [[ ! -z $CUR_NUM && $METRIC_NUM -gt $CUR_NUM ]]
        if [[ $METRIC_NUM -lt $CUR_NUM ]]
        then
            #echo -e "### $DATE #### \n${i} " | mail -s "Host: nameserver,rabbitmq queue is Abnormal" $RECEIVE_MAILER
            echo -e "### $DATE #### \n${i} " | mail -s "Host: nameserver,rabbitmq queue is Abnormal" $RECEIVE_MAILER2
        fi
    done <${MONITOR_LOG}
else
#    echo "rabbitmq queue info Abnormal" | mail -s  " rabbitmq service is maybe Unavailable " $RECEIVE_MAILER
    echo "rabbitmq queue info Abnormal" | mail -s  " rabbitmq service is maybe Unavailable " $RECEIVE_MAILER2
fi

监控_实时文件对比

#!/bin/bash
DATE=$(date "+%F %T")
i=0
interval=1700

old_date=(`tail -1 /mnt/logs/consumer/apps_consumer.log|sed 's#\([^ ]*\) \([^,]*\),\([0-9]*\)\(\[.*\)#\1 \2 \3 \4#g'`)
old_date_d=${old_date[0]}
old_date_h=${old_date[1]}
old_java_n=${old_date[2]}

count=$[1700/$interval]   # $[]只能进行整数运算
RECEIVE_MAILER1[email protected]
RECEIVE_MAILER2[email protected]

while [ $i -lt $count ]
do
    sleep $interval
    new_date=(`tail -1 /mnt/logs/consumer/apps_consumer.log|sed 's#\([^ ]*\) \([^,]*\),\([0-9]*\)\(\[.*\)#\1 \2 \3 \4#g'`)
    new_date_d=${new_date[0]}
    new_date_h=${new_date[1]}
    new_java_n=${new_date[2]}
#if [ ${old_date_d} == $(tail -1 /mnt/logs/consumer/apps_consumer.log|sed 's#\([^ ]*\) \([^,]*\).*#\1#g') -a ${old_date_h} == $(tail -1 /mnt/logs/consumer/apps_consumer.log|sed 's#\([^ ]*\) \([^,]*\).*#\2#g')  ];then
    if [ ${old_date_d} == ${new_date_d} -a ${old_date_h} == ${new_date_h} -a ${old_java_n} == ${new_java_n} ];then
        echo -e "### $DATE #### \n'/mnt/logs/consumer/apps_consumer.log  not updated' \n consumer_info:\n ${new_date[@]} " | mail -s "Host(134): searchlog,rabbitmq queue is Abnormal" $RECEIVE_MAILER1
        echo -e "### $DATE #### \n'/mnt/logs/consumer/apps_consumer.log  not updated' \n consumer_info:\n ${new_date[@]} " | mail -s "Host(134): searchlog,rabbitmq queue is Abnormal" $RECEIVE_MAILER2
    fi
    ((i++))
done

监控_网络流量

监控_网络流量1

#!/bin/bash
PATH=/bin:/usr/bin:/sbin:/usr/sbin:/usr/local/bin:/usr/local/sbin;
export PATH

#本地网卡名
#local nic_arr=(`ifconfig | grep -E -o "^[a-z0-9]+" | grep -v "lo" | uniq`)
#本地网卡个数
#local nicLen=${#nic_arr[@]}
#eth=$nic_arr
#当前流量
#traffic_be=(`awk -v eth=$eth -F'[: ]+' '{if ($0 ~eth){print $3,$11}}' /proc/net/dev`)


function traffic_monitor {
  # 系统版本
  OS_NAME=$(sed -n '1p' /etc/issue)
  # 网口名
  eth=$1
  #判断网卡存在与否,不存在则退出
  if [ ! -d /sys/class/net/$eth ];then
      echo -e "Network-Interface Not Found"
      echo -e "You system have network-interface:\n`ls /sys/class/net`"
      exit 5
  fi
  while [ "1" ]
  do
    # 状态
    STATUS="fine"
    # 获取当前时刻网口接收与发送的流量
    RXpre=$(cat /proc/net/dev | grep $eth | tr : " " | awk '{print $2}')
    TXpre=$(cat /proc/net/dev | grep $eth | tr : " " | awk '{print $10}')
    # 获取1秒后网口接收与发送的流量
    sleep 1
    RXnext=$(cat /proc/net/dev | grep $eth | tr : " " | awk '{print $2}')
    TXnext=$(cat /proc/net/dev | grep $eth | tr : " " | awk '{print $10}')
    clear
    # 获取这1秒钟实际的进出流量
    RX=$((${RXnext}-${RXpre}))
    TX=$((${TXnext}-${TXpre}))
    # 判断接收流量如果大于MB数量级则显示MB单位,否则显示KB数量级
    if [[ $RX -lt 1024 ]];then
      RX="${RX}B/s"
    elif [[ $RX -gt 1048576 ]];then
      RX=$(echo $RX | awk '{print $1/1048576 "MB/s"}')
      $STATUS="busy"
    else
      RX=$(echo $RX | awk '{print $1/1024 "KB/s"}')
    fi
    # 判断发送流量如果大于MB数量级则显示MB单位,否则显示KB数量级
    if [[ $TX -lt 1024 ]];then
      TX="${TX}B/s"
      elif [[ $TX -gt 1048576 ]];then
      TX=$(echo $TX | awk '{print $1/1048576 "MB/s"}')
    else
      TX=$(echo $TX | awk '{print $1/1024 "KB/s"}')
    fi
    # 打印信息    echo -e "Date:   `date +%F`"
 #   echo -e "Time:   `date +%k:%M:%S`"
 #   echo -e "Port:   $1"
 #   echo -e "Status: $STATUS"
 #   echo -e  " \t     RX \tTX"
    echo "------------------------------"
    # 打印实时流量
    echo -e "Time:`date +%k:%M:%S` $eth \t 'RX:'$RX   'TX:'$TX "
    echo "------------------------------"
    # 退出信息
    echo -e "Press 'Ctrl+C' to exit"
  done
}
# 判断执行参数
if [[ -n "$1" ]];then
  # 执行函数
  traffic_monitor $1
else
  echo -e "None parameter,please add system netport after run the script! \nExample: 'sh traffic_monitor eth0'"
fi

监控_网络流量2

#!/bin/bash
R2=`cat /sys/class/net/$1/statistics/rx_bytes`
T2=`cat /sys/class/net/$1/statistics/tx_bytes`
NUM=100000
if [ -z "$1" ]; then
        echo
        echo usage: $0 network-interface
        echo
        echo e.g. $0 eth0
        echo
        exit
fi
IF=$1
while true
do
        R1=`cat /sys/class/net/$1/statistics/rx_bytes`
        T1=`cat /sys/class/net/$1/statistics/tx_bytes`
        TBPS=`expr $T1 - $T2`
        RBPS=`expr $R1 - $R2`
        TKBPS=`expr $TBPS / 1024`
        RKBPS=`expr $RBPS / 1024`
        #RKBPS1=`echo "scale=3; $RBPS/$NUM"|bc`
        eval `date "+day=%d; month=%m; year=%Y; hour=%H; minute=%M second=%S"`
        INSTFIL4="$hour:$minute:$second"
        echo "$INSTFIL4 tx $1: $TKBPS kb/ rx $RKBPS kb/s "

        R2=`cat /sys/class/net/$1/statistics/rx_bytes`
        T2=`cat /sys/class/net/$1/statistics/tx_bytes`
        sleep 1s
done

监控_连接数检查

cat <<\EOF> monitor_conntrack.sh
#!/bin/bash

#set -x
#CON_NUM=$(wc -l /proc/net/ip_conntrack | awk '{print $1}')
#CON_NUM=$(cat /proc/sys/net/ipv4/netfilter/ip_conntrack_count)
CON_NUM=$(cat /proc/sys/net/netfilter/nf_conntrack_count)
TRACK_LOG=/root/tools/logs/conntrack.log
ALARM_NUM=50000
REC_MAILER='[email protected]'
DATE=$(date '+%F %T')
HOST_NAME=$(hostname)

echo "$DATE  $HOST_NAME Current conntrack number is $CON_NUM " >>$TRACK_LOG
if [ $ALARM_NUM -le $CON_NUM ]
then
    echo "$DATE $HOST_NAME Current conntrack number is $CON_NUM "| mail -s 'Conntrack alarm' $REC_MAILER
fi
EOF

监控-marathon-check

#!/bin/python
# -*- coding: utf-8 -*-
import urllib2
import sys
import subprocess
import urllib
import time
import socket
import os,re
import requests,json
import commands

def get_host_ip():
    try:
        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
        s.connect(('8.8.8.8', 80))
        ip = s.getsockname()[0]
    finally:
        s.close()
    return ip
def get_nginx_conf():
    apps = []
    try:
       if os.path.exists("/home/sh/nginx_marathon/nginx_conf.sh"):
           os.system('/usr/bin/sh /home/sh/nginx_marathon/nginx_conf.sh')
       if os.path.exists("/home/sh/nginx_marathon/nginx_conf"):
           fs = commands.getoutput('cat /home/sh/nginx_marathon/nginx_conf')
           if fs:
               b=re.compile('^upstream\s*(.\S*)\s*{\n\s+(.*)',re.MULTILINE)
               tags = (b.findall(fs))
               #print tags
               for i in tags:
                   try:
                       #b=re.findall('(.*)-\d+',i[0])[0]
                       d=re.compile('(\d+.\d+.\d+.\d+)+',re.MULTILINE)
                       apps.append([re.findall('(.*)-\d+',i[0])[0],d.findall(i[1])])
                   except:
                       continue
    except Exception as e:
        print(str(e))
    return apps

def dindin(token,msg):
    try:
        dindinapi = 'https://oapi.dingtalk.com/robot/send?access_token=%s' % token
        headers = {'content-type': 'application/json','charset':'utf-8'}
        data  = {
         "msgtype": "text",
         "text": {
         "content": msg
         },
         "at": {
         "atMobiles": [
         ],
         "isAtAll": True
         }
        }
        req=requests.post(dindinapi,json=data,headers=headers)
    except Exception as e:
        print(str(e))
def check_marathon():
    try:
        marathon_ip = '192.168.101.150'
        host_ip = get_host_ip()
        hostname = commands.getoutput('hostname')
        tokens = 'a6184389010ffc41f045ab23eab3d40f8ebe1cd9526e7832a131d718aa13057c'
        apps = get_nginx_conf()
        r = requests.Session()
        r.auth = ('wowotuan','wowotuan')
        for i in apps:
            if i[0]:
                try:
                    s = r.get('http://%s:8080/v2/apps/%s' % (marathon_ip,i[0].encode('utf8')))
                    app_infos = s.json()['app']
                    hosts = []
                    if app_infos:
                        for task in app_infos['tasks']:
                            hosts.append(task['host'].encode('utf8'))
                    for ip in i[1]:
                        if ip.encode('utf8') not in hosts:
                            msg = 'tc hb2  nginx:%s-%s app:%s ip:%s 注册失败' % (hostname,host_ip,i[0],ip)
                            print msg
                            dindin(tokens,msg)
                        else:
                            msg = 'tc hb2  nginx:%s-%s app:%s ip:%s  注册成功' % (hostname,host_ip,i[0],ip)
                            print msg
                except:
                   continue
    except Exception as e:
        print(str(e))
#print get_nginx_conf()
check_marathon()
[root@tc-xy-nginx002 ~]# /usr/bin/python  /home/sh/nginx_marathon/nginx_marathon_check.py
tc hb2  nginx:tc-xy-nginx002-192.168.100.81 app:xy-qd-api-yzprod ip:192.168.101.248  注册成功
tc hb2  nginx:tc-xy-nginx002-192.168.100.81 app:yzbizcenter-admin-yzprod ip:192.168.101.26  注册成功
tc hb2  nginx:tc-xy-nginx002-192.168.100.81 app:yzbizcenter-admin-yzprod ip:192.168.101.2  注册成功

高峰关闭服务

18 19 20 21 22 高峰时间停止服务运行

[root@jumpserver01 ~]# cat /alidata/xuchangwei/checkpy.sh
#!/bin/bash
#
export JAVA_HOME=/usr/java/jdk1.8.0_40
export CLASSPATH=.:$JAVA_HOME/jre/lib/rt.jar:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib/tools.jar
export PATH=$PATH:$JAVA_HOME/bin

function check_time() {
    high="18 19 20 21 22"
    if [[ "$high" =~ "$(date +%H)" ]]; then
    #if [[ "$high" =~ "14" ]]; then
        echo "kill service"
        kill -9 $(ps -ef |grep updateCover |grep -v grep|awk '{print $2}')
        kill -9 $(ps -ef |grep monitorjob.py |grep -v grep|awk '{print $2}')
        kill -9 $(ps -ef |grep imagedeal-jar-with-dependencies.jar|grep -v grep| awk '{print $2}')
    else
        check_pid
    fi
}

function check_pid() {
    i=0
    interval=10
    count=$[60/$interval]

    while [ $i -lt $count ]; do
        p_num=$(ps -ef |grep updateCover |grep -v grep |wc -l)
        m_num=$(ps -ef |grep monitorjob.py |grep -v grep |wc -l)
        i_num=$(ps -ef |grep imagedeal-jar-with-dependencies.jar|grep -v grep|wc -l)
        if [ $p_num == '0' ]; then
            cd /alidata/xuchangwei/dealcover
            nohup python updateCover.py 1 300000000 &>>to.log &
        fi
        if [ $m_num == '0' ]; then
            cd /alidata/xuchangwei/monitorcv
            nohup python monitorjob.py 10000 &>m.log &
        fi

        if [ $i_num == '0' ]; then
            cd /alidata/imagedeal/project/waterDeal/target/
            nohup java -jar /alidata/imagedeal/project/waterDeal/target/imagedeal-jar-with-dependencies.jar online 45 >/alidata/imagedeal/project/imagedeal.log 2>&1 &
        fi


        ((i++))
        sleep $interval
    done
}

check_time

prometheus

阿里SLS分析

# -*- coding:UTF-8 -*-
import datetime
import os
import time

from alibabacloud_cms20190101 import models as cms_20190101_models
from alibabacloud_cms20190101.client import Client as Cms20190101Client
from alibabacloud_sls20201230 import models as sls_20201230_models
from alibabacloud_sls20201230.client import Client as Sls20201230Client
from alibabacloud_tea_openapi import models as open_api_models
from alibabacloud_tea_util import models as util_models
from alibabacloud_tea_util.client import Client as UtilClient
from prometheus_client import Gauge, start_http_server

g4_1 = Gauge('sls_undertow_service_qps', 'service qps', ['service'])
g4_2 = Gauge('sls_undertow_service_avg_rt', 'service avg rt', ['service'])
g4_3_0 = Gauge('sls_undertow_api_slow_level0', 'api slow graater than 200ms ', ['service', 'uri'])
g4_3_4 = Gauge('sls_undertow_api_slow_detail', 'slow query detail',
               ['service', 'uri', 'avg_timeused', 'min_timeused', 'max_timeused'])

g6 = Gauge('sls_exception_qpm', 'exception qpm', ['service'])


def get_content_from_file(file_path):
    if not os.path.exists(file_path):
        return ""
    with open(file_path, 'r') as f:
        content = f.readlines()
        if content:
            return content[0].strip()
        return ""


def set_content_to_file(content, file_path):
    with open(file_path, 'w') as f:
        f.write(content)
        f.close()


class getSlsLogData:
    access_key_id = "Lxx"
    access_key_secret = "fxxx"
    undertow_project = "singapore"
    undertow_logstore = "undertow"
    logback_project = "overseas-logback-server"
    logback_logstore = "server-exception"
    sls_endpoint = f'ap-southeast-1-intranet.log.aliyuncs.com'
    service_all = [
        "attribute-server",
        "attribute-server-manager",
    ]
    prefix_uris = [
        "/episodic_drama",
        "/ads_unlock/episodic_drama",
    ]

    def __init__(self):
        config = open_api_models.Config(
            access_key_id=self.access_key_id,
            access_key_secret=self.access_key_secret
        )
        config.endpoint = self.sls_endpoint
        self.client = Sls20201230Client(config)
        self.is_pre = False

    def get_sls_data(self, current=0, project="", logstore="", query=""):
        if not current:
            current = int(time.time())
        if not project:
            project = self.undertow_project
        if not logstore:
            logstore = self.undertow_logstore
        from_time = current - 60
        to_time = current
        headers = {}
        runtime = util_models.RuntimeOptions()
        get_logs_request = sls_20201230_models.GetLogsRequest(from_=from_time, to=to_time, query=query)
        try:
            res = self.client.get_logs_with_options(project, logstore, get_logs_request, headers, runtime)
            return res.body
        except Exception as error:
            UtilClient.assert_as_string(error)
            return []

    def set_metric_service_qps(self):
        query = '*|select "__tag__:_container_name_" as service ,COUNT(*) as cnt group by service order by cnt desc '
        res_list = self.get_sls_data(query=query)
        for data in res_list:
            service = data["service"]
            cnt = float(data["cnt"])
            g4_1.labels(service).set(cnt)

    def set_metric_service_avg_rt(self):
        query = '* |select "__tag__:_container_name_" as service , avg(timeUsed) as avg_time  group by service order by avg_time desc '
        res_list = self.get_sls_data(query=query)
        for data in res_list:
            service = data["service"]
            try:
                avg_time_ns = float(data["avg_time"])
            except:
                continue
            avg_time_ms = int(avg_time_ns / 1000)
            g4_2.labels(service).set(avg_time_ms)

    def get_undertow_data_daily(self, query=""):
        now = datetime.datetime.now()
        midnight = now.replace(hour=0, minute=0, second=0, microsecond=0)
        midnight_ts = int(midnight.timestamp())
        from_time = midnight_ts - 9000
        to_time = from_time + 1800
        project = self.undertow_project
        logstore = self.undertow_logstore
        headers = {}
        runtime = util_models.RuntimeOptions()
        get_logs_request = sls_20201230_models.GetLogsRequest(from_=from_time, to=to_time, query=query)
        try:
            res = self.client.get_logs_with_options(project, logstore, get_logs_request, headers, runtime)
            return res.body
        except Exception as error:
            UtilClient.assert_as_string(error)
            return []

    def set_metric_slow_detail(self):
        g4_3_4.clear()
        for service in self.get_service_list():
            query = '* and __tag__:_container_name_: {} |select COUNT(*) as cnt, avg(timeUsed)/1000 as avg_timeused, min(timeUsed)/1000 as min_timeused, max(timeUsed)/1000 as max_timeused , sum(timeUsed)/1000 as sum_timeused, "__tag__:_container_name_" as service ,uri group by service,uri  HAVING  avg_timeused >200 order by avg_timeused desc limit 100000'.format(
                service)
            res_list = self.get_undertow_data_daily(query=query)
            pre_map = {}
            sum = float(0)
            for data in res_list:
                self.is_pre = False
                uri = data["uri"]
                cnt = float(data["cnt"])
                min_timeused = float(data["min_timeused"])
                max_timeused = float(data["max_timeused"])
                sum += cnt

                for prefix_uri in self.prefix_uris:
                    if not uri.startswith(prefix_uri):
                        continue
                    uri_len = len(uri.split("/"))
                    prefix_uri_len = len(prefix_uri.split("/"))
                    param_str = ""
                    for i in range(uri_len - prefix_uri_len):
                        param_str = param_str + "/{param}"
                    sum_timeused = float(data["sum_timeused"])
                    key = prefix_uri + param_str
                    pre_uri_data = pre_map.get(key, {})
                    pre_uri_data["sum_timeused"] = float(pre_uri_data.get("sum_timeused", 0)) + sum_timeused
                    pre_uri_data["cnt"] = float(pre_uri_data.get("cnt", 0)) + cnt
                    pre_uri_data["min_timeused"] = min(float(pre_uri_data.get("min_timeused", min_timeused)),
                                                       min_timeused)
                    pre_uri_data["max_timeused"] = max(float(pre_uri_data.get("max_timeused", max_timeused)),
                                                       max_timeused)
                    pre_map[key] = pre_uri_data
                    self.is_pre = True
                    break
                if not self.is_pre:
                    avg_timeused = data["avg_timeused"]
                    min_timeused = str(min_timeused)
                    max_timeused = str(max_timeused)
                    g4_3_4.labels(service, uri, avg_timeused, min_timeused, max_timeused).set(cnt)

            for uri, pre_uri_data in pre_map.items():
                cnt = pre_uri_data["cnt"]
                sum_timeused = pre_uri_data["sum_timeused"]
                min_timeused = str(pre_uri_data["min_timeused"])
                max_timeused = str(pre_uri_data["max_timeused"])
                avg_timeused = str(round(sum_timeused / cnt, 2))
                g4_3_4.labels(service, uri, avg_timeused, min_timeused, max_timeused).set(cnt)

    def get_service_list(self):
        service_list = []
        query = '*|select DISTINCT "__tag__:_container_name_" as service'
        res_list = self.get_undertow_data_daily(query=query)
        for data in res_list:
            service = data["service"]
            service_list.append(service)
        return service_list

    def set_metric_api_slow_level0(self):
        query = '* and timeUsed >= 200000|select "__tag__:_container_name_" as service ,uri, COUNT(*) as cnt group by service,uri order by cnt desc '
        res_list = self.get_sls_data(query=query)
        g4_3_0.clear()
        for data in res_list:
            service = data["service"]
            uri = data["uri"]
            cnt = float(data["cnt"])
            g4_3_0.labels(service, uri).set(cnt)

    def set_metric_logback_exception(self):
        query = "* | select __topic__ as service , COUNT(*) as cnt group by service"
        res_list = self.get_sls_data(project=self.logback_project, logstore=self.logback_logstore, query=query)
        g6.clear()
        g6.labels("content-distribution-platform").set(0)
        for service in self.service_all:
            g6.labels(service).set(0)
        for data in res_list:
            service = data["service"]
            cnt = data["cnt"]
            g6.labels(service).set(cnt)


if __name__ == '__main__':
    already_file = "/opt/ops/others/already_run"
    start_http_server(9111)  # 8006端口启动
    print("监控程序启动...")
    data4 = getSlsLogData()
    while True:
        try:
            current_day = datetime.datetime.now().strftime("%Y%m%d")
            already_day = get_content_from_file(already_file)
            if current_day != already_day:
                set_content_to_file(current_day, already_file)
                print("执行慢接口统计任务")
                data4.set_metric_slow_detail()
                print("慢接口统计任务执行完毕")
            data4.set_metric_service_qps()
            data4.set_metric_service_avg_rt()
            data4.set_metric_api_slow_level0()
            data4.set_metric_logback_exception()
            current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            print("{} 监控数据采集成功".format(current_time))
            time.sleep(40)
        except Exception as e:
            print(e)
            time.sleep(40)

openflcon

网络质量监控脚本

脚本逻辑:通过定量ping内网地址,将丢包的比例上报的open-falcon上,并在open-falcon中设定阈值;

#!/bin/sh
#########################
#system variable
timestamp=`date +%s`
host_name=`hostname`
tc_ping=`ping -c 10 192.168.100.43 |grep "packet loss" |awk '{print $6}' |sed 's/%//g'`
sc_ping=`ping -c 10 192.168.10.65 |grep "packet loss" |awk '{print $6}' |sed 's/%//g'`

#########################
main () {
        curl -X POST -d '[{"metric": "tc high-speed channel", "endpoint": "'$host_name'", "timestamp": '$timestamp', "step": 60,"value": '$tc_ping',"counterType": "GAUGE","tags": ""}]' http://127.0.0.1:1988/v1/push
        curl -X POST -d '[{"metric": "sc high-speed channel", "endpoint": "'$host_name'", "timestamp": '$timestamp', "step": 60,"value": '$sc_ping',"counterType": "GAUGE","tags": ""}]' http://127.0.0.1:1988/v1/push
}

main

openflacon-zookeeeper

[root@zk-1 ~]# crontab -l
################## monitor falcon agent ######################
*/1 * * * * /usr/bin/sh /data/work/open-falcon/script/report_agent.sh

[root@zk-1 ~]# cat /data/work/open-falcon/script/report_agent.sh
#!/bin/bash
timestamp=`date +%s`
host_name=`hostname`
### check zk ###新###
zk_pid=`ps -ef|grep 'zookeeper'|grep -v 'grep'|awk '{print $2}'`
zk_port_2181=`netstat -antlp|grep $zk_pid|grep 2181|wc -l`
zk_num=`ps -ef|grep 'zookeeper'|grep -v grep | wc -l`
curl -X POST -d '[{"metric": "zookeeper.status", "endpoint": "'$host_name'", "timestamp": '$timestamp', "step": 60,"value": '$zk_num',"counterType": "GAUGE","tags": ""}]' http://127.0.0.1:1988/v1/push
curl -X POST -d '[{"metric": "zookeeper.connect_port_2181", "endpoint": "'$host_name'", "timestamp": '$timestamp', "step": 60,"value": '$zk_port_2181',"counterType": "GAUGE","tags": ""}]' http://127.0.0.1:1988/v1/push

role=`echo mntr|nc 127.0.0.1 2181 |grep zk_server_state|awk '{print $2}'`
if [ $role = "leader" ];then
zk_avg_latency=`echo mntr|nc 127.0.0.1 2181 |grep zk_avg_latency|awk '{print $2}'`
zk_max_latency=`echo mntr|nc 127.0.0.1 2181 |grep zk_max_latency |awk '{print $2}'`
zk_min_latency=`echo mntr|nc 127.0.0.1 2181 |grep zk_min_latency|awk '{print $2}'`
zk_packets_received=`echo mntr|nc 127.0.0.1 2181 |grep zk_packets_received|awk '{print $2}'`
zk_packets_sent=`echo mntr|nc 127.0.0.1 2181 |grep zk_packets_sent|awk '{print $2}'`
zk_num_alive_connections=`echo mntr|nc 127.0.0.1 2181 |grep zk_num_alive_connections|awk '{print $2}'`
zk_outstanding_requests=`echo mntr|nc 127.0.0.1 2181 |grep zk_outstanding_requests|awk '{print $2}'`
zk_znode_count=`echo mntr|nc 127.0.0.1 2181 |grep zk_znode_count|awk '{print $2}'`
zk_watch_count=`echo mntr|nc 127.0.0.1 2181 |grep zk_watch_count|awk '{print $2}'`
zk_ephemerals_count=`echo mntr|nc 127.0.0.1 2181 |grep zk_ephemerals_count|awk '{print $2}'`
zk_approximate_data_size=`echo mntr|nc 127.0.0.1 2181 |grep zk_approximate_data_size|awk '{print $2}'`
zk_open_file_descriptor_count=`echo mntr|nc 127.0.0.1 2181 |grep zk_open_file_descriptor_count|awk '{print $2}'`
zk_max_file_descriptor_count=`echo mntr|nc 127.0.0.1 2181 |grep zk_max_file_descriptor_count|awk '{print $2}'`
zk_followers=`echo mntr|nc 127.0.0.1 2181 |grep zk_followers|awk '{print $2}'`
curl -X POST -d "[{\"metric\": \"ZK.zk_avg_latency\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_avg_latency,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push
curl -X POST -d "[{\"metric\": \"ZK.zk_max_latency\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_max_latency,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push
curl -X POST -d "[{\"metric\": \"ZK.zk_min_latency\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_min_latency,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push
curl -X POST -d "[{\"metric\": \"ZK.zk_packets_received\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_packets_received,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push
curl -X POST -d "[{\"metric\": \"ZK.zk_packets_sent\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_packets_sent,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push
curl -X POST -d "[{\"metric\": \"ZK.zk_num_alive_connections\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_num_alive_connections,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push
curl -X POST -d "[{\"metric\": \"ZK.zk_outstanding_requests\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_outstanding_requests,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push
curl -X POST -d "[{\"metric\": \"ZK.zk_znode_count\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_znode_count,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push
curl -X POST -d "[{\"metric\": \"ZK.zk_watch_count\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_watch_count,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push
curl -X POST -d "[{\"metric\": \"ZK.zk_ephemerals_count\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_ephemerals_count,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push
curl -X POST -d "[{\"metric\": \"ZK.zk_approximate_data_size\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_approximate_data_size,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push
curl -X POST -d "[{\"metric\": \"ZK.zk_open_file_descriptor_count\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_open_file_descriptor_count,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push
curl -X POST -d "[{\"metric\": \"ZK.zk_max_file_descriptor_count\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_max_file_descriptor_count,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push
curl -X POST -d "[{\"metric\": \"ZK.zk_followers\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_followers,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push

else
zk_avg_latency=`echo mntr|nc 127.0.0.1 2181 |grep zk_avg_latency|awk '{print $2}'`
zk_max_latency=`echo mntr|nc 127.0.0.1 2181 |grep zk_max_latency |awk '{print $2}'`
zk_min_latency=`echo mntr|nc 127.0.0.1 2181 |grep zk_min_latency|awk '{print $2}'`
zk_packets_received=`echo mntr|nc 127.0.0.1 2181 |grep zk_packets_received|awk '{print $2}'`
zk_packets_sent=`echo mntr|nc 127.0.0.1 2181 |grep zk_packets_sent|awk '{print $2}'`
zk_num_alive_connections=`echo mntr|nc 127.0.0.1 2181 |grep zk_num_alive_connections|awk '{print $2}'`
zk_outstanding_requests=`echo mntr|nc 127.0.0.1 2181 |grep zk_outstanding_requests|awk '{print $2}'`
zk_znode_count=`echo mntr|nc 127.0.0.1 2181 |grep zk_znode_count|awk '{print $2}'`
zk_watch_count=`echo mntr|nc 127.0.0.1 2181 |grep zk_watch_count|awk '{print $2}'`
zk_ephemerals_count=`echo mntr|nc 127.0.0.1 2181 |grep zk_ephemerals_count|awk '{print $2}'`
zk_approximate_data_size=`echo mntr|nc 127.0.0.1 2181 |grep zk_approximate_data_size|awk '{print $2}'`
zk_open_file_descriptor_count=`echo mntr|nc 127.0.0.1 2181 |grep zk_open_file_descriptor_count|awk '{print $2}'`
zk_max_file_descriptor_count=`echo mntr|nc 127.0.0.1 2181 |grep zk_max_file_descriptor_count|awk '{print $2}'`
curl -X POST -d "[{\"metric\": \"ZK.zk_avg_latency\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_avg_latency,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push
curl -X POST -d "[{\"metric\": \"ZK.zk_max_latency\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_max_latency,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push
curl -X POST -d "[{\"metric\": \"ZK.zk_min_latency\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_min_latency,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push
curl -X POST -d "[{\"metric\": \"ZK.zk_packets_received\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_packets_received,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push
curl -X POST -d "[{\"metric\": \"ZK.zk_packets_sent\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_packets_sent,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push
curl -X POST -d "[{\"metric\": \"ZK.zk_num_alive_connections\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_num_alive_connections,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push
curl -X POST -d "[{\"metric\": \"ZK.zk_outstanding_requests\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_outstanding_requests,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push
curl -X POST -d "[{\"metric\": \"ZK.zk_znode_count\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_znode_count,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push
curl -X POST -d "[{\"metric\": \"ZK.zk_watch_count\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_watch_count,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push
curl -X POST -d "[{\"metric\": \"ZK.zk_ephemerals_count\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_ephemerals_count,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push
curl -X POST -d "[{\"metric\": \"ZK.zk_approximate_data_size\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_approximate_data_size,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push
curl -X POST -d "[{\"metric\": \"ZK.zk_open_file_descriptor_count\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_open_file_descriptor_count,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push
curl -X POST -d "[{\"metric\": \"ZK.zk_max_file_descriptor_count\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_max_file_descriptor_count,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push
fi

服务监控治理

监控脚本说明

参数说明:
--service:服务类型
--ports:服务端口,多端口间,逗号分隔。没有端口则填0。
--names: 监控指标。多个指标以,逗号分隔。

bash脚本模板

monitor[root@nginx001 vhosts]# crontab -l
#Ansible: zookeeper.connect_port
*/1 * * * * /home/scripts/zookeeper/zookeeper.connect_port.sh  --service zk  --ports 2181,3888  --names  zk.connect_port
#Ansible: zookeeper.daemon_status
*/1 * * * * /home/scripts/zookeeper/zookeeper.daemon_status.sh  --service zk  --ports 0  --names  zk.daemon_status
#!/usr/bin/env bash
#
source /etc/profile
timestamp=`date +%s`
host_name=`hostname`
exit_status=0

function usage(){
cat <<EOF
Usage: $0 [MODE] [OPTION]

OPTION:
  -s, --service <server_name>     指定服务
  -p, --ports <port1,port2...>    指定端口号
  -n, --names <tag_name>          指定监控指标
  --help                   show help
EOF
}

parameters=$(getopt -o s:p:n: --long service:,ports:,names:,help -n "$0" -- "$@")
[ $? -ne 0 ] && { echo "Try '$0 --help' for more information."; exit 1; }

eval set -- "$parameters"

while true; do
    case "$1" in
    -s|--service) service=$2; shift 2;;
    -p|--ports) ports=$2 ; shift 2 ;;
    -n|--names) names=$2 ; shift 2 ;;
    --help) usage;exit ;;
    --)
        shift
        FIRST=$1
        SECOND=$2
        LAST=$3
        break ;;
    *) usage;exit 1 ;;
    esac
done

function open_falcon_push() {
    local metric=$1
    local tag=$2
    local value=$3
    curl --connect-timeout 5 -X POST -d '[{"'metric'": "'$metric'", "endpoint": "'$host_name'", "timestamp": '$timestamp', "step": 60,"value": '$value',"counterType": "GAUGE","tags": "'$tag'"}]' http://127.0.0.1:1988/v1/push
    #echo  '[{"metric": "'$metric'", "endpoint": "'$host_name'", "timestamp": '$timestamp', "step": 60,"value": '$value',"counterType": "GAUGE","tags": "'$tag'"}]' http://127.0.0.1:1988/v1/push
}

[ -z "$service" ] && { echo -e "\033[31;1;4mERR\033[0m: the service is not found, please --help" && exit_status=1; }
[ -z "$ports" ] && { echo -e "\033[31;1;4mERR\033[0m: the ports are not found, please --help" && exit_status=1; }
[ -z "$names" ] && { echo -e "\033[31;1;4mERR\033[0m: the names are not found, please --help" && exit_status=1; }
[  $exit_status == 1 ]&& exit 1


#----------------------------------change
[ -z "$(which nc)" ] && yum install -y nc
daemon_status=`echo ruok | nc 127.0.0.1 2181|grep imok|wc -l`

zk_pid=`jps |grep QuorumPeerMain| grep -v 'grep'|awk '{print $1}'`

function noport_check() {
    for port in $(echo ${ports//,/ }); do
        if [ "$port" -ge 0 ] 2>/dev/null ; then :;else echo -e "\033[31;1;4mERR\033[0m: port $port is not number" && exit 1; fi
        for name in $(echo ${names//,/ }); do
            open_falcon_push   "$name" "name=$name,port=0,service=$service" "$daemon_status"
        done
    done
}

function ports_check() {
    for port in $(echo ${ports//,/ }); do
        if [ "$port" -ge 0 ] 2>/dev/null ; then :;else echo -e "\033[31;1;4mERR\033[0m: port $port is not number" && exit 1; fi
        for name in $(echo ${names//,/ }); do
            zk_port=`netstat -antlp|grep $zk_pid|grep $port|wc -l`
            open_falcon_push   "$name" "name=$name,port=$port,service=$service" "$zk_port"
        done
    done
}

#----------------------------------end
function monitor() {
   noport_check
   #ports_check
}

python脚本模板

[root@nginx001 vhosts]# /home/scripts/nginx/nginx.openfiles_check.py  --service nginx  --ports 0  --names  nginx.openfiles_check
success
[{'metric': 'nginx.openfiles_check', 'endpoint': 'nginx001', 'timestamp': 1575447496, 'step': 120, 'value': 1, 'counterType': 'GAUGE', 'tags': 'name=nginx.openfiles_check,port=0,service=nginx'}]
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
import time,datetime
import json
import socket
import requests
import os

import argparse
parser = argparse.ArgumentParser(description='monitor script')
parser.add_argument("-s","--service", help="指定服务", type=str, required=True)
parser.add_argument("-p","--ports", help="指定端口号", type=str, required=True)
parser.add_argument("-n","--names", help="指定监控指标", type=str, required=True)
args = parser.parse_args()

def open_falcon_push(payload):
    url = 'http://127.0.0.1:1988/v1/push'
    headers = {"Content-Type": "application/json"}
    try:
        r = requests.post(url, data=json.dumps(payload), headers=headers, timeout=3)
        if r.status_code == 200:
            print(r.text)
        else:
            print('{"err":1,"msg":"%s"}' % r.text)
    except Exception as e:
        print("Exception: {traceback.format_exc(chain=False)}")

#---------------------------------------------change
def main():
    service = args.service
    ports = args.ports
    names = args.names
    hostname = socket.gethostname()
    timestamp = int(time.time())
    step = 120
    payload = []

    try:
        ports = ports.split(',')
        names = names.split(',')

        for port in ports:
            tags = 'port=%s' % port
            for name in names:
                tag ="name={},port={},service={}".format(name,port,service)
                if "openfiles_check" in name: 
                    value = openfiles()
                    i = {
                        'metric': name,
                        'endpoint': hostname,
                        'timestamp': timestamp,
                        'step': step,
                        'value': value,
                        'counterType': "GAUGE",
                        'tages': tag
                    }
                    payload.append(i)
        open_falcon_push(payload)
    except Exception as e:
        msg = traceback.format_exc(chain=False)
        print("Exception: {msg}")
    finally:
        print(payload)

def openfiles():
    nginx_conf = []
    nginx_path = "/data/logs/"
    value=1

    g = os.walk(nginx_path)
    for path, dir_list, file_list in g:
        for file in file_list:
            if 'error.log' in file and '2019' not in file:
                nginx_conf.append(file)

    for conf_name in nginx_conf:
        nginxpath = ''
        nginxpath = nginx_path + conf_name
        f = open(nginxpath,"r",encoding='utf-8', errors='replace')
        for i in f.readlines():
            i = i.strip("\n")
            time_date = i.split(" ")
            fiveminago =  (datetime.datetime.now()-datetime.timedelta(minutes=5)).strftime("%H:%M:%S")
            if time_date[1] < fiveminago:
                pass
            else:
                if "Too many open files" in i :
                    value=0
        f.close()
    return value
#--------------------------------------end

if __name__ == '__main__':
    # print(sys.argv)
    main()

go脚本模板


emacs

Emacs

org-mode

Orgmode

Donations

打赏

Copyright

© 2025 Jasper Hsu

Creative Commons

Creative Commons

Attribute

Attribute

Noncommercial

Noncommercial

Share Alike

Share Alike