scripts-monitor
- TAGS: Script
监控_rabbitmq
monitor_rabbitmq.sh #!/bin/bash # 20140102 # AUTHOR coolber DATE=$(date "+%F %T") MONITOR_LOG=/root/scripts/log/monitor_rabbitmq_queue.log RECEIVE_MAILER2[email protected] METRIC_NUM=100 echo "### ${DATE} Rabbitmq queue ###" >$MONITOR_LOG /usr/local/rabbitmq/sbin/rabbitmqctl list_queues >>$MONITOR_LOG if [ $? -eq 0 ] then while read i do CUR_NUM=$( echo $i | awk '{ print $NF }' | grep -vE '^#|^$' | grep -E '[0-9]+' ) #if [[ ! -z $CUR_NUM && $METRIC_NUM -gt $CUR_NUM ]] if [[ $METRIC_NUM -lt $CUR_NUM ]] then #echo -e "### $DATE #### \n${i} " | mail -s "Host: nameserver,rabbitmq queue is Abnormal" $RECEIVE_MAILER echo -e "### $DATE #### \n${i} " | mail -s "Host: nameserver,rabbitmq queue is Abnormal" $RECEIVE_MAILER2 fi done <${MONITOR_LOG} else # echo "rabbitmq queue info Abnormal" | mail -s " rabbitmq service is maybe Unavailable " $RECEIVE_MAILER echo "rabbitmq queue info Abnormal" | mail -s " rabbitmq service is maybe Unavailable " $RECEIVE_MAILER2 fi
监控_实时文件对比
#!/bin/bash DATE=$(date "+%F %T") i=0 interval=1700 old_date=(`tail -1 /mnt/logs/consumer/apps_consumer.log|sed 's#\([^ ]*\) \([^,]*\),\([0-9]*\)\(\[.*\)#\1 \2 \3 \4#g'`) old_date_d=${old_date[0]} old_date_h=${old_date[1]} old_java_n=${old_date[2]} count=$[1700/$interval] # $[]只能进行整数运算 RECEIVE_MAILER1[email protected] RECEIVE_MAILER2[email protected] while [ $i -lt $count ] do sleep $interval new_date=(`tail -1 /mnt/logs/consumer/apps_consumer.log|sed 's#\([^ ]*\) \([^,]*\),\([0-9]*\)\(\[.*\)#\1 \2 \3 \4#g'`) new_date_d=${new_date[0]} new_date_h=${new_date[1]} new_java_n=${new_date[2]} #if [ ${old_date_d} == $(tail -1 /mnt/logs/consumer/apps_consumer.log|sed 's#\([^ ]*\) \([^,]*\).*#\1#g') -a ${old_date_h} == $(tail -1 /mnt/logs/consumer/apps_consumer.log|sed 's#\([^ ]*\) \([^,]*\).*#\2#g') ];then if [ ${old_date_d} == ${new_date_d} -a ${old_date_h} == ${new_date_h} -a ${old_java_n} == ${new_java_n} ];then echo -e "### $DATE #### \n'/mnt/logs/consumer/apps_consumer.log not updated' \n consumer_info:\n ${new_date[@]} " | mail -s "Host(134): searchlog,rabbitmq queue is Abnormal" $RECEIVE_MAILER1 echo -e "### $DATE #### \n'/mnt/logs/consumer/apps_consumer.log not updated' \n consumer_info:\n ${new_date[@]} " | mail -s "Host(134): searchlog,rabbitmq queue is Abnormal" $RECEIVE_MAILER2 fi ((i++)) done
监控_网络流量
监控_网络流量1
#!/bin/bash PATH=/bin:/usr/bin:/sbin:/usr/sbin:/usr/local/bin:/usr/local/sbin; export PATH #本地网卡名 #local nic_arr=(`ifconfig | grep -E -o "^[a-z0-9]+" | grep -v "lo" | uniq`) #本地网卡个数 #local nicLen=${#nic_arr[@]} #eth=$nic_arr #当前流量 #traffic_be=(`awk -v eth=$eth -F'[: ]+' '{if ($0 ~eth){print $3,$11}}' /proc/net/dev`) function traffic_monitor { # 系统版本 OS_NAME=$(sed -n '1p' /etc/issue) # 网口名 eth=$1 #判断网卡存在与否,不存在则退出 if [ ! -d /sys/class/net/$eth ];then echo -e "Network-Interface Not Found" echo -e "You system have network-interface:\n`ls /sys/class/net`" exit 5 fi while [ "1" ] do # 状态 STATUS="fine" # 获取当前时刻网口接收与发送的流量 RXpre=$(cat /proc/net/dev | grep $eth | tr : " " | awk '{print $2}') TXpre=$(cat /proc/net/dev | grep $eth | tr : " " | awk '{print $10}') # 获取1秒后网口接收与发送的流量 sleep 1 RXnext=$(cat /proc/net/dev | grep $eth | tr : " " | awk '{print $2}') TXnext=$(cat /proc/net/dev | grep $eth | tr : " " | awk '{print $10}') clear # 获取这1秒钟实际的进出流量 RX=$((${RXnext}-${RXpre})) TX=$((${TXnext}-${TXpre})) # 判断接收流量如果大于MB数量级则显示MB单位,否则显示KB数量级 if [[ $RX -lt 1024 ]];then RX="${RX}B/s" elif [[ $RX -gt 1048576 ]];then RX=$(echo $RX | awk '{print $1/1048576 "MB/s"}') $STATUS="busy" else RX=$(echo $RX | awk '{print $1/1024 "KB/s"}') fi # 判断发送流量如果大于MB数量级则显示MB单位,否则显示KB数量级 if [[ $TX -lt 1024 ]];then TX="${TX}B/s" elif [[ $TX -gt 1048576 ]];then TX=$(echo $TX | awk '{print $1/1048576 "MB/s"}') else TX=$(echo $TX | awk '{print $1/1024 "KB/s"}') fi # 打印信息 echo -e "Date: `date +%F`" # echo -e "Time: `date +%k:%M:%S`" # echo -e "Port: $1" # echo -e "Status: $STATUS" # echo -e " \t RX \tTX" echo "------------------------------" # 打印实时流量 echo -e "Time:`date +%k:%M:%S` $eth \t 'RX:'$RX 'TX:'$TX " echo "------------------------------" # 退出信息 echo -e "Press 'Ctrl+C' to exit" done } # 判断执行参数 if [[ -n "$1" ]];then # 执行函数 traffic_monitor $1 else echo -e "None parameter,please add system netport after run the script! \nExample: 'sh traffic_monitor eth0'" fi
监控_网络流量2
#!/bin/bash R2=`cat /sys/class/net/$1/statistics/rx_bytes` T2=`cat /sys/class/net/$1/statistics/tx_bytes` NUM=100000 if [ -z "$1" ]; then echo echo usage: $0 network-interface echo echo e.g. $0 eth0 echo exit fi IF=$1 while true do R1=`cat /sys/class/net/$1/statistics/rx_bytes` T1=`cat /sys/class/net/$1/statistics/tx_bytes` TBPS=`expr $T1 - $T2` RBPS=`expr $R1 - $R2` TKBPS=`expr $TBPS / 1024` RKBPS=`expr $RBPS / 1024` #RKBPS1=`echo "scale=3; $RBPS/$NUM"|bc` eval `date "+day=%d; month=%m; year=%Y; hour=%H; minute=%M second=%S"` INSTFIL4="$hour:$minute:$second" echo "$INSTFIL4 tx $1: $TKBPS kb/ rx $RKBPS kb/s " R2=`cat /sys/class/net/$1/statistics/rx_bytes` T2=`cat /sys/class/net/$1/statistics/tx_bytes` sleep 1s done
监控_连接数检查
cat <<\EOF> monitor_conntrack.sh #!/bin/bash #set -x #CON_NUM=$(wc -l /proc/net/ip_conntrack | awk '{print $1}') #CON_NUM=$(cat /proc/sys/net/ipv4/netfilter/ip_conntrack_count) CON_NUM=$(cat /proc/sys/net/netfilter/nf_conntrack_count) TRACK_LOG=/root/tools/logs/conntrack.log ALARM_NUM=50000 REC_MAILER='[email protected]' DATE=$(date '+%F %T') HOST_NAME=$(hostname) echo "$DATE $HOST_NAME Current conntrack number is $CON_NUM " >>$TRACK_LOG if [ $ALARM_NUM -le $CON_NUM ] then echo "$DATE $HOST_NAME Current conntrack number is $CON_NUM "| mail -s 'Conntrack alarm' $REC_MAILER fi EOF
监控-marathon-check
#!/bin/python # -*- coding: utf-8 -*- import urllib2 import sys import subprocess import urllib import time import socket import os,re import requests,json import commands def get_host_ip(): try: s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.connect(('8.8.8.8', 80)) ip = s.getsockname()[0] finally: s.close() return ip def get_nginx_conf(): apps = [] try: if os.path.exists("/home/sh/nginx_marathon/nginx_conf.sh"): os.system('/usr/bin/sh /home/sh/nginx_marathon/nginx_conf.sh') if os.path.exists("/home/sh/nginx_marathon/nginx_conf"): fs = commands.getoutput('cat /home/sh/nginx_marathon/nginx_conf') if fs: b=re.compile('^upstream\s*(.\S*)\s*{\n\s+(.*)',re.MULTILINE) tags = (b.findall(fs)) #print tags for i in tags: try: #b=re.findall('(.*)-\d+',i[0])[0] d=re.compile('(\d+.\d+.\d+.\d+)+',re.MULTILINE) apps.append([re.findall('(.*)-\d+',i[0])[0],d.findall(i[1])]) except: continue except Exception as e: print(str(e)) return apps def dindin(token,msg): try: dindinapi = 'https://oapi.dingtalk.com/robot/send?access_token=%s' % token headers = {'content-type': 'application/json','charset':'utf-8'} data = { "msgtype": "text", "text": { "content": msg }, "at": { "atMobiles": [ ], "isAtAll": True } } req=requests.post(dindinapi,json=data,headers=headers) except Exception as e: print(str(e)) def check_marathon(): try: marathon_ip = '192.168.101.150' host_ip = get_host_ip() hostname = commands.getoutput('hostname') tokens = 'a6184389010ffc41f045ab23eab3d40f8ebe1cd9526e7832a131d718aa13057c' apps = get_nginx_conf() r = requests.Session() r.auth = ('wowotuan','wowotuan') for i in apps: if i[0]: try: s = r.get('http://%s:8080/v2/apps/%s' % (marathon_ip,i[0].encode('utf8'))) app_infos = s.json()['app'] hosts = [] if app_infos: for task in app_infos['tasks']: hosts.append(task['host'].encode('utf8')) for ip in i[1]: if ip.encode('utf8') not in hosts: msg = 'tc hb2 nginx:%s-%s app:%s ip:%s 注册失败' % (hostname,host_ip,i[0],ip) print msg dindin(tokens,msg) else: msg = 'tc hb2 nginx:%s-%s app:%s ip:%s 注册成功' % (hostname,host_ip,i[0],ip) print msg except: continue except Exception as e: print(str(e)) #print get_nginx_conf() check_marathon()
[root@tc-xy-nginx002 ~]# /usr/bin/python /home/sh/nginx_marathon/nginx_marathon_check.py tc hb2 nginx:tc-xy-nginx002-192.168.100.81 app:xy-qd-api-yzprod ip:192.168.101.248 注册成功 tc hb2 nginx:tc-xy-nginx002-192.168.100.81 app:yzbizcenter-admin-yzprod ip:192.168.101.26 注册成功 tc hb2 nginx:tc-xy-nginx002-192.168.100.81 app:yzbizcenter-admin-yzprod ip:192.168.101.2 注册成功
高峰关闭服务
18 19 20 21 22 高峰时间停止服务运行
[root@jumpserver01 ~]# cat /alidata/xuchangwei/checkpy.sh #!/bin/bash # export JAVA_HOME=/usr/java/jdk1.8.0_40 export CLASSPATH=.:$JAVA_HOME/jre/lib/rt.jar:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib/tools.jar export PATH=$PATH:$JAVA_HOME/bin function check_time() { high="18 19 20 21 22" if [[ "$high" =~ "$(date +%H)" ]]; then #if [[ "$high" =~ "14" ]]; then echo "kill service" kill -9 $(ps -ef |grep updateCover |grep -v grep|awk '{print $2}') kill -9 $(ps -ef |grep monitorjob.py |grep -v grep|awk '{print $2}') kill -9 $(ps -ef |grep imagedeal-jar-with-dependencies.jar|grep -v grep| awk '{print $2}') else check_pid fi } function check_pid() { i=0 interval=10 count=$[60/$interval] while [ $i -lt $count ]; do p_num=$(ps -ef |grep updateCover |grep -v grep |wc -l) m_num=$(ps -ef |grep monitorjob.py |grep -v grep |wc -l) i_num=$(ps -ef |grep imagedeal-jar-with-dependencies.jar|grep -v grep|wc -l) if [ $p_num == '0' ]; then cd /alidata/xuchangwei/dealcover nohup python updateCover.py 1 300000000 &>>to.log & fi if [ $m_num == '0' ]; then cd /alidata/xuchangwei/monitorcv nohup python monitorjob.py 10000 &>m.log & fi if [ $i_num == '0' ]; then cd /alidata/imagedeal/project/waterDeal/target/ nohup java -jar /alidata/imagedeal/project/waterDeal/target/imagedeal-jar-with-dependencies.jar online 45 >/alidata/imagedeal/project/imagedeal.log 2>&1 & fi ((i++)) sleep $interval done } check_time
prometheus
阿里SLS分析
# -*- coding:UTF-8 -*- import datetime import os import time from alibabacloud_cms20190101 import models as cms_20190101_models from alibabacloud_cms20190101.client import Client as Cms20190101Client from alibabacloud_sls20201230 import models as sls_20201230_models from alibabacloud_sls20201230.client import Client as Sls20201230Client from alibabacloud_tea_openapi import models as open_api_models from alibabacloud_tea_util import models as util_models from alibabacloud_tea_util.client import Client as UtilClient from prometheus_client import Gauge, start_http_server g4_1 = Gauge('sls_undertow_service_qps', 'service qps', ['service']) g4_2 = Gauge('sls_undertow_service_avg_rt', 'service avg rt', ['service']) g4_3_0 = Gauge('sls_undertow_api_slow_level0', 'api slow graater than 200ms ', ['service', 'uri']) g4_3_4 = Gauge('sls_undertow_api_slow_detail', 'slow query detail', ['service', 'uri', 'avg_timeused', 'min_timeused', 'max_timeused']) g6 = Gauge('sls_exception_qpm', 'exception qpm', ['service']) def get_content_from_file(file_path): if not os.path.exists(file_path): return "" with open(file_path, 'r') as f: content = f.readlines() if content: return content[0].strip() return "" def set_content_to_file(content, file_path): with open(file_path, 'w') as f: f.write(content) f.close() class getSlsLogData: access_key_id = "Lxx" access_key_secret = "fxxx" undertow_project = "singapore" undertow_logstore = "undertow" logback_project = "overseas-logback-server" logback_logstore = "server-exception" sls_endpoint = f'ap-southeast-1-intranet.log.aliyuncs.com' service_all = [ "attribute-server", "attribute-server-manager", ] prefix_uris = [ "/episodic_drama", "/ads_unlock/episodic_drama", ] def __init__(self): config = open_api_models.Config( access_key_id=self.access_key_id, access_key_secret=self.access_key_secret ) config.endpoint = self.sls_endpoint self.client = Sls20201230Client(config) self.is_pre = False def get_sls_data(self, current=0, project="", logstore="", query=""): if not current: current = int(time.time()) if not project: project = self.undertow_project if not logstore: logstore = self.undertow_logstore from_time = current - 60 to_time = current headers = {} runtime = util_models.RuntimeOptions() get_logs_request = sls_20201230_models.GetLogsRequest(from_=from_time, to=to_time, query=query) try: res = self.client.get_logs_with_options(project, logstore, get_logs_request, headers, runtime) return res.body except Exception as error: UtilClient.assert_as_string(error) return [] def set_metric_service_qps(self): query = '*|select "__tag__:_container_name_" as service ,COUNT(*) as cnt group by service order by cnt desc ' res_list = self.get_sls_data(query=query) for data in res_list: service = data["service"] cnt = float(data["cnt"]) g4_1.labels(service).set(cnt) def set_metric_service_avg_rt(self): query = '* |select "__tag__:_container_name_" as service , avg(timeUsed) as avg_time group by service order by avg_time desc ' res_list = self.get_sls_data(query=query) for data in res_list: service = data["service"] try: avg_time_ns = float(data["avg_time"]) except: continue avg_time_ms = int(avg_time_ns / 1000) g4_2.labels(service).set(avg_time_ms) def get_undertow_data_daily(self, query=""): now = datetime.datetime.now() midnight = now.replace(hour=0, minute=0, second=0, microsecond=0) midnight_ts = int(midnight.timestamp()) from_time = midnight_ts - 9000 to_time = from_time + 1800 project = self.undertow_project logstore = self.undertow_logstore headers = {} runtime = util_models.RuntimeOptions() get_logs_request = sls_20201230_models.GetLogsRequest(from_=from_time, to=to_time, query=query) try: res = self.client.get_logs_with_options(project, logstore, get_logs_request, headers, runtime) return res.body except Exception as error: UtilClient.assert_as_string(error) return [] def set_metric_slow_detail(self): g4_3_4.clear() for service in self.get_service_list(): query = '* and __tag__:_container_name_: {} |select COUNT(*) as cnt, avg(timeUsed)/1000 as avg_timeused, min(timeUsed)/1000 as min_timeused, max(timeUsed)/1000 as max_timeused , sum(timeUsed)/1000 as sum_timeused, "__tag__:_container_name_" as service ,uri group by service,uri HAVING avg_timeused >200 order by avg_timeused desc limit 100000'.format( service) res_list = self.get_undertow_data_daily(query=query) pre_map = {} sum = float(0) for data in res_list: self.is_pre = False uri = data["uri"] cnt = float(data["cnt"]) min_timeused = float(data["min_timeused"]) max_timeused = float(data["max_timeused"]) sum += cnt for prefix_uri in self.prefix_uris: if not uri.startswith(prefix_uri): continue uri_len = len(uri.split("/")) prefix_uri_len = len(prefix_uri.split("/")) param_str = "" for i in range(uri_len - prefix_uri_len): param_str = param_str + "/{param}" sum_timeused = float(data["sum_timeused"]) key = prefix_uri + param_str pre_uri_data = pre_map.get(key, {}) pre_uri_data["sum_timeused"] = float(pre_uri_data.get("sum_timeused", 0)) + sum_timeused pre_uri_data["cnt"] = float(pre_uri_data.get("cnt", 0)) + cnt pre_uri_data["min_timeused"] = min(float(pre_uri_data.get("min_timeused", min_timeused)), min_timeused) pre_uri_data["max_timeused"] = max(float(pre_uri_data.get("max_timeused", max_timeused)), max_timeused) pre_map[key] = pre_uri_data self.is_pre = True break if not self.is_pre: avg_timeused = data["avg_timeused"] min_timeused = str(min_timeused) max_timeused = str(max_timeused) g4_3_4.labels(service, uri, avg_timeused, min_timeused, max_timeused).set(cnt) for uri, pre_uri_data in pre_map.items(): cnt = pre_uri_data["cnt"] sum_timeused = pre_uri_data["sum_timeused"] min_timeused = str(pre_uri_data["min_timeused"]) max_timeused = str(pre_uri_data["max_timeused"]) avg_timeused = str(round(sum_timeused / cnt, 2)) g4_3_4.labels(service, uri, avg_timeused, min_timeused, max_timeused).set(cnt) def get_service_list(self): service_list = [] query = '*|select DISTINCT "__tag__:_container_name_" as service' res_list = self.get_undertow_data_daily(query=query) for data in res_list: service = data["service"] service_list.append(service) return service_list def set_metric_api_slow_level0(self): query = '* and timeUsed >= 200000|select "__tag__:_container_name_" as service ,uri, COUNT(*) as cnt group by service,uri order by cnt desc ' res_list = self.get_sls_data(query=query) g4_3_0.clear() for data in res_list: service = data["service"] uri = data["uri"] cnt = float(data["cnt"]) g4_3_0.labels(service, uri).set(cnt) def set_metric_logback_exception(self): query = "* | select __topic__ as service , COUNT(*) as cnt group by service" res_list = self.get_sls_data(project=self.logback_project, logstore=self.logback_logstore, query=query) g6.clear() g6.labels("content-distribution-platform").set(0) for service in self.service_all: g6.labels(service).set(0) for data in res_list: service = data["service"] cnt = data["cnt"] g6.labels(service).set(cnt) if __name__ == '__main__': already_file = "/opt/ops/others/already_run" start_http_server(9111) # 8006端口启动 print("监控程序启动...") data4 = getSlsLogData() while True: try: current_day = datetime.datetime.now().strftime("%Y%m%d") already_day = get_content_from_file(already_file) if current_day != already_day: set_content_to_file(current_day, already_file) print("执行慢接口统计任务") data4.set_metric_slow_detail() print("慢接口统计任务执行完毕") data4.set_metric_service_qps() data4.set_metric_service_avg_rt() data4.set_metric_api_slow_level0() data4.set_metric_logback_exception() current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") print("{} 监控数据采集成功".format(current_time)) time.sleep(40) except Exception as e: print(e) time.sleep(40)
openflcon
网络质量监控脚本
脚本逻辑:通过定量ping内网地址,将丢包的比例上报的open-falcon上,并在open-falcon中设定阈值;
#!/bin/sh ######################### #system variable timestamp=`date +%s` host_name=`hostname` tc_ping=`ping -c 10 192.168.100.43 |grep "packet loss" |awk '{print $6}' |sed 's/%//g'` sc_ping=`ping -c 10 192.168.10.65 |grep "packet loss" |awk '{print $6}' |sed 's/%//g'` ######################### main () { curl -X POST -d '[{"metric": "tc high-speed channel", "endpoint": "'$host_name'", "timestamp": '$timestamp', "step": 60,"value": '$tc_ping',"counterType": "GAUGE","tags": ""}]' http://127.0.0.1:1988/v1/push curl -X POST -d '[{"metric": "sc high-speed channel", "endpoint": "'$host_name'", "timestamp": '$timestamp', "step": 60,"value": '$sc_ping',"counterType": "GAUGE","tags": ""}]' http://127.0.0.1:1988/v1/push } main
openflacon-zookeeeper
[root@zk-1 ~]# crontab -l ################## monitor falcon agent ###################### */1 * * * * /usr/bin/sh /data/work/open-falcon/script/report_agent.sh [root@zk-1 ~]# cat /data/work/open-falcon/script/report_agent.sh #!/bin/bash timestamp=`date +%s` host_name=`hostname` ### check zk ###新### zk_pid=`ps -ef|grep 'zookeeper'|grep -v 'grep'|awk '{print $2}'` zk_port_2181=`netstat -antlp|grep $zk_pid|grep 2181|wc -l` zk_num=`ps -ef|grep 'zookeeper'|grep -v grep | wc -l` curl -X POST -d '[{"metric": "zookeeper.status", "endpoint": "'$host_name'", "timestamp": '$timestamp', "step": 60,"value": '$zk_num',"counterType": "GAUGE","tags": ""}]' http://127.0.0.1:1988/v1/push curl -X POST -d '[{"metric": "zookeeper.connect_port_2181", "endpoint": "'$host_name'", "timestamp": '$timestamp', "step": 60,"value": '$zk_port_2181',"counterType": "GAUGE","tags": ""}]' http://127.0.0.1:1988/v1/push role=`echo mntr|nc 127.0.0.1 2181 |grep zk_server_state|awk '{print $2}'` if [ $role = "leader" ];then zk_avg_latency=`echo mntr|nc 127.0.0.1 2181 |grep zk_avg_latency|awk '{print $2}'` zk_max_latency=`echo mntr|nc 127.0.0.1 2181 |grep zk_max_latency |awk '{print $2}'` zk_min_latency=`echo mntr|nc 127.0.0.1 2181 |grep zk_min_latency|awk '{print $2}'` zk_packets_received=`echo mntr|nc 127.0.0.1 2181 |grep zk_packets_received|awk '{print $2}'` zk_packets_sent=`echo mntr|nc 127.0.0.1 2181 |grep zk_packets_sent|awk '{print $2}'` zk_num_alive_connections=`echo mntr|nc 127.0.0.1 2181 |grep zk_num_alive_connections|awk '{print $2}'` zk_outstanding_requests=`echo mntr|nc 127.0.0.1 2181 |grep zk_outstanding_requests|awk '{print $2}'` zk_znode_count=`echo mntr|nc 127.0.0.1 2181 |grep zk_znode_count|awk '{print $2}'` zk_watch_count=`echo mntr|nc 127.0.0.1 2181 |grep zk_watch_count|awk '{print $2}'` zk_ephemerals_count=`echo mntr|nc 127.0.0.1 2181 |grep zk_ephemerals_count|awk '{print $2}'` zk_approximate_data_size=`echo mntr|nc 127.0.0.1 2181 |grep zk_approximate_data_size|awk '{print $2}'` zk_open_file_descriptor_count=`echo mntr|nc 127.0.0.1 2181 |grep zk_open_file_descriptor_count|awk '{print $2}'` zk_max_file_descriptor_count=`echo mntr|nc 127.0.0.1 2181 |grep zk_max_file_descriptor_count|awk '{print $2}'` zk_followers=`echo mntr|nc 127.0.0.1 2181 |grep zk_followers|awk '{print $2}'` curl -X POST -d "[{\"metric\": \"ZK.zk_avg_latency\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_avg_latency,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push curl -X POST -d "[{\"metric\": \"ZK.zk_max_latency\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_max_latency,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push curl -X POST -d "[{\"metric\": \"ZK.zk_min_latency\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_min_latency,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push curl -X POST -d "[{\"metric\": \"ZK.zk_packets_received\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_packets_received,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push curl -X POST -d "[{\"metric\": \"ZK.zk_packets_sent\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_packets_sent,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push curl -X POST -d "[{\"metric\": \"ZK.zk_num_alive_connections\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_num_alive_connections,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push curl -X POST -d "[{\"metric\": \"ZK.zk_outstanding_requests\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_outstanding_requests,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push curl -X POST -d "[{\"metric\": \"ZK.zk_znode_count\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_znode_count,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push curl -X POST -d "[{\"metric\": \"ZK.zk_watch_count\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_watch_count,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push curl -X POST -d "[{\"metric\": \"ZK.zk_ephemerals_count\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_ephemerals_count,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push curl -X POST -d "[{\"metric\": \"ZK.zk_approximate_data_size\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_approximate_data_size,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push curl -X POST -d "[{\"metric\": \"ZK.zk_open_file_descriptor_count\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_open_file_descriptor_count,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push curl -X POST -d "[{\"metric\": \"ZK.zk_max_file_descriptor_count\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_max_file_descriptor_count,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push curl -X POST -d "[{\"metric\": \"ZK.zk_followers\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_followers,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push else zk_avg_latency=`echo mntr|nc 127.0.0.1 2181 |grep zk_avg_latency|awk '{print $2}'` zk_max_latency=`echo mntr|nc 127.0.0.1 2181 |grep zk_max_latency |awk '{print $2}'` zk_min_latency=`echo mntr|nc 127.0.0.1 2181 |grep zk_min_latency|awk '{print $2}'` zk_packets_received=`echo mntr|nc 127.0.0.1 2181 |grep zk_packets_received|awk '{print $2}'` zk_packets_sent=`echo mntr|nc 127.0.0.1 2181 |grep zk_packets_sent|awk '{print $2}'` zk_num_alive_connections=`echo mntr|nc 127.0.0.1 2181 |grep zk_num_alive_connections|awk '{print $2}'` zk_outstanding_requests=`echo mntr|nc 127.0.0.1 2181 |grep zk_outstanding_requests|awk '{print $2}'` zk_znode_count=`echo mntr|nc 127.0.0.1 2181 |grep zk_znode_count|awk '{print $2}'` zk_watch_count=`echo mntr|nc 127.0.0.1 2181 |grep zk_watch_count|awk '{print $2}'` zk_ephemerals_count=`echo mntr|nc 127.0.0.1 2181 |grep zk_ephemerals_count|awk '{print $2}'` zk_approximate_data_size=`echo mntr|nc 127.0.0.1 2181 |grep zk_approximate_data_size|awk '{print $2}'` zk_open_file_descriptor_count=`echo mntr|nc 127.0.0.1 2181 |grep zk_open_file_descriptor_count|awk '{print $2}'` zk_max_file_descriptor_count=`echo mntr|nc 127.0.0.1 2181 |grep zk_max_file_descriptor_count|awk '{print $2}'` curl -X POST -d "[{\"metric\": \"ZK.zk_avg_latency\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_avg_latency,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push curl -X POST -d "[{\"metric\": \"ZK.zk_max_latency\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_max_latency,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push curl -X POST -d "[{\"metric\": \"ZK.zk_min_latency\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_min_latency,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push curl -X POST -d "[{\"metric\": \"ZK.zk_packets_received\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_packets_received,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push curl -X POST -d "[{\"metric\": \"ZK.zk_packets_sent\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_packets_sent,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push curl -X POST -d "[{\"metric\": \"ZK.zk_num_alive_connections\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_num_alive_connections,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push curl -X POST -d "[{\"metric\": \"ZK.zk_outstanding_requests\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_outstanding_requests,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push curl -X POST -d "[{\"metric\": \"ZK.zk_znode_count\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_znode_count,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push curl -X POST -d "[{\"metric\": \"ZK.zk_watch_count\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_watch_count,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push curl -X POST -d "[{\"metric\": \"ZK.zk_ephemerals_count\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_ephemerals_count,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push curl -X POST -d "[{\"metric\": \"ZK.zk_approximate_data_size\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_approximate_data_size,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push curl -X POST -d "[{\"metric\": \"ZK.zk_open_file_descriptor_count\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_open_file_descriptor_count,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push curl -X POST -d "[{\"metric\": \"ZK.zk_max_file_descriptor_count\", \"endpoint\": \"$host_name\", \"timestamp\": $timestamp, \"step\": 60,\"value\": $zk_max_file_descriptor_count,\"counterType\": \"GAUGE\",\"tags\": \"\"}]" http://127.0.0.1:1988/v1/push fi
服务监控治理
监控脚本说明
参数说明: --service:服务类型 --ports:服务端口,多端口间,逗号分隔。没有端口则填0。 --names: 监控指标。多个指标以,逗号分隔。
bash脚本模板
monitor[root@nginx001 vhosts]# crontab -l #Ansible: zookeeper.connect_port */1 * * * * /home/scripts/zookeeper/zookeeper.connect_port.sh --service zk --ports 2181,3888 --names zk.connect_port #Ansible: zookeeper.daemon_status */1 * * * * /home/scripts/zookeeper/zookeeper.daemon_status.sh --service zk --ports 0 --names zk.daemon_status
#!/usr/bin/env bash # source /etc/profile timestamp=`date +%s` host_name=`hostname` exit_status=0 function usage(){ cat <<EOF Usage: $0 [MODE] [OPTION] OPTION: -s, --service <server_name> 指定服务 -p, --ports <port1,port2...> 指定端口号 -n, --names <tag_name> 指定监控指标 --help show help EOF } parameters=$(getopt -o s:p:n: --long service:,ports:,names:,help -n "$0" -- "$@") [ $? -ne 0 ] && { echo "Try '$0 --help' for more information."; exit 1; } eval set -- "$parameters" while true; do case "$1" in -s|--service) service=$2; shift 2;; -p|--ports) ports=$2 ; shift 2 ;; -n|--names) names=$2 ; shift 2 ;; --help) usage;exit ;; --) shift FIRST=$1 SECOND=$2 LAST=$3 break ;; *) usage;exit 1 ;; esac done function open_falcon_push() { local metric=$1 local tag=$2 local value=$3 curl --connect-timeout 5 -X POST -d '[{"'metric'": "'$metric'", "endpoint": "'$host_name'", "timestamp": '$timestamp', "step": 60,"value": '$value',"counterType": "GAUGE","tags": "'$tag'"}]' http://127.0.0.1:1988/v1/push #echo '[{"metric": "'$metric'", "endpoint": "'$host_name'", "timestamp": '$timestamp', "step": 60,"value": '$value',"counterType": "GAUGE","tags": "'$tag'"}]' http://127.0.0.1:1988/v1/push } [ -z "$service" ] && { echo -e "\033[31;1;4mERR\033[0m: the service is not found, please --help" && exit_status=1; } [ -z "$ports" ] && { echo -e "\033[31;1;4mERR\033[0m: the ports are not found, please --help" && exit_status=1; } [ -z "$names" ] && { echo -e "\033[31;1;4mERR\033[0m: the names are not found, please --help" && exit_status=1; } [ $exit_status == 1 ]&& exit 1 #----------------------------------change [ -z "$(which nc)" ] && yum install -y nc daemon_status=`echo ruok | nc 127.0.0.1 2181|grep imok|wc -l` zk_pid=`jps |grep QuorumPeerMain| grep -v 'grep'|awk '{print $1}'` function noport_check() { for port in $(echo ${ports//,/ }); do if [ "$port" -ge 0 ] 2>/dev/null ; then :;else echo -e "\033[31;1;4mERR\033[0m: port $port is not number" && exit 1; fi for name in $(echo ${names//,/ }); do open_falcon_push "$name" "name=$name,port=0,service=$service" "$daemon_status" done done } function ports_check() { for port in $(echo ${ports//,/ }); do if [ "$port" -ge 0 ] 2>/dev/null ; then :;else echo -e "\033[31;1;4mERR\033[0m: port $port is not number" && exit 1; fi for name in $(echo ${names//,/ }); do zk_port=`netstat -antlp|grep $zk_pid|grep $port|wc -l` open_falcon_push "$name" "name=$name,port=$port,service=$service" "$zk_port" done done } #----------------------------------end function monitor() { noport_check #ports_check }
python脚本模板
[root@nginx001 vhosts]# /home/scripts/nginx/nginx.openfiles_check.py --service nginx --ports 0 --names nginx.openfiles_check success [{'metric': 'nginx.openfiles_check', 'endpoint': 'nginx001', 'timestamp': 1575447496, 'step': 120, 'value': 1, 'counterType': 'GAUGE', 'tags': 'name=nginx.openfiles_check,port=0,service=nginx'}]
#!/usr/bin/env python3 # -*- coding:utf-8 -*- import time,datetime import json import socket import requests import os import argparse parser = argparse.ArgumentParser(description='monitor script') parser.add_argument("-s","--service", help="指定服务", type=str, required=True) parser.add_argument("-p","--ports", help="指定端口号", type=str, required=True) parser.add_argument("-n","--names", help="指定监控指标", type=str, required=True) args = parser.parse_args() def open_falcon_push(payload): url = 'http://127.0.0.1:1988/v1/push' headers = {"Content-Type": "application/json"} try: r = requests.post(url, data=json.dumps(payload), headers=headers, timeout=3) if r.status_code == 200: print(r.text) else: print('{"err":1,"msg":"%s"}' % r.text) except Exception as e: print("Exception: {traceback.format_exc(chain=False)}") #---------------------------------------------change def main(): service = args.service ports = args.ports names = args.names hostname = socket.gethostname() timestamp = int(time.time()) step = 120 payload = [] try: ports = ports.split(',') names = names.split(',') for port in ports: tags = 'port=%s' % port for name in names: tag ="name={},port={},service={}".format(name,port,service) if "openfiles_check" in name: value = openfiles() i = { 'metric': name, 'endpoint': hostname, 'timestamp': timestamp, 'step': step, 'value': value, 'counterType': "GAUGE", 'tages': tag } payload.append(i) open_falcon_push(payload) except Exception as e: msg = traceback.format_exc(chain=False) print("Exception: {msg}") finally: print(payload) def openfiles(): nginx_conf = [] nginx_path = "/data/logs/" value=1 g = os.walk(nginx_path) for path, dir_list, file_list in g: for file in file_list: if 'error.log' in file and '2019' not in file: nginx_conf.append(file) for conf_name in nginx_conf: nginxpath = '' nginxpath = nginx_path + conf_name f = open(nginxpath,"r",encoding='utf-8', errors='replace') for i in f.readlines(): i = i.strip("\n") time_date = i.split(" ") fiveminago = (datetime.datetime.now()-datetime.timedelta(minutes=5)).strftime("%H:%M:%S") if time_date[1] < fiveminago: pass else: if "Too many open files" in i : value=0 f.close() return value #--------------------------------------end if __name__ == '__main__': # print(sys.argv) main()
go脚本模板