一、缘由
本脚本是阿里云资源利用率定期统计方案中的其中一个脚本。
本脚本可实现,从每天95个平均值数据中取中位数,15天15个中位数取平均值,得到最终的15天内CPU和内存使用率数值。
进而根据阈值进行判断,资源是否处于低利用率状态。
二、环境
Python3.7 + 阿里云云监控SDK + 阿里云ECS的SDK + pandas + numpy
三、代码实现
# -*- coding: utf-7 -*-
import time
import numpy
import json
import pandas as pd
from typing import List
from alibabacloud_cms20190101.client import Client as Cms20190101Client
from alibabacloud_tea_openapi import models as open_api_models
from alibabacloud_cms20190101 import models as cms_20190101_models
from alibabacloud_ecs20140526.client import Client as Ecs20140527Client
from alibabacloud_ecs20140526 import models as ecs_20140526_models
REGION_ID = ['cn-beijing', 'cn-hangzhou', 'cn-zhangjiakou', 'cn-shenzhen', 'cn-shanghai', 'cn-hongkong',
'ap-southeast-1', 'ap-northeast-1']
class GetMonitorData:
def __init__(self):
pass
@staticmethod
def create_client(
access_key_id: str,
access_key_secret: str,
) -> Cms20190101Client:
config = open_api_models.Config(
access_key_id='xxxxxxxxx',
access_key_secret='xxxxxxxxxxxxx'
)
config.endpoint = 'metrics.cn-hangzhou.aliyuncs.com'
return Cms20190101Client(config)
@staticmethod
def main(
args: List[str],
) -> None:
client = GetMonitorData.create_client('acessKeyId', 'accessKeySecret')
describe_metric_list_request = cms_20190101_models.DescribeMetricListRequest(
metric_name=args[1],
namespace=args[0],
period='900',
start_time=args[2],
end_time=args[3],
length='100',
dimensions='{{"instanceId":{}}}'.format(args[4])
)
res = client.describe_metric_list(describe_metric_list_request)
return res.body
class GetInstanceIdName:
def __init__(self):
pass
@staticmethod
def create_client(
access_key_id: str,
access_key_secret: str,
) -> Ecs20140526Client:
config = open_api_models.Config(
access_key_id='xxxxxxxxxxxxxx',
access_key_secret='xxxxxxxxxxxxxxxxxxxx'
)
config.endpoint = 'ecs-cn-hangzhou.aliyuncs.com'
return Ecs20140526Client(config)
@staticmethod
def main(
args: List[str],
) -> None:
client = GetInstanceIdName.create_client('accessKeyId', 'accessKeySecret')
describe_instances_request = ecs_20140526_models.DescribeInstancesRequest(
region_id=args[1],
next_token=args[0],
max_results=50
)
res = client.describe_instances(describe_instances_request)
return res.body
def get_id_name_dict():
instance_dict = {}
for i in range(0, len(REGION_ID)):
token = 'init_data'
while token:
result = GetInstanceIdName.main([token, REGION_ID[i]])
token = result.next_token
info_list = result.instances.instance
for j in range(0, len(info_list)):
instance_dict[info_list[j].instance_id] = info_list[j].instance_name
print('实例ID和名字的字典:', instance_dict)
return instance_dict
def get_median_24h(instance_dict, pre_days, metric_name):
median_dict = {}
today = time.strftime('%Y-%m-%d', time.localtime(time.time()))
today_time = time.mktime(time.strptime(today, '%Y-%m-%d'))
# 从昨天开始,往前推15天,15次循环,取1-15。取00:00:00-23:59:59的时间戳
start_time = str(round((today_time - 86400*pre_days)*1000))
end_time = str(round((today_time - 86400*(pre_days-1) - 1)*1000))
namespace = 'acs_ecs_dashboard'
for i in instance_dict.keys():
result = GetMonitorData.main([namespace, metric_name, start_time, end_time, i])
average_list = []
res_list = json.loads(result.datapoints)
# 停机和未安装监控agent的主机拿不到监控数据,res_list是个空列表,计算平均是会报错
if len(res_list) != 0:
for j in range(0, len(res_list)):
# print(res_list[i])
average_list.append(round(res_list[j]['Average'], 2))
else:
average_list = [0.00, ]
# print(len(average_list), average_list)
# 取中位数
median_dict[i] = numpy.median(average_list)
time.sleep(0.2)
return median_dict
def get_average_15days(instance_dict, metric):
temp_dict = {}
median_dict1 = get_median_24h(instance_dict, 1, metric)
for k, v in median_dict1.items():
temp_dict[k] = []
for k1, v1 in median_dict1.items():
temp_dict[k1].append(v1)
for i in range(2, 16):
median_dict = get_median_24h(instance_dict, i, metric)
for k2, v2 in median_dict.items():
temp_dict[k2].append(v2)
for k3, v3 in temp_dict.items():
temp_dict[k3] = round(numpy.mean(v3), 3)
print(temp_dict)
return temp_dict
def write_to_execl(data):
df = pd.DataFrame.from_dict(data, orient='index', columns=['CPU使用率', '内存使用率'])
df.to_excel('cpu_mem_15days.xlsx')
if __name__ == '__main__':
# 获取InstanceId和InstanceName的对应字典
# str_time = time.time()
instance_dict = get_id_name_dict()
# 数据合并处理,比较复杂
id_list_dict = {}
for k, v in instance_dict.items():
id_list_dict[k] = []
ecs_metric = ['CPUUtilization', 'memory_usedutilization']
for metric in ecs_metric:
metric_data = get_average_15days(instance_dict, metric)
for k2, v2 in metric_data.items():
id_list_dict[k2].append(v2)
for k_id, k_name in instance_dict.items():
if k_id in id_list_dict:
id_list_dict[k_name] = id_list_dict.pop(k_id)
# 数据写入excel表格
write_to_execl(id_list_dict)
# print(time.time() - str_time)
注意:以上代码中24小时内的数据,是一个实例一个循环取的数据,调用接口次数多,耗时较长。经实践400个ECS,执行完本脚本要5个小时左右,可耐心等待。
转载请注明来源,欢迎对文章中的引用来源进行考证,欢迎指出任何有错误或不够清晰的表达。可以在下面评论区评论,也可以邮件至 lxwno.1@163.com