用户请求=> Nginx => Ingress => uwsgi
{
"bool": {
"must": [
{
"match_all": {}
},
{
"match_phrase": {
"app_name": {
"query": "xxxx"
}
}
},
{
"match_phrase": {
"path": {
"query": "/app/v1/user/ping"
}
}
},
{
"range": {
"request_time": {
"gte": 1,
"lt": 10
}
}
},
{
"range": {
"@timestamp": {
"gt": "2020-11-09 00:00:00",
"lte": "2020-11-12 00:00:00",
"format": "yyyy-MM-dd HH:mm:ss",
"time_zone": "+08:00"
}
}
}
]
}
}
# 这个数据结构用来记录统计结果,
# [[0, 0.1], 3]表示落在 0~0.1 区间的有 3 条记录
# 因为小数的比较和区间比较麻烦,所以采用整数,这里的 0~35 其实是 0~3.5s 区间
# ingress_cal_map = [
# [[0, 0.1], 0],
# [[0.1, 0.2], 0],
# [[0.2, 0.3], 0],
# [[0.3, 0.4], 0],
# [[0.4, 0.5], 0],
# [[0.5, 1], 0],
# ]
ingress_cal_map = []
for x in range(0, 35, 1):
ingress_cal_map.append(
[[x, (x+1)], 0]
)
nginx_cal_map = copy.deepcopy(ingress_cal_map)
nginx_ingress_gap = copy.deepcopy(ingress_cal_map)
ingress_upstream_gap = copy.deepcopy(ingress_cal_map)
def trace_statisics():
trace_ids = []
# 这里的 trace_id 是提前查找过,那些响应时间比较久的请求所对应的 trace_id
with open(trace_id_file) as f:
data = f.readlines()
for d in data:
trace_ids.append(d.strip())
cnt = 0
for trace_id in trace_ids:
try:
access_data, ingress_data = get_igor_trace(trace_id)
except TypeError as e:
# 继续尝试一次
try:
access_data, ingress_data = get_igor_trace.force_refresh(trace_id)
except TypeError as e:
print("Can't process trace {}: {}".format(trace_id, e))
continue
if access_data['path'] != "/app/v1/user/ping": # 过滤脏数据
continue
if 'request_time' not in ingress_data:
continue
def get_int_num(data): # 数据统一做 *10 处理
return int(float(data) * 10)
# 针对每个区间段进行数据统计,可能有点罗嗦和重复,我当时做统计够用了
ingress_req_time = get_int_num(ingress_data['request_time'])
ingress_upstream_time = get_int_num(ingress_data['upstream_response_time'])
for cal in ingress_cal_map:
if ingress_req_time >= cal[0][0] and ingress_req_time < cal[0][1]:
cal[1] += 1
break
nginx_req_time = get_int_num(access_data['request_time'])
for cal in nginx_cal_map:
if nginx_req_time >= cal[0][0] and nginx_req_time < cal[0][1]:
cal[1] += 1
break
gap = nginx_req_time - ingress_req_time
for cal in nginx_ingress_gap:
if gap >= cal[0][0] and gap <= cal[0][1]:
cal[1] += 1
break
gap = ingress_req_time - ingress_upstream_time
for cal in ingress_upstream_gap:
if gap >= cal[0][0] and gap <= cal[0][1]:
cal[1] += 1
break
{
"_source": {
"INDEX": "51",
"path": "/app/v1/media/",
"referer": "",
"user_agent": "okhttp/4.8.1",
"upstream_connect_time": "1.288",
"upstream_response_time": "1.400",
"TIMESTAMP": "1605776490465",
"request": "POST /app/v1/media/ HTTP/1.0",
"status": "200",
"proxy_upstream_name": "default-prod-XXX-80",
"response_size": "68",
"client_ip": "XXXXX",
"upstream_addr": "172.32.18.194:6000",
"request_size": "1661",
"@source": "XXXX",
"domain": "XXX",
"upstream_status": "200",
"@version": "1",
"request_time": "1.403",
"protocol": "HTTP/1.0",
"tags": ["_dateparsefailure"],
"@timestamp": "2020-11-19T09:01:29.000Z",
"request_method": "POST",
"trace_id": "87bad3cf9d184df0:87bad3cf9d184df0:0:1"
}
}
这个跟我们排查的 cgroup 太像了,宿主机上有一些周期性任务,随着执行次数增多,占用的内核资源越来越多,达到一定程度就影响了网络延迟。
sync && echo 3 > /proc/sys/vm/drop_caches
本文始发于微信公众号(分布式实验室):记一次 Kubernetes 机器内核问题排查
- 左青龙
- 微信扫一扫
- 右白虎
- 微信扫一扫
评论