# echo **Repository Path**: lihang212010/echo ## Basic Information - **Project Name**: echo - **Description**: echo监控中心test - **Primary Language**: Unknown - **License**: Not specified - **Default Branch**: master - **Homepage**: https://gitee.com/ - **GVP Project**: No ## Statistics - **Stars**: 0 - **Forks**: 0 - **Created**: 2021-06-17 - **Last Updated**: 2022-11-03 ## Categories & Tags **Categories**: Uncategorized **Tags**: None ## README ##echo文档 [toc] # 功能简介 echo按功能划分主要有3大功能:图表,报警,数据统计 # 环境安装 echo需要kube-prometheus,详细安装教程可参考该博客 [https://blog.csdn.net/ljx1528/article/details/112971378](https://blog.csdn.net/ljx1528/article/details/112971378) # 快速开始 # 源码解析 ## 原理 ### 基本概念 #### 模版 模版是echo的最重要构成,它储存了图表的属性,报警规则,每一个应用,接口,主机都会将模版里的{{.AppName}}等属性替换成真实AppName,产生各个图表 #### 图表 模版自动为每个app或者api,或者host产生对应的图,图中会存储具体的报警信息,图标数据等,报警时会根据图中的token发送至具体钉钉群,并@相关人员 #### dashboard dashboard是自定义的图,即用于自行创建的图 ### 报警原理 根据kube-prometheus的将规则写入k8s指定位置后,alert将自动将报警信息发送到echo的hook接口,hook会根据报警信息查询到具体图,根据图中小心发送指定报警信息 ### 数据统计 echo定时访问prometheus获得qps,tp99,tp95等信息存入sql ## 底层模块 ### k8s创建规则 echo的报警规则是以prometheusrules格式存在k8s中的,每一次模版的创建更新删除,都会通过k8s创建或者删除老的规则 ``` //删除某资源 func (s *k8sService) DeleteAlertResource(ctx context.Context, name string) error { defer func() { if r := recover(); r != nil { s.logger.Error("recover DeleteAlertResource") fmt.Println(r) } }() name = "prometheus-" + name + "-rules" resource := schema.GroupVersionResource{ Group: "monitoring.coreos.com", Version: "v1", Resource: "prometheusrules", } err := s.dynamicClient.Resource(resource).Namespace("thanos").Delete(name, &metav1.DeleteOptions{}) if err != nil { return errors.Wrap(err, "k8s delete resource error") } return nil } //name是规则名字,分别位appname,api-ApiId,ip,dashboardID kind为对应类型0:app 1:api 2:ip,3:dashboardID func (s *k8sService) UpdateAlertResource(ctx context.Context, name, kind string, templates []*models.Template) error { defer func() { if r := recover(); r != nil { s.logger.Error("recover UpdateAlertResource") fmt.Println(r) } }() resource := schema.GroupVersionResource{ Group: "monitoring.coreos.com", Version: "v1", Resource: "prometheusrules", } var ruleMap []map[string]interface{} for _, graph := range templates { rule := graph.Alert if !rule.ValidateAlert() { continue } if len(rule.Threshold) == 0 { continue } expression := strings.TrimSpace(rule.Expression) if strings.HasSuffix(expression, "%s") { expression = strings.ReplaceAll(expression, "%s", "") } ruleMap = append(ruleMap, map[string]interface{}{ "alert": rule.RuleName, "expr": expression + rule.Threshold, "for": strconv.Itoa(rule.For) + "m", "labels": map[string]interface{}{ "template_id": strconv.Itoa(int(graph.ID)), "name": name, "app_id": strconv.Itoa(int(graph.AppID)), "api_id": strconv.Itoa(int(graph.ApiID)), "ip": graph.Ip, "dashboard_id": strconv.Itoa(int(graph.DashboardID)), "kind": kind, "tag": graph.Tag, }, "annotations": map[string]interface{}{ "message": rule.Message, "value": "{{ $value }}", }, }) } if len(ruleMap) == 0 { return nil } name = "prometheus-" + name + "-rules" conf := make(map[string]interface{}) conf = map[string]interface{}{ "apiVersion": "monitoring.coreos.com/v1", "kind": "PrometheusRule", "metadata": map[string]interface{}{ "name": name, "namespace": "thanos", "labels": map[string]interface{}{ "prometheus": "k8s", "role": "thanos-rules", }, }, "spec": map[string]interface{}{ "groups": []map[string]interface{}{ { "name": fmt.Sprintf("./%s.rules", name), "rules": ruleMap, }, }, }, } err := s.dynamicClient.Resource(resource).Namespace("thanos").Delete(name, &metav1.DeleteOptions{}) if err != nil { s.logger.Error("delete resource error", zap.Error(err)) } obj := unstructured.Unstructured{Object: conf} _, err = s.dynamicClient.Resource(resource).Namespace("thanos").Create(&obj, metav1.CreateOptions{}) if err != nil { s.logger.Error("update resource error", zap.Error(err)) return err } s.logger.Info("update resource success, " + name) return nil } ``` ### prometheus查询 prometheus查询使用的是github.com/prometheus/common和github.com/prometheus/client_golang这两个包,在本项目中大量运用了prometheus查询,分别用来搜索新增api和图表查询,收集qps等数据 ### 钉钉报警 本项目中钉钉报警的匹配规则数根据部门匹配的,每一个图都会存储对应的钉钉token,通过token发送到指定群钉钉消息。 ## 定时任务 echo需要实时同步app,api,api—data,host等数据,保证数据与limos一致 echo的定时任务是在[cron](github.com/robfig/cron/v3 )的基础上进一步封装的 ``` type Job interface { cron.Job Cron() string Name() string } ``` 在使用时你需要给struct补充Run(),Cron(),Name()方法,详细请参考下面例子 ### app同步 echo需要定期同步limos上的app数据,以方便dashboard的开发以及模版规则的写入。 ``` 查询自身app所有数据 ->调用limos接口FindAppForPage查询所有最新app数据 -->保存所有app数据 ---->删除多余app数据,调用k8s删除对应规则 ---->sql查询所有需要写入k8s的规则,创建一个prometheus-app.id-rules的规则 ``` ``` //将app的信息定时更新到数据库中 func (a *AppSyncJob) StoreAppInfo() error { context, _ := context2.WithTimeout(context2.Background(), 1000*time.Second) all, _ := a.appRepo.FindAll(context) apps, err := zclients.FindApp(context, a.limosService) if err != nil { return errors.Wrap(err, "query limos error") } appMap := map[uint64]*models.Application{} for _, application := range all { appMap[application.ID] = application } for _, app := range apps.Apps { var owners []*models.ApplicationOwner for i, id := range app.OwnerIds { owners = append(owners, &models.ApplicationOwner{ UserID: id, UserName: app.Owner[i], }) } application := a.convert(app, owners) //更新应用信息 err = a.appRepo.Save(context, application) if err != nil { a.logger.Warn("sync app info error:%s", zap.Error(err)) } if appMap[application.ID] != nil { delete(appMap, application.ID) } else { // 新应用添加监控文件 templates, _ := a.template.FindByAppIDIsForce(context, uint64(app.ID)) a.k8s.UpdateAlertResource(context, app.Name, "0", templates) } } // 删除应用 for _, app := range appMap { err := a.appRepo.DeleteById(context, app.ID) if err != nil { a.logger.Warn("delete app err", zap.Error(err)) } //删除掉对应监控配置文件 err = a.k8s.DeleteAlertResource(context, app.Name) if err != nil { a.logger.Warn("delete graph err", zap.Error(err)) } } return nil } ``` ### api同步 echo中api的同步原理是通过查询prometheus的埋点sum(rate(http_server_requests_seconds_count[1h])) by(app, uri,method)和sum(rate(zeus_rpc_seconds_count[1h])) by(app, service, method,kind)获取api并存储数据库 ### 数据收集 每隔10分钟echo会访问prometheus进行qps,tp99,tp95,错误数,请求数的收集并存储sql中 ### reolad规则 reload数通过访问prometheus的/api/v1/alerts接口获取prometheus中的报警,并与数据库中的当前报警进行比对删除 ### host同步 通过调用HostService的zeus接口进行host同步,因为host的id主机号ip可能产生动态变化,所以每隔24小时会进行host同步 ## 功能实现 ### 模版 模版是一个比较复杂的struct结构,它使用了大量的json结构 #### 创建 创建图表是一个相对复杂的过程,前端将对应json储存至后端,后端会对部分数据做处理后解析会一个Template结构,然后储存至mysql中,储存成功后启动一个新的协程,并返回前端成功的消息,新启动的协程会调用各种zeus方法,去尝试获取每个app或者api的{{.AppName}},{{.Group}},{{.Ip}}等消息,获取成功后将对应模版转化成图储存至sql,将告警规则放入k8s中,没成功一个会将进度加1,前端可以通过访问/template/progress接口获得进度 ``` func (s *templateService) Create(ctx context.Context, t *models.Template) error { if err := s.repository.Create(ctx, t); err != nil { return errors.Wrap(err, "call template.Cteate error") } // 将模版同步到对应到图中,0代表是新增 go func() { s.RefreshTemplate(context.Background(), t, 0) }() return nil } ``` #### 删除 通过template_id删除所有图的对应模版,删除对应模版 #### 修改 修改和创建过程相似,不过会通过template查询原有图表的id,进行update操作 ### 图 图操作经常会被用来做各种查询获得对应的信息,其中dashboard会利用图的增加修改操作进行,图进行增加删除修改操作后,需要对应的k8s读写操作,以下是一个图到创建操作 ``` graph, err := s.repository.Create(ctx, graph) if err != nil { s.logger.Error("call GraphRepository Create error", zap.Error(err)) return nil, errors.Wrap(err, "call GraphRepository Create error") } // 同步k8s if err := s.UpdateK8s(ctx, graph); err != nil { s.logger.Error("call GraphRepository UpdateK8s error", zap.Error(err)) } ``` #### 统计图 通过图的图表信息进行对应的prometheus查询,前端拿到数据后渲染 ### 钉钉群管理 钉钉群管理是一个简单的增删改查操作,其中Echo-SLA群必须存在,5分钟一次的告警和日报会打到该群中,模版在同步图的过程会根据钉钉群部门进行同步过程 #### 告警历史 告警历史分为当前正在告警和告警历史2个表中,分别为t_alert_history和t_alert_current,在查询时一般查询t_alert_history表,sla是通过t_alert_history表中的duration字段进行统计的,它的结构如下,会根据app_id,api_id,host_id,dashboard_id查询对应类型的告警,其中,host暂时在echo中无报警设置 ``` create table t_alert_history ( id bigint unsigned auto_increment primary key comment 'id', template_id bigint unsigned not null comment 'template_id', graph_id bigint default 0 comment 'app id', app_id bigint default 0 comment 'app id', api_id bigint default 0 comment 'api id', host_id int default 0 comment 'ip', dashboard_id bigint default 0 comment 'dashboard_id', kind int not null comment '告警类型 0:app,1:api,2:host,3:dashboard', alerts json comment '正在报警具体信息', alert_name varchar(255) default '' comment '告警名称(已经废弃那么代替)', alert_level int default 0 comment '告警级别', name varchar(255) default '' comment '图表名称', owner varchar(255) default '' comment '责任人', threshold varchar(64) default '0' comment '告警阀值', duration bigint default 0 comment '时长', Level int default 0 comment '级别', date varchar(32) default '' comment '日期', dev_group_id varchar(255) default '' comment 'group ID', dev_group_name varchar(255) default '' comment '研发组名', message varchar(255) default '' comment '报警消息', generator_url text comment 'url', `value` varchar(255) default '' comment 'value', `type` tinyint(1) default 0 comment '接口类型0:http 1:zeusserver 2zeusclient', start_time timestamp default CURRENT_TIMESTAMP comment '开始时间', end_time timestamp default CURRENT_TIMESTAMP comment '结束时间', KEY AppID (app_id), KEY ApiID (api_id), KEY GraphID (graph_id), KEY HostID (host_id), Key Template (template_id), KEY DashboardID (dashboard_id), KEY Data (date), UNIQUE INDEX ` t_graph_alert_index` (`graph_id`, `start_time`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4 comment '告警历史表'; ``` ### 仪表盘 仪表盘是一个不通过模版生成到图,它可以自行添加告警和图表信息,是一个简单到sql操作 ### 重点接口解析 #### /hook /hook是echo报警到接口,alertmanager会聚合各种告警调用该接口 首先,控制器会开启处理协程并直接返回200于alertmanager,以保证alertmanager不会阻塞消息 ``` func (h *HookController) Hook(c *gin.Context) { defer func() { i := recover() if i != nil { fmt.Println(i) } }() var info promentheus.AlertInfo err := c.BindJSON(&info) if err != nil { c.String(http.StatusBadRequest, "err=%v", err) return } go h.hookService.Dispatch(context.Background(), info) c.AbortWithStatus(http.StatusOK) } ``` 处理过程中,首先根据kind判断告警类型,然后app,dashboard_id的告警通过对应id查询到对应对图,并将告警信息存储sql中,进行对应的钉钉告警,如果是api告警,则需要通过app,method,uri,service等查询出具体api,比对是否超过阈值,在进行以上过程 ``` // 收集处理报警消息 func (s hookService) Dispatch(ctx context.Context, infos promentheus.AlertInfo) error { defer func() { if r := recover(); r != nil { s.logger.Error("recover DeleteAlertResource") fmt.Println(r) } }() s.mutex.Lock() defer s.mutex.Unlock() // infos.Alerts虽然是数组,但其实是一个图到告警信息,可能因为host等标签不同被alertmanager使用group分组 // 这里可以提前对infos.Alerts合并成一个数据,也可以只保存第一条或者最后一条,对所有消息进行钉钉告警 // 目前处理策略是只保存一条但对所有进行钉钉告警 for _, info := range infos.Alerts { history := new(models.AlertHistory) alert := new(models.AlertCurrent) application := new(models.Application) // kind可以区分app,api,dashoboard,host告警 kind, err := strconv.Atoi(string(info.Labels["kind"])) if err != nil { s.logger.Error("kind don't int type", zap.Error(err)) continue } history.Kind = kind // templateID可以查询出api对graph templateID, err := strconv.ParseUint(string(info.Labels["template_id"]), 10, 64) if err != nil { s.logger.Error("templateID don't int type", zap.Error(err)) continue } history.TemplateID = templateID switch kind { case 0: GraphID, err := strconv.ParseUint(string(info.Labels["graph_id"]), 10, 64) if err != nil { s.logger.Error("graph_id don't int type", zap.Error(err)) continue } history.GraphID = GraphID alert.GraphID = GraphID appID, err := strconv.ParseUint(string(info.Labels["app_id"]), 10, 64) if err != nil { s.logger.Error("appID don't int type", zap.Error(err)) continue } alert.AppID = appID history.AppID = appID app, err := s.appRepository.FindByID(ctx, history.AppID) if err != nil { s.logger.Error("call appRepository FindByID", zap.Error(err)) continue } application = app alert.Level = app.Level alert.AppID = app.ID alert.DevGroupID = app.DevGroupID history.AppID = app.ID history.Level = app.Level history.DevGroupName = app.DevGroupName history.DevGroupID = app.DevGroupID case 1: api := new(models.Api) if _, ok := info.Labels["uri"]; ok { uri := string(info.Labels["uri"]) method := string(info.Labels["method"]) app := string(info.Labels["app"]) api, err = s.apiRepository.FindByHttpPath(ctx, app, method, uri) if err != nil { s.logger.Error("call apiRepository FindByID", zap.Error(err)) continue } } else { service := string(info.Labels["service"]) method := string(info.Labels["method"]) app := string(info.Labels["app"]) api, err = s.apiRepository.FindByZeus(ctx, app, service, method, 3) if err != nil { s.logger.Error("call apiRepository FindByID", zap.Error(err)) continue } } graph, err := s.graphRepository.FindGraphByApiIDAndTemplateID(ctx, api.ID, templateID) if err != nil { s.logger.Error("call graphRepository FindGraphByApiIDAndTemplateID", zap.Error(err)) continue } if graph.ID == 0 { continue } history.GraphID = graph.ID alert.GraphID = graph.ID alert.AppID = api.AppID alert.ApiID = api.ID alert.Level = api.Level alert.DevGroupID = api.DevGroupID history.AppID = api.AppID history.ApiID = api.ID history.Level = api.Level history.Type = api.Type history.DevGroupName = api.DevGroupName history.DevGroupID = api.DevGroupID case 2: break case 3: GraphID, err := strconv.ParseUint(string(info.Labels["graph_id"]), 10, 64) if err != nil { s.logger.Error("templateID don't int type", zap.Error(err)) continue } history.GraphID = GraphID alert.GraphID = GraphID dashboardID, err := strconv.ParseUint(string(info.Labels["dashboard_id"]), 10, 64) if err != nil { s.logger.Error("appID don't int type", zap.Error(err)) continue } alert.DashboardID = dashboardID history.DashboardID = dashboardID } graph, err := s.graphRepository.FindByID(ctx, history.GraphID) if err != nil { s.logger.Error("call graphRepository FindGraphByApiIDAndTemplateID", zap.Error(err)) continue } // 比较一下阈值是否超过graph对阈值或者小于 threshold, _ := data.ComputeThreshold(graph.Alert.Threshold) value, _ := strconv.ParseFloat(string(info.Annotations["value"]), 64) graph.Alert.Expression = strings.ReplaceAll(graph.Alert.Expression, "%s", "") graph.Alert.Expression = strings.ReplaceAll(graph.Alert.Expression, " ", "") si := "" if len(graph.Alert.Expression) > 0 { si = graph.Alert.Expression[len(graph.Alert.Expression)-1 : len(graph.Alert.Expression)] } if si == "<" { if threshold < value { continue } } else { if threshold >= value { continue } } alert.Alerts = new(models.Alerts) info.State = string(info.Alert.Status()) info.StartAlertTime = time.Time{info.StartsAt.Local()} *alert.Alerts = append(*alert.Alerts, info) alert.StartAlertTime = time.Time{info.StartsAt.Local()} alert.Duration = time.Now().Sub(info.StartsAt).String() history.Alerts = new(models.Alerts) *history.Alerts = append(*history.Alerts, info) history.Message = string(info.Annotations["message"]) history.AlertLevel = graph.Alert.AlertLevel alert.AlertLevel = graph.Alert.AlertLevel history.Owner = graph.Alert.GetOwner() alert.Owner = graph.Alert.GetOwner() history.Name = graph.Name if graph.Alert != nil { history.AlertName = graph.Name alert.AlertName = graph.Name history.Threshold = graph.Alert.Threshold alert.Threshold = graph.Alert.Threshold } // 过滤golang,java报警 if kind == 0 && graph.TypeID == constants.Java && application.Language != "java" { continue } if kind == 0 && graph.TypeID == constants.Go && application.Language != "go" { continue } history.StartTime = time.Time{info.StartsAt.Local()} history.EndTime = time.Time{info.EndsAt.Local()} history.GeneratorURL = info.GeneratorURL history.Value = info.Value history.Duration = (history.EndTime.UnixNano() - history.StartTime.UnixNano()) / 1e6 history.Date = data.FormatDate(history.StartTime.Time.Local(), data.Format_yyyyMMddHH) // 判断是恢复还是开始告警 if info.Alert.Status() != model.AlertFiring { err := s.history.DeleteByCreate(ctx, history.GraphID, history.StartTime) if err != nil { _, err := s.history.FindByGraphIDAndStart(ctx, history.GraphID, history.StartTime) if err != gorm.ErrRecordNotFound { s.logger.Info("跳过一个重复的消息:" + strconv.Itoa(int(history.GraphID))) continue } } currents, err := s.alertRepository.FindByGraphID(ctx, history.GraphID, history.StartTime, history.EndTime) if err == nil { for _, c := range currents { if c.ClaimStatus == 1 { history.ClaimStatus = c.ClaimStatus history.ClaimUser = c.ClaimUser break } } } if err := s.history.Create(ctx, history); err != nil { s.logger.Error("call history.create error ", zap.Error(err)) continue } if err := s.alertRepository.Delete(ctx, history.GraphID); err != nil { s.logger.Error("call alertRepository.Delete error ", zap.Error(err)) continue } // 发送告警 s.SendAlert(ctx, info) } else { alert.Name = info.Name() if err := s.alertRepository.Delete(ctx, history.GraphID); err != nil { s.logger.Error("call alertRepository.Delete error ", zap.Error(err)) continue } if err := s.alertRepository.Create(ctx, alert); err != nil { s.logger.Error("call alertRepository.Create error ", zap.Error(err)) continue } // 发送告警 s.SendAlert(ctx, info) } } //s.SendAlerts(context.Background(), infos.Alerts) return nil } ``` #### /graphs/data 该进口是所有前端展示时使用,通过前端传入对表达式和时间在prometheus中查询数据,使用比较频繁,但逻辑相对简单,主要是将prometheus中数据处理为[]map[string]interface{}结构 ### 前端相关接口 #### 查询app 查询app,是一个根据app表和告警历史表进行数据处理对查询,因为数据量不多,它的排序是在内存进行的 ``` // 查询app func (s appService) Find(ctx context.Context, query *request.SlaQuery) ([]map[string]interface{}, int, error) { start, err := time.ParseInLocation("2006-01-02 15:04:05", query.Start, time.Local) if err != nil { s.logger.Error("call time ParseInLocation error") return nil, 0, err } end, err := time.ParseInLocation("2006-01-02 15:04:05", query.End, time.Local) if err != nil { s.logger.Error("call time ParseInLocation error") return nil, 0, err } query.StartDate = data2.FormatDate(start, data2.Format_yyyyMMddHH) query.EndDate = data2.FormatDate(end, data2.Format_yyyyMMddHH) base := float64(end.Sub(start).Milliseconds()) if query.GroupID != "" { groupIDS, err := zclients.GetNodeIDs(ctx, s.department, query.GroupID) if err != nil { s.logger.Error("call appService getNodeIDs error") return nil, 0, err } query.GroupIDS = groupIDS s.logger.Debug(fmt.Sprint(query.GroupIDS)) } // 查询app applications, err := s.repository.Find(ctx, query.AppName, query.Level, query.GroupIDS) if err != nil { s.logger.Error("call ApplicationRepository Find error", zap.Error(err)) return nil, 0, err } appIDMap := make(map[uint64]map[uint64]*response.History) appIDMap2 := make(map[uint64]map[uint64]*response.History) // 查询被设定sla的remplateID templates, err := s.templateRepository.FindByIsSla(ctx) if err != nil { s.logger.Error("call templateRepository FindByIsSla error", zap.Error(err)) return nil, 0, errors.Wrap(err, "call templateRepository FindByIsSla error") } ids := make([]uint64, 0) tagsMap := make(map[uint64]int) for _, h := range templates { tagsMap[h.ID] = 0 ids = append(ids, h.ID) } //处理前端传入的id ids2 := make([]uint64, 0) if query.TemplateIDS != "" { arr := strings.Split(query.TemplateIDS, ",") for _, a := range arr { id, err := strconv.ParseUint(a, 10, 64) if err == nil { ids2 = append(ids2, id) } } } for _, id := range ids2 { tagsMap[id] = 0 } histories2, err := s.history.FindByDateGroupApp(ctx, ids2, query.Start, query.End) if err != nil { s.logger.Error("call AlertHistoryRepository FindByDateGroupApp error", zap.Error(err)) return nil, 0, err } // 将告警历史处理为map结构,appID对应具体告警历史 for _, h := range histories2 { if _, ok := appIDMap2[h.AppID]; !ok { tags := make(map[uint64]*response.History) tags[h.ID] = h appIDMap2[h.AppID] = tags } else { appIDMap2[h.AppID][h.ID] = h } } // 相关,appID对应具体告警历史 histories, err := s.history.FindByDateGroupApp(ctx, ids, query.Start, query.End) if err != nil { s.logger.Error("call AlertHistoryRepository FindByDateGroupApp error", zap.Error(err)) return nil, 0, err } for _, h := range histories { if _, ok := appIDMap[h.AppID]; !ok { tags := make(map[uint64]*response.History) tags[h.ID] = h appIDMap[h.AppID] = tags } else { appIDMap[h.AppID][h.ID] = h } } count := len(applications) data := make([]map[string]interface{}, 0) for _, app := range applications { m := make(map[string]interface{}) m["app_id"] = app.ID m["app_name"] = app.Name m["description"] = app.Description m["git_url"] = app.GitURL m["owner"] = app.GetOwners() users := make([]string, 0) userIDs := make([]string, 0) for _, user := range app.GetOwners() { users = append(users, user.UserName) userIDs = append(userIDs, user.UserID) } m["users"] = users m["language"] = app.Language m["level"] = app.Level m["dev_group_id"] = app.DevGroupID m["dev_group_name"] = app.DevGroupName m["sort"] = query.Sort m["order"] = query.Order if query.Type == 1 { if v, ok := appIDMap2[app.ID]; ok { for k := range tagsMap { if v1, ok := v[k]; ok { key := strconv.Itoa(int(k)) m[key] = float64(v1.Duration) } else { key := strconv.Itoa(int(k)) m[key] = float64(0) } } } else { for k := range tagsMap { key := strconv.Itoa(int(k)) m[key] = float64(0) } } } else { if v, ok := appIDMap2[app.ID]; ok { for k := range tagsMap { if v1, ok := v[k]; ok { key := strconv.Itoa(int(k)) m[key] = float64(v1.DurationCount) } else { key := strconv.Itoa(int(k)) m[key] = float64(0) } } } else { for k := range tagsMap { key := strconv.Itoa(int(k)) m[key] = float64(0) } } } // 计算sla m["sla"] = s.Compute(appIDMap[app.ID], base) if query.UID != "" { if data2.ContainString(userIDs, query.UID) { data = append(data, m) continue } else { continue } } if query.GroupID != "" { if data2.ContainString(query.GroupIDS, app.DevGroupID) { data = append(data, m) continue } else { continue } } data = append(data, m) } count = len(data) // 排序,根据map结构中的order进行排序 sort.Sort(data2.HistorySlice(data)) pageSize := query.PageSize * (query.PageNum) pageNum := (query.PageNum - 1) * query.PageSize if pageSize > len(data) { pageSize = len(data) } if len(data) > 0 && count > 0 { data = data[pageNum:pageSize] } for _, d := range data { sla := fmt.Sprint(d["sla"]) + "%" d["sla"] = sla } return data, count, nil } //计算sla func (s appService) Compute(historys map[uint64]*response.History, base float64) float64 { if historys == nil || len(historys) == 0 { return 100 } var sum float64 for _, h := range historys { sum += float64(h.Duration) } if base-sum < 0 { return 0 } value2 := (base - sum) / base value2 = value2 * 100 value, _ := strconv.ParseFloat(fmt.Sprintf("%.2f", value2), 64) s.logger.Debug("计算sla" + fmt.Sprint(value)) return value ``` #### 查询api 查询api是一个相对复杂的接口,因为api的统计信息表t_api_data中存储数据量过大,进行连表操作速度太慢,因此,对此进行多协程查询 该接口首先会开启3个协程,分别查询api,告警历史,api_data,处理结束后在进行较为合并操作 #### 其他 其他前端接口逻辑相对简单,无须讲解 ### 特殊逻辑说明 #### api api不会将对应的图消息写入k8s,原因是因为,api数量过多,如果全部写入k8s中,会导致prometheus速度减慢很多,因此只会将模版进行处理后写入k8s规则,然后hook接口会根据alertmanager发送多告警消息与图多阈值对比判断是否应该告警