找回密码
 立即注册
首页 业界区 业界 Monit-基于非容器服务自恢复程序实践

Monit-基于非容器服务自恢复程序实践

康器 2025-11-23 15:40:01
1.需求:因为历史原因和软件程序原因,有上百台服务和所在服务未运行在容器中,需要在程序奔溃自动拉起(以Java Python C++为主)。
2.目的:能够非人为干预快速自动恢复,要求检测频率在10s一次
3.实现方式
3.1 根据不同语言自己开发脚本实现自动拉起和通知(不够标准化-弃用)
1.gif
2.gif
  1.   1 #!/bin/bash
  2.   2
  3.   3 # ========== 使用说明 ==========
  4.   4 # 1. 自定配置env: fat/prod等环境变量, nacos地址, lark webhook等
  5.   5 # 2. 确保每个Java服务有对应的启动脚本(如 deploy-xxx.sh)
  6.   6 # 3. 设置需要监控的Java服务列表(见下方SERVICES_PROCESS_ID等变量)
  7.   7 # 4. 将此脚本添加到crontab中定期执行,例如每1分钟检测一次monitor是否在运行,如果在运行不会重复运行,不在运行则会后台运行:
  8.   8 # */1 * * * * /bin/bash /path/to/this/monitor.sh >> /path/to/monitor.log 2>&1
  9.   9
  10. 10 # ========== 环境变量加载 ==========
  11. 11 # 在 crontab 中执行时,需要显式加载环境变量
  12. 12 # 按优先级加载多个可能的环境变量文件
  13. 13 ENV_FILES=(
  14. 14     "/etc/profile"
  15. 15     "/etc/bashrc"
  16. 16     "/root/.bash_profile"
  17. 17     "/root/.bashrc"
  18. 18     "$HOME/.bash_profile"
  19. 19     "$HOME/.bashrc"
  20. 20 )
  21. 21
  22. 22 for env_file in "${ENV_FILES[@]}"; do
  23. 23     if [ -f "$env_file" ]; then
  24. 24         echo "[$(date '+%Y-%m-%d %H:%M:%S')] 加载环境变量文件: $env_file" >> /tmp/monitor_env.log
  25. 25         source "$env_file"
  26. 26     fi
  27. 27 done
  28. 28
  29. 29 # 如果上述文件都没有加载到关键环境变量,则手动设置
  30. 30 if [ -z "$JAVA_HOME" ]; then
  31. 31     export JAVA_HOME="/opt/jdk-17.0.8"
  32. 32     export CLASSPATH=".:$JAVA_HOME/lib/tools.jar:$JAVA_HOME/lib/dt.jar"
  33. 33     export PATH="$JAVA_HOME/bin:$PATH"
  34. 34     echo "[$(date '+%Y-%m-%d %H:%M:%S')] 手动设置 JAVA_HOME: $JAVA_HOME" >> /tmp/monitor_env.log
  35. 35 fi
  36. 36
  37. 37 # 设置必要的环境变量
  38. 38 export env_nacos_address="nacos.test.com:8848"
  39. 39 export env_nacos_namespace="fat"
  40. 40 export ENV="fat"
  41. 41
  42. 42 # 记录环境变量状态(用于调试)
  43. 43 {
  44. 44     echo "=== 环境变量检查 ==="
  45. 45     echo "时间: $(date)"
  46. 46     echo "JAVA_HOME: $JAVA_HOME"
  47. 47     echo "PATH: $PATH"
  48. 48     echo "ENV: $ENV"
  49. 49     echo "env_nacos_address: $env_nacos_address"
  50. 50     which java
  51. 51     java -version 2>&1
  52. 52     echo "=== 环境变量检查结束 ==="
  53. 53 } >> /tmp/monitor_env.log 2>&1
  54. 54
  55. 55 # ========== 脚本配置 ==========
  56. 56 target_folder="./"
  57. 57
  58. 58 LARK_WEBHOOK="https://open.larksuite.com/open-apis/bot/v2/hook/token"
  59. 59 LARK_LOG_FILE="$target_folder/lark.log"
  60. 60 MONITOR_LOG_FILE="/tmp/check.logs"
  61. 61
  62. 62 LARK_MAX_RETRY=3
  63. 63 LARK_RETRY_DELAY=10
  64. 64 LARK_ENV="fat"
  65. 65
  66. 66 # ========== 文件锁机制 ==========
  67. 67 LOCK_FILE="/tmp/service_monitor.lock"
  68. 68
  69. 69 # 尝试获取锁,如果失败则退出
  70. 70 exec 200>"$LOCK_FILE"
  71. 71 flock -n 200 || {
  72. 72     echo "[$(date '+%Y-%m-%d %H:%M:%S')] 监控脚本已在运行,退出本次执行 (PID: $(cat $LOCK_FILE 2>/dev/null || echo '未知'))" >> "$MONITOR_LOG_FILE"
  73. 73     exit 0
  74. 74 }
  75. 75
  76. 76 # 将当前PID写入锁文件
  77. 77 echo $$ > "$LOCK_FILE"
  78. 78
  79. 79 # 设置退出时清理锁文件
  80. 80 trap "rm -f $LOCK_FILE; exit" INT TERM EXIT
  81. 81
  82. 82 echo "[$(date '+%Y-%m-%d %H:%M:%S')] 启动服务监控进程 (PID: $$)" | tee -a "$MONITOR_LOG_FILE"
  83. 83
  84. 84 # ========== Lark 报警函数 ==========
  85. 85 send_lark_alert() {
  86. 86     local message="$1"
  87. 87     local timestamp=$(date "+%Y-%m-%d %H:%M:%S")
  88. 88     
  89. 89     local full_message="[$LARK_ENV][进程监控] $message\n"
  90. 90     full_message+="服务器名: $(hostname)\n"
  91. 91     full_message+="报警时间: $timestamp\n"
  92. 92     #full_message+="Jame Mei</at>"
  93. 93     #full_message+="Levi Li</at>"
  94. 94     #full_message+="Bingbing Sun</at>"
  95. 95     
  96. 96     local json_data="{
  97. 97         "msg_type": "interactive",
  98. 98         "card": {
  99. 99             "elements": [{
  100. 100                 "tag": "div",
  101. 101                 "text": {
  102. 102                     "content": "$full_message",
  103. 103                     "tag": "lark_md"
  104. 104                 }
  105. 105             }]
  106. 106         }
  107. 107     }"
  108. 108     
  109. 109     local attempt=1
  110. 110     while [[ $attempt -le $LARK_MAX_RETRY ]]; do
  111. 111         http_status=$(curl -s -o /dev/null -w "%{http_code}" \
  112. 112             -m 3 \
  113. 113             -X POST "$LARK_WEBHOOK" \
  114. 114             -H "Content-Type: application/json" \
  115. 115             -d "$json_data")
  116. 116         
  117. 117         if [[ $http_status -eq 200 ]]; then
  118. 118             echo "[$(date "+%F %T")] Lark报警发送成功" >> "$LARK_LOG_FILE"
  119. 119             return 0
  120. 120         else
  121. 121             sleep $LARK_RETRY_DELAY
  122. 122             ((attempt++))
  123. 123         fi
  124. 124     done
  125. 125     echo "[$(date "+%F %T")] 错误: Lark报警发送失败,已达最大重试次数: $attempt" >> "$LARK_LOG_FILE"
  126. 126     return 1
  127. 127 }
  128. 128
  129. 129 send_alert_async() {
  130. 130     (
  131. 131         send_lark_alert "$1"
  132. 132     ) &> /dev/null &
  133. 133 }
  134. 134
  135. 135 # ========== 服务配置 ==========
  136. 136 WORK_DIR="/data/scripts"
  137. 137
  138. 138 # 服务列表
  139. 139 #SERVICES_PROCESS_ID[0]="pb-trading-engine-1.0-SNAPSHOT.jar"
  140. 140 #SERVICES_FRIENDLY_NAME[0]="交易引擎 (Trading Engine)"
  141. 141 #SERVICES_START_CMD[0]="./deploy-engine-new.sh start"
  142. 142
  143. 143 SERVICES_PROCESS_ID[1]="pb-trading-market-1.0-SNAPSHOT.jar"
  144. 144 SERVICES_FRIENDLY_NAME[1]="行情服务 (Trading Market)"
  145. 145 SERVICES_START_CMD[1]="./pb-trading-market.sh start"
  146. 146
  147. 147 SERVICES_PROCESS_ID[2]="rapidx-trading-query-realtime-1.0-SNAPSHOT.jar"
  148. 148 SERVICES_FRIENDLY_NAME[2]="实时查询 (Query Realtime)"
  149. 149 SERVICES_START_CMD[2]="./deploy-realtime-new.sh start"
  150. 150
  151. 151 SERVICES_PROCESS_ID[3]="router-server-1.0.0-SNAPSHOT.jar"
  152. 152 SERVICES_FRIENDLY_NAME[3]="路由服务 (Router Server)"
  153. 153 SERVICES_START_CMD[3]="./router-server.sh start"
  154. 154
  155. 155 SERVICES_PROCESS_ID[4]="pb-trading-query-1.0-SNAPSHOT.jar"
  156. 156 SERVICES_FRIENDLY_NAME[4]="交易查询 (Trading Query)"
  157. 157 SERVICES_START_CMD[4]="./pb-trading-query.sh start"
  158. 158
  159. 159 SERVICES_PROCESS_ID[5]="rapidx-trading-market-gateway-1.0-SNAPSHOT.jar"
  160. 160 SERVICES_FRIENDLY_NAME[5]="行情网关 (Market Gateway)"
  161. 161 SERVICES_START_CMD[5]="./rapidx-trading-market-gateway.sh start"
  162. 162
  163. 163 SERVICES_PROCESS_ID[6]="ltp-ems-1.0.0-master-SNAPSHOT.jar"
  164. 164 SERVICES_FRIENDLY_NAME[6]="订单管理 (EMS)"
  165. 165 SERVICES_START_CMD[6]="./deploy-ems-new.sh start"
  166. 166
  167. 167 SERVICES_PROCESS_ID[7]="pb-trading-push-0.0.1-SNAPSHOT.jar"
  168. 168 SERVICES_FRIENDLY_NAME[7]="推送服务 (Trading Push)"
  169. 169 SERVICES_START_CMD[7]="./pb-trading-push.sh start"
  170. 170
  171. 171 SERVICES_PROCESS_ID[8]="pb-trading-gateway-1.0-SNAPSHOT.jar"
  172. 172 SERVICES_FRIENDLY_NAME[8]="交易网关 (Trading Gateway)"
  173. 173 SERVICES_START_CMD[8]="./pb-trading-gateway.sh start"
  174. 174
  175. 175 SERVICES_PROCESS_ID[9]="exchange-data-server-1.0.0-SNAPSHOT.jar"
  176. 176 SERVICES_FRIENDLY_NAME[9]="exchange-data-server"
  177. 177 SERVICES_START_CMD[9]="./ltp-exchange-data-server.sh start"
  178. 178
  179. 179 SERVICES_PROCESS_ID[10]="rapidtrade-storage-1.0-SNAPSHOT.jar"
  180. 180 SERVICES_FRIENDLY_NAME[10]="rapidtrade-storage"
  181. 181 SERVICES_START_CMD[10]="./rapidtrade-storage.sh start"
  182. 182
  183. 183 SERVICES_PROCESS_ID[11]="rapidx-trading-algo-server-1.0-SNAPSHOT.jar"
  184. 184 SERVICES_FRIENDLY_NAME[11]="rapidx-trading-algo-server"
  185. 185 SERVICES_START_CMD[11]="./rapidx-trading-algo-server.sh start"
  186. 186
  187. 187 SERVICES_PROCESS_ID[12]="rapidtrade-mock-1.0-SNAPSHOT.jar"
  188. 188 SERVICES_FRIENDLY_NAME[12]="rapidtrade-mock"
  189. 189 SERVICES_START_CMD[12]="./rapidtrade-mock.sh start"
  190. 190
  191. 191 SERVICES_PROCESS_ID[13]="rapidx-ws-simulator-1.0-SNAPSHOT.jar"
  192. 192 SERVICES_FRIENDLY_NAME[13]="rapidx-ws-simulator"
  193. 193 SERVICES_START_CMD[13]="./rapidx-ws-simulator.sh start"
  194. 194
  195. 195 SERVICES_PROCESS_ID[14]="pb-trading-statistics-1.0-SNAPSHOT.jar"
  196. 196 SERVICES_FRIENDLY_NAME[14]="pb-trading-statistics"
  197. 197 SERVICES_START_CMD[14]="./pb-trading-statistics.sh start"
  198. 198
  199. 199 SERVICES_PROCESS_ID[15]="rapidx-trading-onezero-maker-1.0-SNAPSHOT.jar"
  200. 200 SERVICES_FRIENDLY_NAME[15]="rapidx-trading-onezero-maker"
  201. 201 SERVICES_START_CMD[15]="./rapidx-trading-onezero-maker.sh start"
  202. 202
  203. 203 SERVICES_PROCESS_ID[16]="pb-trading-transfer-1.0-SNAPSHOT.jar"
  204. 204 SERVICES_FRIENDLY_NAME[16]="pb-trading-transfer"
  205. 205 SERVICES_START_CMD[16]="./pb-trading-transfer.sh start"
  206. 206
  207. 207 SERVICES_PROCESS_ID[17]="rapidx-trading-clearing-1.0-SNAPSHOT.jar"
  208. 208 SERVICES_FRIENDLY_NAME[17]="rapidx-trading-clearing"
  209. 209 SERVICES_START_CMD[17]="./rapidx-trading-clearing.sh start"
  210. 210
  211. 211 SERVICES_PROCESS_ID[18]="pb-trading-monitor-1.0-SNAPSHOT.jar"
  212. 212 SERVICES_FRIENDLY_NAME[18]="pb-trading-monitor"
  213. 213 SERVICES_START_CMD[18]="./pb-trading-monitor.sh start"
  214. 214
  215. 215 SERVICES_PROCESS_ID[19]="rapidx-trading-query-persistent-1.0-SNAPSHOT.jar"
  216. 216 SERVICES_FRIENDLY_NAME[19]="rapidx-trading-query-persistent"
  217. 217 SERVICES_START_CMD[19]="./deploy-persistent-new.sh start"
  218. 218
  219. 219 SERVICES_PROCESS_ID[20]="rapidtrade-storage-1.0-SNAPSHOT.jar"
  220. 220 SERVICES_FRIENDLY_NAME[20]="rapidtrade-storage"
  221. 221 SERVICES_START_CMD[20]="./rapidtrade-storage.sh start"
  222. 222
  223. 223 SERVICES_PROCESS_ID[21]="bitu-trade-1.0-SNAPSHOT.jar"
  224. 224 SERVICES_FRIENDLY_NAME[21]="bitu-trade"
  225. 225 SERVICES_START_CMD[21]=bitu-trade.sh
  226. 226 #SERVICES_START_CMD[21]="./deploy.sh start"
  227. 227
  228. 228 SERVICES_PROCESS_ID[22]="ltp-data-integration-1.0-SNAPSHOT.jar"
  229. 229 SERVICES_FRIENDLY_NAME[22]="ltp-data-integration"
  230. 230 SERVICES_START_CMD[22]="./ltp-data-integration.sh start"
  231. 231
  232. 232 SERVICES_PROCESS_ID[23]="ingest-server-app-1.0-SNAPSHOT.jar"
  233. 233 SERVICES_FRIENDLY_NAME[23]="ingest-server-app"
  234. 234 SERVICES_START_CMD[23]="./data-ingest-server.sh start"  
  235. 235
  236. 236 SERVICES_PROCESS_ID[24]="ltp-data-visual-1.0-SNAPSHOT.jar"
  237. 237 SERVICES_FRIENDLY_NAME[24]="ltp-data-visual"
  238. 238 SERVICES_START_CMD[24]="./data-cam-visual.sh start"
  239. 239
  240. 240
  241. 241 # ========== 主循环 ==========
  242. 242 mkdir -p "$(dirname "$MONITOR_LOG_FILE")"
  243. 243
  244. 244 while true; do
  245. 245     for i in "${!SERVICES_PROCESS_ID[@]}"; do
  246. 246         process_id="${SERVICES_PROCESS_ID[$i]}"
  247. 247         friendly_name="${SERVICES_FRIENDLY_NAME[$i]}"
  248. 248         start_cmd="${SERVICES_START_CMD[$i]}"
  249. 249
  250. 250         if ! pgrep -f "$process_id" > /dev/null; then
  251. 251             echo "[$(date '+%Y-%m-%d %H:%M:%S')] - 服务 [$friendly_name] (进程标识: $process_id) 未运行,正在重启..." | tee -a "$MONITOR_LOG_FILE"
  252. 252            
  253. 253             # 在子shell中启动服务,确保环境变量正确传递
  254. 254             (
  255. 255                 # 再次加载环境变量确保子shell中有正确的环境
  256. 256                 for env_file in "${ENV_FILES[@]}"; do
  257. 257                     if [ -f "$env_file" ]; then
  258. 258                         source "$env_file"
  259. 259                     fi
  260. 260                 done
  261. 261                 
  262. 262                 # 手动设置关键环境变量作为备用
  263. 263                 export JAVA_HOME="/opt/jdk-17.0.8"
  264. 264                 export CLASSPATH=".:$JAVA_HOME/lib/tools.jar:$JAVA_HOME/lib/dt.jar"
  265. 265                 export PATH="$JAVA_HOME/bin:$PATH"
  266. 266                 export env_nacos_address="nacos.test.com:8848"
  267. 267                 export env_nacos_namespace="fat"
  268. 268                 export ENV="fat"
  269. 269                 
  270. 270                 cd "$WORK_DIR"
  271. 271                 echo "[$(date '+%Y-%m-%d %H:%M:%S')] - 启动目录: $(pwd), JAVA_HOME: $JAVA_HOME" >> "$MONITOR_LOG_FILE"
  272. 272                 $start_cmd >> "$MONITOR_LOG_FILE" 2>&1
  273. 273             ) &
  274. 274            
  275. 275             send_alert_async "服务 [$friendly_name] (进程标识: $process_id) 已停止运行!正在尝试重启。"
  276. 276         else
  277. 277             echo "[$(date '+%Y-%m-%d %H:%M:%S')] - 服务 [$friendly_name] 运行正常." >> "$MONITOR_LOG_FILE"
  278. 278         fi
  279. 279     done
  280. 280     
  281. 281     sleep 10
  282. 282 done
复制代码
View Code 
 
3.2 使用开源通用软件统一维护自动拉起(开发+运维都可以简单维护和使用)        
3.gif
4.gif
  1. 1 #程序安装
  2. 2 dnf install -y gcc make openssl-devel bison flex zlib-devel
  3. 3 #apt install -y gcc make libssl-dev bison flex zlib1g-dev
  4. 4 #yum install -y gcc make openssl-devel bison flex zlib-devel
  5. 5 wget https://mmonit.com/monit/dist/monit-5.34.0.tar.gz
  6. 6 tar xf monit-5.34.0.tar.gz && cd monit-5.34.0/
  7. 7 ./configure --prefix=/usr/local/monit --without-pam && make && make install
  8. 8 mkdir /usr/local/monit/etc -p && mkdir -p /usr/local/monit/etc/
  9. 9 cp monitrc /usr/local/monit/etc/
  10. 10 chmod 600 /usr/local/monit/etc/monitrc #配置文件定义检测时间,检测配置文件
  11. 11 ln -s /usr/local/monit/bin/monit /usr/sbin/monit
  12. 12 monit --version
  13. 13 mkdir /etc/monit/conf.d/ -p #所有进程检测配置文件,如启动脚本变动修改这里即可
  14. 14 cp /usr/local/monit/bin/monit  /usr/bin/
  15. 15
  16. 16
  17. 17 #systemd配置:vi /etc/systemd/system/monit.service
  18. 18 [Unit]
  19. 19 Description=Monit process monitor
  20. 20 Documentation=https://mmonit.com/monit/
  21. 21 After=network.target
  22. 22
  23. 23 [Service]
  24. 24 Type=forking
  25. 25 ExecStart=/usr/bin/monit -c  /usr/local/monit/etc/monitrc
  26. 26 ExecReload=/usr/bin/monit -c /usr/local/monit/etc/monitrc reload
  27. 27 ExecStop=/usr/bin/monit  -c  /usr/local/monit/etc/monitrc quit
  28. 28 PIDFile=/var/run/monit.pid
  29. 29 Restart=on-failure
  30. 30 User=root
  31. 31 Group=root
  32. 32
  33. 33 [Install]
  34. 34 WantedBy=multi-user.target  # 多用户模式下开机自启
  35. 35
  36. 36 systemctl reload monit
  37. 37 systemctl enable monit
  38. 38 systemctl start monit
复制代码
3.2.1程序安装.sh 
5.gif
6.gif
  1. 1 #进程配置方式vim /usr/local/monit/etc/monitrc:
  2. 2 set daemon 10
  3. 3 set logfile /var/log/monit.log
  4. 4
  5. 5 # 服务配置 /etc/monit/conf.d/rapidtrade-mock.conf
  6. 6 check process rapidtrade_mock matching "rapidtrade-mock"
  7. 7     start program = "/data/scripts/rapidtrade-mock.sh start"
  8. 8     stop program = "/data/scripts/rapidtrade-mock.sh stop"
  9. 9     if does not exist then start
  10. 10
  11. 11
  12. 12
  13. 13 #port配置方式:
  14. 14  check host rapidtrade_mock with address 127.0.0.1
  15. 15     if failed
  16. 16         port 7040
  17. 17         type tcp
  18. 18         timeout 5 seconds
  19. 19         for 2 cycles
  20. 20     then start
  21. 21     start program = "/data/scripts/rapidtrade-mock.sh start" as uid root and gid root
  22. 22     stop program = "/data/scripts/rapidtrade-mock.sh stop" as uid root and gid root
  23. 23     if 3 restarts within 5 cycles then timeout
  24. 24
  25. 25
  26. 26
  27. 27
  28. 28 #健康监测端口+路径:经过测试有问题,无法启动服务并恢复正常。
  29. 29 check host my_web_service with address 127.0.0.1
  30. 30     if failed
  31. 31         port 80
  32. 32         protocol http
  33. 33         request "/actuator/prometheus" # 指定要检查的健康检查端点路径
  34. 34         with timeout 10 seconds
  35. 35         for 3 cycles
  36. 36     then restart
  37. 37     start program = "/usr/bin/systemctl start my-service"
  38. 38     stop program = "/usr/bin/systemctl stop my-service"
复制代码
3.2.2 配置使用 
 
4.测试和使用
7.jpg
8.jpg
这样基于传统服务,只需要改程序有对应的start.sh stop.sh脚本 就可以简单配置进程健康检测来维护服务自动拉起,简单高效不需要每个团队开发很多自己的脚本去维护。
来源:程序园用户自行投稿发布,如果侵权,请联系站长删除
免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!

相关推荐

您需要登录后才可以回帖 登录 | 立即注册