Compare commits

...

5 Commits

Author SHA1 Message Date
f77d618553 Block more bad bot activity
Signed-off-by: Marcus Noble <github@marcusnoble.co.uk>
2025-09-28 16:22:24 +01:00
c4263762c7 Added 'investigating-and-fixing-stoppodsandbox' post
Signed-off-by: Marcus Noble <github@marcusnoble.co.uk>
2025-09-28 12:17:11 +01:00
1a9df45e55 Added advice from Josh Clark
Signed-off-by: Marcus Noble <github@marcusnoble.co.uk>
2025-05-10 17:14:00 +01:00
d5d7f74fd0 Added some advice from lobste.rs
Signed-off-by: Marcus Noble <github@marcusnoble.co.uk>
2025-05-03 07:11:07 +01:00
934c4f4de3 Added advice from Aaron Patterson
Signed-off-by: Marcus Noble <github@marcusnoble.co.uk>
2025-05-03 07:04:38 +01:00
6 changed files with 351 additions and 0 deletions

View File

@@ -33,6 +33,7 @@ router.all(/.*\.\.\/.*/, blackHole);
router.all('/__MACOSX/*?', blackHole);
// Block access to Workdpress files
router.all('(/*)?/wp-admin/', blackHole);
router.all('(/*)?/wp-admin/.*', blackHole);
router.all('(/*)?/wp-includes/?(*)?', blackHole);
router.all('(/*)?/wp-content/?(*)?', blackHole);
router.all('/wordpress/', blackHole);
@@ -59,6 +60,7 @@ router.all('/bak/', blackHole);
router.all('archive.zip', blackHole);
// Block access to possible credentials
router.all('/env.test', blackHole);
router.all('(/.*)?/env$', blackHole);
router.all('/admin(/.*)?', blackHole)
router.all('/credentials(/*)?', blackHole);
router.all(/.*credentials\.json$/, blackHole);

View File

@@ -42,6 +42,17 @@
"Segoe UI Symbol", "Noto Color Emoji";
}
::-moz-selection {
background-color: var(--primary);
color: var(--primary-inverse);
}
::selection {
background-color: var(--primary);
color: var(--primary-inverse);
}
h2 {
margin-bottom: 10px;
}

22
src/js/promql.js Normal file
View File

@@ -0,0 +1,22 @@
var module=module?module:{};
function hljsDefinePromQL(hljs){
const terms = ['avg', 'bottomk', 'count', 'count_values', 'group', 'limitk', 'limit_ratio', 'max', 'min', 'quantile', 'stddev', 'stdvar', 'sum', 'topk', 'abs', 'absent', 'absent_over_time', 'acos', 'acosh', 'asin', 'asinh', 'atan', 'atanh', 'avg_over_time', 'ceil', 'changes', 'clamp', 'clamp_max', 'clamp_min', 'cos', 'cosh', 'count_over_time', 'days_in_month', 'day_of_month', 'day_of_week', 'day_of_year', 'deg', 'delta', 'deriv', 'exp', 'floor', 'histogram_avg', 'histogram_count', 'histogram_fraction', 'histogram_quantile', 'histogram_sum', 'histogram_stddev', 'histogram_stdvar', 'double_exponential_smoothing', 'hour', 'idelta', 'increase', 'info', 'irate', 'label_replace', 'label_join', 'last_over_time', 'ln', 'log10', 'log2', 'mad_over_time', 'max_over_time', 'min_over_time', 'ts_of_max_over_time', 'ts_of_min_over_time', 'ts_of_last_over_time', 'minute', 'month', 'pi', 'predict_linear', 'present_over_time', 'quantile_over_time', 'rad', 'rate', 'resets', 'round', 'scalar', 'sgn', 'sin', 'sinh', 'sort', 'sort_desc', 'sort_by_label', 'sort_by_label_desc', 'sqrt', 'stddev_over_time', 'stdvar_over_time', 'sum_over_time', 'tan', 'tanh', 'time', 'timestamp', 'vector', 'year', 'on', 'ignoring', 'group_left', 'group_right', 'start()', 'end()', 'by', 'without', 'nan', 'inf'];
const DURATION_MODE = {
scope: 'number',
match: new RegExp(`\\d+[${['y','w','d','h','m','s','ms'].join()}]`)
};
return {
name: "PromQL",
aliases: "promql prom prometheus",
keywords: terms,
contains: [
hljs.COMMENT('#', '$'),
hljs.QUOTE_STRING_MODE,
DURATION_MODE,
hljs.NUMBER_MODE
]
};
}
module.exports=function(hljs){hljs.registerLanguage('promql',hljsDefinePromQL)};
module.exports.definer=hljsDefinePromQL;

View File

@@ -11,6 +11,12 @@ summary: |
<details>
<summary>Changelog</summary>
2025-05-10: Added advice from Josh Clark
2025-05-03: Added some advice from Lobste.rs comments
2025-05-03: Added advice from Aaron Patterson about repeating questions back
2025-05-01: Added advice from Márk Sági-Kazár
2025-05-01: Added suggestion from Oriel about going to the toilet before speaking.
@@ -65,6 +71,14 @@ I also recommend having a final slide with links to get a copy of your slides if
Always remember that **your slides are only there to reenforce what you're saying** and are not the total content. They should enhance and summarise your points as you go so keep them brief and snappy.
On a similar note to avoiding paragraphs, [stip](https://lobste.rs/~stip) points out on [lobste.rs](https://lobste.rs/s/ioyjfm/my_tips_on_giving_technical_talks#c_jixtpt) that:
> you should limit code on slides as much as possible, ideally only to a few lines. In my opinion, most points can be made with just a function signature or a several-line toy function. Showing the audience more than that risks losing them while they try to figure out what is going on.
> If you really need to put a long function on a slide, try and highlight the relevant section as youre talking about it so the audience can follow along better.
This is great advice! Techies in the audience will automatically try and read all the code and figure out what it does rather than listen to you talking. For this I like to use contrast of text colour to highlight the bits I want people to focus on and "fade out" the rest of the code that is just boiler plate or will be relevant later. I will often have the same code on multiple slides but each slide highlights a different bit of the code so I can walk the audience through it rather than dumping it on them all in one go.
### Technical
Most presentation software has the ability to write **speakers notes**. I don't personally use these myself as I can't read fast enough to make them useful but if you feel more confident with them then I totally recommend using them. The main thing to keep in mind though is not to use them as a script but rather just short notes to remind you of your place in the talk.
@@ -120,6 +134,10 @@ Be aware that you don't need to be all knowing - if you don't have an answer for
If you think an answer might require considerable time or be confusing to explain from the stage then suggest that the person asking the question meets you after the session to discuss it further.
One thing [Aaron Patterson pointed out on Bluesky](https://bsky.app/profile/tenderlove.dev/post/3lo7btf5gfs2u) that I hadn't included here is:
> during Q&A try to repeat the questions. Audience members don't always have a mic so repeating will help everyone, it makes sure you understand the Q, and buys you time to think of an answer
## After the talk
So, you did it! You gave the talk! Congratulations! 🎉
@@ -193,3 +211,21 @@ Bigger is always better here. Assume the person sat at the back of the room also
> Disable true tone (sometimes it makes screen recording yellowish)
Solid practical tip if you use true tone on your machine. Not something I've ever thought of before.
### [tomhukins](https://lobste.rs/~tomhukins) on [lobste.rs](https://lobste.rs/s/ioyjfm/my_tips_on_giving_technical_talks#c_dklog7)
> In one of my talks, I completely forgot what I wanted to say. I noticed myself panicking, so took a deep breath, paused and looked at the audience. When I watched the video of my talk, it looked like a natural pause in the presentation. Although saying “um” as a pause is okay, silence works better.
> Im a native English speaker and I often present to people who arent. I try to speak slowly and clearly, and to use simple or common words whenever possible. I want to communicate an idea, not to sound clever.
> Sharing links and contact details on a final slide makes sense: include a QR code for the audience to photograph.
### [Josh Clark](https://www.linkedin.com/in/je-clark/) on [LinkedIn](https://www.linkedin.com/feed/update/urn:li:activity:7323429734250418176?commentUrn=urn%3Ali%3Acomment%3A%28activity%3A7323429734250418176%2C7326995404191870977%29&dashCommentUrn=urn%3Ali%3Afsd_comment%3A%287326995404191870977%2Curn%3Ali%3Aactivity%3A7323429734250418176%29)
> In demos, make sure the relevant info (e.g. terminal commands) is in the middle third of the screen. The top of the projector screen can often be washed out or reflecting the overhead lights in the room, and the bottom is often cut off by people's heads.
This is actually really good advice for all your slides. Having padding around the edges of your presentation theme will help you deal with this issue and any wonky projector issues that you might face.
> And when I'm juggling multiple windows (slides, terminal, wireshark, etc) in a presentation, I like to set them up in individual virtual desktops on my mac. In rehearsals, I make sure the order of those desktops is conducive to swiping between them.
This is a great practical tip for how to handle the "cleaning your desktop" advice I have above! Much slicker than moving around windows or finding the appropriate tab or whatever.

View File

@@ -0,0 +1,278 @@
---
layout: post.html
date: 2025-09-28
title: Investigating and fixing "StopPodSandbox from runtime service failed" Kubelet errors
tags: Kubernetes, Kubelet, CNI
summary: |
For months now (maybe years, who knows 🤷) I've had the following error sprinkled throughout my Kubelet logs across multiple of my worker nodes:
```plain
StopPodSandbox from runtime service failed: rpc error: code = Unknown desc = failed to destroy network for sandbox "055b221e44a28ce8d9120f771d5e1ef201f2457ce49c58999a0147104cca2708": failed to get network "cbr0" cached result: decoding version from network config: unexpected end of JSON input
```
As there has never been any associated impact with this that I could see I just ignored it... for a long time!
Well, fast forward to last Friday night where a sudden hyper-focus hit and I decided I wanted to know what was going on and how to stop it - here's how it went.
---
For months now (maybe years, who knows 🤷) I've had the following error sprinkled throughout my Kubelet logs across multiple of my worker nodes:
```plain
StopPodSandbox from runtime service failed: rpc error: code = Unknown desc = failed to destroy network for sandbox "055b221e44a28ce8d9120f771d5e1ef201f2457ce49c58999a0147104cca2708": failed to get network "cbr0" cached result: decoding version from network config: unexpected end of JSON input
```
As there has never been any associated impact with this that I could see I just ignored it... for a long time!
Well, fast forward to last Friday night where a sudden hyper-focus hit and I decided I wanted to know what was going on and how to stop it - here's how it went.
## A little context
Before we dive into things I think it's worth providing a bit of context first so we all know what we're working with. The Kubernetes cluster in question is my "homelab" cluster that is running [Talos Linux](https://www.talos.dev) within a [Proxmox](https://www.proxmox.com/en/) cluster. The cluster has a single control plane node and 5 worker nodes. The worker nodes are where we'll be focussing. Beyond that there's nothing too special about this cluster. CNI is provided by [Flannel](https://github.com/flannel-io/flannel) with default configuration. The only thing that is kinda unusual is my use of [SMB for my CSI driver](https://github.com/kubernetes-csi/csi-driver-smb) - I only mention this because it's possible that a failure here _might_ have been the thing that trigger the initial issue but I don't know for sure.
## Identifying the issue
As I sad, I had ignored this for months and not thought much of it until Friday. I was cleaning up some unrelated stuff in my cluster and was checking my Kubelet logs stored in [Loki](https://grafana.com/docs/loki/latest/) to see if my cleaning up had been applied. This is how I (re-)noticed the high number of sandbox errors from various Kubelets. So I took a closer look with:
```promql
{service="kubelet"} |= "failed to get network \"cbr0\" cached result"
```
There was a LOT of these errors in the logs, almost 1000 per hour! 😳
Even though there was a lot of errors there was actually only a handful of sandbox IDs causing the problem so it wasn't widespread, just frequent.
## Figuring out what was wrong
Because I'm a Professional Kubernetes Platform Engineer™ I knew exactly how to tackle this... ask the internet! So fired up DuckDuckGo, pasted in my error message and started wading through a LOT of useless results.
* I found a couple [similar](https://github.com/projectcalico/calico/issues/4084) [issues](https://github.com/projectcalico/calico/issues/7647) that people had experienced with Calico but didn't seem to fit what I was seeing.
* There was a [closed issue](https://github.com/containerd/containerd/issues/6565) against Containerd where [this comment](https://github.com/containerd/containerd/issues/6565#issuecomment-1632089810) gave me some hope in solving it with a loopback CNI but turned out unrelated also.
These did give me enough info to start looking in the right places though. There was just one problem - Talos Linux doesn't have a shell you can SSH into, it's all API driven.
`kubectl debug` to the rescue!!!
Thankfully, it's possible to mimic an SSH session using `kubectl debug` against the node. As I knew I'd need a bunch of container-related tools to help with debugging I launched a debug container using the [alpine-containertools](https://github.com/raesene/alpine-containertools) image from [Rory McCune](https://www.mccune.org.uk). Due to how the debug command mounts the host filesystem I also had to override the `CONTAINER_RUNTIME_ENDPOINT` environment variable to point to the correct location.
```bash
kubectl debug -it --image raesene/alpine-containertools \
--env CONTAINER_RUNTIME_ENDPOINT=unix:///host/run/containerd/containerd.sock \
node/talos-192-168-1-15 \
-- sh
```
Now we can start poking around on the node. First, lets see if we can force delete the sandbox pod ourself.
```bash
crictl rmp --force 055b221e44a28ce8d9120f771d5e1ef201f2457ce49c58999a0147104cca2708
Failed to stop sandbox before removing: rpc error: code = Unknown desc = failed to destroy network for sandbox "055b221e44a28ce8d9120f771d5e1ef201f2457ce49c58999a0147104cca2708": failed to get network "cbr0" cached result: decoding version from network config: unexpected end of JSON input
```
No luck! But we've been able to reproduce what Kubelet is hitting at least.
I spent a bit more time poking at `crictl` to get an understanding of the state of things. Doing so I was able to identify which pods were broken on each node.
```bash
# Get the full details about the pod in containerd
crictl inspectp --output=json 055b221e44a28ce8d9120f771d5e1ef201f2457ce49c58999a0147104cca2708
# List all pods known to containerd, and filter to those in a Notready state
crictl pods | grep NotReady
```
Eventually, after wading through all the unrelated search results one did seem relevant, a GitHub issue on `containerd`: "[Empty cache files causes "KillPodSandbox" errors when deleting pods](https://github.com/containerd/containerd/issues/8197)".
## Cleaning things up
This final GitHub issue contained one important piece of information I was missing - the location of the network cache: `/var/lib/cni/results`. This is the directory where the CNI stores a JSON file for each network it creates. So let's go take a look:
```bash
> ls -la /host/var/lib/cni/results
total 148
drwx------ 2 root root 4096 Sep 27 05:48 .
drwx------ 5 root root 52 Jun 23 2022 ..
-rw------- 1 root root 1647 Sep 27 05:47 cbr0-01d576d2f808a969bb7cb8da11d1ee3117ec4e9792d4aea33abec55318b74e01-eth0
-rw------- 1 root root 0 Jun 15 21:28 cbr0-055b221e44a28ce8d9120f771d5e1ef201f2457ce49c58999a0147104cca2708-eth0
-rw------- 1 root root 1608 Sep 27 05:48 cbr0-225486a2df0c2c8be5038245d099b4c48dc03ede648735a2b4676154650e7741-eth0
-rw------- 1 root root 2045 Sep 27 05:46 cbr0-2e848e89543e7b6491d0b47eb2be7ec620865024d942012303ede16842aa1108-eth0
-rw------- 1 root root 1638 Sep 27 05:48 cbr0-415cd18f24ec2740486a7bf254054f871bd8abe4c26bc6e27dbf1f5a10c29f69-eth0
-rw------- 1 root root 1638 Sep 27 05:46 cbr0-47d021d26d2d1935025cee58941ff68ad28e09f1337b867e0865be56dde64a2a-eth0
-rw------- 1 root root 1616 Sep 27 05:46 cbr0-5d40d2f86c6f4be7a85969f24e844ea049b41af14b97cfa3f5a655f52d4fc695-eth0
-rw------- 1 root root 1616 Sep 27 05:48 cbr0-68f252ab057496036e8852a4f6103c1b6c564f6976d04676010858a5588a9a10-eth0
-rw------- 1 root root 1596 Sep 27 05:48 cbr0-91d8805fe4b95c92f4c53e1321813b5fe9f71467c5692328bb7646649db57b22-eth0
-rw------- 1 root root 1764 Sep 27 05:48 cbr0-a0eaa73bd701c9b1b1167efd49487b48074936211de1c6be37beec5184e42691-eth0
```
Notice anything suspicious? The file size of the file associated with our failing sandbox ID is `0`. 🤔 That explains why the CNI is unable to load from the cache then - it's empty. Taking a quick look at one of the other files shows a valid JSON object with various network configuration stored within.
So lets delete this file - `rm -rf /host/var/lib/cni/results/cbr0-055b221e44a28ce8d9120f771d5e1ef201f2457ce49c58999a0147104cca2708-eth0`
With that gone, let's try again to delete the pod:
```bash
crictl rmp 055b221e44a28ce8d9120f771d5e1ef201f2457ce49c58999a0147104cca2708
Removed sandbox 055b221e44a28ce8d9120f771d5e1ef201f2457ce49c58999a0147104cca2708
```
SUCCESS! 🎉
Now we know what to clean up it's time to repeat the process for each filing sandbox ID on each node. Once that is done the kubelet logs are looking _muuuucccchhhhhhh_ better!
## Preventing this happening again
As I'm still unclear why or how these pods first failed I can't be sure it wont happen again. My current leading theories are it was either caused by a previous power failure that left the nodes in a bad state or was related to my NAS crashing as the few pods I took a close look at all had SMB volumes associated with them.
Regardless of what caused it I want to make sure I'm being more proactive going forward and leaving my cluster in a better state than it was before. I could set up a dashboard that includes the log query from Loki that we used at the start of this post but that requires me to check it often and... I'm lazy. What we really want is some alerts to annoy me when / if this happens again.
I already have quite a decent monitoring and alerting setup in my homelab made up of [VictoriaMetrics](https://victoriametrics.com), [AlertManager](https://prometheus.io/docs/alerting/latest/alertmanager/) and various metric exporters so I'm in a good starting point.
But what I am lacking is containerd metrics.
A quick search lead me to this [great post](https://collabnix.com/monitoring-containerd/) by Abraham Dahunsi covering a lot of details about monitoring containerd. This gave the information on what metrics to work with and what I can do with them but I was still missing the actual metrics. In Talos Linux the containerd metrics aren't enabled by default but thankfully it's [quite simple to enable](https://www.talos.dev/v1.11/talos-guides/configuration/containerd/), I just needed to add the following to my machine config for each of my nodes:
```yaml
machine:
files:
- content: |
[metrics]
address = "0.0.0.0:11234"
path: /etc/cri/conf.d/20-customization.part
op: create
```
I then just need to instruct [vmagent](https://docs.victoriametrics.com/victoriametrics/vmagent/) to scrape this new endpoint:
```yaml
scrape_configs:
- job_name: containerd
scrape_interval: 30s
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}:11234/proxy/v1/metrics
- source_labels: [__meta_kubernetes_node_name]
action: replace
target_label: kubernetes_node
```
Now I have the metrics coming in, we can put them to use at identifying issues.
**Identify failures starting a pod sandbox:**
```promql
sum by(grpc_code, instance) (
rate(grpc_server_handled_total{job="containerd", grpc_code!="OK", grpc_method="RunPodSandbox"})
) > 0
```
**Identify failures stopping or removing a pod sandbox:** (This is the issue we handled in this post)
```promql
sum by(grpc_code, grpc_method, instance) (
rate(grpc_server_handled_total{job="containerd", grpc_code!="OK", grpc_method=~"StopPodSandbox|RemovePodSandbox"})
) > 0
```
**Find all pods known to containerd that aren't known to Kubernetes:**
```promql
(
container_pids_current{namespace="k8s.io"}
unless on (container_id)
label_replace(
kube_pod_container_info,
"container_id", "$1", "container_id", "containerd://(.+)"
)
unless on (container_id)
label_replace(
kube_pod_init_container_info,
"container_id", "$1", "container_id", "containerd://(.+)"
)
)
unless on (container_id)
(
container_memory_swap_limit_bytes > 0
)
```
This last one has a lot going on but it breaks down to:
1. List all `pids` counts for all containers in the `k8s.io` namespace in containerd
2. Unless the `container_id` is also found on the `kube_pod_container_info` metric (where we need to strip the `containerd://` prefix)
3. Or the `container_id` is also found on the `kube_pod_init_container_info` metric
4. And finally only if the `container_memory_swap_limit_bytes` is greater than 0 which will filter out the sandbox containers (pause container) which aren't exposed to Kubernetes anyway.
These three then become new alerts that can notify me if any match.
```yaml
- name: containerd
rules:
- alert: ContainerdPodStartFailed
annotations:
description: |
Containerd on **{{ .Labels.instance }}** has failed to start a pod sandbox due to **{{ .Labels.grpc_code }}**.
expr: |
sum by(grpc_code, instance) (rate(grpc_server_handled_total{job="containerd",grpc_code!="OK", grpc_method="RunPodSandbox"})) > 0
for: 5m
labels:
severity: notify
- alert: ContainerdPodRemoveFailed
annotations:
description: |
Containerd on **{{ .Labels.instance }}** has failed to {{ .Labels.grpc_method }} due to **{{ .Labels.grpc_code }}**.
expr: |
sum by(grpc_code, grpc_method, instance) (rate(grpc_server_handled_total{job="containerd", grpc_code!="OK", grpc_method=~"StopPodSandbox|RemovePodSandbox"})) > 0
for: 5m
labels:
severity: notify
- alert: ContainerdHasLeftoverContainersRunning
annotations:
description: |
Containerd on **{{ .Labels.instance }}** has extra containers that aren't known to Kubernetes.
expr: |
(
container_pids_current{namespace="k8s.io"}
unless on (container_id)
label_replace(
kube_pod_container_info,
"container_id", "$1", "container_id", "containerd://(.+)"
)
unless on (container_id)
label_replace(
kube_pod_init_container_info,
"container_id", "$1", "container_id", "containerd://(.+)"
)
)
unless on (container_id)
(
container_memory_swap_limit_bytes > 0
)
for: 15m
labels:
severity: notify
```
## This I learnt
I'd never really used `crictl` much before this so it was good to be able to better understand it. One thing I didn't realise was it also has the concept of "pods" and "containers" which are different, but map to, those found in Kubernetes. A "pod" seems to refer to the sandbox container - that is, the container that is created to setup all the networking etc. that is needed for the workloads (this is the "pause container"). Pods can be listed with `crictl pods` and inspected with `crictl inspectp [ID]`. Containers can be listed with `crictl ps` and inspected with `crictl inspect`.
Before this I didn't really know what a CNI _actually did_ in terms of things it created etc., and I still mostly don't, but I've at least seem some hints of it in the way of the network cache files that it creates.
Debugging Talos when it goes wrong at the host level is _tricky_ as you have limited access to the node itself. This is often good as it prevents you from being able to make live changes that could be forgotten or reverted accidentally but when you actually need to fix something like this it just gets in the way. I'm very thankful for `kubectl debug` (and Rory's wonderful `alpine-containertools` image 💙) that makes this debugging possible and relatively painless.

View File

@@ -35,8 +35,10 @@
<link rel="stylesheet" href="/css/zenburn.css" async defer>
<script type="text/javascript" src="/js/highlight.min.js"></script>
<script type="text/javascript" src="/js/terraform.js"></script>
<script type="text/javascript" src="/js/promql.js"></script>
<script>
hljs.registerLanguage('terraform', window.hljsDefineTerraform);
hljs.registerLanguage('promql', window.hljsDefinePromQL);
hljs.highlightAll();
</script>