Connection pool: Add client death tracking

While running this in production I noticed a number of ghost
processes with all their clients dead before they released the connection,
so let's track them to log it and remove them from clients
This commit is contained in:
rinpatch 2020-05-08 18:18:59 +03:00
parent e94ba05e52
commit 1b15cb066c
2 changed files with 45 additions and 2 deletions

View file

@ -20,7 +20,10 @@ def init([key, uri, opts, client_pid]) do
end)
send(client_pid, {:conn_pid, conn_pid})
{:ok, %{key: key, timer: nil}, :hibernate}
{:ok,
%{key: key, timer: nil, client_monitors: %{client_pid => Process.monitor(client_pid)}},
:hibernate}
else
err -> {:stop, err}
end
@ -45,6 +48,9 @@ def handle_cast({:add_client, client_pid, send_pid_back}, %{key: key} = state) d
state
end
ref = Process.monitor(client_pid)
state = put_in(state.client_monitors[client_pid], ref)
{:noreply, state, :hibernate}
end
@ -55,6 +61,9 @@ def handle_cast({:remove_client, client_pid}, %{key: key} = state) do
{conn_pid, List.delete(used_by, client_pid), crf, last_reference}
end)
{ref, state} = pop_in(state.client_monitors[client_pid])
Process.demonitor(ref)
timer =
if used_by == [] do
max_idle = Pleroma.Config.get([:connections_pool, :max_idle_time], 30_000)
@ -85,6 +94,26 @@ def handle_info({:gun_down, _pid, _protocol, _reason, _killed_streams} = down_me
{:stop, {:error, down_message}, state}
end
@impl true
def handle_info({:DOWN, _ref, :process, pid, reason}, state) do
# Sometimes the client is dead before we demonitor it in :remove_client, so the message
# arrives anyway
case state.client_monitors[pid] do
nil ->
{:noreply, state, :hibernate}
_ref ->
:telemetry.execute(
[:pleroma, :connection_pool, :client_death],
%{client_pid: pid, reason: reason},
%{key: state.key}
)
handle_cast({:remove_client, pid}, state)
end
end
# LRFU policy: https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.55.1478
defp crf(time_delta, prev_crf) do
1 + :math.pow(0.5, time_delta / 100) * prev_crf

View file

@ -6,7 +6,8 @@ defmodule Pleroma.Telemetry.Logger do
@events [
[:pleroma, :connection_pool, :reclaim, :start],
[:pleroma, :connection_pool, :reclaim, :stop],
[:pleroma, :connection_pool, :provision_failure]
[:pleroma, :connection_pool, :provision_failure],
[:pleroma, :connection_pool, :client_death]
]
def attach do
:telemetry.attach_many("pleroma-logger", @events, &handle_event/4, [])
@ -59,4 +60,17 @@ def handle_event(
"Connection pool had to refuse opening a connection to #{key} due to connection limit exhaustion"
end)
end
def handle_event(
[:pleroma, :connection_pool, :client_death],
%{client_pid: client_pid, reason: reason},
%{key: key},
_
) do
Logger.warn(fn ->
"Pool worker for #{key}: Client #{inspect(client_pid)} died before releasing the connection with #{
inspect(reason)
}"
end)
end
end