From 57eef6d76492e772f83acba2402d50ecb6a69f6b Mon Sep 17 00:00:00 2001 From: ilja Date: Sun, 8 Jan 2023 18:22:53 +0100 Subject: [PATCH] prune_objects can prune orphaned activities who reference an array of objects E.g. Flag activities have an array of objects We prune the activity when NONE of the objects can be found Note that the cost of finding and deleting these is ~4x higher than finding and deleting the non-array ones Only string: Delete on activities (cost=506573.48..506580.38 rows=0 width=0) Only Array: Delete on activities (cost=3570359.68..4276365.34 rows=0 width=0) (They are still executed separately, so the total cost is the sum of the two) --- lib/mix/tasks/pleroma/database.ex | 47 ++++++++++------- test/mix/tasks/pleroma/database_test.exs | 65 +++++++++++++++++++++++- 2 files changed, 94 insertions(+), 18 deletions(-) diff --git a/lib/mix/tasks/pleroma/database.ex b/lib/mix/tasks/pleroma/database.ex index 0f428ca03..726a22d41 100644 --- a/lib/mix/tasks/pleroma/database.ex +++ b/lib/mix/tasks/pleroma/database.ex @@ -172,35 +172,48 @@ def run(["prune_objects" | args]) do |> Repo.delete_all(timeout: :infinity) if Keyword.get(options, :prune_orphaned_activities) do + # Prune activities who link to a single object """ delete from public.activities where id in ( - select a.id from public.activities a - left join public.objects o on a.data ->> 'object' = o.data ->> 'id' - left join public.activities a2 on a.data ->> 'object' = a2.data ->> 'id' - left join public.users u on a.data ->> 'object' = u.ap_id - -- Only clean up remote activities - where not a.local - -- For now we only focus on activities with direct links to objects - -- e.g. not json objects (in case of embedded objects) or json arrays (in case of multiple objects) - and jsonb_typeof(a."data" -> 'object') = 'string' - -- Find Activities that don't have existing objects - and o.id is null - and a2.id is null - and u.id is null + select a.id from public.activities a + left join public.objects o on a.data ->> 'object' = o.data ->> 'id' + left join public.activities a2 on a.data ->> 'object' = a2.data ->> 'id' + left join public.users u on a.data ->> 'object' = u.ap_id + where not a.local + and jsonb_typeof(a."data" -> 'object') = 'string' + and o.id is null + and a2.id is null + and u.id is null ) """ - |> Repo.query() + |> Repo.query([], timeout: :infinity) + + # Prune activities who link to an array of objects + """ + delete from public.activities + where id in ( + select a.id from public.activities a + join json_array_elements_text((a."data" -> 'object')::json) as j on jsonb_typeof(a."data" -> 'object') = 'array' + left join public.objects o on j.value = o.data ->> 'id' + left join public.activities a2 on j.value = a2.data ->> 'id' + left join public.users u on j.value = u.ap_id + group by a.id + having max(o.data ->> 'id') is null + and max(a2.data ->> 'id') is null + and max(u.ap_id) is null + ) + """ + |> Repo.query([], timeout: :infinity) end - prune_hashtags_query = """ + """ DELETE FROM hashtags AS ht WHERE NOT EXISTS ( SELECT 1 FROM hashtags_objects hto WHERE ht.id = hto.hashtag_id) """ - - Repo.query(prune_hashtags_query) + |> Repo.query() if Keyword.get(options, :vacuum) do Maintenance.vacuum("full") diff --git a/test/mix/tasks/pleroma/database_test.exs b/test/mix/tasks/pleroma/database_test.exs index 7f5cd91a9..402856f3d 100644 --- a/test/mix/tasks/pleroma/database_test.exs +++ b/test/mix/tasks/pleroma/database_test.exs @@ -354,7 +354,7 @@ test "with the --keep-threads option it keeps old threads with bookmarked posts" assert length(Repo.all(Object)) == 1 end - test "We don't have unexpected tables which can contain objects that are referenced by activities" do + test "We don't have unexpected tables which may contain objects that are referenced by activities" do # We can delete orphaned activities. For that we look for the objects they reference in the 'objects', 'activities', and 'users' table. # If someone adds another table with objects (idk, maybe with separate relations, or collections or w/e), then we need to make sure we # add logic for that in the 'prune_objects' task so that we don't wrongly delete their corresponding activities. @@ -481,6 +481,69 @@ test "it prunes orphaned activities with the --prune-orphaned-activities" do assert length(activities) == 4 end + + test "it prunes orphaned activities with the --prune-orphaned-activities when the objects are referenced from an array" do + %Object{} |> Map.merge(%{data: %{"id" => "existing_object"}}) |> Repo.insert() + %User{} |> Map.merge(%{ap_id: "existing_actor"}) |> Repo.insert() + + # Multiple objects, one object exists (keep) + %Activity{} + |> Map.merge(%{ + local: false, + data: %{ + "id" => "remote_activity_existing_object", + "object" => ["non_ existing_object", "existing_object"] + } + }) + |> Repo.insert() + + # Multiple objects, one actor exists (keep) + %Activity{} + |> Map.merge(%{ + local: false, + data: %{ + "id" => "remote_activity_existing_actor", + "object" => ["non_ existing_object", "existing_actor"] + } + }) + |> Repo.insert() + + # Multiple objects, one activity exists (keep) + %Activity{} + |> Map.merge(%{ + local: false, + data: %{ + "id" => "remote_activity_existing_activity", + "object" => ["non_ existing_object", "remote_activity_existing_actor"] + } + }) + |> Repo.insert() + + # Multiple objects none exist (prune) + %Activity{} + |> Map.merge(%{ + local: false, + data: %{ + "id" => "remote_activity_without_existing_referenced_object", + "object" => ["owo", "whats_this"] + } + }) + |> Repo.insert() + + assert length(Repo.all(Activity)) == 4 + Mix.Tasks.Pleroma.Database.run(["prune_objects"]) + assert length(Repo.all(Activity)) == 4 + Mix.Tasks.Pleroma.Database.run(["prune_objects", "--prune-orphaned-activities"]) + activities = Repo.all(Activity) + assert length(activities) == 3 + + assert "remote_activity_without_existing_referenced_object" not in Enum.map( + activities, + fn a -> a.data["id"] end + ) + + assert length(activities) == 3 + end end describe "running update_users_following_followers_counts" do