From 18234cc44e6bc989e3e3cf15714c54b4fa05b9dd Mon Sep 17 00:00:00 2001 From: Sachin Joshi Date: Tue, 16 Jul 2019 22:37:36 +0545 Subject: [PATCH 1/3] add the rich media ttl based on image exp time --- CHANGELOG.md | 1 + config/config.exs | 3 +- ..._set_richmedia_cache_ttl_based_on_image.md | 32 +++++++++++ lib/pleroma/web/rich_media/parser.ex | 41 ++++++++++++++ .../rich_media/parsers/ttl/aws_signed_url.ex | 54 +++++++++++++++++++ test/fixtures/rich_media/amz.html | 5 ++ test/web/rich_media/aws_signed_url_test.exs | 37 +++++++++++++ 7 files changed, 172 insertions(+), 1 deletion(-) create mode 100644 docs/config/howto_set_richmedia_cache_ttl_based_on_image.md create mode 100644 lib/pleroma/web/rich_media/parsers/ttl/aws_signed_url.ex create mode 100644 test/fixtures/rich_media/amz.html create mode 100644 test/web/rich_media/aws_signed_url_test.exs diff --git a/CHANGELOG.md b/CHANGELOG.md index f3630a1c5..4e58b0a9f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -45,6 +45,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - Configuration: Filter.AnonymizeFilename added ability to retain file extension with custom text - Admin API: changed json structure for saving config settings. - RichMedia: parsers and their order are configured in `rich_media` config. +- RichMedia: add the rich media ttl based on image expiration time. ## [1.0.1] - 2019-07-14 ### Security diff --git a/config/config.exs b/config/config.exs index 7d539f994..aa5bd0da9 100644 --- a/config/config.exs +++ b/config/config.exs @@ -344,7 +344,8 @@ Pleroma.Web.RichMedia.Parsers.TwitterCard, Pleroma.Web.RichMedia.Parsers.OGP, Pleroma.Web.RichMedia.Parsers.OEmbed - ] + ], + ttl_setters: [Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl] config :pleroma, :media_proxy, enabled: false, diff --git a/docs/config/howto_set_richmedia_cache_ttl_based_on_image.md b/docs/config/howto_set_richmedia_cache_ttl_based_on_image.md new file mode 100644 index 000000000..489f9ece8 --- /dev/null +++ b/docs/config/howto_set_richmedia_cache_ttl_based_on_image.md @@ -0,0 +1,32 @@ +# How to set rich media cache ttl based on image ttl +## Explanation + +Richmedia are cached without the ttl but the rich media may have image which can expire, like aws signed url. +In such cases the old image url (expired) is returned from the media cache. + +So to avoid such situation we can define a moddule that will set ttl based no image. + +The module must have a `run` function and it should be registered in the config. + +### Example + +```exs +defmodule MyModule do + def run(data, url) do + image_url = Map.get(data, :image) + # do some parsing in the url and get the ttl of the image + # ttl is unix time + ttl = parse_ttl_from_url(image_url) + Cachex.expire_at(:rich_media_cache, url, ttl * 1000) + end +end +``` + +And update the config + +```exs +config :pleroma, :rich_media, + ttl_setters: [Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl, MyModule] +``` + +> For reference there is a parser for AWS signed URL `Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl`, it's enabled by default. diff --git a/lib/pleroma/web/rich_media/parser.ex b/lib/pleroma/web/rich_media/parser.ex index 0d2523338..ba8dc6f2a 100644 --- a/lib/pleroma/web/rich_media/parser.ex +++ b/lib/pleroma/web/rich_media/parser.ex @@ -24,6 +24,7 @@ def parse(url) do Cachex.fetch!(:rich_media_cache, url, fn _ -> {:commit, parse_url(url)} end) + |> set_ttl_based_on_image(url) rescue e -> {:error, "Cachex error: #{inspect(e)}"} @@ -31,6 +32,46 @@ def parse(url) do end end + @doc """ + Set the rich media cache based on the expiration time of image. + + Define a module that has `run` function + + ## Example + + defmodule MyModule do + def run(data, url) do + image_url = Map.get(data, :image) + # do some parsing in the url and get the ttl of the image + # ttl is unix time + ttl = parse_ttl_from_url(image_url) + Cachex.expire_at(:rich_media_cache, url, ttl * 1000) + end + end + + Define the module in the config + + config :pleroma, :rich_media, + ttl_setters: [MyModule] + """ + def set_ttl_based_on_image({:ok, data}, url) do + case Cachex.ttl(:rich_media_cache, url) do + {:ok, nil} -> + modules = Pleroma.Config.get([:rich_media, :ttl_setters]) + + if Enum.count(modules) > 0 do + Enum.each(modules, & &1.run(data, url)) + end + + {:ok, data} + + _ -> + {:ok, data} + end + end + + def set_ttl_based_on_image(data, _url), do: data + defp parse_url(url) do try do {:ok, %Tesla.Env{body: html}} = Pleroma.HTTP.get(url, [], adapter: @hackney_options) diff --git a/lib/pleroma/web/rich_media/parsers/ttl/aws_signed_url.ex b/lib/pleroma/web/rich_media/parsers/ttl/aws_signed_url.ex new file mode 100644 index 000000000..d57107939 --- /dev/null +++ b/lib/pleroma/web/rich_media/parsers/ttl/aws_signed_url.ex @@ -0,0 +1,54 @@ +defmodule Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl do + def run(data, url) do + image = Map.get(data, :image) + + if is_aws_signed_url(image) do + image + |> parse_query_params() + |> format_query_params() + |> get_expiration_timestamp() + |> set_ttl(url) + end + end + + defp is_aws_signed_url(""), do: nil + defp is_aws_signed_url(nil), do: nil + + defp is_aws_signed_url(image) when is_binary(image) do + %URI{host: host, query: query} = URI.parse(image) + + if String.contains?(host, "amazonaws.com") and + String.contains?(query, "X-Amz-Expires") do + image + else + nil + end + end + + defp is_aws_signed_url(_), do: nil + + defp parse_query_params(image) do + %URI{query: query} = URI.parse(image) + query + end + + defp format_query_params(query) do + query + |> String.split(~r/&|=/) + |> Enum.chunk_every(2) + |> Map.new(fn [k, v] -> {k, v} end) + end + + defp get_expiration_timestamp(params) when is_map(params) do + {:ok, date} = + params + |> Map.get("X-Amz-Date") + |> Timex.parse("{ISO:Basic:Z}") + + Timex.to_unix(date) + String.to_integer(Map.get(params, "X-Amz-Expires")) + end + + defp set_ttl(ttl, url) do + Cachex.expire_at(:rich_media_cache, url, ttl * 1000) + end +end diff --git a/test/fixtures/rich_media/amz.html b/test/fixtures/rich_media/amz.html new file mode 100644 index 000000000..d4f8bd1a3 --- /dev/null +++ b/test/fixtures/rich_media/amz.html @@ -0,0 +1,5 @@ + + + + + diff --git a/test/web/rich_media/aws_signed_url_test.exs b/test/web/rich_media/aws_signed_url_test.exs new file mode 100644 index 000000000..75bf6c6df --- /dev/null +++ b/test/web/rich_media/aws_signed_url_test.exs @@ -0,0 +1,37 @@ +# Pleroma: A lightweight social networking server +# Copyright © 2017-2019 Pleroma Authors +# SPDX-License-Identifier: AGPL-3.0-only + +defmodule Pleroma.Web.RichMedia.TTL.AwsSignedUrlTest do + use ExUnit.Case, async: true + + test "amazon signed url is parsed and correct ttl is set for rich media" do + url = "https://pleroma.social/amz" + + {:ok, timestamp} = + Timex.now() + |> DateTime.truncate(:second) + |> Timex.format("{ISO:Basic:Z}") + + # in seconds + valid_till = 30 + + data = %{ + image: + "https://pleroma.s3.ap-southeast-1.amazonaws.com/sachin%20%281%29%20_a%20-%25%2Aasdasd%20BNN%20bnnn%20.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIBLWWK6RGDQXDLJQ%2F20190716%2Fap-southeast-1%2Fs3%2Faws4_request&X-Amz-Date=#{ + timestamp + }&X-Amz-Expires=#{valid_till}&X-Amz-Signature=04ffd6b98634f4b1bbabc62e0fac4879093cd54a6eed24fe8eb38e8369526bbf&X-Amz-SignedHeaders=host", + locale: "en_US", + site_name: "Pleroma", + title: "PLeroma", + url: url + } + + Cachex.put(:rich_media_cache, url, data) + assert {:ok, _} = Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl.run(data, url) + {:ok, cache_ttl} = Cachex.ttl(:rich_media_cache, url) + + # as there is delay in setting and pulling the data from cache we ignore 1 second + assert_in_delta(valid_till * 1000, cache_ttl, 1000) + end +end From de9906ad56bd25d6c8c38bef1307192df2e95445 Mon Sep 17 00:00:00 2001 From: Sachin Joshi Date: Fri, 19 Jul 2019 11:43:42 +0545 Subject: [PATCH 2/3] change the structure of image ttl parsar --- ..._set_richmedia_cache_ttl_based_on_image.md | 2 +- lib/pleroma/web/rich_media/parser.ex | 36 +++++----- .../rich_media/parsers/ttl/aws_signed_url.ex | 10 ++- lib/pleroma/web/rich_media/parsers/ttl/ttl.ex | 3 + test/web/rich_media/aws_signed_url_test.exs | 70 +++++++++++++++---- 5 files changed, 85 insertions(+), 36 deletions(-) create mode 100644 lib/pleroma/web/rich_media/parsers/ttl/ttl.ex diff --git a/docs/config/howto_set_richmedia_cache_ttl_based_on_image.md b/docs/config/howto_set_richmedia_cache_ttl_based_on_image.md index 489f9ece8..5846b6ab0 100644 --- a/docs/config/howto_set_richmedia_cache_ttl_based_on_image.md +++ b/docs/config/howto_set_richmedia_cache_ttl_based_on_image.md @@ -4,7 +4,7 @@ Richmedia are cached without the ttl but the rich media may have image which can expire, like aws signed url. In such cases the old image url (expired) is returned from the media cache. -So to avoid such situation we can define a moddule that will set ttl based no image. +So to avoid such situation we can define a moddule that will set ttl based on image. The module must have a `run` function and it should be registered in the config. diff --git a/lib/pleroma/web/rich_media/parser.ex b/lib/pleroma/web/rich_media/parser.ex index ba8dc6f2a..b69b2be61 100644 --- a/lib/pleroma/web/rich_media/parser.ex +++ b/lib/pleroma/web/rich_media/parser.ex @@ -35,17 +35,17 @@ def parse(url) do @doc """ Set the rich media cache based on the expiration time of image. - Define a module that has `run` function + Adopt behaviour `Pleroma.Web.RichMedia.Parser.TTL` ## Example defmodule MyModule do - def run(data, url) do + @behaviour Pleroma.Web.RichMedia.Parser.TTL + def ttl(data, url) do image_url = Map.get(data, :image) # do some parsing in the url and get the ttl of the image - # ttl is unix time - ttl = parse_ttl_from_url(image_url) - Cachex.expire_at(:rich_media_cache, url, ttl * 1000) + # and return ttl is unix time + parse_ttl_from_url(image_url) end end @@ -55,22 +55,26 @@ def run(data, url) do ttl_setters: [MyModule] """ def set_ttl_based_on_image({:ok, data}, url) do - case Cachex.ttl(:rich_media_cache, url) do - {:ok, nil} -> - modules = Pleroma.Config.get([:rich_media, :ttl_setters]) - - if Enum.count(modules) > 0 do - Enum.each(modules, & &1.run(data, url)) - end - - {:ok, data} - + with {:ok, nil} <- Cachex.ttl(:rich_media_cache, url) do + ttl = get_ttl_from_image(data, url) + Cachex.expire_at(:rich_media_cache, url, ttl * 1000) + {:ok, data} + else _ -> {:ok, data} end end - def set_ttl_based_on_image(data, _url), do: data + defp get_ttl_from_image(data, url) do + Pleroma.Config.get([:rich_media, :ttl_setters]) + |> Enum.reduce({:ok, nil}, fn + module, {:ok, _ttl} -> + module.ttl(data, url) + + _, error -> + error + end) + end defp parse_url(url) do try do diff --git a/lib/pleroma/web/rich_media/parsers/ttl/aws_signed_url.ex b/lib/pleroma/web/rich_media/parsers/ttl/aws_signed_url.ex index d57107939..014c0935f 100644 --- a/lib/pleroma/web/rich_media/parsers/ttl/aws_signed_url.ex +++ b/lib/pleroma/web/rich_media/parsers/ttl/aws_signed_url.ex @@ -1,5 +1,8 @@ defmodule Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl do - def run(data, url) do + @behaviour Pleroma.Web.RichMedia.Parser.TTL + + @impl Pleroma.Web.RichMedia.Parser.TTL + def ttl(data, _url) do image = Map.get(data, :image) if is_aws_signed_url(image) do @@ -7,7 +10,6 @@ def run(data, url) do |> parse_query_params() |> format_query_params() |> get_expiration_timestamp() - |> set_ttl(url) end end @@ -47,8 +49,4 @@ defp get_expiration_timestamp(params) when is_map(params) do Timex.to_unix(date) + String.to_integer(Map.get(params, "X-Amz-Expires")) end - - defp set_ttl(ttl, url) do - Cachex.expire_at(:rich_media_cache, url, ttl * 1000) - end end diff --git a/lib/pleroma/web/rich_media/parsers/ttl/ttl.ex b/lib/pleroma/web/rich_media/parsers/ttl/ttl.ex new file mode 100644 index 000000000..6b3ec6d30 --- /dev/null +++ b/lib/pleroma/web/rich_media/parsers/ttl/ttl.ex @@ -0,0 +1,3 @@ +defmodule Pleroma.Web.RichMedia.Parser.TTL do + @callback ttl(Map.t(), String.t()) :: {:ok, Integer.t()} | {:error, String.t()} +end diff --git a/test/web/rich_media/aws_signed_url_test.exs b/test/web/rich_media/aws_signed_url_test.exs index 75bf6c6df..122787bc2 100644 --- a/test/web/rich_media/aws_signed_url_test.exs +++ b/test/web/rich_media/aws_signed_url_test.exs @@ -5,7 +5,7 @@ defmodule Pleroma.Web.RichMedia.TTL.AwsSignedUrlTest do use ExUnit.Case, async: true - test "amazon signed url is parsed and correct ttl is set for rich media" do + test "s3 signed url is parsed correct for expiration time" do url = "https://pleroma.social/amz" {:ok, timestamp} = @@ -16,22 +16,66 @@ test "amazon signed url is parsed and correct ttl is set for rich media" do # in seconds valid_till = 30 - data = %{ - image: - "https://pleroma.s3.ap-southeast-1.amazonaws.com/sachin%20%281%29%20_a%20-%25%2Aasdasd%20BNN%20bnnn%20.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIBLWWK6RGDQXDLJQ%2F20190716%2Fap-southeast-1%2Fs3%2Faws4_request&X-Amz-Date=#{ - timestamp - }&X-Amz-Expires=#{valid_till}&X-Amz-Signature=04ffd6b98634f4b1bbabc62e0fac4879093cd54a6eed24fe8eb38e8369526bbf&X-Amz-SignedHeaders=host", - locale: "en_US", - site_name: "Pleroma", - title: "PLeroma", - url: url - } + metadata = construct_metadata(timestamp, valid_till, url) + + expire_time = + Timex.parse!(timestamp, "{ISO:Basic:Z}") |> Timex.to_unix() |> Kernel.+(valid_till) + + assert expire_time == Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl.ttl(metadata, url) + end + + test "s3 signed url is parsed and correct ttl is set for rich media" do + url = "https://pleroma.social/amz" + + {:ok, timestamp} = + Timex.now() + |> DateTime.truncate(:second) + |> Timex.format("{ISO:Basic:Z}") + + # in seconds + valid_till = 30 + + metadata = construct_metadata(timestamp, valid_till, url) + + body = """ + + + + + + """ + + Tesla.Mock.mock(fn + %{ + method: :get, + url: "https://pleroma.social/amz" + } -> + %Tesla.Env{status: 200, body: body} + end) + + Cachex.put(:rich_media_cache, url, metadata) + + Pleroma.Web.RichMedia.Parser.set_ttl_based_on_image({:ok, metadata}, url) - Cachex.put(:rich_media_cache, url, data) - assert {:ok, _} = Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl.run(data, url) {:ok, cache_ttl} = Cachex.ttl(:rich_media_cache, url) # as there is delay in setting and pulling the data from cache we ignore 1 second assert_in_delta(valid_till * 1000, cache_ttl, 1000) end + + defp construct_s3_url(timestamp, valid_till) do + "https://pleroma.s3.ap-southeast-1.amazonaws.com/sachin%20%281%29%20_a%20-%25%2Aasdasd%20BNN%20bnnn%20.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIBLWWK6RGDQXDLJQ%2F20190716%2Fap-southeast-1%2Fs3%2Faws4_request&X-Amz-Date=#{ + timestamp + }&X-Amz-Expires=#{valid_till}&X-Amz-Signature=04ffd6b98634f4b1bbabc62e0fac4879093cd54a6eed24fe8eb38e8369526bbf&X-Amz-SignedHeaders=host" + end + + defp construct_metadata(timestamp, valid_till, url) do + %{ + image: construct_s3_url(timestamp, valid_till), + site: "Pleroma", + title: "Pleroma", + description: "Pleroma", + url: url + } + end end From 581756ccc50cc08823957a2f24f506bf23c7cd22 Mon Sep 17 00:00:00 2001 From: Sachin Joshi Date: Fri, 19 Jul 2019 11:50:47 +0545 Subject: [PATCH 3/3] update the docs --- ...owto_set_richmedia_cache_ttl_based_on_image.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/docs/config/howto_set_richmedia_cache_ttl_based_on_image.md b/docs/config/howto_set_richmedia_cache_ttl_based_on_image.md index 5846b6ab0..bfee5a9e6 100644 --- a/docs/config/howto_set_richmedia_cache_ttl_based_on_image.md +++ b/docs/config/howto_set_richmedia_cache_ttl_based_on_image.md @@ -4,20 +4,21 @@ Richmedia are cached without the ttl but the rich media may have image which can expire, like aws signed url. In such cases the old image url (expired) is returned from the media cache. -So to avoid such situation we can define a moddule that will set ttl based on image. - -The module must have a `run` function and it should be registered in the config. +So to avoid such situation we can define a module that will set ttl based on image. +The module must adopt behaviour `Pleroma.Web.RichMedia.Parser.TTL` ### Example ```exs defmodule MyModule do - def run(data, url) do + @behaviour Pleroma.Web.RichMedia.Parser.TTL + + @impl Pleroma.Web.RichMedia.Parser.TTL + def ttl(data, url) do image_url = Map.get(data, :image) # do some parsing in the url and get the ttl of the image - # ttl is unix time - ttl = parse_ttl_from_url(image_url) - Cachex.expire_at(:rich_media_cache, url, ttl * 1000) + # return ttl is unix time + parse_ttl_from_url(image_url) end end ```