Redmage/api/download_subreddit_images.go

458 lines
14 KiB
Go
Raw Normal View History

2024-04-09 22:37:26 +07:00
package api
import (
"context"
"errors"
2024-04-14 00:32:55 +07:00
"image/jpeg"
"io"
2024-04-10 17:13:07 +07:00
"net/http"
2024-04-14 00:32:55 +07:00
"net/url"
"os"
"path"
"strings"
"sync"
"time"
"github.com/aarondl/opt/omit"
2024-04-14 00:32:55 +07:00
"github.com/disintegration/imaging"
"github.com/tigorlazuardi/redmage/api/reddit"
2024-04-25 12:31:20 +07:00
"github.com/tigorlazuardi/redmage/models"
"github.com/tigorlazuardi/redmage/pkg/errs"
2024-04-14 00:32:55 +07:00
"github.com/tigorlazuardi/redmage/pkg/log"
"github.com/tigorlazuardi/redmage/pkg/telemetry"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/trace"
)
2024-04-09 22:37:26 +07:00
type DownloadSubredditParams struct {
2024-04-14 00:32:55 +07:00
Countback int
2024-04-25 12:31:20 +07:00
Devices models.DeviceSlice
2024-04-14 00:32:55 +07:00
SubredditType reddit.SubredditType
2024-04-09 22:37:26 +07:00
}
var (
2024-04-10 17:13:07 +07:00
ErrNoDevices = errors.New("api: no devices set")
ErrDownloadDirNotSet = errors.New("api: download directory not set")
)
func (api *API) DownloadSubredditImages(ctx context.Context, subreddit *models.Subreddit, devices models.DeviceSlice) error {
2024-04-10 17:13:07 +07:00
downloadDir := api.config.String("download.directory")
if downloadDir == "" {
return errs.Wrapw(ErrDownloadDirNotSet, "download directory must be set before images can be downloaded").Code(http.StatusBadRequest)
}
if len(devices) == 0 {
2024-04-10 17:13:07 +07:00
return errs.Wrapw(ErrNoDevices, "downloading images requires at least one device configured").Code(http.StatusBadRequest)
}
2024-04-10 17:13:07 +07:00
ctx, span := tracer.Start(ctx, "*API.DownloadSubredditImages", trace.WithAttributes(attribute.String("subreddit", subreddit.Name)))
2024-04-14 00:32:55 +07:00
defer span.End()
wg := sync.WaitGroup{}
countback := int(subreddit.Countback)
2024-04-14 00:32:55 +07:00
2024-04-25 12:31:20 +07:00
var (
list reddit.Listing
err error
)
for countback > 0 {
limit := 100
if limit > countback {
limit = countback
2024-04-14 00:32:55 +07:00
}
log.New(ctx).Debug("getting posts", "subreddit", subreddit, "current_countback", countback, "current_limit", limit)
2024-04-25 12:31:20 +07:00
list, err = api.reddit.GetPosts(ctx, reddit.GetPostsParam{
Subreddit: subreddit.Name,
2024-04-14 00:32:55 +07:00
Limit: limit,
2024-04-25 12:31:20 +07:00
After: list.GetLastAfter(),
SubredditType: reddit.SubredditType(subreddit.Subtype),
2024-04-14 00:32:55 +07:00
})
if err != nil {
return errs.Wrapw(err, "failed to get posts", "subreddit", subreddit)
2024-04-14 00:32:55 +07:00
}
wg.Add(1)
go func(ctx context.Context, posts reddit.Listing) {
defer wg.Done()
err := api.downloadSubredditListImage(ctx, list, subreddit, devices)
2024-04-14 00:32:55 +07:00
if err != nil {
log.New(ctx).Err(err).Error("failed to download image")
}
}(ctx, list)
2024-04-25 12:31:20 +07:00
if len(list.GetPosts()) == 0 {
break
}
2024-04-14 00:32:55 +07:00
countback -= len(list.GetPosts())
}
wg.Wait()
return nil
}
func (api *API) downloadSubredditListImage(ctx context.Context, list reddit.Listing, subreddit *models.Subreddit, devices models.DeviceSlice) error {
2024-04-14 00:49:36 +07:00
ctx, span := tracer.Start(ctx, "*API.downloadSubredditListImage")
2024-04-14 00:32:55 +07:00
defer span.End()
wg := sync.WaitGroup{}
for _, post := range list.GetPosts() {
if !post.IsImagePost() {
continue
}
2024-04-30 14:12:33 +07:00
acceptedDevices := api.getDevicesThatAcceptPost(ctx, post, devices)
if len(acceptedDevices) == 0 {
2024-04-14 00:32:55 +07:00
continue
}
2024-04-30 14:12:33 +07:00
log.New(ctx).Debug("downloading image", "post_id", post.GetID(), "post_url", post.GetImageURL(), "devices", acceptedDevices)
2024-04-14 00:32:55 +07:00
wg.Add(1)
api.imageSemaphore <- struct{}{}
go func(ctx context.Context, post reddit.Post) {
defer func() {
<-api.imageSemaphore
wg.Done()
}()
2024-04-30 14:12:33 +07:00
if imageFile := api.findImageFileForDevices(ctx, post, devices); imageFile != nil {
err := api.saveImageToFSAndDatabase(ctx, imageFile, subreddit, post, acceptedDevices)
if err != nil {
log.New(ctx).Err(err).Error("failed to download subreddit image")
}
return
}
if err := api.downloadSubredditImage(ctx, post, subreddit, acceptedDevices); err != nil {
2024-04-14 00:49:36 +07:00
log.New(ctx).Err(err).Error("failed to download subreddit image")
2024-04-14 00:32:55 +07:00
}
2024-04-14 00:49:36 +07:00
}(ctx, post)
}
2024-04-14 00:32:55 +07:00
2024-04-14 00:49:36 +07:00
wg.Wait()
2024-04-14 00:32:55 +07:00
2024-04-14 00:49:36 +07:00
return nil
}
2024-04-14 00:32:55 +07:00
func (api *API) downloadSubredditImage(ctx context.Context, post reddit.Post, subreddit *models.Subreddit, devices models.DeviceSlice) error {
2024-04-14 00:49:36 +07:00
ctx, span := tracer.Start(ctx, "*API.downloadSubredditImage")
defer span.End()
2024-04-14 00:32:55 +07:00
2024-04-14 00:49:36 +07:00
imageHandler, err := api.reddit.DownloadImage(ctx, post, api.downloadBroadcast)
if err != nil {
return errs.Wrapw(err, "failed to download image")
}
defer imageHandler.Close()
2024-04-14 00:32:55 +07:00
2024-04-14 00:49:36 +07:00
// copy to temp dir first to avoid copying incomplete files.
tmpImageFile, err := api.copyImageToTempDir(ctx, imageHandler)
if err != nil {
return errs.Wrapw(err, "failed to download image to temp file")
2024-04-14 00:32:55 +07:00
}
2024-04-14 00:49:36 +07:00
defer tmpImageFile.Close()
2024-04-14 00:32:55 +07:00
2024-04-14 00:49:36 +07:00
thumbnailPath := post.GetThumbnailTargetPath(api.config)
_, errStat := os.Stat(thumbnailPath)
if errStat == nil {
// file exist
return nil
}
if !errors.Is(errStat, os.ErrNotExist) {
2024-04-14 13:11:10 +07:00
return errs.Wrapw(err, "failed to check thumbnail existence", "path", thumbnailPath)
2024-04-14 00:49:36 +07:00
}
2024-04-26 22:13:04 +07:00
_ = os.MkdirAll(post.GetThumbnailTargetDir(api.config), 0o777)
2024-04-14 00:49:36 +07:00
thumbnailSource, err := imaging.Open(tmpImageFile.filename)
if err != nil {
return errs.Wrapw(err, "failed to open temp thumbnail file",
"filename", tmpImageFile.filename,
"post_url", post.GetPermalink(),
"image_url", post.GetImageURL(),
)
2024-04-14 00:49:36 +07:00
}
thumbnail := imaging.Resize(thumbnailSource, 256, 0, imaging.Lanczos)
thumbnailFile, err := os.Create(thumbnailPath)
if err != nil {
return errs.Wrapw(err, "failed to create thumbnail file", "filename", thumbnailPath)
}
defer thumbnailFile.Close()
err = jpeg.Encode(thumbnailFile, thumbnail, nil)
if err != nil {
return errs.Wrapw(err, "failed to encode thumbnail file to jpeg", "filename", thumbnailPath)
}
2024-04-14 00:32:55 +07:00
2024-04-30 14:12:33 +07:00
return api.saveImageToFSAndDatabase(ctx, tmpImageFile, subreddit, post, devices)
}
func (api *API) saveImageToFSAndDatabase(ctx context.Context, image io.ReadCloser, subreddit *models.Subreddit, post reddit.Post, devices models.DeviceSlice) (err error) {
ctx, span := tracer.Start(ctx, "*API.saveImageToFSAndDatabase")
defer span.End()
defer image.Close()
w, close, err := api.createDeviceImageWriters(post, devices)
if err != nil {
return errs.Wrapw(err, "failed to create image files")
}
log.New(ctx).Debug("saving image files", "post_id", post.GetID(), "post_url", post.GetImageURL(), "devices", devices)
defer close()
2024-04-30 14:12:33 +07:00
_, err = io.Copy(w, image)
if err != nil {
return errs.Wrapw(err, "failed to save image files")
}
var many []*models.ImageSetter
now := time.Now()
for _, device := range devices {
var nsfw int32
if post.IsNSFW() {
nsfw = 1
}
2024-04-30 14:12:33 +07:00
width, height := post.GetImageSize()
var size int64
if fi, err := os.Stat(post.GetImageTargetPath(api.config, device)); err == nil {
size = fi.Size()
}
many = append(many, &models.ImageSetter{
2024-04-30 14:12:33 +07:00
Subreddit: omit.From(subreddit.Name),
Device: omit.From(device.Slug),
PostTitle: omit.From(post.GetTitle()),
PostURL: omit.From(post.GetPostURL()),
PostCreated: omit.From(post.GetCreated().Unix()),
PostName: omit.From(post.GetName()),
2024-04-30 14:12:33 +07:00
PostAuthor: omit.From(post.GetAuthor()),
PostAuthorURL: omit.From(post.GetAuthorURL()),
ImageWidth: omit.From(int32(width)),
ImageHeight: omit.From(int32(height)),
ImageSize: omit.From(size),
ImageRelativePath: omit.From(post.GetImageRelativePath(device)),
ThumbnailRelativePath: omit.From(post.GetThumbnailRelativePath()),
ImageOriginalURL: omit.From(post.GetImageURL()),
NSFW: omit.From(nsfw),
CreatedAt: omit.From(now.Unix()),
UpdatedAt: omit.From(now.Unix()),
})
}
log.New(ctx).Debug("inserting images to database", "images", many)
_, err = models.Images.InsertMany(ctx, api.db, many...)
if err != nil {
return errs.Wrapw(err, "failed to insert images to database", "params", many)
}
2024-04-09 22:37:26 +07:00
return nil
}
2024-04-14 00:32:55 +07:00
2024-04-25 12:31:20 +07:00
func (api *API) createDeviceImageWriters(post reddit.Post, devices models.DeviceSlice) (writer io.Writer, close func(), err error) {
2024-04-14 00:32:55 +07:00
// open file for each device
var files []*os.File
var writers []io.Writer
for _, device := range devices {
var filename string
if device.WindowsWallpaperMode == 1 {
filename = post.GetWindowsWallpaperImageTargetPath(api.config, device)
2024-04-26 22:13:04 +07:00
dir := post.GetWindowsWallpaperImageTargetDir(api.config, device)
_ = os.MkdirAll(dir, 0o777)
2024-04-14 00:32:55 +07:00
} else {
filename = post.GetImageTargetPath(api.config, device)
2024-04-26 22:13:04 +07:00
dir := post.GetImageTargetDir(api.config, device)
if err := os.MkdirAll(dir, 0o777); err != nil {
for _, f := range files {
_ = f.Close()
}
return nil, nil, errs.Wrapw(err, "failed to create target image dir")
}
2024-04-14 00:32:55 +07:00
}
2024-04-25 12:31:20 +07:00
file, err := os.OpenFile(filename, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0o644)
2024-04-14 00:32:55 +07:00
if err != nil {
for _, f := range files {
_ = f.Close()
}
2024-04-26 22:13:04 +07:00
return nil, nil, errs.Wrapw(err, "failed to open target image file",
2024-04-14 00:32:55 +07:00
"device_name", device.Name,
2024-04-14 17:30:04 +07:00
"device_slug", device.Slug,
2024-04-14 00:32:55 +07:00
"filename", filename,
)
}
files = append(files, file)
writers = append(writers, file)
}
return io.MultiWriter(writers...), func() {
for _, file := range files {
_ = file.Close()
}
}, nil
}
2024-04-25 12:31:20 +07:00
func (api *API) getDevicesThatAcceptPost(ctx context.Context, post reddit.Post, devices models.DeviceSlice) (devs models.DeviceSlice) {
2024-04-14 00:32:55 +07:00
for _, device := range devices {
2024-04-30 14:12:33 +07:00
if shouldDownloadPostForDevice(post, device) && !api.isImageEntryExists(ctx, post, device) {
devs = append(devs, device)
2024-04-14 00:32:55 +07:00
}
}
return devs
}
2024-04-30 14:12:33 +07:00
// isImageEntryExists checks if the image entry already exists in the database and
// the image file actually exists in the filesystem.
func (api *API) isImageEntryExists(ctx context.Context, post reddit.Post, device *models.Device) (found bool) {
2024-04-25 12:31:20 +07:00
ctx, span := tracer.Start(ctx, "*API.IsImageExists")
defer span.End()
_, errQuery := models.Images.Query(ctx, api.db,
2024-04-30 14:12:33 +07:00
models.SelectWhere.Images.Device.EQ(device.Slug),
models.SelectWhere.Images.PostName.EQ(post.GetName()),
2024-04-25 12:31:20 +07:00
).One()
_, errStat := os.Stat(post.GetImageTargetPath(api.config, device))
2024-04-25 12:31:20 +07:00
return errQuery == nil && errStat == nil
2024-04-25 12:31:20 +07:00
}
2024-04-30 14:12:33 +07:00
// findImageFileForDevice finds if any of the image file exists for given devices.
//
// This helps to avoid downloading the same image for different devices.
//
// Return nil if no image file exists for the devices.
//
// Ensure to close the file after use.
func (api *API) findImageFileForDevices(ctx context.Context, post reddit.Post, devices models.DeviceSlice) (file *os.File) {
for _, device := range devices {
_, err := os.Stat(post.GetImageTargetPath(api.config, device))
if err == nil {
file, err = os.Open(post.GetImageTargetPath(api.config, device))
if err != nil {
log.New(ctx).Err(err).Error("failed to open image file", "filename", post.GetImageTargetPath(api.config, device))
return nil
}
return file
}
}
return nil
}
2024-04-25 12:31:20 +07:00
func shouldDownloadPostForDevice(post reddit.Post, device *models.Device) bool {
if post.IsNSFW() && device.NSFW == 0 {
2024-04-14 00:32:55 +07:00
return false
}
devAspectRatio := deviceAspectRatio(device)
rangeStart := devAspectRatio - device.AspectRatioTolerance
rangeEnd := devAspectRatio + device.AspectRatioTolerance
imgAspectRatio := post.GetImageAspectRatio()
width, height := post.GetImageSize()
log.New(context.Background()).Debug("checking image aspect ratio",
"device", device.Slug,
"device_height", device.ResolutionY,
"device_width", device.ResolutionX,
"device_aspect_ratio", devAspectRatio,
"image_aspect_ratio", imgAspectRatio,
"range_start", rangeStart,
"range_end", rangeEnd,
"success_fulfill_download_range_start", (imgAspectRatio > rangeStart),
"success_fulfill_download_range_end", (imgAspectRatio < rangeEnd),
"url", post.GetImageURL(),
"image.width", width,
"image.height", height,
)
if imgAspectRatio < rangeStart {
return false
}
if imgAspectRatio > rangeEnd {
2024-04-14 00:32:55 +07:00
return false
}
2024-04-25 12:31:20 +07:00
if device.MaxX > 0 && width > int64(device.MaxX) {
2024-04-14 00:32:55 +07:00
return false
}
2024-04-25 12:31:20 +07:00
if device.MaxY > 0 && height > int64(device.MaxY) {
2024-04-14 00:32:55 +07:00
return false
}
2024-04-25 12:31:20 +07:00
if device.MinX > 0 && width < int64(device.MinX) {
2024-04-14 00:32:55 +07:00
return false
}
2024-04-25 12:31:20 +07:00
if device.MinY > 0 && height < int64(device.MinY) {
2024-04-14 00:32:55 +07:00
return false
}
return true
}
2024-04-25 12:31:20 +07:00
func deviceAspectRatio(device *models.Device) float64 {
2024-04-14 00:32:55 +07:00
return float64(device.ResolutionX) / float64(device.ResolutionY)
}
type tempFile struct {
filename string
file *os.File
}
func (te *tempFile) Read(p []byte) (n int, err error) {
return te.file.Read(p)
}
func (te *tempFile) Close() error {
return te.file.Close()
}
// copyImageToTempDir copies the image to a temporary directory and returns the file handle
//
// file must be closed by the caller after use.
//
// file is nil if an error occurred.
func (api *API) copyImageToTempDir(ctx context.Context, img reddit.PostImage) (tmp *tempFile, err error) {
_, span := tracer.Start(ctx, "*API.copyImageToTempDir")
defer func() { telemetry.EndWithStatus(span, err) }()
// ignore error because url is always valid if this
// function is called
url, _ := url.Parse(img.URL)
split := strings.Split(url.Path, "/")
imageFilename := split[len(split)-1]
tmpDirname := path.Join(os.TempDir(), "redmage")
err = os.MkdirAll(tmpDirname, 0o777)
2024-04-26 22:13:04 +07:00
if err != nil {
return nil, errs.Wrapw(err, "failed to create temporary dir", "dir_name", tmpDirname)
}
2024-04-14 00:32:55 +07:00
tmpFilename := path.Join(tmpDirname, imageFilename)
2024-04-26 22:13:04 +07:00
file, err := os.OpenFile(tmpFilename, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0o777)
2024-04-14 00:32:55 +07:00
if err != nil {
return nil, errs.Wrapw(err, "failed to open temp image file",
"temp_file_path", tmpFilename,
"image_url", img.URL,
)
}
2024-04-26 22:13:04 +07:00
// File must be closed by end of function because kernel stuffs.
//
// A fresh fd must be used to properly get the new data.
defer file.Close()
2024-04-14 00:32:55 +07:00
_, err = io.Copy(file, img.File)
if err != nil {
2024-04-26 22:13:04 +07:00
return nil, errs.Wrapw(err, "failed to download image to temp file",
"temp_file_path", tmpFilename,
"image_url", img.URL,
)
}
filew, err := os.OpenFile(tmpFilename, os.O_RDONLY, 0o777)
if err != nil {
2024-04-14 00:32:55 +07:00
return nil, errs.Wrapw(err, "failed to download image to temp file",
"temp_file_path", tmpFilename,
"image_url", img.URL,
)
}
return &tempFile{
2024-04-26 22:13:04 +07:00
file: filew,
2024-04-14 00:32:55 +07:00
filename: tmpFilename,
}, err
}