ausglamr/blogs/management/commands/check_feeds.py

144 lines
5.1 KiB
Python
Raw Normal View History

2024-01-04 11:54:56 +11:00
"""call this from cron to run through all the feeds to find new posts"""
2024-01-07 16:58:56 +11:00
import logging
import traceback
2024-01-04 11:54:56 +11:00
from datetime import datetime, timedelta, timezone
import feedparser
from django.core.management.base import BaseCommand
from django.db.models import Q
from django.utils import timezone as django_timezone
from blogs import models
def date_to_tz_aware(date_tuple):
"""turn a 9-tuple into something usable"""
# we are assuming all dates are UTC which is a bit dodgy but it works
return datetime(*date_tuple[0:7], tzinfo=timezone.utc)
def get_tags(dictionary):
"""parse out tags from blog and upsert as tag instances"""
tags = []
for tag_obj in dictionary:
2024-01-07 16:58:56 +11:00
if tag_obj.term.lower() != "uncategorized":
tag = models.Tag.objects.filter(name=tag_obj.term.lower()).first()
if not tag:
tag = models.Tag.objects.create(name=tag_obj.term.lower())
2024-01-04 11:54:56 +11:00
2024-01-07 16:58:56 +11:00
tags.append(tag)
2024-01-04 11:54:56 +11:00
return tags
class Command(BaseCommand):
"""the check_feeds command"""
2024-01-07 16:58:56 +11:00
def add_arguments(self, parser):
parser.add_argument(
"-q",
action="store_true",
help="Suppress non-error messages",
)
2024-01-04 11:54:56 +11:00
def handle(self, *args, **options):
"""check feeds and update database"""
2024-01-07 16:58:56 +11:00
if not options["q"]:
logging.info(
f"checking feeds at {django_timezone.localtime(django_timezone.now())}"
)
2024-01-04 11:54:56 +11:00
2024-01-07 16:58:56 +11:00
blogs = models.Blog.objects.filter(
approved=True, suspended=False, active=True
).all()
2024-01-04 11:54:56 +11:00
for blog in blogs:
try:
data = feedparser.parse(blog.feed)
for article in data.entries:
if not models.Article.objects.filter(
Q(url=article.link) | Q(guid=article.id)
).exists():
2024-01-07 16:58:56 +11:00
if blog.suspension_lifted and (
2024-01-04 11:54:56 +11:00
blog.suspension_lifted
2024-01-07 16:58:56 +11:00
> date_to_tz_aware(article.updated_parsed)
2024-01-04 11:54:56 +11:00
):
2024-01-07 16:58:56 +11:00
continue # don't ingest posts published prior to suspension being lifted (we should already have older ones from prior to suspension)
2024-01-04 11:54:56 +11:00
2024-01-07 16:58:56 +11:00
taglist = getattr(article, "tags", None) or getattr(
article, "categories", []
2024-01-04 11:54:56 +11:00
)
2024-01-07 16:58:56 +11:00
tags = [tag.term.lower() for tag in taglist]
2024-01-04 11:54:56 +11:00
opt_out = False
# don't include posts with opt out tags
for tag in tags:
if (
len(
2024-01-07 16:58:56 +11:00
{tag}
2024-01-04 11:54:56 +11:00
& {
"notglam",
"notglamr",
"notausglamblogs",
"notausglamr",
"notglamblogs",
"#notglam",
}
)
> 0
):
opt_out = True
else:
continue
if not opt_out:
author_name = getattr(article, "author", None) or getattr(
blog, "author", None
)
instance = models.Article.objects.create(
title=article.title,
author_name=author_name,
url=article.link,
description=article.summary,
updateddate=date_to_tz_aware(article.updated_parsed),
blog=blog,
pubdate=date_to_tz_aware(article.published_parsed),
guid=article.id,
)
2024-01-07 16:58:56 +11:00
tags_to_add = get_tags(
getattr(article, "tags", None)
or getattr(article, "categories", [])
)
for tag in tags_to_add:
2024-01-04 11:54:56 +11:00
instance.tags.add(tag)
2024-01-07 16:58:56 +11:00
2024-01-04 11:54:56 +11:00
instance.save()
cutoff = django_timezone.now() - timedelta(days=3)
newish = instance.pubdate > cutoff
if newish:
instance.announce()
2024-01-07 16:58:56 +11:00
blog.set_success(
updateddate=date_to_tz_aware(article.updated_parsed)
)
2024-01-04 11:54:56 +11:00
except Exception as e:
blog.set_failing()
2024-01-07 16:58:56 +11:00
logging.error(f"ERROR WITH BLOG {blog.title} - {blog.url}")
logging.error(e)
2024-01-04 11:54:56 +11:00
2024-01-07 16:58:56 +11:00
if not options["q"]:
logging.info(
f"completed run at {django_timezone.localtime(django_timezone.now())}"
)