ausglamr/blogs/management/commands/check_feeds.py

119 lines
4.3 KiB
Python
Raw Normal View History

2024-01-04 11:54:56 +11:00
"""call this from cron to run through all the feeds to find new posts"""
from datetime import datetime, timedelta, timezone
import feedparser
from django.core.management.base import BaseCommand
from django.db.models import Q
from django.utils import timezone as django_timezone
from blogs import models
def date_to_tz_aware(date_tuple):
"""turn a 9-tuple into something usable"""
# we are assuming all dates are UTC which is a bit dodgy but it works
return datetime(*date_tuple[0:7], tzinfo=timezone.utc)
def get_tags(dictionary):
"""parse out tags from blog and upsert as tag instances"""
tags = []
for tag_obj in dictionary:
tag = models.Tag.objects.filter(name=tag_obj.term.lower()).first()
if not tag:
tag = models.Tag.objects.create(name=tag_obj.term.lower())
tags.append(tag)
return tags
class Command(BaseCommand):
"""the check_feeds command"""
# we could add arguments but we don't really need any
def handle(self, *args, **options):
"""check feeds and update database"""
print(f"checking feeds at {django_timezone.localtime(django_timezone.now())}")
blogs = models.Blog.objects.filter(approved=True, suspended=False).all()
for blog in blogs:
try:
data = feedparser.parse(blog.feed)
for article in data.entries:
if not models.Article.objects.filter(
Q(url=article.link) | Q(guid=article.id)
).exists():
if (
blog.suspension_lifted
and blog.suspension_lifted
< date_to_tz_aware(article.updated_parsed)
):
continue # don't ingest posts published during a suspension
tags = get_tags(
getattr(article, "tags", None)
or getattr(article, "categories", [])
)
opt_out = False
# don't include posts with opt out tags
for tag in tags:
if (
len(
{tag.name}
& {
"notglam",
"notglamr",
"notausglamblogs",
"notausglamr",
"notglamblogs",
"#notglam",
}
)
> 0
):
opt_out = True
else:
continue
if not opt_out:
author_name = getattr(article, "author", None) or getattr(
blog, "author", None
)
instance = models.Article.objects.create(
title=article.title,
author_name=author_name,
url=article.link,
description=article.summary,
updateddate=date_to_tz_aware(article.updated_parsed),
blog=blog,
pubdate=date_to_tz_aware(article.published_parsed),
guid=article.id,
)
for tag in tags:
instance.tags.add(tag)
instance.save()
cutoff = django_timezone.now() - timedelta(days=3)
newish = instance.pubdate > cutoff
if newish:
instance.announce()
blog.set_success()
except Exception as e:
blog.set_failing()
print(f"ERROR WITH BLOG {blog.title} - {blog.url}")
print(e)
print(f"completed run at {django_timezone.localtime(django_timezone.now())}")