ausglamr/blogs/management/commands/check_feeds.py

264 lines
11 KiB
Python

"""call this from cron to run through all the feeds to find new posts"""
import logging
import traceback
from datetime import datetime, timedelta, timezone
import feedparser
from django.core.management.base import BaseCommand
from django.db.models import Q
from django.utils import html
from django.utils import timezone as django_timezone
from blogs import models
from django.forms.models import model_to_dict
agent = "AusGLAMR/1.0 +https://ausglamr.newcardigan.org"
def date_to_tz_aware(date_tuple):
"""turn a 9-tuple into something usable"""
# we are assuming all dates are UTC which is a bit dodgy but it works
return datetime(*date_tuple[0:7], tzinfo=timezone.utc)
def get_tags(dictionary):
"""parse out tags from blog and upsert as tag instances"""
tags = []
for tag_obj in dictionary:
if tag_obj.term.lower() != "uncategorized":
tag = models.Tag.objects.filter(name=tag_obj.term.lower()).first()
if not tag:
tag = models.Tag.objects.create(name=tag_obj.term.lower())
tags.append(tag)
return tags
class Command(BaseCommand):
"""the check_feeds command"""
def add_arguments(self, parser):
parser.add_argument(
"-q",
action="store_true",
help="Suppress non-error messages",
)
parser.add_argument(
"-blogs",
action="store_true",
help="Only check blog posts",
)
parser.add_argument(
"-newsletters",
action="store_true",
help="Only check editions",
)
def handle(self, *args, **options):
"""check feeds and update database"""
if not options["q"]:
logging.info(
f"checking feeds at {django_timezone.localtime(django_timezone.now())}"
)
if not options["newsletters"]:
blogs = models.Blog.objects.filter(
approved=True, suspended=False, active=True
).all()
for blog in blogs:
try:
data = feedparser.parse(blog.feed, agent=agent)
for article in data.entries:
if not models.Article.objects.filter(
Q(url=article.link)
| Q(guid=getattr(article, "id", article.link))
).exists():
if blog.suspension_lifted and (
blog.suspension_lifted
> date_to_tz_aware(article.updated_parsed)
):
continue # don't ingest posts published prior to suspension being lifted (we should already have older ones from prior to suspension)
taglist = getattr(article, "tags", None) or getattr(
article, "categories", []
)
tags = [tag.term.lower() for tag in taglist]
opt_out = False
# don't include posts with opt out tags
for tag in tags:
if (
len(
{tag}
& {
"notglam",
"notglamr",
"notausglamblogs",
"notausglamr",
"notglamblogs",
"#notglam",
}
)
> 0
):
opt_out = True
else:
continue
if not opt_out:
author_name = getattr(
article, "author", None
) or getattr(blog, "author", "")
description = (
html.strip_tags(article.summary)
if (
hasattr(article, "summary")
and len(article.summary) > 0
)
else (
html.strip_tags(article.description)
if (
hasattr(article, "description")
and len(article.summary)
)
else (
html.strip_tags(article.content[0].value)
if (
hasattr(article, "content")
and len(article.content)
)
else None
)
)
)
if description:
desc = description[:200] + "..."
else:
desc = ""
instance = models.Article.objects.create(
title=article.title,
author_name=author_name,
url=article.link,
description=desc,
updateddate=date_to_tz_aware(
article.updated_parsed
),
blog=blog,
pubdate=date_to_tz_aware(article.published_parsed),
guid=getattr(article, "id", article.link),
)
tags_to_add = get_tags(
getattr(article, "tags", None)
or getattr(article, "categories", [])
)
for tag in tags_to_add:
instance.tags.add(tag)
instance.save()
cutoff = django_timezone.now() - timedelta(days=3)
newish = instance.pubdate > cutoff
if newish:
instance.announce()
blog.set_success(
updateddate=date_to_tz_aware(
article.updated_parsed
)
)
except Exception as e:
blog.set_failing()
logging.error(f"ERROR WITH BLOG {blog.title} - {blog.url}")
logging.info(article)
logging.error(e)
if not options["blogs"]:
newsletters = models.Newsletter.objects.filter(
approved=True, active=True, feed__isnull=False
).all()
for newsletter in newsletters:
try:
data = feedparser.parse(newsletter.feed, agent=agent)
for edition in data.entries:
if not models.Edition.objects.filter(
Q(url=edition.link)
| Q(guid=getattr(edition, "id", edition.link))
).exists():
author_name = getattr(edition, "author", None) or getattr(
edition, "author", ""
)
description = (
html.strip_tags(edition.summary)
if (
hasattr(edition, "summary") and len(edition.summary)
)
else (
html.strip_tags(edition.description)
if (
hasattr(edition, "description")
and len(edition.description)
)
else (
html.strip_tags(edition.content[0].value)
if (
hasattr(article, "content")
and len(article.content)
)
else None
)
)
)
if description:
desc = description[:200] + "..."
else:
desc = ""
instance = models.Edition.objects.create(
title=edition.title,
author_name=author_name,
url=edition.link,
description=desc,
updateddate=date_to_tz_aware(edition.updated_parsed),
newsletter=newsletter,
pubdate=date_to_tz_aware(edition.published_parsed),
guid=getattr(edition, "id", edition.link),
)
instance.save()
cutoff = django_timezone.now() - timedelta(days=3)
newish = instance.pubdate > cutoff
if newish:
instance.announce()
newsletter.set_success(
updateddate=date_to_tz_aware(edition.updated_parsed)
)
except Exception as e:
newsletter.set_failing()
logging.error(
f"ERROR WITH NEWSLETTER {newsletter.name} - {newsletter.url}"
)
logging.error(e)
if not options["q"]:
logging.info(
f"completed run at {django_timezone.localtime(django_timezone.now())}"
)