You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
290 lines
11 KiB
290 lines
11 KiB
# -*- coding: utf-8 -*-
|
|
from __future__ import unicode_literals, print_function
|
|
|
|
import logging
|
|
import os
|
|
import sys
|
|
|
|
from codecs import open as codecs_open
|
|
|
|
from pelican import signals
|
|
|
|
from pytz import timezone
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
if sys.version_info >= (3, 0):
|
|
from urllib.parse import urljoin
|
|
else:
|
|
from urlparse import urljoin
|
|
|
|
|
|
class ConfigurationError(Exception):
|
|
"""
|
|
Exception class for wrong configurations.
|
|
"""
|
|
pass
|
|
|
|
|
|
class SitemapGenerator(object):
|
|
"""
|
|
Class for generating a sitemap.xml.
|
|
"""
|
|
|
|
xml_wrap = """<?xml version="1.0" encoding="UTF-8"?><?xml-stylesheet type="text/xsl" href="%(SITEURL)ssitemap-stylesheet.xsl"?>
|
|
<urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd" xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
%(urls)s
|
|
</urlset>"""
|
|
|
|
template_url = """<url>
|
|
{}
|
|
</url>"""
|
|
|
|
settings_default = {
|
|
'priorities': {
|
|
'index': 1.0,
|
|
'articles': 0.8,
|
|
'pages': 0.5,
|
|
'others': 0.4
|
|
},
|
|
'changefrequencies': {
|
|
'index': 'daily',
|
|
'articles': 'weekly',
|
|
'pages': 'monthly',
|
|
'others': 'monthly',
|
|
}
|
|
}
|
|
|
|
def __init__(self, context, settings, path, theme, output_path, **kwargs):
|
|
"""
|
|
Initializes the generator class.
|
|
:param context: the generated context, mix of settings and transformed content
|
|
:type context: dict
|
|
:param settings: the pelican project settings
|
|
:type settings: dict
|
|
:param path: the path to the content files
|
|
:type path: str
|
|
:param theme: the path to the theme
|
|
:type theme: str
|
|
:param output_path: the path where the generated output is put
|
|
:type output_path: str
|
|
:param kwargs: additional keyword arguments
|
|
:type kwargs: dict
|
|
"""
|
|
self.pelican_settings = settings
|
|
self.path_content = path
|
|
self.path_output = output_path
|
|
self.context = context
|
|
if settings.get('TIMEZONE', None) is None:
|
|
raise ConfigurationError('Please specify the TIMEZONE setting!')
|
|
self.timezone = timezone(settings.get('TIMEZONE'))
|
|
self.url_site = settings.get('SITEURL')
|
|
if len(self.url_site) == 0:
|
|
self.url_site = "https://phschoen.de"
|
|
# Pelican strips off trailing slashes during settings initialization.
|
|
# The later used urljoin function strips of path elements not ending with a trailing slash,
|
|
# a slash is added here if it is not already present
|
|
if not self.url_site.endswith('/'):
|
|
self.url_site += '/'
|
|
self.settings = settings.get('EXTENDED_SITEMAP_PLUGIN', self.settings_default)
|
|
|
|
def generate_output(self, writer):
|
|
"""
|
|
Generates the sitemap file and the stylesheet file and puts them into the content dir.
|
|
:param writer: the writer instance
|
|
:type writer: pelican.writers.Writer
|
|
"""
|
|
# write xml stylesheet
|
|
with codecs_open(os.path.join(os.path.dirname(__file__), 'sitemap-stylesheet.xsl'), 'r', encoding='utf-8') as fd_origin:
|
|
with codecs_open(os.path.join(self.path_output, 'sitemap-stylesheet.xsl'), 'w', encoding='utf-8') as fd_destination:
|
|
xsl = fd_origin.read()
|
|
# replace some template markers
|
|
# TODO use pelican template magic
|
|
xsl = xsl.replace('{{ SITENAME }}', self.context.get('SITENAME'))
|
|
fd_destination.write(xsl)
|
|
|
|
# will contain the url nodes as text
|
|
urls = ''
|
|
|
|
# get all articles sorted by time
|
|
articles_sorted = sorted(self.context['articles'], key=self.__get_date_key, reverse=True)
|
|
|
|
# get all pages with date/modified date
|
|
pages_with_date = list(
|
|
filter(
|
|
lambda p: getattr(p, 'modified', False) or getattr(p, 'date', False),
|
|
self.context.get('pages')
|
|
)
|
|
)
|
|
pages_with_date_sorted = sorted(pages_with_date, key=self.__get_date_key, reverse=True)
|
|
|
|
# get all pages without date
|
|
pages_without_date = list(
|
|
filter(
|
|
lambda p: getattr(p, 'modified', None) is None and getattr(p, 'date', None) is None,
|
|
self.context.get('pages')
|
|
)
|
|
)
|
|
pages_without_date_sorted = sorted(pages_without_date, key=self.__get_title_key, reverse=False)
|
|
|
|
# join them, first date sorted, then title sorted
|
|
pages_sorted = pages_with_date_sorted + pages_without_date_sorted
|
|
|
|
# the landing page
|
|
if 'index' in self.context.get('DIRECT_TEMPLATES'):
|
|
# assume that the index page has changed with the most current article or page
|
|
# use the first article or page if no articles
|
|
index_reference = None
|
|
if len(articles_sorted) > 0:
|
|
index_reference = articles_sorted[0]
|
|
elif len(pages_sorted) > 0:
|
|
index_reference = pages_sorted[0]
|
|
|
|
if index_reference is not None:
|
|
urls += self.__create_url_node_for_content(
|
|
index_reference,
|
|
'index',
|
|
url=self.url_site,
|
|
)
|
|
|
|
# process articles
|
|
for article in articles_sorted:
|
|
urls += self.__create_url_node_for_content(
|
|
article,
|
|
'articles',
|
|
url=urljoin(self.url_site, article.url)
|
|
)
|
|
|
|
# process pages
|
|
for page in pages_sorted:
|
|
urls += self.__create_url_node_for_content(
|
|
page,
|
|
'pages',
|
|
url=urljoin(self.url_site, page.url)
|
|
)
|
|
|
|
# process category pages
|
|
if self.context.get('CATEGORY_URL'):
|
|
urls += self.__process_url_wrapper_elements(self.context.get('categories'))
|
|
|
|
# process tag pages
|
|
if self.context.get('TAG_URL'):
|
|
urls += self.__process_url_wrapper_elements(sorted(self.context.get('tags'), key=lambda x: x[0].name))
|
|
|
|
# process author pages
|
|
if self.context.get('AUTHOR_URL'):
|
|
urls += self.__process_url_wrapper_elements(self.context.get('authors'))
|
|
|
|
# handle all DIRECT_TEMPLATES but "index"
|
|
for direct_template in list(filter(lambda p: p != 'index', self.context.get('DIRECT_TEMPLATES'))):
|
|
# we assume the modification date of the last article as modification date for the listings of
|
|
# categories, authors and archives (all values of DIRECT_TEMPLATES but "index")
|
|
modification_time = getattr(articles_sorted[0], 'modified', getattr(articles_sorted[0], 'date', None))
|
|
url = self.__get_direct_template_url(direct_template)
|
|
urls += self.__create_url_node_for_content(None, 'others', url, modification_time)
|
|
|
|
# write the final sitemap file
|
|
with codecs_open(os.path.join(self.path_output, 'sitemap.xml'), 'w', encoding='utf-8') as fd:
|
|
fd.write(self.xml_wrap % {
|
|
'SITEURL': self.url_site,
|
|
'urls': urls
|
|
})
|
|
|
|
def __get_direct_template_url(self, name):
|
|
"""
|
|
Returns the URL for the given DIRECT_TEMPLATE name.
|
|
Resolution order is:
|
|
1. ${DIRECT_TEMPLATE}_URL (custom property, no Pelican default)
|
|
2. ${DIRECT_TEMPLATE}_SAVE_AS
|
|
3. Default path
|
|
:param name: name of the direct template
|
|
:return: str
|
|
"""
|
|
name_upper = name.upper()
|
|
url = self.pelican_settings.get(
|
|
'{}_URL'.format(name_upper),
|
|
self.pelican_settings.get(
|
|
'{}_SAVE_AS'.format(name_upper),
|
|
'{}.html'.format(name)
|
|
)
|
|
)
|
|
return urljoin(self.url_site, url)
|
|
|
|
def __process_url_wrapper_elements(self, elements):
|
|
"""
|
|
Creates the url nodes for pelican.urlwrappers.Category and pelican.urlwrappers.Tag.
|
|
:param elements: list of wrapper elements
|
|
:type elements: list
|
|
:return: the processes urls as HTML
|
|
:rtype: str
|
|
"""
|
|
urls = ''
|
|
for url_wrapper, articles in elements:
|
|
urls += self.__create_url_node_for_content(
|
|
url_wrapper,
|
|
'others',
|
|
url=urljoin(self.url_site, url_wrapper.url),
|
|
modification_time=self.__get_date_key(sorted(articles, key=self.__get_date_key, reverse=True)[0])
|
|
)
|
|
return urls
|
|
|
|
def __create_url_node_for_content(self, content, content_type, url=None, modification_time=None):
|
|
"""
|
|
Creates the required <url> node for the sitemap xml.
|
|
:param content: the content class to handle
|
|
:type content: pelican.contents.Content | None
|
|
:param content_type: the type of the given content to match settings.EXTENDED_SITEMAP_PLUGIN
|
|
:type content_type; str
|
|
:param url; if given, the URL to use instead of the url of the content instance
|
|
:type url: str
|
|
:param modification_time: the modification time of the url, will be used instead of content date if given
|
|
:type modification_time: datetime.datetime | None
|
|
:returns: the text node
|
|
:rtype: str
|
|
"""
|
|
loc = url
|
|
if loc is None:
|
|
loc = urljoin(self.url_site, self.context.get('ARTICLE_URL').format(**content.url_format))
|
|
lastmod = None
|
|
if modification_time is not None:
|
|
lastmod = modification_time.strftime('%Y-%m-%d')
|
|
else:
|
|
if content is not None:
|
|
if getattr(content, 'modified', None) is not None:
|
|
lastmod = getattr(content, 'modified').strftime('%Y-%m-%d')
|
|
elif getattr(content, 'date', None) is not None:
|
|
lastmod = getattr(content, 'date').strftime('%Y-%m-%d')
|
|
|
|
output = "<loc>{}</loc>".format(loc)
|
|
if lastmod is not None:
|
|
output += "\n<lastmod>{}</lastmod>".format(lastmod)
|
|
output += "\n<changefreq>{}</changefreq>".format(self.settings.get('changefrequencies').get(content_type))
|
|
output += "\n<priority>{:.2f}</priority>".format(self.settings.get('priorities').get(content_type))
|
|
|
|
return self.template_url.format(output)
|
|
|
|
@staticmethod
|
|
def __get_date_key(obj):
|
|
return getattr(obj, 'modified', None) or obj.date
|
|
|
|
@staticmethod
|
|
def __get_title_key(obj):
|
|
return getattr(obj, 'title')
|
|
|
|
|
|
def get_generators(generators):
|
|
"""
|
|
Returns the generators of this plugin,
|
|
:param generators: current generators
|
|
:type generators: pelican.Pelican
|
|
:returns: the sitemap generator type
|
|
:rtype: type
|
|
"""
|
|
return SitemapGenerator
|
|
|
|
|
|
def register():
|
|
"""
|
|
Registers the sitemap generator.
|
|
"""
|
|
signals.get_generators.connect(get_generators)
|