You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

290 lines
11 KiB

# -*- coding: utf-8 -*-
from __future__ import unicode_literals, print_function
import logging
import os
import sys
from codecs import open as codecs_open
from pelican import signals
from pytz import timezone
logger = logging.getLogger(__name__)
if sys.version_info >= (3, 0):
from urllib.parse import urljoin
else:
from urlparse import urljoin
class ConfigurationError(Exception):
"""
Exception class for wrong configurations.
"""
pass
class SitemapGenerator(object):
"""
Class for generating a sitemap.xml.
"""
xml_wrap = """<?xml version="1.0" encoding="UTF-8"?><?xml-stylesheet type="text/xsl" href="%(SITEURL)ssitemap-stylesheet.xsl"?>
<urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd" xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
%(urls)s
</urlset>"""
template_url = """<url>
{}
</url>"""
settings_default = {
'priorities': {
'index': 1.0,
'articles': 0.8,
'pages': 0.5,
'others': 0.4
},
'changefrequencies': {
'index': 'daily',
'articles': 'weekly',
'pages': 'monthly',
'others': 'monthly',
}
}
def __init__(self, context, settings, path, theme, output_path, **kwargs):
"""
Initializes the generator class.
:param context: the generated context, mix of settings and transformed content
:type context: dict
:param settings: the pelican project settings
:type settings: dict
:param path: the path to the content files
:type path: str
:param theme: the path to the theme
:type theme: str
:param output_path: the path where the generated output is put
:type output_path: str
:param kwargs: additional keyword arguments
:type kwargs: dict
"""
self.pelican_settings = settings
self.path_content = path
self.path_output = output_path
self.context = context
if settings.get('TIMEZONE', None) is None:
raise ConfigurationError('Please specify the TIMEZONE setting!')
self.timezone = timezone(settings.get('TIMEZONE'))
self.url_site = settings.get('SITEURL')
if len(self.url_site) == 0:
self.url_site = "https://phschoen.de"
# Pelican strips off trailing slashes during settings initialization.
# The later used urljoin function strips of path elements not ending with a trailing slash,
# a slash is added here if it is not already present
if not self.url_site.endswith('/'):
self.url_site += '/'
self.settings = settings.get('EXTENDED_SITEMAP_PLUGIN', self.settings_default)
def generate_output(self, writer):
"""
Generates the sitemap file and the stylesheet file and puts them into the content dir.
:param writer: the writer instance
:type writer: pelican.writers.Writer
"""
# write xml stylesheet
with codecs_open(os.path.join(os.path.dirname(__file__), 'sitemap-stylesheet.xsl'), 'r', encoding='utf-8') as fd_origin:
with codecs_open(os.path.join(self.path_output, 'sitemap-stylesheet.xsl'), 'w', encoding='utf-8') as fd_destination:
xsl = fd_origin.read()
# replace some template markers
# TODO use pelican template magic
xsl = xsl.replace('{{ SITENAME }}', self.context.get('SITENAME'))
fd_destination.write(xsl)
# will contain the url nodes as text
urls = ''
# get all articles sorted by time
articles_sorted = sorted(self.context['articles'], key=self.__get_date_key, reverse=True)
# get all pages with date/modified date
pages_with_date = list(
filter(
lambda p: getattr(p, 'modified', False) or getattr(p, 'date', False),
self.context.get('pages')
)
)
pages_with_date_sorted = sorted(pages_with_date, key=self.__get_date_key, reverse=True)
# get all pages without date
pages_without_date = list(
filter(
lambda p: getattr(p, 'modified', None) is None and getattr(p, 'date', None) is None,
self.context.get('pages')
)
)
pages_without_date_sorted = sorted(pages_without_date, key=self.__get_title_key, reverse=False)
# join them, first date sorted, then title sorted
pages_sorted = pages_with_date_sorted + pages_without_date_sorted
# the landing page
if 'index' in self.context.get('DIRECT_TEMPLATES'):
# assume that the index page has changed with the most current article or page
# use the first article or page if no articles
index_reference = None
if len(articles_sorted) > 0:
index_reference = articles_sorted[0]
elif len(pages_sorted) > 0:
index_reference = pages_sorted[0]
if index_reference is not None:
urls += self.__create_url_node_for_content(
index_reference,
'index',
url=self.url_site,
)
# process articles
for article in articles_sorted:
urls += self.__create_url_node_for_content(
article,
'articles',
url=urljoin(self.url_site, article.url)
)
# process pages
for page in pages_sorted:
urls += self.__create_url_node_for_content(
page,
'pages',
url=urljoin(self.url_site, page.url)
)
# process category pages
if self.context.get('CATEGORY_URL'):
urls += self.__process_url_wrapper_elements(self.context.get('categories'))
# process tag pages
if self.context.get('TAG_URL'):
urls += self.__process_url_wrapper_elements(sorted(self.context.get('tags'), key=lambda x: x[0].name))
# process author pages
if self.context.get('AUTHOR_URL'):
urls += self.__process_url_wrapper_elements(self.context.get('authors'))
# handle all DIRECT_TEMPLATES but "index"
for direct_template in list(filter(lambda p: p != 'index', self.context.get('DIRECT_TEMPLATES'))):
# we assume the modification date of the last article as modification date for the listings of
# categories, authors and archives (all values of DIRECT_TEMPLATES but "index")
modification_time = getattr(articles_sorted[0], 'modified', getattr(articles_sorted[0], 'date', None))
url = self.__get_direct_template_url(direct_template)
urls += self.__create_url_node_for_content(None, 'others', url, modification_time)
# write the final sitemap file
with codecs_open(os.path.join(self.path_output, 'sitemap.xml'), 'w', encoding='utf-8') as fd:
fd.write(self.xml_wrap % {
'SITEURL': self.url_site,
'urls': urls
})
def __get_direct_template_url(self, name):
"""
Returns the URL for the given DIRECT_TEMPLATE name.
Resolution order is:
1. ${DIRECT_TEMPLATE}_URL (custom property, no Pelican default)
2. ${DIRECT_TEMPLATE}_SAVE_AS
3. Default path
:param name: name of the direct template
:return: str
"""
name_upper = name.upper()
url = self.pelican_settings.get(
'{}_URL'.format(name_upper),
self.pelican_settings.get(
'{}_SAVE_AS'.format(name_upper),
'{}.html'.format(name)
)
)
return urljoin(self.url_site, url)
def __process_url_wrapper_elements(self, elements):
"""
Creates the url nodes for pelican.urlwrappers.Category and pelican.urlwrappers.Tag.
:param elements: list of wrapper elements
:type elements: list
:return: the processes urls as HTML
:rtype: str
"""
urls = ''
for url_wrapper, articles in elements:
urls += self.__create_url_node_for_content(
url_wrapper,
'others',
url=urljoin(self.url_site, url_wrapper.url),
modification_time=self.__get_date_key(sorted(articles, key=self.__get_date_key, reverse=True)[0])
)
return urls
def __create_url_node_for_content(self, content, content_type, url=None, modification_time=None):
"""
Creates the required <url> node for the sitemap xml.
:param content: the content class to handle
:type content: pelican.contents.Content | None
:param content_type: the type of the given content to match settings.EXTENDED_SITEMAP_PLUGIN
:type content_type; str
:param url; if given, the URL to use instead of the url of the content instance
:type url: str
:param modification_time: the modification time of the url, will be used instead of content date if given
:type modification_time: datetime.datetime | None
:returns: the text node
:rtype: str
"""
loc = url
if loc is None:
loc = urljoin(self.url_site, self.context.get('ARTICLE_URL').format(**content.url_format))
lastmod = None
if modification_time is not None:
lastmod = modification_time.strftime('%Y-%m-%d')
else:
if content is not None:
if getattr(content, 'modified', None) is not None:
lastmod = getattr(content, 'modified').strftime('%Y-%m-%d')
elif getattr(content, 'date', None) is not None:
lastmod = getattr(content, 'date').strftime('%Y-%m-%d')
output = "<loc>{}</loc>".format(loc)
if lastmod is not None:
output += "\n<lastmod>{}</lastmod>".format(lastmod)
output += "\n<changefreq>{}</changefreq>".format(self.settings.get('changefrequencies').get(content_type))
output += "\n<priority>{:.2f}</priority>".format(self.settings.get('priorities').get(content_type))
return self.template_url.format(output)
@staticmethod
def __get_date_key(obj):
return getattr(obj, 'modified', None) or obj.date
@staticmethod
def __get_title_key(obj):
return getattr(obj, 'title')
def get_generators(generators):
"""
Returns the generators of this plugin,
:param generators: current generators
:type generators: pelican.Pelican
:returns: the sitemap generator type
:rtype: type
"""
return SitemapGenerator
def register():
"""
Registers the sitemap generator.
"""
signals.get_generators.connect(get_generators)