Skip to content

Commit

Permalink
Make FileSystemHandler (#4)
Browse files Browse the repository at this point in the history
  • Loading branch information
webb-ben committed Jul 28, 2023
1 parent c81f3c3 commit 25f1276
Show file tree
Hide file tree
Showing 6 changed files with 245 additions and 108 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@ jobs:

steps:
- uses: actions/checkout@v3
- name: Checkout Geoconnex Namespace
- name: Checkout Cached Geoconnex Namespace
run: |
git clone -b master https://github.com/internetofwater/geoconnex.us.git geoconnex.us
cd geoconnex.us
git checkout e0e1c2ba0d023bfd80f3e6d76c85c01fab35c581
- uses: actions/setup-python@v2
name: Setup Python ${{ matrix.python-version }}
with:
Expand Down
53 changes: 53 additions & 0 deletions sitemap_generator/handler/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# =================================================================
#
# Authors: Benjamin Webb <[email protected]>
#
# Copyright (c) 2023 Benjamin Webb
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation
# files (the "Software"), to deal in the Software without
# restriction, including without limitation the rights to use,
# copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following
# conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
#
# =================================================================

'''Handler classs'''

import click
from pathlib import Path

from sitemap_generator.handler.filesystem import FileSystemHandler
from sitemap_generator.util import OPTION_VERBOSITY


@click.command()
@click.pass_context
@OPTION_VERBOSITY
@click.argument('filepath', type=click.Path())
@click.option('-s', '--uri_stem', type=str, default='https://geoconnex.us/',
help='uri stem to be removed from short url for keyword')
def run(ctx, verbosity, filepath, uri_stem):
filepath = Path(filepath)
if filepath.is_dir():
handler = FileSystemHandler(filepath, uri_stem)
handler.handle()


if __name__ == '__main__':
run()
124 changes: 38 additions & 86 deletions sitemap_generator/handler.py → sitemap_generator/handler/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,7 @@
#
# =================================================================

import click

from datetime import datetime as dt
from git import Repo
from datetime import datetime
import logging
import os
from pathlib import Path
Expand All @@ -39,24 +36,16 @@

from sitemap_generator.util import (url_join, get_smi, add_smi_node,
get_urlset, add_urlset_node,
write_tree, walk_path,
parse, OPTION_VERBOSITY)
write_tree)

LOGGER = logging.getLogger(__name__)

# Environment Vars for Git Repository to source last mod
SOURCE_REPO = os.environ.get('SOURCE_REPO', '/geoconnex.us')
SOURCE_REPO_PATH = os.environ.get('SOURCE_REPO_PATH', 'namespaces')
# Git Repository objects
REPO = Repo(SOURCE_REPO)
TREE = REPO.heads.master.commit.tree
NAMESPACE = TREE / SOURCE_REPO_PATH

# Sitemap directory objects
SITEMAP_DIR = Path(os.environ.get('SITEMAP_DIR', '/sitemap'))


class Handler:
class BaseHandler:
"""Sitemap Generator Handler"""

def __init__(self, filepath: Path, uri_stem: str) -> None:
Expand All @@ -77,16 +66,35 @@ def handle(self) -> None:
:returns: `None`
"""
raise NotImplementedError

def parse(self) -> None:
"""
Parse sitemap creation sitemapindex
:returns: `None`
"""
raise NotImplementedError

def get_filetime(self, filename: Path) -> str:
"""
Gets relative path to file.
:param filename: `Path` of file
:returns file_time: `str` of file lastmod as W3C Datetime
"""
raise NotImplementedError

LOGGER.debug('Making urlsets')
[self.make_urlset(file)
for file in walk_path(self.root_path, r'.*.csv')]
def get_rel_path(self, filename: Path) -> str:
"""
Gets relative path to file.
LOGGER.debug('Making sitemap index')
urlsets = walk_path(self.root_path, r'.*.xml')
self.make_sitemap(urlsets)
:param filename: `Path` of file
LOGGER.debug('Finished task')
:returns parent: `str` of parent path
"""
raise NotImplementedError

def make_urlset(self, filename: Path) -> None:
"""
Expand All @@ -97,8 +105,8 @@ def make_urlset(self, filename: Path) -> None:
:returns: `None`
"""
LOGGER.debug(f'Making urlset for {filename}')
file_time = self._get_filetime(filename)
urlsets = parse(filename)
file_time = self.get_filetime(filename)
urlsets = self.parse(filename)

for i, chunk in enumerate(urlsets):
# Build sitemaps for each csv file
Expand All @@ -115,6 +123,11 @@ def make_urlset(self, filename: Path) -> None:
sitemap_file = (filename.parent / fidx).with_suffix('.xml')
write_tree(tree, sitemap_file)

_ = datetime.strptime(file_time, '%Y-%m-%dT%H:%M:%SZ')
mtime = _.timestamp()
atime = datetime.now().timestamp()
os.utime(sitemap_file, (atime, mtime))

def make_sitemap(self, files: Iterator[Path]) -> None:
"""
Create sitemapindex
Expand All @@ -135,78 +148,17 @@ def make_sitemap(self, files: Iterator[Path]) -> None:
continue

# Move xml to /sitemaps
filepath = (SITEMAP_DIR / self._get_rel_path(f))
filepath = (SITEMAP_DIR / self.get_rel_path(f))
filepath.mkdir(parents=True, exist_ok=True)
file_path = filepath / f.name
LOGGER.debug(f'Copying urlset to {filepath}')
copy2(f, file_path)

# create to link /sitemap/_sitemap.xml
file_time = self._get_filetime(file_path)
file_time = self.get_filetime(file_path)
url_ = url_join(self.uri_stem, file_path)
add_smi_node(root, url_, file_time)

sitemap_out = SITEMAP_DIR / '_sitemap.xml'
LOGGER.debug(f'Writing sitemapindex to {sitemap_out}')
write_tree(tree, sitemap_out)

def _get_filetime(self, filename: Path) -> str:
"""
Gets relative path to file.
:param filename: `Path` of file
:returns file_time: `str` of file lastmod as W3C Datetime
"""
try:
LOGGER.debug('Getting filetime from Git commit')
blob = (NAMESPACE / self._get_rel_path(filename))
commits = REPO.iter_commits(paths=blob.path, max_count=1)
commit = next(commits)
file_time = commit.committed_datetime

except KeyError as err:
LOGGER.warning(err)
_ = os.path.getmtime(filename)
file_time = dt.fromtimestamp(_)

except OSError as err:
LOGGER.warning(err)
file_time = dt.now()

return file_time.strftime('%Y-%m-%dT%H:%M:%SZ')

def _get_rel_path(self, filename: Path) -> str:
"""
Gets relative path to file.
:param filename: `Path` of file
:returns parent: `str` of parent path
"""
full_path = str(filename.resolve())
LOGGER.debug(f'Resolving relative path for {full_path}')
if self.root_path in full_path:
LOGGER.debug('File in namespaces context')
parent = filename.parent.relative_to(self.root_path)
else:
LOGGER.debug('File in sitemap context')
parent = filename.parent.relative_to(SITEMAP_DIR)

LOGGER.debug(f'Parent dir of file is: {parent}')
return str(parent)


@click.command()
@click.pass_context
@OPTION_VERBOSITY
@click.argument('filepath', type=click.Path())
@click.option('-s', '--uri_stem', type=str, default='https://geoconnex.us/',
help='uri stem to be removed from short url for keyword')
def run(ctx, verbosity, filepath, uri_stem):
handler = Handler(filepath, uri_stem)
handler.handle()


if __name__ == '__main__':
run()
134 changes: 134 additions & 0 deletions sitemap_generator/handler/filesystem.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
# =================================================================
#
# Authors: Benjamin Webb <[email protected]>
#
# Copyright (c) 2023 Benjamin Webb
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation
# files (the "Software"), to deal in the Software without
# restriction, including without limitation the rights to use,
# copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following
# conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
#
# =================================================================

from datetime import datetime as dt
from git import Repo
import logging
import os
from pathlib import Path

from sitemap_generator.handler.base import BaseHandler, SITEMAP_DIR
from sitemap_generator.util import walk_path, parse

LOGGER = logging.getLogger(__name__)

# Environment Vars for Git Repository to source last mod
SOURCE_REPO = os.environ.get('SOURCE_REPO', '/geoconnex.us')
SOURCE_REPO_PATH = os.environ.get('SOURCE_REPO_PATH', 'namespaces')


class FileSystemHandler(BaseHandler):
def __init__(self, filepath: Path, uri_stem: str) -> None:
"""
Sitemap handler initializer
:param filepath: `Path` of filepath to handle
:param uri_stem: `str` of sitemap location
:returns: `None`
"""
super().__init__(str(filepath), uri_stem)
# Git Repository objects
self.repo = Repo(SOURCE_REPO)
self.tree = self.repo.heads.master.commit.tree
self.namespace = self.tree / SOURCE_REPO_PATH

def handle(self) -> None:
"""
Handle sitemap creation sitemapindex
:returns: `None`
"""
LOGGER.debug('Making urlsets')
[self.make_urlset(file)
for file in walk_path(self.root_path, r'.*.csv')]

LOGGER.debug('Making sitemap index')
urlsets = walk_path(self.root_path, r'.*.xml')
self.make_sitemap(urlsets)

LOGGER.debug('Finished task')

def parse(self, filename: Path, n: int = 50000) -> list:
"""
Parses file to a CSV
:param filename: `Path` of source file to parse
:param n: `int` size of each chunk
:returns: `list`
"""
return parse(filename, n)

def get_filetime(self, filename: Path) -> str:
"""
Gets relative path to file.
:param filename: `Path` of file
:returns file_time: `str` of file lastmod as W3C Datetime
"""
try:
LOGGER.debug(f'Getting filetime from Git commit for {filename}')
relative_path = self.get_rel_path(filename)
blob = (self.namespace / relative_path / filename.name)
commits = self.repo.iter_commits(paths=blob.path, max_count=1)
commit = next(commits)
file_time = commit.committed_datetime

except KeyError as err:
LOGGER.warning(err)
_ = os.path.getmtime(filename)
file_time = dt.fromtimestamp(_)

except OSError as err:
LOGGER.warning(err)
file_time = dt.now()

return file_time.strftime('%Y-%m-%dT%H:%M:%SZ')

def get_rel_path(self, filename: Path) -> str:
"""
Gets relative path to file.
:param filename: `Path` of file
:returns parent: `str` of parent path
"""
full_path = str(filename.resolve())
LOGGER.debug(f'Resolving relative path for {full_path}')
if self.root_path in full_path:
LOGGER.debug('File in namespaces context')
parent = filename.parent.relative_to(self.root_path)
else:
LOGGER.debug('File in sitemap context')
parent = filename.parent.relative_to(SITEMAP_DIR)

LOGGER.debug(f'Parent dir of file is: {parent}')
return str(parent)
Loading

0 comments on commit 25f1276

Please sign in to comment.