Skip to content

Commit

Permalink
Merge pull request #147 from roskakori/105-add-option-to-merge-embedd…
Browse files Browse the repository at this point in the history
…ed-languages

#105 Add option to merge embedded languages
  • Loading branch information
roskakori authored May 12, 2024
2 parents d671461 + 4676492 commit 8127879
Show file tree
Hide file tree
Showing 8 changed files with 130 additions and 23 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ jobs:
if: ${{ matrix.python-version == env.MAIN_PYTHON_VERSION }}
run: |
poetry run sh scripts/build_documentation.sh
- name: Update coveralls
- name: Update coveralls statistics
if: ${{ matrix.python-version == env.MAIN_PYTHON_VERSION }}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
Expand Down
3 changes: 3 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion docs/changes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,11 @@ Changes

This chapter describes the changes coming with each new version of pygount.

Version 1.7.0, 2023-07-02
Version 1.7.0, 2024-05-13

* Add command line option ``--merge-embedded-languages`` to merge embedded
languages into their base language. For example, "HTML+Django/Jinja" counts
as "HTML" (issue `#105 <https://github.com/roskakori/pygount/issues/105>`_).
* Add Python 3.12 and made it the main version for CI (issue
`#145 <https://github.com/roskakori/pygount/issues/145>`_).

Expand Down
16 changes: 16 additions & 0 deletions docs/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,22 @@ to the data.
For further processing the results of pygount, ``--format=json`` should be the
easiest to deal with. For more information see :doc:`json`.

.. option:: --merge-embedded-languages

Some languages such as HTML or JavaScript allow to embed other languages in their source code. In that case, the source code is assigned to a language
that contains both the base and end embedded language in its name, for example:

- HTML+Jinja
- JavaScript+Lasso

If you prefer count all variants of a base language only under its own name,
specify ``--merge-embedded-languages``. The example above will then show as:

- HTML
- JavaScript

Consequently, multiple different embedded languages will all count for its
common base language.

Remote repositories
-------------------
Expand Down
61 changes: 41 additions & 20 deletions pygount/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import re
from enum import Enum
from io import SEEK_CUR, BufferedIOBase, IOBase, RawIOBase, TextIOBase
from typing import Dict, Iterator, List, Optional, Pattern, Sequence, Set, Tuple, Union
from typing import Iterator, List, Optional, Pattern, Sequence, Set, Tuple, Union

import pygments.lexer
import pygments.lexers
Expand Down Expand Up @@ -48,6 +48,8 @@
#: Pygments token type; we need to define our own type because pygments' ``_TokenType`` is internal.
TokenType = type(pygments.token.Token)

_BASE_LANGUAGE_REGEX = re.compile(r"^(?P<base_language>[^+]+)\+[^+].*$")


class SourceState(Enum):
"""
Expand Down Expand Up @@ -113,7 +115,7 @@ class SourceState(Enum):
"news",
"readme",
"thanks",
# Github community recommendations, see
# GitHub community recommendations, see
# <https://docs.github.com/en/communities/setting-up-your-project-for-healthy-contributions>.
# By now, in practice most projects use a suffix like "*.md" but some older ones
# still might have such files without suffix.
Expand Down Expand Up @@ -246,12 +248,9 @@ def from_state(
@staticmethod
def _check_state_info(state: SourceState, state_info: Optional[str]):
states_that_require_state_info = [SourceState.duplicate, SourceState.error, SourceState.generated]
assert (state in states_that_require_state_info) == (
state_info is not None
), "state={} and state_info={} but state_info must be specified for the following states: {}".format(
state,
state_info,
states_that_require_state_info,
assert (state in states_that_require_state_info) == (state_info is not None), (
f"state={state} and state_info={state_info} "
f"but state_info must be specified for the following states: {states_that_require_state_info}"
)

@staticmethod
Expand All @@ -260,16 +259,17 @@ def from_file(
group: str,
encoding: str = "automatic",
fallback_encoding: str = "cp1252",
generated_regexes=pygount.common.regexes_from(DEFAULT_GENERATED_PATTERNS_TEXT),
generated_regexes: Optional[List[Pattern]] = None,
duplicate_pool: Optional[DuplicatePool] = None,
file_handle: Optional[IOBase] = None,
merge_embedded_language: bool = False,
) -> "SourceAnalysis":
"""
Factory method to create a :py:class:`SourceAnalysis` by analyzing
the source code in ``source_path`` or the open file ``file_handle``.
:param source_path: path to source code to analyze
:param group: name of a logical group the sourc code belongs to, e.g. a
:param group: name of a logical group the source code belongs to, e.g. a
package.
:param encoding: encoding according to :func:`encoding_for`
:param fallback_encoding: fallback encoding according to
Expand All @@ -281,9 +281,11 @@ def from_file(
:param file_handle: a file-like object, or ``None`` to read and open the file from
``source_path``. If the file is open in text mode, it must be opened with the correct
encoding.
:param merge_embedded_language: If pygments detects a base and embedded language, the source
code counts towards the base language. For example: "JavaScript+Lasso" counts as
"JavaScript".
"""
assert encoding is not None
assert generated_regexes is not None

result = None
lexer = None
Expand Down Expand Up @@ -323,8 +325,15 @@ def from_file(
if result is None:
lexer = guess_lexer(source_path, source_code)
assert lexer is not None
if (result is None) and (len(generated_regexes) != 0):
number_line_and_regex = matching_number_line_and_regex(pygount.common.lines(source_code), generated_regexes)
actual_generated_regexes = (
generated_regexes
if generated_regexes is not None
else pygount.common.regexes_from(DEFAULT_GENERATED_PATTERNS_TEXT)
)
if (result is None) and (len(actual_generated_regexes) != 0):
number_line_and_regex = matching_number_line_and_regex(
pygount.common.lines(source_code), actual_generated_regexes
)
if number_line_and_regex is not None:
number, _, regex = number_line_and_regex
message = f"line {number} matches {regex}"
Expand All @@ -333,7 +342,7 @@ def from_file(
if result is None:
assert lexer is not None
assert source_code is not None
language = lexer.name
language = base_language(lexer.name) if merge_embedded_language else lexer.name
if ("xml" in language.lower()) or (language == "Genshi"):
dialect = pygount.xmldialect.xml_dialect(source_path, source_code)
if dialect is not None:
Expand Down Expand Up @@ -452,7 +461,7 @@ def state_info(self) -> Optional[Union[str, Exception]]:
the :py:attr:`path` is a duplicate of
* :py:attr:`SourceState.error`: the :py:exc:`Exception` causing the
error
* :py:attr:`SourceState.generated`: a human readable explanation why
* :py:attr:`SourceState.generated`: a human-readable explanation why
the file is considered to be generated
"""
return self._state_info
Expand Down Expand Up @@ -625,7 +634,7 @@ def source_paths(self) -> Iterator[str]:


def matching_number_line_and_regex(
source_lines: Sequence[str], generated_regexes: Sequence[Pattern], max_line_count: int = 15
source_lines: Iterator[str], generated_regexes: Sequence[Pattern], max_line_count: int = 15
) -> Optional[Tuple[int, str, Pattern]]:
"""
The first line and its number (starting with 0) in the source code that
Expand Down Expand Up @@ -661,7 +670,7 @@ def white_characters(language_id: str) -> str:
return "(),:;[]{}"


def white_code_words(language_id: str) -> Dict[str, List[str]]:
def white_code_words(language_id: str) -> Set[str]:
"""
Words that do not count as code if it is the only word in a line.
"""
Expand All @@ -683,7 +692,7 @@ def _delined_tokens(tokens: Sequence[Tuple[TokenType, str]]) -> Iterator[TokenTy

def _pythonized_comments(tokens: Sequence[Tuple[TokenType, str]]) -> Iterator[TokenType]:
"""
Similar to tokens but converts strings after a colon (:) to comments.
Similar to tokens but converts strings after a colon (`:`) to comments.
"""
is_after_colon = True
for token_type, token_text in tokens:
Expand Down Expand Up @@ -890,7 +899,19 @@ def source_analysis(
group,
encoding="automatic",
fallback_encoding="cp1252",
generated_regexes=pygount.common.regexes_from(DEFAULT_GENERATED_PATTERNS_TEXT),
generated_regexes: Optional[List[Pattern]] = None,
duplicate_pool: Optional[DuplicatePool] = None,
):
return SourceAnalysis.from_file(source_path, group, encoding, fallback_encoding, generated_regexes, duplicate_pool)
actual_generated_regexes = (
generated_regexes
if generated_regexes is not None
else pygount.common.regexes_from(DEFAULT_GENERATED_PATTERNS_TEXT)
)
return SourceAnalysis.from_file(
source_path, group, encoding, fallback_encoding, actual_generated_regexes, duplicate_pool
)


def base_language(language: str) -> str:
base_language_match = _BASE_LANGUAGE_REGEX.match(language)
return language if base_language_match is None else base_language_match.group("base_language")
19 changes: 19 additions & 0 deletions pygount/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@
_HELP_GENERATED = """comma separated list of regular expressions to detect
generated code; default: %(default)s"""

_HELP_MERGE_EMBEDDED_LANGUAGES = """merge counts for embedded languages into
their base language; for example, HTML+Jinja2 counts as HTML"""

_HELP_FOLDERS_TO_SKIP = """comma separated list of glob patterns for folder
names not to analyze. Use "..." as first entry to append patterns to the
default patterns; default: %(default)s"""
Expand Down Expand Up @@ -102,6 +105,7 @@ def __init__(self):
self._generated_regexs = pygount.common.regexes_from(pygount.analysis.DEFAULT_GENERATED_PATTERNS_TEXT)
self._has_duplicates = False
self._has_summary = False
self._has_to_merge_embedded_languages = False
self._is_verbose = False
self._names_to_skip = pygount.common.regexes_from(pygount.analysis.DEFAULT_NAME_PATTERNS_TO_SKIP_TEXT)
self._output = _DEFAULT_OUTPUT
Expand Down Expand Up @@ -168,6 +172,13 @@ def has_duplicates(self):
def set_has_duplicates(self, has_duplicates, source=None):
self._has_duplicates = bool(has_duplicates)

@property
def has_to_merge_embedded_languages(self):
return self._has_to_merge_embedded_languages

def set_has_to_merge_embedded_languages(self, has_to_merge_embedded_languages, source=None):
self._has_to_merge_embedded_languages = bool(has_to_merge_embedded_languages)

@property
def is_verbose(self):
return self._is_verbose
Expand Down Expand Up @@ -247,6 +258,12 @@ def argument_parser(self):
default=pygount.analysis.DEFAULT_GENERATED_PATTERNS_TEXT,
help=_HELP_GENERATED,
)
parser.add_argument(
"--merge-embedded-languages",
"-m",
action="store_true",
help=_HELP_MERGE_EMBEDDED_LANGUAGES,
)
parser.add_argument(
"--names-to-skip",
"-N",
Expand Down Expand Up @@ -313,6 +330,7 @@ def apply_arguments(self, arguments=None):
self.set_folders_to_skip(args.folders_to_skip, "option --folders-to-skip")
self.set_generated_regexps(args.generated, "option --generated")
self.set_has_duplicates(args.duplicates, "option --duplicates")
self.set_has_to_merge_embedded_languages(args.merge_embedded_languages, "option --merge-embedded-languages")
self.set_is_verbose(args.verbose, "option --verbose")
self.set_names_to_skip(args.names_to_skip, "option --folders-to-skip")
self.set_output(args.out, "option --out")
Expand Down Expand Up @@ -346,6 +364,7 @@ def execute(self):
self.fallback_encoding,
generated_regexes=self._generated_regexs,
duplicate_pool=duplicate_pool,
merge_embedded_language=self.has_to_merge_embedded_languages,
)
)
finally:
Expand Down
31 changes: 30 additions & 1 deletion tests/test_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

from pygount import Error as PygountError
from pygount import analysis, common
from pygount.analysis import guess_lexer
from pygount.analysis import base_language, guess_lexer

from ._common import PYGOUNT_PROJECT_FOLDER, PYGOUNT_SOURCE_FOLDER, TempFolderTest
from .test_xmldialect import EXAMPLE_ANT_CODE
Expand Down Expand Up @@ -230,6 +230,26 @@ def test_can_analyze_bytesio(self):
assert source_analysis.language == "Python"
assert source_analysis.code_count == 2

def test_can_analyze_embedded_language(self):
test_html_django_path = self.create_temp_file(
"some.html",
["<!DOCTYPE html>", "{% load i18n %}", '<html lang="{{ language_code }}" />'],
)
source_analysis = analysis.SourceAnalysis.from_file(test_html_django_path, "test", encoding="utf-8")
assert source_analysis.language.lower() == "html+django/jinja"
assert source_analysis.code_count == 3

def test_can_merge_embedded_language(self):
test_html_django_path = self.create_temp_file(
"some.html",
["<!DOCTYPE html>", "{% load i18n %}", '<html lang="{{ language_code }}" />'],
)
source_analysis = analysis.SourceAnalysis.from_file(
test_html_django_path, "test", encoding="utf-8", merge_embedded_language=True
)
assert source_analysis.language.lower() == "html"
assert source_analysis.code_count == 3

def test_fails_on_non_seekable_file_handle_with_encoding_automatic(self):
file_handle = _NonSeekableEmptyBytesIO()

Expand Down Expand Up @@ -474,6 +494,15 @@ def test_can_match_deprecated_functions():
)


def test_can_compute_base_language():
assert base_language("JavaScript") == "JavaScript"
assert base_language("JavaScript+Lasso") == "JavaScript"
assert base_language("JavaScript+") == "JavaScript+" # no actual language
assert base_language("C++") == "C++"
assert base_language("++C") == "++C" # no actual language
assert base_language("") == "" # no actual language, but should not crash either


class DuplicatePoolTest(TempFolderTest):
def test_can_distinguish_different_files(self):
some_path = self.create_temp_file(__name__ + "_some", "some")
Expand Down
16 changes: 16 additions & 0 deletions tests/test_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,3 +211,19 @@ def test_can_write_all_output_formats(self):
for output_format in VALID_OUTPUT_FORMATS:
exit_code = command.pygount_command(["--format", output_format, PYGOUNT_SOURCE_FOLDER])
self.assertEqual(exit_code, 0)

def test_can_merge_embedded_languages(self):
test_html_django_path = self.create_temp_file(
"some.html",
["<!DOCTYPE html>", "{% load i18n %}", '<html lang="{{ language_code }}" />'],
)
cloc_xml_path = os.path.join(self.tests_temp_folder, "cloc.xml")
exit_code = command.pygount_command(
["--merge-embedded-languages", "--format", "cloc-xml", "--out", cloc_xml_path, test_html_django_path]
)
assert exit_code == 0
assert os.path.exists(cloc_xml_path)
cloc_xml_root = ElementTree.parse(cloc_xml_path)
file_elements = cloc_xml_root.findall("files/file[@language='HTML']")
assert file_elements is not None
assert len(file_elements) == 1

0 comments on commit 8127879

Please sign in to comment.