Skip to content

Commit

Permalink
Add format type classification
Browse files Browse the repository at this point in the history
Format type classification is added to the output so that when it
is available reports will be automatically updated. Consistency
has been improved in headings used in demystify as well.
  • Loading branch information
ross-spencer committed Mar 24, 2024
1 parent 6a2ce9f commit bb20900
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 19 deletions.
49 changes: 30 additions & 19 deletions src/demystify/i18n/internationalstrings.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class AnalysisStringsEN:
)

SUMMARY_UNIQUE_EXTENSIONS = "Total Unique Extensions Across Accession/Extract"
SUMMARY_ZERO_BYTE = "Total Zero-byte Files in Accession/Extract"
SUMMARY_ZERO_BYTE = "Total Zero-byte Files in the Accession/Extract"
SUMMARY_IDENTICAL_FILES = "Total Files with Identical Content (Checksum Value)"
SUMMARY_PERCENTAGE_IDENTIFIED = "Percentage of Accession/Extract Identified"
SUMMARY_PERCENTAGE_UNIDENTIFIED = "Percentage of Accession/Extract Unidentified"
Expand Down Expand Up @@ -123,9 +123,7 @@ class AnalysisStringsEN:
"understand an accession/extract."
)

HEADING_BINARY_ID = (
"Aggregated Frequency of File Format Signature Identifiers in Accession/Extract"
)
HEADING_BINARY_ID = "Aggregated Frequency of File Format Signature Identifiers in the Accession/Extract"
HEADING_DESC_BINARY_ID = (
"Listing of format identification, namespace, count, and match basis for files "
"identified using file format signatures. Some tools can report what the basis "
Expand All @@ -135,7 +133,7 @@ class AnalysisStringsEN:
+ COUNT_TEXT
)

HEADING_XML_ID = "Aggregated Frequency of XML Identifiers in Accession/Extract"
HEADING_XML_ID = "Aggregated Frequency of XML Identifiers in the Accession/Extract"
HEADING_DESC_XML_ID = (
"Listing of format identification, namespace, count, and match basis for files "
"identified using XML based identification. Some tools can report what the basis "
Expand All @@ -145,7 +143,9 @@ class AnalysisStringsEN:
+ COUNT_TEXT
)

HEADING_TEXT_ID = "Aggregated Frequency of Text Identifiers in Accession/Extract"
HEADING_TEXT_ID = (
"Aggregated Frequency of Text Identifiers in the Accession/Extract"
)
HEADING_DESC_TEXT_ID = (
"Listing of format identification, namespace, count, and match basis for files "
"identified using Text based identification. Some tools can report what the basis "
Expand All @@ -156,7 +156,7 @@ class AnalysisStringsEN:
)

HEADING_FILENAME_ID = (
"Aggregated Frequency of Filename Identifiers in Accession/Extract"
"Aggregated Frequency of Filename Identifiers in the Accession/Extract"
)
HEADING_DESC_FILENAME_ID = (
"Listing of format identification, namespace, count, and match basis for files "
Expand Down Expand Up @@ -234,7 +234,7 @@ class AnalysisStringsEN:
)

HEADING_AGGREGATE_BINARY_IDENTIFIED = (
"Aggregated File Format Signature Identifiers in Accession/Extract"
"Aggregated File Format Signature Identifiers in the Accession/Extract"
)
HEADING_DESC_IDENTIFIED = (
"A list of ID values and format names to provide a clear picture of diversity/complexity of the "
Expand All @@ -248,20 +248,23 @@ class AnalysisStringsEN:
HEADING_FREQUENCY_PUIDS_IDENTIFIED = (
"Frequency of File Format Signature Identified IDs"
)
HEADING_CLASSIFICATION = (
"Range of format type classifciation in the Accession/Extract"
)
HEADING_DATE_RANGE = "Date Range of Items in the Accession/Extract"
HEADING_EXTENSION_ONLY = "Extension Only Identification in the Accession/Extract"
HEADING_ID_METHOD = "Identification Method Frequency"
HEADING_FREQUENCY_EXTENSION_ONLY = (
"Frequency of Extension Only Identification In Accession/Extract"
"Frequency of Extension Only Identification in the Accession/Extract"
)
HEADING_UNIQUE_EXTENSIONS = (
"Unique Extensions Identified Across All Objects (ID and non-ID)"
)
HEADING_LIST_MULTIPLE = "List of Files With Multiple Identifications"
HEADING_FREQUENCY_EXTENSIONS_ALL = "Frequency of All Extensions"
HEADING_FREQUENCY_MIME = "MIME Type (Internet Media Type) Frequency"
HEADING_LIST_ZERO_BYTES = "Zero-byte files in Accession/Extract"
HEADING_ARCHIVE_FORMATS = "Archive File Types in Accession/Extract"
HEADING_LIST_ZERO_BYTES = "Zero-byte files in the Accession/Extract"
HEADING_ARCHIVE_FORMATS = "Archive File Types in the Accession/Extract"
HEADING_IDENTICAL_CONTENT = "Files With Identical Content (Checksum Value)"
HEADING_TROUBLESOME_FILENAMES = "Identifying Non-ASCII and System File Names"
HEADING_TROUBLESOME_DIRNAMES = "Identifying Non-ASCII and System Directory Names"
Expand Down Expand Up @@ -363,6 +366,12 @@ class AnalysisStringsEN:
"is represented in the accession/extract, in a descending list from most frequent to least."
)

HEADING_DESC_CLASSIFICATION = (
"Count and visialization giving a clear illustration of the distribution of file format type classifications "
"across the accession/extract. The list is sorted by count in descending order and provides a measure "
"of how homogenous a collection is which is potentially indicative of its complexity to preserve."
)

HEADING_DESC_DATE_RANGE = (
"Count and visualization giving a clear illustration of the distribution of file modification dates across the "
"accession/extract. The list is in descending order based on the number of files last modified on any given year. "
Expand Down Expand Up @@ -403,7 +412,7 @@ class AnalysisStringsEN:
)

HEADING_DESC_FREQUENCY_EXTENSIONS_ALL = (
"Lists the gamut of file extensions alongside how many times they appear in accession/extract "
"Lists the gamut of file extensions alongside how many times they appear in the accession/extract "
"in descending order. This information can be used to identify the diversity/complexity of the "
"accession/extract, but also to identify the consistency with which extensions may have been "
"used in the accession/extract and may indicate how much work may be needed to correct "
Expand Down Expand Up @@ -454,10 +463,12 @@ class AnalysisStringsEN:
"<a href='https://msdn.microsoft.com/en-nz/library/windows/desktop/aa365247%28v=vs.85%29.aspx?f=255&MSPPError=-2147217396'>Microsoft</a>"
)

HEADING_DENYLIST_IDS = "Denylist Identifiers found in Accession/Extract"
HEADING_DENYLIST_EXTS = "Denylist Filename Extensions found in Accession/Extract"
HEADING_DENYLIST_DIRS = "Denylist Directory Names found in Accession/Extract"
HEADING_DENYLIST_FILENAMES = "Denylist Filenames found in Accession/Extract"
HEADING_DENYLIST_IDS = "Denylist Identifiers found in the Accession/Extract"
HEADING_DENYLIST_EXTS = (
"Denylist Filename Extensions found in the Accession/Extract"
)
HEADING_DENYLIST_DIRS = "Denylist Directory Names found in the Accession/Extract"
HEADING_DENYLIST_FILENAMES = "Denylist Filenames found in the Accession/Extract"
HEADING_DESC_DENYLIST = """Lists objects which are considered undesirable inside an accession/extract.
The denylist is entirely configurable by user and may contain files that can be
identified as undesirable, e.g. system files; or file names that may need pre-conditioning
Expand All @@ -466,15 +477,15 @@ class AnalysisStringsEN:
strings using a wildcard search. IDs e.g. PUID identifiers, and filename extension searches
are understandably more precise."""

TEXT_ONLY_FIVE_TOP_PUIDS = "Five Top PUIDs in Accession/Extract"
TEXT_ONLY_FIVE_TOP_EXTENSIONS = "Five Top Extensions in Accession/Extract"
TEXT_ONLY_FIVE_TOP_PUIDS = "Five Top PUIDs in the Accession/Extract"
TEXT_ONLY_FIVE_TOP_EXTENSIONS = "Five Top Extensions in the Accession/Extract"

COLUMN_HEADER_VALUES_NAMESPACE = "Namespace"
COLUMN_HEADER_VALUES_ID = "ID"
COLUMN_HEADER_VALUES_FORMAT = "Format Name"
COLUMN_HEADER_VALUES_COUNT = "Count"
COLUMN_HEADER_VALUES_YEAR = "Year"
COLUMN_HEADER_VALUES_YEAR = "Volume"
COLUMN_HEADER_VALUES_CLASSIFICATION = "Classification"

FNAME_CHECK_ASCII = "contains, characters outside of ASCII range"
FNAME_CHECK_PERIOD = "has a period '.' as its last character"
Expand Down
21 changes: 21 additions & 0 deletions src/demystify/libs/AnalysisQueriesClass.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,27 @@ def methods_return_ns_sort(self, ns_id):
"AND (IDDATA.METHOD='Signature' OR IDDATA.METHOD='Container')"
)

SELECT_CLASSIFICATION_COUNT = (
"SELECT COUNT(IDDATA.classification)\n"
"FROM IDRESULTS\n"
"JOIN NSDATA on IDDATA.NS_ID = NSDATA.NS_ID\n"
"JOIN IDDATA on IDRESULTS.ID_ID = IDDATA.ID_ID\n"
"WHERE (NSDATA.NS_NAME='pronom')\n"
"AND (IDDATA.METHOD='Signature' OR IDDATA.METHOD='Container')"
"AND IDDATA.classification != 'None'"
)

SELECT_CLASSIFICATION_FREQUENCY = (
"SELECT IDDATA.classification,\n"
"COUNT(*) as TOTAL\n"
"FROM IDRESULTS\n"
"JOIN NSDATA on IDDATA.NS_ID = NSDATA.NS_ID\n"
"JOIN IDDATA on IDRESULTS.ID_ID = IDDATA.ID_ID\n"
"WHERE (NSDATA.NS_NAME='pronom')\n"
"AND (IDDATA.METHOD='Signature' OR IDDATA.METHOD='Container')"
"GROUP BY IDDATA.classification ORDER BY TOTAL DESC"
)

# PRONOM and OTHERS Text identifiers as one result
# PRONOM and OTHERS Text identifiers as one result
@staticmethod
Expand Down
3 changes: 3 additions & 0 deletions src/demystify/libs/AnalysisResultsClass.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ def __init__(self):
self.badFileNames = None
self.badDirNames = None

self.classifications_count = 0
self.classifications = []

# Hash related values.
self.hashused = False
self.duplicateHASHlisting = None
Expand Down
12 changes: 12 additions & 0 deletions src/demystify/libs/DemystifyAnalysisClass.py
Original file line number Diff line number Diff line change
Expand Up @@ -979,6 +979,18 @@ def queryDB(self):
if self.rogueanalysis:
self._handle_rogue_analysis()

self.analysis_results.classifications_count = int(
self._querydb(
AnalysisQueries.SELECT_CLASSIFICATION_COUNT,
True,
True,
)
)

self.analysis_results.classifications = self._querydb(
AnalysisQueries.SELECT_CLASSIFICATION_FREQUENCY
)

return self.analysis_results

def _handle_rogue_analysis(self):
Expand Down
43 changes: 43 additions & 0 deletions src/demystify/libs/outputhandlers/htmloutputclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -768,6 +768,49 @@ def generateHTML(self):
)
self.identifierchart(countlist)

#######################################################################
### CLASSIFICATION
#######################################################################

if self.analysis_results.classifications_count > 0:
self._outputheading(
self.STRINGS.HEADING_CLASSIFICATION,
self.STRINGS.HEADING_DESC_CLASSIFICATION,
)
self.printFormattedText("<table>")
self.printFormattedText(
'<table><th style="text-align: left;">{}</a></th><th style="text-align: left;">{}</th>'.format(
self.STRINGS.COLUMN_HEADER_VALUES_CLASSIFICATION,
self.STRINGS.COLUMN_HEADER_VALUES_COUNT,
)
)

for format_classification in self.analysis_results.classifications:
classification = format_classification[0]
if classification.lower() == "none":
classification = "No format type classification"
self.printFormattedText('<tr><td style="width: 300px;">')
self.printFormattedText(f"{classification}")
self.printFormattedText(
"</td><td>{}</td>".format(format_classification[1])
)

# Unused Meter Code...
self.printFormattedText(
self._outputmeter(
format_classification[1], 0, self.analysis_results.filecount
)
)
self.printFormattedText("</tr>")

self.printFormattedText("</table>")
self._htmlnewline()
self.printFormattedText("<hr/>")

#######################################################################
### DATE RANGE
#######################################################################

if self.analysis_results.dateFrequency is not None:
# Date Ranges
self._outputheading(
Expand Down

0 comments on commit bb20900

Please sign in to comment.