diff --git a/src/demystify/i18n/internationalstrings.py b/src/demystify/i18n/internationalstrings.py index d1221c8..c7bf2a4 100644 --- a/src/demystify/i18n/internationalstrings.py +++ b/src/demystify/i18n/internationalstrings.py @@ -36,7 +36,7 @@ class AnalysisStringsEN: ) SUMMARY_UNIQUE_EXTENSIONS = "Total Unique Extensions Across Accession/Extract" - SUMMARY_ZERO_BYTE = "Total Zero-byte Files in Accession/Extract" + SUMMARY_ZERO_BYTE = "Total Zero-byte Files in the Accession/Extract" SUMMARY_IDENTICAL_FILES = "Total Files with Identical Content (Checksum Value)" SUMMARY_PERCENTAGE_IDENTIFIED = "Percentage of Accession/Extract Identified" SUMMARY_PERCENTAGE_UNIDENTIFIED = "Percentage of Accession/Extract Unidentified" @@ -123,9 +123,7 @@ class AnalysisStringsEN: "understand an accession/extract." ) - HEADING_BINARY_ID = ( - "Aggregated Frequency of File Format Signature Identifiers in Accession/Extract" - ) + HEADING_BINARY_ID = "Aggregated Frequency of File Format Signature Identifiers in the Accession/Extract" HEADING_DESC_BINARY_ID = ( "Listing of format identification, namespace, count, and match basis for files " "identified using file format signatures. Some tools can report what the basis " @@ -135,7 +133,7 @@ class AnalysisStringsEN: + COUNT_TEXT ) - HEADING_XML_ID = "Aggregated Frequency of XML Identifiers in Accession/Extract" + HEADING_XML_ID = "Aggregated Frequency of XML Identifiers in the Accession/Extract" HEADING_DESC_XML_ID = ( "Listing of format identification, namespace, count, and match basis for files " "identified using XML based identification. Some tools can report what the basis " @@ -145,7 +143,9 @@ class AnalysisStringsEN: + COUNT_TEXT ) - HEADING_TEXT_ID = "Aggregated Frequency of Text Identifiers in Accession/Extract" + HEADING_TEXT_ID = ( + "Aggregated Frequency of Text Identifiers in the Accession/Extract" + ) HEADING_DESC_TEXT_ID = ( "Listing of format identification, namespace, count, and match basis for files " "identified using Text based identification. Some tools can report what the basis " @@ -156,7 +156,7 @@ class AnalysisStringsEN: ) HEADING_FILENAME_ID = ( - "Aggregated Frequency of Filename Identifiers in Accession/Extract" + "Aggregated Frequency of Filename Identifiers in the Accession/Extract" ) HEADING_DESC_FILENAME_ID = ( "Listing of format identification, namespace, count, and match basis for files " @@ -234,7 +234,7 @@ class AnalysisStringsEN: ) HEADING_AGGREGATE_BINARY_IDENTIFIED = ( - "Aggregated File Format Signature Identifiers in Accession/Extract" + "Aggregated File Format Signature Identifiers in the Accession/Extract" ) HEADING_DESC_IDENTIFIED = ( "A list of ID values and format names to provide a clear picture of diversity/complexity of the " @@ -248,11 +248,14 @@ class AnalysisStringsEN: HEADING_FREQUENCY_PUIDS_IDENTIFIED = ( "Frequency of File Format Signature Identified IDs" ) + HEADING_CLASSIFICATION = ( + "Range of format type classifciation in the Accession/Extract" + ) HEADING_DATE_RANGE = "Date Range of Items in the Accession/Extract" HEADING_EXTENSION_ONLY = "Extension Only Identification in the Accession/Extract" HEADING_ID_METHOD = "Identification Method Frequency" HEADING_FREQUENCY_EXTENSION_ONLY = ( - "Frequency of Extension Only Identification In Accession/Extract" + "Frequency of Extension Only Identification in the Accession/Extract" ) HEADING_UNIQUE_EXTENSIONS = ( "Unique Extensions Identified Across All Objects (ID and non-ID)" @@ -260,8 +263,8 @@ class AnalysisStringsEN: HEADING_LIST_MULTIPLE = "List of Files With Multiple Identifications" HEADING_FREQUENCY_EXTENSIONS_ALL = "Frequency of All Extensions" HEADING_FREQUENCY_MIME = "MIME Type (Internet Media Type) Frequency" - HEADING_LIST_ZERO_BYTES = "Zero-byte files in Accession/Extract" - HEADING_ARCHIVE_FORMATS = "Archive File Types in Accession/Extract" + HEADING_LIST_ZERO_BYTES = "Zero-byte files in the Accession/Extract" + HEADING_ARCHIVE_FORMATS = "Archive File Types in the Accession/Extract" HEADING_IDENTICAL_CONTENT = "Files With Identical Content (Checksum Value)" HEADING_TROUBLESOME_FILENAMES = "Identifying Non-ASCII and System File Names" HEADING_TROUBLESOME_DIRNAMES = "Identifying Non-ASCII and System Directory Names" @@ -363,6 +366,12 @@ class AnalysisStringsEN: "is represented in the accession/extract, in a descending list from most frequent to least." ) + HEADING_DESC_CLASSIFICATION = ( + "Count and visialization giving a clear illustration of the distribution of file format type classifications " + "across the accession/extract. The list is sorted by count in descending order and provides a measure " + "of how homogenous a collection is which is potentially indicative of its complexity to preserve." + ) + HEADING_DESC_DATE_RANGE = ( "Count and visualization giving a clear illustration of the distribution of file modification dates across the " "accession/extract. The list is in descending order based on the number of files last modified on any given year. " @@ -403,7 +412,7 @@ class AnalysisStringsEN: ) HEADING_DESC_FREQUENCY_EXTENSIONS_ALL = ( - "Lists the gamut of file extensions alongside how many times they appear in accession/extract " + "Lists the gamut of file extensions alongside how many times they appear in the accession/extract " "in descending order. This information can be used to identify the diversity/complexity of the " "accession/extract, but also to identify the consistency with which extensions may have been " "used in the accession/extract and may indicate how much work may be needed to correct " @@ -454,10 +463,12 @@ class AnalysisStringsEN: "Microsoft" ) - HEADING_DENYLIST_IDS = "Denylist Identifiers found in Accession/Extract" - HEADING_DENYLIST_EXTS = "Denylist Filename Extensions found in Accession/Extract" - HEADING_DENYLIST_DIRS = "Denylist Directory Names found in Accession/Extract" - HEADING_DENYLIST_FILENAMES = "Denylist Filenames found in Accession/Extract" + HEADING_DENYLIST_IDS = "Denylist Identifiers found in the Accession/Extract" + HEADING_DENYLIST_EXTS = ( + "Denylist Filename Extensions found in the Accession/Extract" + ) + HEADING_DENYLIST_DIRS = "Denylist Directory Names found in the Accession/Extract" + HEADING_DENYLIST_FILENAMES = "Denylist Filenames found in the Accession/Extract" HEADING_DESC_DENYLIST = """Lists objects which are considered undesirable inside an accession/extract. The denylist is entirely configurable by user and may contain files that can be identified as undesirable, e.g. system files; or file names that may need pre-conditioning @@ -466,15 +477,15 @@ class AnalysisStringsEN: strings using a wildcard search. IDs e.g. PUID identifiers, and filename extension searches are understandably more precise.""" - TEXT_ONLY_FIVE_TOP_PUIDS = "Five Top PUIDs in Accession/Extract" - TEXT_ONLY_FIVE_TOP_EXTENSIONS = "Five Top Extensions in Accession/Extract" + TEXT_ONLY_FIVE_TOP_PUIDS = "Five Top PUIDs in the Accession/Extract" + TEXT_ONLY_FIVE_TOP_EXTENSIONS = "Five Top Extensions in the Accession/Extract" COLUMN_HEADER_VALUES_NAMESPACE = "Namespace" COLUMN_HEADER_VALUES_ID = "ID" COLUMN_HEADER_VALUES_FORMAT = "Format Name" COLUMN_HEADER_VALUES_COUNT = "Count" COLUMN_HEADER_VALUES_YEAR = "Year" - COLUMN_HEADER_VALUES_YEAR = "Volume" + COLUMN_HEADER_VALUES_CLASSIFICATION = "Classification" FNAME_CHECK_ASCII = "contains, characters outside of ASCII range" FNAME_CHECK_PERIOD = "has a period '.' as its last character" diff --git a/src/demystify/libs/AnalysisQueriesClass.py b/src/demystify/libs/AnalysisQueriesClass.py index a6c00c2..506e38e 100644 --- a/src/demystify/libs/AnalysisQueriesClass.py +++ b/src/demystify/libs/AnalysisQueriesClass.py @@ -103,6 +103,27 @@ def methods_return_ns_sort(self, ns_id): "AND (IDDATA.METHOD='Signature' OR IDDATA.METHOD='Container')" ) + SELECT_CLASSIFICATION_COUNT = ( + "SELECT COUNT(IDDATA.classification)\n" + "FROM IDRESULTS\n" + "JOIN NSDATA on IDDATA.NS_ID = NSDATA.NS_ID\n" + "JOIN IDDATA on IDRESULTS.ID_ID = IDDATA.ID_ID\n" + "WHERE (NSDATA.NS_NAME='pronom')\n" + "AND (IDDATA.METHOD='Signature' OR IDDATA.METHOD='Container')" + "AND IDDATA.classification != 'None'" + ) + + SELECT_CLASSIFICATION_FREQUENCY = ( + "SELECT IDDATA.classification,\n" + "COUNT(*) as TOTAL\n" + "FROM IDRESULTS\n" + "JOIN NSDATA on IDDATA.NS_ID = NSDATA.NS_ID\n" + "JOIN IDDATA on IDRESULTS.ID_ID = IDDATA.ID_ID\n" + "WHERE (NSDATA.NS_NAME='pronom')\n" + "AND (IDDATA.METHOD='Signature' OR IDDATA.METHOD='Container')" + "GROUP BY IDDATA.classification ORDER BY TOTAL DESC" + ) + # PRONOM and OTHERS Text identifiers as one result # PRONOM and OTHERS Text identifiers as one result @staticmethod diff --git a/src/demystify/libs/AnalysisResultsClass.py b/src/demystify/libs/AnalysisResultsClass.py index 57e032a..2308023 100644 --- a/src/demystify/libs/AnalysisResultsClass.py +++ b/src/demystify/libs/AnalysisResultsClass.py @@ -96,6 +96,9 @@ def __init__(self): self.badFileNames = None self.badDirNames = None + self.classifications_count = 0 + self.classifications = [] + # Hash related values. self.hashused = False self.duplicateHASHlisting = None diff --git a/src/demystify/libs/DemystifyAnalysisClass.py b/src/demystify/libs/DemystifyAnalysisClass.py index 7aecbc3..7294796 100644 --- a/src/demystify/libs/DemystifyAnalysisClass.py +++ b/src/demystify/libs/DemystifyAnalysisClass.py @@ -979,6 +979,18 @@ def queryDB(self): if self.rogueanalysis: self._handle_rogue_analysis() + self.analysis_results.classifications_count = int( + self._querydb( + AnalysisQueries.SELECT_CLASSIFICATION_COUNT, + True, + True, + ) + ) + + self.analysis_results.classifications = self._querydb( + AnalysisQueries.SELECT_CLASSIFICATION_FREQUENCY + ) + return self.analysis_results def _handle_rogue_analysis(self): diff --git a/src/demystify/libs/outputhandlers/htmloutputclass.py b/src/demystify/libs/outputhandlers/htmloutputclass.py index 3984173..299d3b6 100644 --- a/src/demystify/libs/outputhandlers/htmloutputclass.py +++ b/src/demystify/libs/outputhandlers/htmloutputclass.py @@ -768,6 +768,49 @@ def generateHTML(self): ) self.identifierchart(countlist) + ####################################################################### + ### CLASSIFICATION + ####################################################################### + + if self.analysis_results.classifications_count > 0: + self._outputheading( + self.STRINGS.HEADING_CLASSIFICATION, + self.STRINGS.HEADING_DESC_CLASSIFICATION, + ) + self.printFormattedText("") + self.printFormattedText( + '
'.format( + self.STRINGS.COLUMN_HEADER_VALUES_CLASSIFICATION, + self.STRINGS.COLUMN_HEADER_VALUES_COUNT, + ) + ) + + for format_classification in self.analysis_results.classifications: + classification = format_classification[0] + if classification.lower() == "none": + classification = "No format type classification" + self.printFormattedText('".format(format_classification[1]) + ) + + # Unused Meter Code... + self.printFormattedText( + self._outputmeter( + format_classification[1], 0, self.analysis_results.filecount + ) + ) + self.printFormattedText("") + + self.printFormattedText("
{}{}
') + self.printFormattedText(f"{classification}") + self.printFormattedText( + "{}
") + self._htmlnewline() + self.printFormattedText("
") + + ####################################################################### + ### DATE RANGE + ####################################################################### + if self.analysis_results.dateFrequency is not None: # Date Ranges self._outputheading(